blob: 60fdbcf02738fc1e4ff9fae34d401eba0140a10d [file] [log] [blame]
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001/*
2 * kmp_barrier.cpp
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16#include "kmp.h"
17#include "kmp_wait_release.h"
18#include "kmp_stats.h"
19#include "kmp_itt.h"
20
21#if KMP_MIC
22#include <immintrin.h>
23#define USE_NGO_STORES 1
24#endif // KMP_MIC
25
26#if KMP_MIC && USE_NGO_STORES
27// ICV copying
28#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
29#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
30#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
32#else
33#define ngo_load(src) ((void)0)
34#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
Jonathan Peyton01b58b72015-07-09 18:20:51 +000035#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
Jim Cownie4cc4bb42014-10-07 16:25:50 +000036#define ngo_sync() ((void)0)
37#endif /* KMP_MIC && USE_NGO_STORES */
38
39void __kmp_print_structure(void); // Forward declaration
40
41// ---------------------------- Barrier Algorithms ----------------------------
42
43// Linear Barrier
44static void
45__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
46 void (*reduce)(void *, void *)
47 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
48{
Jonathan Peyton45be4502015-08-11 21:36:41 +000049 KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +000050 register kmp_team_t *team = this_thr->th.th_team;
51 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
52 register kmp_info_t **other_threads = team->t.t_threads;
53
54 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
55 gtid, team->t.t_id, tid, bt));
56 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
57
58#if USE_ITT_BUILD && USE_ITT_NOTIFY
59 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +000060 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +000061 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
62 }
63#endif
64 // We now perform a linear reduction to signal that all of the threads have arrived.
65 if (!KMP_MASTER_TID(tid)) {
66 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
67 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
68 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
69 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
70 // Mark arrival to master thread
71 /* After performing this write, a worker thread may not assume that the team is valid
72 any more - it could be deallocated by the master thread at any time. */
73 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
74 flag.release();
75 } else {
76 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
77 register int nproc = this_thr->th.th_team_nproc;
78 register int i;
79 // Don't have to worry about sleep bit here or atomic since team setting
80 register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
81
82 // Collect all the worker team member threads.
83 for (i=1; i<nproc; ++i) {
84#if KMP_CACHE_MANAGE
85 // Prefetch next thread's arrived count
86 if (i+1 < nproc)
87 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
88#endif /* KMP_CACHE_MANAGE */
89 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
90 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
91 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
92 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
93
94 // Wait for worker thread to arrive
95 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
96 flag.wait(this_thr, FALSE
97 USE_ITT_BUILD_ARG(itt_sync_obj) );
98#if USE_ITT_BUILD && USE_ITT_NOTIFY
99 // Barrier imbalance - write min of the thread time and the other thread time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000100 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000101 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
102 other_threads[i]->th.th_bar_min_time);
103 }
104#endif
105 if (reduce) {
106 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
107 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
108 (*reduce)(this_thr->th.th_local.reduce_data,
109 other_threads[i]->th.th_local.reduce_data);
110 }
111 }
112 // Don't have to worry about sleep bit here or atomic since team setting
113 team_bar->b_arrived = new_state;
114 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
115 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
116 }
117 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
118 gtid, team->t.t_id, tid, bt));
119}
120
121static void
122__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
123 int propagate_icvs
124 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
125{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000126 KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000127 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
128 register kmp_team_t *team;
129
130 if (KMP_MASTER_TID(tid)) {
131 register unsigned int i;
132 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
133 register kmp_info_t **other_threads;
134
135 team = __kmp_threads[gtid]->th.th_team;
136 KMP_DEBUG_ASSERT(team != NULL);
137 other_threads = team->t.t_threads;
138
139 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
140 gtid, team->t.t_id, tid, bt));
141
142 if (nproc > 1) {
143#if KMP_BARRIER_ICV_PUSH
Jonathan Peyton45be4502015-08-11 21:36:41 +0000144 {
145 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
146 if (propagate_icvs) {
147 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
148 for (i=1; i<nproc; ++i) {
149 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
150 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
151 &team->t.t_implicit_task_taskdata[0].td_icvs);
152 }
153 ngo_sync();
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000154 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000155 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000156#endif // KMP_BARRIER_ICV_PUSH
157
158 // Now, release all of the worker threads
159 for (i=1; i<nproc; ++i) {
160#if KMP_CACHE_MANAGE
161 // Prefetch next thread's go flag
162 if (i+1 < nproc)
163 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
164#endif /* KMP_CACHE_MANAGE */
165 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
166 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
167 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
168 &other_threads[i]->th.th_bar[bt].bb.b_go,
169 other_threads[i]->th.th_bar[bt].bb.b_go,
170 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
171 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
172 flag.release();
173 }
174 }
175 } else { // Wait for the MASTER thread to release us
176 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
177 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
178 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
179 flag.wait(this_thr, TRUE
180 USE_ITT_BUILD_ARG(itt_sync_obj) );
181#if USE_ITT_BUILD && USE_ITT_NOTIFY
182 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
183 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
184 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
185 // Cancel wait on previous parallel region...
186 __kmp_itt_task_starting(itt_sync_obj);
187
188 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
189 return;
190
191 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
192 if (itt_sync_obj != NULL)
193 // Call prepare as early as possible for "new" barrier
194 __kmp_itt_task_finished(itt_sync_obj);
195 } else
196#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
197 // Early exit for reaping threads releasing forkjoin barrier
198 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
199 return;
200 // The worker thread may now assume that the team is valid.
201#ifdef KMP_DEBUG
202 tid = __kmp_tid_from_gtid(gtid);
203 team = __kmp_threads[gtid]->th.th_team;
204#endif
205 KMP_DEBUG_ASSERT(team != NULL);
206 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
207 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
208 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
209 KMP_MB(); // Flush all pending memory write invalidates.
210 }
211 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
212 gtid, team->t.t_id, tid, bt));
213}
214
215// Tree barrier
216static void
217__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
218 void (*reduce)(void *, void *)
219 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
220{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000221 KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000222 register kmp_team_t *team = this_thr->th.th_team;
223 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
224 register kmp_info_t **other_threads = team->t.t_threads;
225 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
226 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
227 register kmp_uint32 branch_factor = 1 << branch_bits;
228 register kmp_uint32 child;
229 register kmp_uint32 child_tid;
230 register kmp_uint new_state;
231
232 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
233 gtid, team->t.t_id, tid, bt));
234 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
235
236#if USE_ITT_BUILD && USE_ITT_NOTIFY
237 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000238 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000239 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
240 }
241#endif
242 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
243 child_tid = (tid << branch_bits) + 1;
244 if (child_tid < nproc) {
245 // Parent threads wait for all their children to arrive
246 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
247 child = 1;
248 do {
249 register kmp_info_t *child_thr = other_threads[child_tid];
250 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
251#if KMP_CACHE_MANAGE
252 // Prefetch next thread's arrived count
253 if (child+1 <= branch_factor && child_tid+1 < nproc)
254 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
255#endif /* KMP_CACHE_MANAGE */
256 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
257 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
258 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
259 &child_bar->b_arrived, new_state));
260 // Wait for child to arrive
261 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
262 flag.wait(this_thr, FALSE
263 USE_ITT_BUILD_ARG(itt_sync_obj) );
264#if USE_ITT_BUILD && USE_ITT_NOTIFY
265 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000266 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000267 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
268 child_thr->th.th_bar_min_time);
269 }
270#endif
271 if (reduce) {
272 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
273 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
274 team->t.t_id, child_tid));
275 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
276 }
277 child++;
278 child_tid++;
279 }
280 while (child <= branch_factor && child_tid < nproc);
281 }
282
283 if (!KMP_MASTER_TID(tid)) { // Worker threads
284 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
285
286 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
287 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
288 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
289 &thr_bar->b_arrived, thr_bar->b_arrived,
290 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
291
292 // Mark arrival to parent thread
293 /* After performing this write, a worker thread may not assume that the team is valid
294 any more - it could be deallocated by the master thread at any time. */
295 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
296 flag.release();
297 } else {
298 // Need to update the team arrived pointer if we are the master thread
299 if (nproc > 1) // New value was already computed above
300 team->t.t_bar[bt].b_arrived = new_state;
301 else
302 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
303 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
304 gtid, team->t.t_id, tid, team->t.t_id,
305 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
306 }
307 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
308 gtid, team->t.t_id, tid, bt));
309}
310
311static void
312__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
313 int propagate_icvs
314 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
315{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000316 KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000317 register kmp_team_t *team;
318 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
319 register kmp_uint32 nproc;
320 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
321 register kmp_uint32 branch_factor = 1 << branch_bits;
322 register kmp_uint32 child;
323 register kmp_uint32 child_tid;
324
325 // Perform a tree release for all of the threads that have been gathered
326 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
327 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
328 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
329 // Wait for parent thread to release us
330 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
331 flag.wait(this_thr, TRUE
332 USE_ITT_BUILD_ARG(itt_sync_obj) );
333#if USE_ITT_BUILD && USE_ITT_NOTIFY
334 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
335 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
336 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
337 // Cancel wait on previous parallel region...
338 __kmp_itt_task_starting(itt_sync_obj);
339
340 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
341 return;
342
343 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
344 if (itt_sync_obj != NULL)
345 // Call prepare as early as possible for "new" barrier
346 __kmp_itt_task_finished(itt_sync_obj);
347 } else
348#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
349 // Early exit for reaping threads releasing forkjoin barrier
350 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
351 return;
352
353 // The worker thread may now assume that the team is valid.
354 team = __kmp_threads[gtid]->th.th_team;
355 KMP_DEBUG_ASSERT(team != NULL);
356 tid = __kmp_tid_from_gtid(gtid);
357
358 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
359 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
360 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
361 KMP_MB(); // Flush all pending memory write invalidates.
362 } else {
363 team = __kmp_threads[gtid]->th.th_team;
364 KMP_DEBUG_ASSERT(team != NULL);
365 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
366 gtid, team->t.t_id, tid, bt));
367 }
368 nproc = this_thr->th.th_team_nproc;
369 child_tid = (tid << branch_bits) + 1;
370
371 if (child_tid < nproc) {
372 register kmp_info_t **other_threads = team->t.t_threads;
373 child = 1;
374 // Parent threads release all their children
375 do {
376 register kmp_info_t *child_thr = other_threads[child_tid];
377 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
378#if KMP_CACHE_MANAGE
379 // Prefetch next thread's go count
380 if (child+1 <= branch_factor && child_tid+1 < nproc)
381 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
382#endif /* KMP_CACHE_MANAGE */
383
384#if KMP_BARRIER_ICV_PUSH
Jonathan Peyton45be4502015-08-11 21:36:41 +0000385 {
386 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
387 if (propagate_icvs) {
388 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
389 team, child_tid, FALSE);
390 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
391 &team->t.t_implicit_task_taskdata[0].td_icvs);
392 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000393 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000394#endif // KMP_BARRIER_ICV_PUSH
395 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
396 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
397 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
398 child_tid, &child_bar->b_go, child_bar->b_go,
399 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
400 // Release child from barrier
401 kmp_flag_64 flag(&child_bar->b_go, child_thr);
402 flag.release();
403 child++;
404 child_tid++;
405 }
406 while (child <= branch_factor && child_tid < nproc);
407 }
408 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
409 gtid, team->t.t_id, tid, bt));
410}
411
412
413// Hyper Barrier
414static void
415__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
416 void (*reduce)(void *, void *)
417 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
418{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000419 KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000420 register kmp_team_t *team = this_thr->th.th_team;
421 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
422 register kmp_info_t **other_threads = team->t.t_threads;
423 register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
424 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
425 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
426 register kmp_uint32 branch_factor = 1 << branch_bits;
427 register kmp_uint32 offset;
428 register kmp_uint32 level;
429
430 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
431 gtid, team->t.t_id, tid, bt));
432
433 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
434
435#if USE_ITT_BUILD && USE_ITT_NOTIFY
436 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000437 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000438 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
439 }
440#endif
441 /* Perform a hypercube-embedded tree gather to wait until all of the threads have
442 arrived, and reduce any required data as we go. */
443 kmp_flag_64 p_flag(&thr_bar->b_arrived);
444 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
445 {
446 register kmp_uint32 child;
447 register kmp_uint32 child_tid;
448
449 if (((tid >> level) & (branch_factor - 1)) != 0) {
450 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
451
452 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
453 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
454 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
455 &thr_bar->b_arrived, thr_bar->b_arrived,
456 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
457 // Mark arrival to parent thread
458 /* After performing this write (in the last iteration of the enclosing for loop),
459 a worker thread may not assume that the team is valid any more - it could be
460 deallocated by the master thread at any time. */
461 p_flag.set_waiter(other_threads[parent_tid]);
462 p_flag.release();
463 break;
464 }
465
466 // Parent threads wait for children to arrive
467 if (new_state == KMP_BARRIER_UNUSED_STATE)
468 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
469 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
470 child++, child_tid+=(1 << level))
471 {
472 register kmp_info_t *child_thr = other_threads[child_tid];
473 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474#if KMP_CACHE_MANAGE
475 register kmp_uint32 next_child_tid = child_tid + (1 << level);
476 // Prefetch next thread's arrived count
477 if (child+1 < branch_factor && next_child_tid < num_threads)
478 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
479#endif /* KMP_CACHE_MANAGE */
480 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
481 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
482 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
483 &child_bar->b_arrived, new_state));
484 // Wait for child to arrive
485 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
486 c_flag.wait(this_thr, FALSE
487 USE_ITT_BUILD_ARG(itt_sync_obj) );
488#if USE_ITT_BUILD && USE_ITT_NOTIFY
489 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000490 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000491 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
492 child_thr->th.th_bar_min_time);
493 }
494#endif
495 if (reduce) {
496 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
497 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
498 team->t.t_id, child_tid));
499 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
500 }
501 }
502 }
503
504 if (KMP_MASTER_TID(tid)) {
505 // Need to update the team arrived pointer if we are the master thread
506 if (new_state == KMP_BARRIER_UNUSED_STATE)
507 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
508 else
509 team->t.t_bar[bt].b_arrived = new_state;
510 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
511 gtid, team->t.t_id, tid, team->t.t_id,
512 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
513 }
514 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
515 gtid, team->t.t_id, tid, bt));
516}
517
518// The reverse versions seem to beat the forward versions overall
519#define KMP_REVERSE_HYPER_BAR
520static void
521__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
522 int propagate_icvs
523 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
524{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000525 KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000526 register kmp_team_t *team;
527 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
528 register kmp_info_t **other_threads;
529 register kmp_uint32 num_threads;
530 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
531 register kmp_uint32 branch_factor = 1 << branch_bits;
532 register kmp_uint32 child;
533 register kmp_uint32 child_tid;
534 register kmp_uint32 offset;
535 register kmp_uint32 level;
536
537 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
538 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
539 order of the corresponding gather, otherwise threads are released in the same order. */
540 if (KMP_MASTER_TID(tid)) { // master
541 team = __kmp_threads[gtid]->th.th_team;
542 KMP_DEBUG_ASSERT(team != NULL);
543 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
544 gtid, team->t.t_id, tid, bt));
545#if KMP_BARRIER_ICV_PUSH
546 if (propagate_icvs) { // master already has ICVs in final destination; copy
547 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
548 }
549#endif
550 }
551 else { // Handle fork barrier workers who aren't part of a team yet
552 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
553 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
554 // Wait for parent thread to release us
555 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
556 flag.wait(this_thr, TRUE
557 USE_ITT_BUILD_ARG(itt_sync_obj) );
558#if USE_ITT_BUILD && USE_ITT_NOTIFY
559 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
560 // In fork barrier where we could not get the object reliably
561 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
562 // Cancel wait on previous parallel region...
563 __kmp_itt_task_starting(itt_sync_obj);
564
565 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
566 return;
567
568 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
569 if (itt_sync_obj != NULL)
570 // Call prepare as early as possible for "new" barrier
571 __kmp_itt_task_finished(itt_sync_obj);
572 } else
573#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
574 // Early exit for reaping threads releasing forkjoin barrier
575 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
576 return;
577
578 // The worker thread may now assume that the team is valid.
579 team = __kmp_threads[gtid]->th.th_team;
580 KMP_DEBUG_ASSERT(team != NULL);
581 tid = __kmp_tid_from_gtid(gtid);
582
583 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
584 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
585 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
586 KMP_MB(); // Flush all pending memory write invalidates.
587 }
588 num_threads = this_thr->th.th_team_nproc;
589 other_threads = team->t.t_threads;
590
591#ifdef KMP_REVERSE_HYPER_BAR
592 // Count up to correct level for parent
593 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
594 level+=branch_bits, offset<<=branch_bits);
595
596 // Now go down from there
597 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
598 level-=branch_bits, offset>>=branch_bits)
599#else
600 // Go down the tree, level by level
601 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
602#endif // KMP_REVERSE_HYPER_BAR
603 {
604#ifdef KMP_REVERSE_HYPER_BAR
605 /* Now go in reverse order through the children, highest to lowest.
606 Initial setting of child is conservative here. */
607 child = num_threads >> ((level==0)?level:level-1);
608 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
609 child>=1; child--, child_tid-=(1<<level))
610#else
611 if (((tid >> level) & (branch_factor - 1)) != 0)
612 // No need to go lower than this, since this is the level parent would be notified
613 break;
614 // Iterate through children on this level of the tree
615 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
616 child++, child_tid+=(1<<level))
617#endif // KMP_REVERSE_HYPER_BAR
618 {
619 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
620 else {
621 register kmp_info_t *child_thr = other_threads[child_tid];
622 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
623#if KMP_CACHE_MANAGE
624 register kmp_uint32 next_child_tid = child_tid - (1 << level);
625 // Prefetch next thread's go count
626# ifdef KMP_REVERSE_HYPER_BAR
627 if (child-1 >= 1 && next_child_tid < num_threads)
628# else
629 if (child+1 < branch_factor && next_child_tid < num_threads)
630# endif // KMP_REVERSE_HYPER_BAR
631 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
632#endif /* KMP_CACHE_MANAGE */
633
634#if KMP_BARRIER_ICV_PUSH
635 if (propagate_icvs) // push my fixed ICVs to my child
636 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
637#endif // KMP_BARRIER_ICV_PUSH
638
639 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
640 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
641 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
642 child_tid, &child_bar->b_go, child_bar->b_go,
643 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
644 // Release child from barrier
645 kmp_flag_64 flag(&child_bar->b_go, child_thr);
646 flag.release();
647 }
648 }
649 }
650#if KMP_BARRIER_ICV_PUSH
651 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
652 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
653 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
654 }
655#endif
656 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
657 gtid, team->t.t_id, tid, bt));
658}
659
660// Hierarchical Barrier
661
662// Initialize thread barrier data
663/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
664 minimum amount of initialization required based on how the team has changed. Returns true if
665 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
666 team size increases, threads already in the team will respond to on-core wakeup on their parent
667 thread, but threads newly added to the team will only be listening on the their local b_go. */
668static bool
669__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
670 int gtid, int tid, kmp_team_t *team)
671{
672 // Checks to determine if (re-)initialization is needed
673 bool uninitialized = thr_bar->team == NULL;
674 bool team_changed = team != thr_bar->team;
675 bool team_sz_changed = nproc != thr_bar->nproc;
676 bool tid_changed = tid != thr_bar->old_tid;
677 bool retval = false;
678
679 if (uninitialized || team_sz_changed) {
680 __kmp_get_hierarchy(nproc, thr_bar);
681 }
682
683 if (uninitialized || team_sz_changed || tid_changed) {
684 thr_bar->my_level = thr_bar->depth-1; // default for master
685 thr_bar->parent_tid = -1; // default for master
686 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
687 kmp_uint32 d=0;
688 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
689 kmp_uint32 rem;
690 if (d == thr_bar->depth-2) { // reached level right below the master
691 thr_bar->parent_tid = 0;
692 thr_bar->my_level = d;
693 break;
694 }
695 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
696 // thread is not a subtree root at next level, so this is max
697 thr_bar->parent_tid = tid - rem;
698 thr_bar->my_level = d;
699 break;
700 }
701 ++d;
702 }
703 }
704 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
705 thr_bar->old_tid = tid;
706 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
707 }
708 if (uninitialized || team_changed || tid_changed) {
709 thr_bar->team = team;
710 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
711 retval = true;
712 }
713 if (uninitialized || team_sz_changed || tid_changed) {
714 thr_bar->nproc = nproc;
715 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
716 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
717 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
718 thr_bar->leaf_kids = nproc - tid - 1;
719 thr_bar->leaf_state = 0;
720 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
721 }
722 return retval;
723}
724
725static void
726__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
727 int gtid, int tid, void (*reduce) (void *, void *)
728 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
729{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000730 KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000731 register kmp_team_t *team = this_thr->th.th_team;
732 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
733 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
734 register kmp_info_t **other_threads = team->t.t_threads;
735 register kmp_uint64 new_state;
736
Andrey Churbanov42a79212015-01-27 16:50:31 +0000737 int level = team->t.t_level;
738 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
739 if (this_thr->th.th_teams_size.nteams > 1)
740 ++level; // level was not increased in teams construct for team_of_masters
741 if (level == 1) thr_bar->use_oncore_barrier = 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000742 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
743
744 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
745 gtid, team->t.t_id, tid, bt));
746 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
747
Andrey Churbanove6bfb732015-05-06 18:34:15 +0000748#if USE_ITT_BUILD && USE_ITT_NOTIFY
749 // Barrier imbalance - save arrive time to the thread
750 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
751 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
752 }
753#endif
754
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000755 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
756
757 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
758 register kmp_int32 child_tid;
759 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
760 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
761 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
762 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
763 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
764 flag.wait(this_thr, FALSE
765 USE_ITT_BUILD_ARG(itt_sync_obj) );
766 if (reduce) {
767 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
768 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
769 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
770 team->t.t_id, child_tid));
771 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
772 }
773 }
774 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
775 }
776 // Next, wait for higher level children on each child's b_arrived flag
777 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
778 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
779 if (last > nproc) last = nproc;
780 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
781 register kmp_info_t *child_thr = other_threads[child_tid];
782 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
783 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
784 "arrived(%p) == %u\n",
785 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
786 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
787 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
788 flag.wait(this_thr, FALSE
789 USE_ITT_BUILD_ARG(itt_sync_obj) );
790 if (reduce) {
791 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
792 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
793 team->t.t_id, child_tid));
794 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
795 }
796 }
797 }
798 }
799 else { // Blocktime is not infinite
800 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
801 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
802 if (last > nproc) last = nproc;
803 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
804 register kmp_info_t *child_thr = other_threads[child_tid];
805 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
806 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
807 "arrived(%p) == %u\n",
808 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
809 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
810 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
811 flag.wait(this_thr, FALSE
812 USE_ITT_BUILD_ARG(itt_sync_obj) );
813 if (reduce) {
814 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
815 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
816 team->t.t_id, child_tid));
817 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
818 }
819 }
820 }
821 }
822 }
823 // All subordinates are gathered; now release parent if not master thread
824
825 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
826 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
827 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
828 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
829 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
830 /* Mark arrival to parent: After performing this write, a worker thread may not assume that
831 the team is valid any more - it could be deallocated by the master thread at any time. */
832 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
833 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
834 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
835 flag.release();
836 }
837 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
838 thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
839 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
840 flag.set_waiter(other_threads[thr_bar->parent_tid]);
841 flag.release();
842 }
843 } else { // Master thread needs to update the team's b_arrived value
844 team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
845 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
846 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
847 }
848 // Is the team access below unsafe or just technically invalid?
849 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
850 gtid, team->t.t_id, tid, bt));
851}
852
853static void
854__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
855 int propagate_icvs
856 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
857{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000858 KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000859 register kmp_team_t *team;
860 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
861 register kmp_uint32 nproc;
862 bool team_change = false; // indicates on-core barrier shouldn't be used
863
864 if (KMP_MASTER_TID(tid)) {
865 team = __kmp_threads[gtid]->th.th_team;
866 KMP_DEBUG_ASSERT(team != NULL);
867 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
868 gtid, team->t.t_id, tid, bt));
869 }
870 else { // Worker threads
871 // Wait for parent thread to release me
872 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
873 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
874 // Use traditional method of waiting on my own b_go flag
875 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
876 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
877 flag.wait(this_thr, TRUE
878 USE_ITT_BUILD_ARG(itt_sync_obj) );
879 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
880 }
881 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
882 // Wait on my "offset" bits on parent's b_go flag
883 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
884 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
885 bt, this_thr
886 USE_ITT_BUILD_ARG(itt_sync_obj) );
887 flag.wait(this_thr, TRUE
888 USE_ITT_BUILD_ARG(itt_sync_obj) );
889 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
890 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
891 }
892 else { // Reset my bits on parent's b_go flag
893 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
894 }
895 }
896 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
897 // Early exit for reaping threads releasing forkjoin barrier
898 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
899 return;
900 // The worker thread may now assume that the team is valid.
901 team = __kmp_threads[gtid]->th.th_team;
902 KMP_DEBUG_ASSERT(team != NULL);
903 tid = __kmp_tid_from_gtid(gtid);
904
905 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
906 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
907 KMP_MB(); // Flush all pending memory write invalidates.
908 }
909
Andrey Churbanov42a79212015-01-27 16:50:31 +0000910 int level = team->t.t_level;
911 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
912 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
913 ++level; // level was not increased in teams construct for team_of_workers
914 if( this_thr->th.th_teams_size.nteams > 1 )
915 ++level; // level was not increased in teams construct for team_of_masters
916 }
917 if (level == 1) thr_bar->use_oncore_barrier = 1;
918 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000919 nproc = this_thr->th.th_team_nproc;
920
921 // If the team size has increased, we still communicate with old leaves via oncore barrier.
922 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
923 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
924 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
925 // But if the entire team changes, we won't use oncore barrier at all
926 if (team_change) old_leaf_kids = 0;
927
928#if KMP_BARRIER_ICV_PUSH
929 if (propagate_icvs) {
Jonathan Peyton2211cfe2015-08-12 20:59:48 +0000930 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000931 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
932 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
933 }
934 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
935 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
936 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
937 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
938 &thr_bar->parent_bar->th_fixed_icvs);
939 // non-leaves will get ICVs piggybacked with b_go via NGO store
940 }
941 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
942 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
943 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
944 else // leaves copy parent's fixed ICVs directly to local ICV store
945 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
946 &thr_bar->parent_bar->th_fixed_icvs);
947 }
948 }
949#endif // KMP_BARRIER_ICV_PUSH
950
951 // Now, release my children
952 if (thr_bar->my_level) { // not a leaf
953 register kmp_int32 child_tid;
954 kmp_uint32 last;
955 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
956 if (KMP_MASTER_TID(tid)) { // do a flat release
957 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
958 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
959 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
960 ngo_load(&thr_bar->th_fixed_icvs);
961 // This loops over all the threads skipping only the leaf nodes in the hierarchy
962 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
963 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
964 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
965 " go(%p): %u => %u\n",
966 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
967 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
968 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
969 // Use ngo store (if available) to both store ICVs and release child via child's b_go
970 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
971 }
972 ngo_sync();
973 }
974 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
975 // Now, release leaf children
976 if (thr_bar->leaf_kids) { // if there are any
977 // We test team_change on the off-chance that the level 1 team changed.
978 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
979 if (old_leaf_kids) { // release old leaf kids
980 thr_bar->b_go |= old_leaf_state;
981 }
982 // Release new leaf kids
983 last = tid+thr_bar->skip_per_level[1];
984 if (last > nproc) last = nproc;
985 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
986 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
987 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
988 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
989 " T#%d(%d:%d) go(%p): %u => %u\n",
990 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
991 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
992 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
993 // Release child using child's b_go flag
994 kmp_flag_64 flag(&child_bar->b_go, child_thr);
995 flag.release();
996 }
997 }
998 else { // Release all children at once with leaf_state bits on my own b_go flag
999 thr_bar->b_go |= thr_bar->leaf_state;
1000 }
1001 }
1002 }
1003 else { // Blocktime is not infinite; do a simple hierarchical release
1004 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1005 last = tid+thr_bar->skip_per_level[d+1];
1006 kmp_uint32 skip = thr_bar->skip_per_level[d];
1007 if (last > nproc) last = nproc;
1008 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1009 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1010 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1011 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1012 " go(%p): %u => %u\n",
1013 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1014 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1015 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1016 // Release child using child's b_go flag
1017 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1018 flag.release();
1019 }
1020 }
1021 }
1022#if KMP_BARRIER_ICV_PUSH
1023 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1024 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1025#endif // KMP_BARRIER_ICV_PUSH
1026 }
1027 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1028 gtid, team->t.t_id, tid, bt));
1029}
1030
1031// ---------------------------- End of Barrier Algorithms ----------------------------
1032
1033// Internal function to do a barrier.
1034/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1035 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1036 Returns 0 if master thread, 1 if worker thread. */
1037int
1038__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1039 void *reduce_data, void (*reduce)(void *, void *))
1040{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001041 KMP_TIME_DEVELOPER_BLOCK(KMP_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001042 register int tid = __kmp_tid_from_gtid(gtid);
1043 register kmp_info_t *this_thr = __kmp_threads[gtid];
1044 register kmp_team_t *team = this_thr->th.th_team;
1045 register int status = 0;
1046 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001047#if OMPT_SUPPORT
1048 ompt_task_id_t my_task_id;
1049 ompt_parallel_id_t my_parallel_id;
1050#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001051
1052 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1053 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1054
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001055#if OMPT_SUPPORT
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001056 if (ompt_status & ompt_status_track) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001057#if OMPT_BLAME
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001058 if (ompt_status == ompt_status_track_callback) {
1059 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1060 my_parallel_id = team->t.ompt_team_info.parallel_id;
1061
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001062#if OMPT_TRACE
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001063 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1064 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1065 ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1066 my_parallel_id, my_task_id);
1067 }
1068 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001069#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001070 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1071 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1072 my_parallel_id, my_task_id);
1073 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001074 }
1075#endif
1076 // It is OK to report the barrier state after the barrier begin callback.
1077 // According to the OMPT specification, a compliant implementation may
1078 // even delay reporting this state until the barrier begins to wait.
1079 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001080 }
1081#endif
1082
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001083 if (! team->t.t_serialized) {
1084#if USE_ITT_BUILD
1085 // This value will be used in itt notify events below.
1086 void *itt_sync_obj = NULL;
1087# if USE_ITT_NOTIFY
1088 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1089 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1090# endif
1091#endif /* USE_ITT_BUILD */
1092 if (__kmp_tasking_mode == tskm_extra_barrier) {
1093 __kmp_tasking_barrier(team, this_thr, gtid);
1094 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1095 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1096 }
1097
1098 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1099 the team struct is not guaranteed to exist. */
1100 // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1101 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1102 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1103 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1104 }
1105
1106#if USE_ITT_BUILD
1107 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1108 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1109#endif /* USE_ITT_BUILD */
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001110#if USE_DEBUGGER
1111 // Let the debugger know: the thread arrived to the barrier and waiting.
1112 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1113 team->t.t_bar[bt].b_master_arrived += 1;
1114 } else {
1115 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1116 } // if
1117#endif /* USE_DEBUGGER */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001118 if (reduce != NULL) {
1119 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1120 this_thr->th.th_local.reduce_data = reduce_data;
1121 }
1122 switch (__kmp_barrier_gather_pattern[bt]) {
1123 case bp_hyper_bar: {
1124 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1125 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1126 USE_ITT_BUILD_ARG(itt_sync_obj) );
1127 break;
1128 }
1129 case bp_hierarchical_bar: {
1130 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1131 USE_ITT_BUILD_ARG(itt_sync_obj));
1132 break;
1133 }
1134 case bp_tree_bar: {
1135 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1136 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1137 USE_ITT_BUILD_ARG(itt_sync_obj) );
1138 break;
1139 }
1140 default: {
1141 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1142 USE_ITT_BUILD_ARG(itt_sync_obj) );
1143 }
1144 }
1145
1146 KMP_MB();
1147
1148 if (KMP_MASTER_TID(tid)) {
1149 status = 0;
1150 if (__kmp_tasking_mode != tskm_immediate_exec) {
1151 __kmp_task_team_wait(this_thr, team
1152 USE_ITT_BUILD_ARG(itt_sync_obj) );
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001153 __kmp_task_team_setup(this_thr, team, 0, 0); // use 0,0 to only setup the current team if nthreads > 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001154 }
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001155#if USE_DEBUGGER
1156 // Let the debugger know: All threads are arrived and starting leaving the barrier.
1157 team->t.t_bar[bt].b_team_arrived += 1;
1158#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001159
1160#if USE_ITT_BUILD
1161 /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1162 before the final summation into the shared variable is done (final summation can be a
1163 long operation for array reductions). */
1164 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1165 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1166#endif /* USE_ITT_BUILD */
1167#if USE_ITT_BUILD && USE_ITT_NOTIFY
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001168 // Barrier - report frame end (only if active_level == 1)
1169 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1170#if OMP_40_ENABLED
1171 this_thr->th.th_teams_microtask == NULL &&
1172#endif
1173 team->t.t_active_level == 1)
1174 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001175 kmp_uint64 cur_time = __itt_get_timestamp();
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001176 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001177 int nproc = this_thr->th.th_team_nproc;
1178 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001179 switch(__kmp_forkjoin_frames_mode) {
1180 case 1:
1181 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1182 this_thr->th.th_frame_time = cur_time;
1183 break;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001184 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001185 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1186 break;
1187 case 3:
1188 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001189 // Initialize with master's wait time
1190 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001191 for (i=1; i<nproc; ++i) {
1192 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1193 }
1194 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1195 }
1196 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1197 this_thr->th.th_frame_time = cur_time;
1198 break;
1199 }
1200 }
1201#endif /* USE_ITT_BUILD */
1202 } else {
1203 status = 1;
1204#if USE_ITT_BUILD
1205 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1206 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1207#endif /* USE_ITT_BUILD */
1208 }
1209 if (status == 1 || ! is_split) {
1210 switch (__kmp_barrier_release_pattern[bt]) {
1211 case bp_hyper_bar: {
1212 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1213 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1214 USE_ITT_BUILD_ARG(itt_sync_obj) );
1215 break;
1216 }
1217 case bp_hierarchical_bar: {
1218 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1219 USE_ITT_BUILD_ARG(itt_sync_obj) );
1220 break;
1221 }
1222 case bp_tree_bar: {
1223 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1224 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1225 USE_ITT_BUILD_ARG(itt_sync_obj) );
1226 break;
1227 }
1228 default: {
1229 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1230 USE_ITT_BUILD_ARG(itt_sync_obj) );
1231 }
1232 }
1233 if (__kmp_tasking_mode != tskm_immediate_exec) {
1234 __kmp_task_team_sync(this_thr, team);
1235 }
1236 }
1237
1238#if USE_ITT_BUILD
1239 /* GEH: TODO: Move this under if-condition above and also include in
1240 __kmp_end_split_barrier(). This will more accurately represent the actual release time
1241 of the threads for split barriers. */
1242 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1243 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1244#endif /* USE_ITT_BUILD */
1245 } else { // Team is serialized.
1246 status = 0;
1247 if (__kmp_tasking_mode != tskm_immediate_exec) {
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001248#if OMP_41_ENABLED
1249 if ( this_thr->th.th_task_team != NULL ) {
1250 void *itt_sync_obj = NULL;
1251#if USE_ITT_NOTIFY
1252 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1253 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1254 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1255 }
1256#endif
1257
Jonathan Peytonfe9a1d72015-08-26 19:58:48 +00001258 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001259 __kmp_task_team_wait(this_thr, team
1260 USE_ITT_BUILD_ARG(itt_sync_obj));
1261 __kmp_task_team_setup(this_thr, team, 0, 0);
1262
1263#if USE_ITT_BUILD
1264 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1265 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1266#endif /* USE_ITT_BUILD */
1267 }
1268#else
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001269 // The task team should be NULL for serialized code (tasks will be executed immediately)
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001270 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001271 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001272#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001273 }
1274 }
1275 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1276 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001277
1278#if OMPT_SUPPORT
1279 if (ompt_status & ompt_status_track) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001280#if OMPT_BLAME
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001281 if ((ompt_status == ompt_status_track_callback) &&
1282 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1283 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1284 my_parallel_id, my_task_id);
1285 }
1286#endif
1287 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1288 }
1289#endif
1290
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001291 return status;
1292}
1293
1294
1295void
1296__kmp_end_split_barrier(enum barrier_type bt, int gtid)
1297{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001298 KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001299 int tid = __kmp_tid_from_gtid(gtid);
1300 kmp_info_t *this_thr = __kmp_threads[gtid];
1301 kmp_team_t *team = this_thr->th.th_team;
1302
1303 if (!team->t.t_serialized) {
1304 if (KMP_MASTER_GTID(gtid)) {
1305 switch (__kmp_barrier_release_pattern[bt]) {
1306 case bp_hyper_bar: {
1307 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1308 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1309 USE_ITT_BUILD_ARG(NULL) );
1310 break;
1311 }
1312 case bp_hierarchical_bar: {
1313 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1314 USE_ITT_BUILD_ARG(NULL));
1315 break;
1316 }
1317 case bp_tree_bar: {
1318 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1319 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1320 USE_ITT_BUILD_ARG(NULL) );
1321 break;
1322 }
1323 default: {
1324 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1325 USE_ITT_BUILD_ARG(NULL) );
1326 }
1327 }
1328 if (__kmp_tasking_mode != tskm_immediate_exec) {
1329 __kmp_task_team_sync(this_thr, team);
1330 } // if
1331 }
1332 }
1333}
1334
1335
1336void
1337__kmp_join_barrier(int gtid)
1338{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001339 KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001340 register kmp_info_t *this_thr = __kmp_threads[gtid];
1341 register kmp_team_t *team;
1342 register kmp_uint nproc;
1343 kmp_info_t *master_thread;
1344 int tid;
1345#ifdef KMP_DEBUG
1346 int team_id;
1347#endif /* KMP_DEBUG */
1348#if USE_ITT_BUILD
1349 void *itt_sync_obj = NULL;
1350# if USE_ITT_NOTIFY
1351 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1352 // Get object created at fork_barrier
1353 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1354# endif
1355#endif /* USE_ITT_BUILD */
1356 KMP_MB();
1357
1358 // Get current info
1359 team = this_thr->th.th_team;
1360 nproc = this_thr->th.th_team_nproc;
1361 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1362 tid = __kmp_tid_from_gtid(gtid);
1363#ifdef KMP_DEBUG
1364 team_id = team->t.t_id;
1365#endif /* KMP_DEBUG */
1366 master_thread = this_thr->th.th_team_master;
1367#ifdef KMP_DEBUG
1368 if (master_thread != team->t.t_threads[0]) {
1369 __kmp_print_structure();
1370 }
1371#endif /* KMP_DEBUG */
1372 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1373 KMP_MB();
1374
1375 // Verify state
1376 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1377 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1378 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1379 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1380 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1381
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001382#if OMPT_SUPPORT
1383#if OMPT_TRACE
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001384 if ((ompt_status == ompt_status_track_callback) &&
1385 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1386 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1387 team->t.ompt_team_info.parallel_id,
1388 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1389 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001390#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001391 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1392#endif
1393
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001394 if (__kmp_tasking_mode == tskm_extra_barrier) {
1395 __kmp_tasking_barrier(team, this_thr, gtid);
1396 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1397 }
1398# ifdef KMP_DEBUG
1399 if (__kmp_tasking_mode != tskm_immediate_exec) {
1400 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001401 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001402 this_thr->th.th_task_team));
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001403 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001404 }
1405# endif /* KMP_DEBUG */
1406
1407 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1408 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1409 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1410 since the values are not used by __kmp_wait_template() in that case. */
1411 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1412 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1413 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1414 }
1415
1416#if USE_ITT_BUILD
1417 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1418 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1419#endif /* USE_ITT_BUILD */
1420
1421 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1422 case bp_hyper_bar: {
1423 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1424 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1425 USE_ITT_BUILD_ARG(itt_sync_obj) );
1426 break;
1427 }
1428 case bp_hierarchical_bar: {
1429 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1430 USE_ITT_BUILD_ARG(itt_sync_obj) );
1431 break;
1432 }
1433 case bp_tree_bar: {
1434 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1435 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1436 USE_ITT_BUILD_ARG(itt_sync_obj) );
1437 break;
1438 }
1439 default: {
1440 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1441 USE_ITT_BUILD_ARG(itt_sync_obj) );
1442 }
1443 }
1444
1445 /* From this point on, the team data structure may be deallocated at any time by the
1446 master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1447 data items that need to be referenced before the end of the barrier should be moved to
1448 the kmp_task_team_t structs. */
1449 if (KMP_MASTER_TID(tid)) {
1450 if (__kmp_tasking_mode != tskm_immediate_exec) {
1451 // Master shouldn't call decrease_load(). // TODO: enable master threads.
1452 // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1453 __kmp_task_team_wait(this_thr, team
1454 USE_ITT_BUILD_ARG(itt_sync_obj) );
1455 }
1456#if USE_ITT_BUILD
1457 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1458 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1459#endif /* USE_ITT_BUILD */
1460
1461# if USE_ITT_BUILD && USE_ITT_NOTIFY
1462 // Join barrier - report frame end
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001463 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1464#if OMP_40_ENABLED
1465 this_thr->th.th_teams_microtask == NULL &&
1466#endif
1467 team->t.t_active_level == 1)
1468 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001469 kmp_uint64 cur_time = __itt_get_timestamp();
1470 ident_t * loc = team->t.t_ident;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001471 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001472 int nproc = this_thr->th.th_team_nproc;
1473 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001474 switch(__kmp_forkjoin_frames_mode) {
1475 case 1:
1476 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1477 break;
1478 case 2:
1479 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1480 break;
1481 case 3:
1482 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001483 // Initialize with master's wait time
1484 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001485 for (i=1; i<nproc; ++i) {
1486 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1487 }
1488 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1489 }
1490 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1491 this_thr->th.th_frame_time = cur_time;
1492 break;
1493 }
1494 }
1495# endif /* USE_ITT_BUILD */
1496 }
1497#if USE_ITT_BUILD
1498 else {
1499 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1500 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1501 }
1502#endif /* USE_ITT_BUILD */
1503
1504#if KMP_DEBUG
1505 if (KMP_MASTER_TID(tid)) {
1506 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1507 gtid, team_id, tid, nproc));
1508 }
1509#endif /* KMP_DEBUG */
1510
1511 // TODO now, mark worker threads as done so they may be disbanded
1512 KMP_MB(); // Flush all pending memory write invalidates.
1513 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001514
1515#if OMPT_SUPPORT
Jonathan Peyton48281512015-07-01 15:16:04 +00001516 if (ompt_status & ompt_status_track) {
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001517#if OMPT_TRACE
1518 if ((ompt_status == ompt_status_track_callback) &&
1519 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1520 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1521 team->t.ompt_team_info.parallel_id,
1522 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1523 }
1524#endif
1525
1526 // return to default state
1527 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1528 }
1529#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001530}
1531
1532
1533// TODO release worker threads' fork barriers as we are ready instead of all at once
1534void
1535__kmp_fork_barrier(int gtid, int tid)
1536{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001537 KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001538 kmp_info_t *this_thr = __kmp_threads[gtid];
1539 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1540#if USE_ITT_BUILD
1541 void * itt_sync_obj = NULL;
1542#endif /* USE_ITT_BUILD */
1543
1544 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1545 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1546
1547 // th_team pointer only valid for master thread here
1548 if (KMP_MASTER_TID(tid)) {
1549#if USE_ITT_BUILD && USE_ITT_NOTIFY
1550 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1551 // Create itt barrier object
1552 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1553 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1554 }
1555#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1556
1557#ifdef KMP_DEBUG
1558 register kmp_info_t **other_threads = team->t.t_threads;
1559 register int i;
1560
1561 // Verify state
1562 KMP_MB();
1563
1564 for(i=1; i<team->t.t_nproc; ++i) {
1565 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1566 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1567 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1568 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1569 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1570 & ~(KMP_BARRIER_SLEEP_STATE))
1571 == KMP_INIT_BARRIER_STATE);
1572 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1573 }
1574#endif
1575
1576 if (__kmp_tasking_mode != tskm_immediate_exec) {
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001577 __kmp_task_team_setup(this_thr, team, 1, 0); // 1,0 indicates setup both task teams if nthreads > 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001578 }
1579
1580 /* The master thread may have changed its blocktime between the join barrier and the
1581 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1582 access it when the team struct is not guaranteed to exist. */
1583 // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1584 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1585 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1586 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1587 }
1588 } // master
1589
1590 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1591 case bp_hyper_bar: {
1592 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1593 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1594 USE_ITT_BUILD_ARG(itt_sync_obj) );
1595 break;
1596 }
1597 case bp_hierarchical_bar: {
1598 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1599 USE_ITT_BUILD_ARG(itt_sync_obj) );
1600 break;
1601 }
1602 case bp_tree_bar: {
1603 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1604 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1605 USE_ITT_BUILD_ARG(itt_sync_obj) );
1606 break;
1607 }
1608 default: {
1609 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1610 USE_ITT_BUILD_ARG(itt_sync_obj) );
1611 }
1612 }
1613
1614 // Early exit for reaping threads releasing forkjoin barrier
1615 if (TCR_4(__kmp_global.g.g_done)) {
1616 if (this_thr->th.th_task_team != NULL) {
1617 if (KMP_MASTER_TID(tid)) {
1618 TCW_PTR(this_thr->th.th_task_team, NULL);
1619 }
1620 else {
1621 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1622 }
1623 }
1624
1625#if USE_ITT_BUILD && USE_ITT_NOTIFY
1626 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1627 if (!KMP_MASTER_TID(tid)) {
1628 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1629 if (itt_sync_obj)
1630 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1631 }
1632 }
1633#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1634 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1635 return;
1636 }
1637
1638 /* We can now assume that a valid team structure has been allocated by the master and
1639 propagated to all worker threads. The current thread, however, may not be part of the
1640 team, so we can't blindly assume that the team pointer is non-null. */
1641 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1642 KMP_DEBUG_ASSERT(team != NULL);
1643 tid = __kmp_tid_from_gtid(gtid);
1644
1645
1646#if KMP_BARRIER_ICV_PULL
1647 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1648 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1649 this data before this function is called. We cannot modify __kmp_fork_call() to look at
1650 the fixed ICVs in the master's thread struct, because it is not always the case that the
1651 threads arrays have been allocated when __kmp_fork_call() is executed. */
Jonathan Peyton45be4502015-08-11 21:36:41 +00001652 {
1653 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
1654 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1655 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1656 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1657 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1658 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1659 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1660 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001661 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001662#endif // KMP_BARRIER_ICV_PULL
1663
1664 if (__kmp_tasking_mode != tskm_immediate_exec) {
1665 __kmp_task_team_sync(this_thr, team);
1666 }
1667
1668#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1669 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1670 if (proc_bind == proc_bind_intel) {
1671#endif
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001672#if KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001673 // Call dynamic affinity settings
1674 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1675 __kmp_balanced_affinity(tid, team->t.t_nproc);
1676 }
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001677#endif // KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001678#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1679 }
Andrey Churbanov94e569e2015-03-10 09:19:47 +00001680 else if (proc_bind != proc_bind_false) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001681 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1682 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1683 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1684 }
1685 else {
1686 __kmp_affinity_set_place(gtid);
1687 }
1688 }
1689#endif
1690
1691#if USE_ITT_BUILD && USE_ITT_NOTIFY
1692 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1693 if (!KMP_MASTER_TID(tid)) {
1694 // Get correct barrier object
1695 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1696 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1697 } // (prepare called inside barrier_release)
1698 }
1699#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1700 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1701}
1702
1703
1704void
1705__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1706{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001707 KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001708
1709 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1710 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1711
1712 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1713 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1714 this data before this function is called. */
1715#if KMP_BARRIER_ICV_PULL
1716 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1717 all of the worker threads can access them and make their own copies after the barrier. */
1718 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1719 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1720 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1721 0, team->t.t_threads[0], team));
1722#elif KMP_BARRIER_ICV_PUSH
1723 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1724 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1725 0, team->t.t_threads[0], team));
1726#else
1727 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1728 ngo_load(new_icvs);
1729 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
Jonathan Peyton91b78702015-06-08 19:39:07 +00001730 for (int f=1; f<new_nproc; ++f) { // Skip the master thread
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001731 // TODO: GEH - pass in better source location info since usually NULL here
1732 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1733 f, team->t.t_threads[f], team));
1734 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1735 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1736 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1737 f, team->t.t_threads[f], team));
1738 }
1739 ngo_sync();
1740#endif // KMP_BARRIER_ICV_PULL
1741}