blob: 0d9c7668b01c32ba77232a054be0e78bdf02fb0b [file] [log] [blame]
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001/*
2 * kmp_barrier.cpp
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16#include "kmp.h"
17#include "kmp_wait_release.h"
18#include "kmp_stats.h"
19#include "kmp_itt.h"
20
21#if KMP_MIC
22#include <immintrin.h>
23#define USE_NGO_STORES 1
24#endif // KMP_MIC
25
26#if KMP_MIC && USE_NGO_STORES
27// ICV copying
28#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
29#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
30#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
32#else
33#define ngo_load(src) ((void)0)
34#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
Jonathan Peyton01b58b72015-07-09 18:20:51 +000035#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
Jim Cownie4cc4bb42014-10-07 16:25:50 +000036#define ngo_sync() ((void)0)
37#endif /* KMP_MIC && USE_NGO_STORES */
38
39void __kmp_print_structure(void); // Forward declaration
40
41// ---------------------------- Barrier Algorithms ----------------------------
42
43// Linear Barrier
44static void
45__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
46 void (*reduce)(void *, void *)
47 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
48{
Jonathan Peyton45be4502015-08-11 21:36:41 +000049 KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +000050 register kmp_team_t *team = this_thr->th.th_team;
51 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
52 register kmp_info_t **other_threads = team->t.t_threads;
53
54 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
55 gtid, team->t.t_id, tid, bt));
56 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
57
58#if USE_ITT_BUILD && USE_ITT_NOTIFY
59 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +000060 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +000061 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
62 }
63#endif
64 // We now perform a linear reduction to signal that all of the threads have arrived.
65 if (!KMP_MASTER_TID(tid)) {
66 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
67 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
68 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
69 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
70 // Mark arrival to master thread
71 /* After performing this write, a worker thread may not assume that the team is valid
72 any more - it could be deallocated by the master thread at any time. */
73 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
74 flag.release();
75 } else {
76 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
77 register int nproc = this_thr->th.th_team_nproc;
78 register int i;
79 // Don't have to worry about sleep bit here or atomic since team setting
80 register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
81
82 // Collect all the worker team member threads.
83 for (i=1; i<nproc; ++i) {
84#if KMP_CACHE_MANAGE
85 // Prefetch next thread's arrived count
86 if (i+1 < nproc)
87 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
88#endif /* KMP_CACHE_MANAGE */
89 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
90 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
91 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
92 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
93
94 // Wait for worker thread to arrive
95 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
96 flag.wait(this_thr, FALSE
97 USE_ITT_BUILD_ARG(itt_sync_obj) );
98#if USE_ITT_BUILD && USE_ITT_NOTIFY
99 // Barrier imbalance - write min of the thread time and the other thread time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000100 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000101 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
102 other_threads[i]->th.th_bar_min_time);
103 }
104#endif
105 if (reduce) {
106 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
107 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
108 (*reduce)(this_thr->th.th_local.reduce_data,
109 other_threads[i]->th.th_local.reduce_data);
110 }
111 }
112 // Don't have to worry about sleep bit here or atomic since team setting
113 team_bar->b_arrived = new_state;
114 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
115 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
116 }
117 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
118 gtid, team->t.t_id, tid, bt));
119}
120
121static void
122__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
123 int propagate_icvs
124 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
125{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000126 KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000127 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
128 register kmp_team_t *team;
129
130 if (KMP_MASTER_TID(tid)) {
131 register unsigned int i;
132 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
133 register kmp_info_t **other_threads;
134
135 team = __kmp_threads[gtid]->th.th_team;
136 KMP_DEBUG_ASSERT(team != NULL);
137 other_threads = team->t.t_threads;
138
139 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
140 gtid, team->t.t_id, tid, bt));
141
142 if (nproc > 1) {
143#if KMP_BARRIER_ICV_PUSH
Jonathan Peyton45be4502015-08-11 21:36:41 +0000144 {
145 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
146 if (propagate_icvs) {
147 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
148 for (i=1; i<nproc; ++i) {
149 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
150 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
151 &team->t.t_implicit_task_taskdata[0].td_icvs);
152 }
153 ngo_sync();
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000154 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000155 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000156#endif // KMP_BARRIER_ICV_PUSH
157
158 // Now, release all of the worker threads
159 for (i=1; i<nproc; ++i) {
160#if KMP_CACHE_MANAGE
161 // Prefetch next thread's go flag
162 if (i+1 < nproc)
163 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
164#endif /* KMP_CACHE_MANAGE */
165 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
166 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
167 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
168 &other_threads[i]->th.th_bar[bt].bb.b_go,
169 other_threads[i]->th.th_bar[bt].bb.b_go,
170 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
171 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
172 flag.release();
173 }
174 }
175 } else { // Wait for the MASTER thread to release us
176 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
177 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
178 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
179 flag.wait(this_thr, TRUE
180 USE_ITT_BUILD_ARG(itt_sync_obj) );
181#if USE_ITT_BUILD && USE_ITT_NOTIFY
182 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
183 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
184 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
185 // Cancel wait on previous parallel region...
186 __kmp_itt_task_starting(itt_sync_obj);
187
188 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
189 return;
190
191 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
192 if (itt_sync_obj != NULL)
193 // Call prepare as early as possible for "new" barrier
194 __kmp_itt_task_finished(itt_sync_obj);
195 } else
196#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
197 // Early exit for reaping threads releasing forkjoin barrier
198 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
199 return;
200 // The worker thread may now assume that the team is valid.
201#ifdef KMP_DEBUG
202 tid = __kmp_tid_from_gtid(gtid);
203 team = __kmp_threads[gtid]->th.th_team;
204#endif
205 KMP_DEBUG_ASSERT(team != NULL);
206 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
207 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
208 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
209 KMP_MB(); // Flush all pending memory write invalidates.
210 }
211 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
212 gtid, team->t.t_id, tid, bt));
213}
214
215// Tree barrier
216static void
217__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
218 void (*reduce)(void *, void *)
219 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
220{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000221 KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000222 register kmp_team_t *team = this_thr->th.th_team;
223 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
224 register kmp_info_t **other_threads = team->t.t_threads;
225 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
226 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
227 register kmp_uint32 branch_factor = 1 << branch_bits;
228 register kmp_uint32 child;
229 register kmp_uint32 child_tid;
230 register kmp_uint new_state;
231
232 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
233 gtid, team->t.t_id, tid, bt));
234 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
235
236#if USE_ITT_BUILD && USE_ITT_NOTIFY
237 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000238 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000239 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
240 }
241#endif
242 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
243 child_tid = (tid << branch_bits) + 1;
244 if (child_tid < nproc) {
245 // Parent threads wait for all their children to arrive
246 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
247 child = 1;
248 do {
249 register kmp_info_t *child_thr = other_threads[child_tid];
250 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
251#if KMP_CACHE_MANAGE
252 // Prefetch next thread's arrived count
253 if (child+1 <= branch_factor && child_tid+1 < nproc)
254 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
255#endif /* KMP_CACHE_MANAGE */
256 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
257 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
258 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
259 &child_bar->b_arrived, new_state));
260 // Wait for child to arrive
261 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
262 flag.wait(this_thr, FALSE
263 USE_ITT_BUILD_ARG(itt_sync_obj) );
264#if USE_ITT_BUILD && USE_ITT_NOTIFY
265 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000266 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000267 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
268 child_thr->th.th_bar_min_time);
269 }
270#endif
271 if (reduce) {
272 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
273 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
274 team->t.t_id, child_tid));
275 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
276 }
277 child++;
278 child_tid++;
279 }
280 while (child <= branch_factor && child_tid < nproc);
281 }
282
283 if (!KMP_MASTER_TID(tid)) { // Worker threads
284 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
285
286 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
287 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
288 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
289 &thr_bar->b_arrived, thr_bar->b_arrived,
290 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
291
292 // Mark arrival to parent thread
293 /* After performing this write, a worker thread may not assume that the team is valid
294 any more - it could be deallocated by the master thread at any time. */
295 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
296 flag.release();
297 } else {
298 // Need to update the team arrived pointer if we are the master thread
299 if (nproc > 1) // New value was already computed above
300 team->t.t_bar[bt].b_arrived = new_state;
301 else
302 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
303 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
304 gtid, team->t.t_id, tid, team->t.t_id,
305 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
306 }
307 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
308 gtid, team->t.t_id, tid, bt));
309}
310
311static void
312__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
313 int propagate_icvs
314 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
315{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000316 KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000317 register kmp_team_t *team;
318 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
319 register kmp_uint32 nproc;
320 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
321 register kmp_uint32 branch_factor = 1 << branch_bits;
322 register kmp_uint32 child;
323 register kmp_uint32 child_tid;
324
325 // Perform a tree release for all of the threads that have been gathered
326 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
327 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
328 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
329 // Wait for parent thread to release us
330 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
331 flag.wait(this_thr, TRUE
332 USE_ITT_BUILD_ARG(itt_sync_obj) );
333#if USE_ITT_BUILD && USE_ITT_NOTIFY
334 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
335 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
336 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
337 // Cancel wait on previous parallel region...
338 __kmp_itt_task_starting(itt_sync_obj);
339
340 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
341 return;
342
343 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
344 if (itt_sync_obj != NULL)
345 // Call prepare as early as possible for "new" barrier
346 __kmp_itt_task_finished(itt_sync_obj);
347 } else
348#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
349 // Early exit for reaping threads releasing forkjoin barrier
350 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
351 return;
352
353 // The worker thread may now assume that the team is valid.
354 team = __kmp_threads[gtid]->th.th_team;
355 KMP_DEBUG_ASSERT(team != NULL);
356 tid = __kmp_tid_from_gtid(gtid);
357
358 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
359 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
360 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
361 KMP_MB(); // Flush all pending memory write invalidates.
362 } else {
363 team = __kmp_threads[gtid]->th.th_team;
364 KMP_DEBUG_ASSERT(team != NULL);
365 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
366 gtid, team->t.t_id, tid, bt));
367 }
368 nproc = this_thr->th.th_team_nproc;
369 child_tid = (tid << branch_bits) + 1;
370
371 if (child_tid < nproc) {
372 register kmp_info_t **other_threads = team->t.t_threads;
373 child = 1;
374 // Parent threads release all their children
375 do {
376 register kmp_info_t *child_thr = other_threads[child_tid];
377 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
378#if KMP_CACHE_MANAGE
379 // Prefetch next thread's go count
380 if (child+1 <= branch_factor && child_tid+1 < nproc)
381 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
382#endif /* KMP_CACHE_MANAGE */
383
384#if KMP_BARRIER_ICV_PUSH
Jonathan Peyton45be4502015-08-11 21:36:41 +0000385 {
386 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
387 if (propagate_icvs) {
388 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
389 team, child_tid, FALSE);
390 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
391 &team->t.t_implicit_task_taskdata[0].td_icvs);
392 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000393 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000394#endif // KMP_BARRIER_ICV_PUSH
395 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
396 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
397 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
398 child_tid, &child_bar->b_go, child_bar->b_go,
399 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
400 // Release child from barrier
401 kmp_flag_64 flag(&child_bar->b_go, child_thr);
402 flag.release();
403 child++;
404 child_tid++;
405 }
406 while (child <= branch_factor && child_tid < nproc);
407 }
408 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
409 gtid, team->t.t_id, tid, bt));
410}
411
412
413// Hyper Barrier
414static void
415__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
416 void (*reduce)(void *, void *)
417 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
418{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000419 KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000420 register kmp_team_t *team = this_thr->th.th_team;
421 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
422 register kmp_info_t **other_threads = team->t.t_threads;
423 register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
424 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
425 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
426 register kmp_uint32 branch_factor = 1 << branch_bits;
427 register kmp_uint32 offset;
428 register kmp_uint32 level;
429
430 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
431 gtid, team->t.t_id, tid, bt));
432
433 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
434
435#if USE_ITT_BUILD && USE_ITT_NOTIFY
436 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000437 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000438 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
439 }
440#endif
441 /* Perform a hypercube-embedded tree gather to wait until all of the threads have
442 arrived, and reduce any required data as we go. */
443 kmp_flag_64 p_flag(&thr_bar->b_arrived);
444 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
445 {
446 register kmp_uint32 child;
447 register kmp_uint32 child_tid;
448
449 if (((tid >> level) & (branch_factor - 1)) != 0) {
450 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
451
452 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
453 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
454 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
455 &thr_bar->b_arrived, thr_bar->b_arrived,
456 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
457 // Mark arrival to parent thread
458 /* After performing this write (in the last iteration of the enclosing for loop),
459 a worker thread may not assume that the team is valid any more - it could be
460 deallocated by the master thread at any time. */
461 p_flag.set_waiter(other_threads[parent_tid]);
462 p_flag.release();
463 break;
464 }
465
466 // Parent threads wait for children to arrive
467 if (new_state == KMP_BARRIER_UNUSED_STATE)
468 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
469 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
470 child++, child_tid+=(1 << level))
471 {
472 register kmp_info_t *child_thr = other_threads[child_tid];
473 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474#if KMP_CACHE_MANAGE
475 register kmp_uint32 next_child_tid = child_tid + (1 << level);
476 // Prefetch next thread's arrived count
477 if (child+1 < branch_factor && next_child_tid < num_threads)
478 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
479#endif /* KMP_CACHE_MANAGE */
480 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
481 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
482 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
483 &child_bar->b_arrived, new_state));
484 // Wait for child to arrive
485 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
486 c_flag.wait(this_thr, FALSE
487 USE_ITT_BUILD_ARG(itt_sync_obj) );
488#if USE_ITT_BUILD && USE_ITT_NOTIFY
489 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000490 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000491 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
492 child_thr->th.th_bar_min_time);
493 }
494#endif
495 if (reduce) {
496 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
497 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
498 team->t.t_id, child_tid));
499 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
500 }
501 }
502 }
503
504 if (KMP_MASTER_TID(tid)) {
505 // Need to update the team arrived pointer if we are the master thread
506 if (new_state == KMP_BARRIER_UNUSED_STATE)
507 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
508 else
509 team->t.t_bar[bt].b_arrived = new_state;
510 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
511 gtid, team->t.t_id, tid, team->t.t_id,
512 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
513 }
514 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
515 gtid, team->t.t_id, tid, bt));
516}
517
518// The reverse versions seem to beat the forward versions overall
519#define KMP_REVERSE_HYPER_BAR
520static void
521__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
522 int propagate_icvs
523 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
524{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000525 KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000526 register kmp_team_t *team;
527 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
528 register kmp_info_t **other_threads;
529 register kmp_uint32 num_threads;
530 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
531 register kmp_uint32 branch_factor = 1 << branch_bits;
532 register kmp_uint32 child;
533 register kmp_uint32 child_tid;
534 register kmp_uint32 offset;
535 register kmp_uint32 level;
536
537 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
538 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
539 order of the corresponding gather, otherwise threads are released in the same order. */
540 if (KMP_MASTER_TID(tid)) { // master
541 team = __kmp_threads[gtid]->th.th_team;
542 KMP_DEBUG_ASSERT(team != NULL);
543 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
544 gtid, team->t.t_id, tid, bt));
545#if KMP_BARRIER_ICV_PUSH
546 if (propagate_icvs) { // master already has ICVs in final destination; copy
547 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
548 }
549#endif
550 }
551 else { // Handle fork barrier workers who aren't part of a team yet
552 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
553 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
554 // Wait for parent thread to release us
555 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
556 flag.wait(this_thr, TRUE
557 USE_ITT_BUILD_ARG(itt_sync_obj) );
558#if USE_ITT_BUILD && USE_ITT_NOTIFY
559 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
560 // In fork barrier where we could not get the object reliably
561 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
562 // Cancel wait on previous parallel region...
563 __kmp_itt_task_starting(itt_sync_obj);
564
565 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
566 return;
567
568 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
569 if (itt_sync_obj != NULL)
570 // Call prepare as early as possible for "new" barrier
571 __kmp_itt_task_finished(itt_sync_obj);
572 } else
573#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
574 // Early exit for reaping threads releasing forkjoin barrier
575 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
576 return;
577
578 // The worker thread may now assume that the team is valid.
579 team = __kmp_threads[gtid]->th.th_team;
580 KMP_DEBUG_ASSERT(team != NULL);
581 tid = __kmp_tid_from_gtid(gtid);
582
583 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
584 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
585 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
586 KMP_MB(); // Flush all pending memory write invalidates.
587 }
588 num_threads = this_thr->th.th_team_nproc;
589 other_threads = team->t.t_threads;
590
591#ifdef KMP_REVERSE_HYPER_BAR
592 // Count up to correct level for parent
593 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
594 level+=branch_bits, offset<<=branch_bits);
595
596 // Now go down from there
597 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
598 level-=branch_bits, offset>>=branch_bits)
599#else
600 // Go down the tree, level by level
601 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
602#endif // KMP_REVERSE_HYPER_BAR
603 {
604#ifdef KMP_REVERSE_HYPER_BAR
605 /* Now go in reverse order through the children, highest to lowest.
606 Initial setting of child is conservative here. */
607 child = num_threads >> ((level==0)?level:level-1);
608 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
609 child>=1; child--, child_tid-=(1<<level))
610#else
611 if (((tid >> level) & (branch_factor - 1)) != 0)
612 // No need to go lower than this, since this is the level parent would be notified
613 break;
614 // Iterate through children on this level of the tree
615 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
616 child++, child_tid+=(1<<level))
617#endif // KMP_REVERSE_HYPER_BAR
618 {
619 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
620 else {
621 register kmp_info_t *child_thr = other_threads[child_tid];
622 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
623#if KMP_CACHE_MANAGE
624 register kmp_uint32 next_child_tid = child_tid - (1 << level);
625 // Prefetch next thread's go count
626# ifdef KMP_REVERSE_HYPER_BAR
627 if (child-1 >= 1 && next_child_tid < num_threads)
628# else
629 if (child+1 < branch_factor && next_child_tid < num_threads)
630# endif // KMP_REVERSE_HYPER_BAR
631 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
632#endif /* KMP_CACHE_MANAGE */
633
634#if KMP_BARRIER_ICV_PUSH
635 if (propagate_icvs) // push my fixed ICVs to my child
636 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
637#endif // KMP_BARRIER_ICV_PUSH
638
639 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
640 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
641 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
642 child_tid, &child_bar->b_go, child_bar->b_go,
643 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
644 // Release child from barrier
645 kmp_flag_64 flag(&child_bar->b_go, child_thr);
646 flag.release();
647 }
648 }
649 }
650#if KMP_BARRIER_ICV_PUSH
651 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
652 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
653 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
654 }
655#endif
656 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
657 gtid, team->t.t_id, tid, bt));
658}
659
660// Hierarchical Barrier
661
662// Initialize thread barrier data
663/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
664 minimum amount of initialization required based on how the team has changed. Returns true if
665 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
666 team size increases, threads already in the team will respond to on-core wakeup on their parent
667 thread, but threads newly added to the team will only be listening on the their local b_go. */
668static bool
669__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
670 int gtid, int tid, kmp_team_t *team)
671{
672 // Checks to determine if (re-)initialization is needed
673 bool uninitialized = thr_bar->team == NULL;
674 bool team_changed = team != thr_bar->team;
675 bool team_sz_changed = nproc != thr_bar->nproc;
676 bool tid_changed = tid != thr_bar->old_tid;
677 bool retval = false;
678
679 if (uninitialized || team_sz_changed) {
680 __kmp_get_hierarchy(nproc, thr_bar);
681 }
682
683 if (uninitialized || team_sz_changed || tid_changed) {
684 thr_bar->my_level = thr_bar->depth-1; // default for master
685 thr_bar->parent_tid = -1; // default for master
686 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
687 kmp_uint32 d=0;
688 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
689 kmp_uint32 rem;
690 if (d == thr_bar->depth-2) { // reached level right below the master
691 thr_bar->parent_tid = 0;
692 thr_bar->my_level = d;
693 break;
694 }
695 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
696 // thread is not a subtree root at next level, so this is max
697 thr_bar->parent_tid = tid - rem;
698 thr_bar->my_level = d;
699 break;
700 }
701 ++d;
702 }
703 }
704 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
705 thr_bar->old_tid = tid;
706 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
707 }
708 if (uninitialized || team_changed || tid_changed) {
709 thr_bar->team = team;
710 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
711 retval = true;
712 }
713 if (uninitialized || team_sz_changed || tid_changed) {
714 thr_bar->nproc = nproc;
715 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
716 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
717 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
718 thr_bar->leaf_kids = nproc - tid - 1;
719 thr_bar->leaf_state = 0;
720 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
721 }
722 return retval;
723}
724
725static void
726__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
727 int gtid, int tid, void (*reduce) (void *, void *)
728 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
729{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000730 KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000731 register kmp_team_t *team = this_thr->th.th_team;
732 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
733 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
734 register kmp_info_t **other_threads = team->t.t_threads;
735 register kmp_uint64 new_state;
736
Andrey Churbanov42a79212015-01-27 16:50:31 +0000737 int level = team->t.t_level;
738 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
739 if (this_thr->th.th_teams_size.nteams > 1)
740 ++level; // level was not increased in teams construct for team_of_masters
741 if (level == 1) thr_bar->use_oncore_barrier = 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000742 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
743
744 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
745 gtid, team->t.t_id, tid, bt));
746 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
747
Andrey Churbanove6bfb732015-05-06 18:34:15 +0000748#if USE_ITT_BUILD && USE_ITT_NOTIFY
749 // Barrier imbalance - save arrive time to the thread
750 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
751 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
752 }
753#endif
754
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000755 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
756
757 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
758 register kmp_int32 child_tid;
759 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
760 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
761 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
762 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
763 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
764 flag.wait(this_thr, FALSE
765 USE_ITT_BUILD_ARG(itt_sync_obj) );
766 if (reduce) {
767 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
768 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
769 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
770 team->t.t_id, child_tid));
771 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
772 }
773 }
774 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
775 }
776 // Next, wait for higher level children on each child's b_arrived flag
777 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
778 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
779 if (last > nproc) last = nproc;
780 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
781 register kmp_info_t *child_thr = other_threads[child_tid];
782 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
783 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
784 "arrived(%p) == %u\n",
785 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
786 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
787 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
788 flag.wait(this_thr, FALSE
789 USE_ITT_BUILD_ARG(itt_sync_obj) );
790 if (reduce) {
791 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
792 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
793 team->t.t_id, child_tid));
794 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
795 }
796 }
797 }
798 }
799 else { // Blocktime is not infinite
800 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
801 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
802 if (last > nproc) last = nproc;
803 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
804 register kmp_info_t *child_thr = other_threads[child_tid];
805 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
806 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
807 "arrived(%p) == %u\n",
808 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
809 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
810 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
811 flag.wait(this_thr, FALSE
812 USE_ITT_BUILD_ARG(itt_sync_obj) );
813 if (reduce) {
814 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
815 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
816 team->t.t_id, child_tid));
817 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
818 }
819 }
820 }
821 }
822 }
823 // All subordinates are gathered; now release parent if not master thread
824
825 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
826 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
827 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
828 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
829 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
830 /* Mark arrival to parent: After performing this write, a worker thread may not assume that
831 the team is valid any more - it could be deallocated by the master thread at any time. */
832 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
833 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
834 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
835 flag.release();
836 }
837 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
838 thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
839 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
840 flag.set_waiter(other_threads[thr_bar->parent_tid]);
841 flag.release();
842 }
843 } else { // Master thread needs to update the team's b_arrived value
844 team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
845 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
846 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
847 }
848 // Is the team access below unsafe or just technically invalid?
849 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
850 gtid, team->t.t_id, tid, bt));
851}
852
853static void
854__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
855 int propagate_icvs
856 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
857{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000858 KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000859 register kmp_team_t *team;
860 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
861 register kmp_uint32 nproc;
862 bool team_change = false; // indicates on-core barrier shouldn't be used
863
864 if (KMP_MASTER_TID(tid)) {
865 team = __kmp_threads[gtid]->th.th_team;
866 KMP_DEBUG_ASSERT(team != NULL);
867 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
868 gtid, team->t.t_id, tid, bt));
869 }
870 else { // Worker threads
871 // Wait for parent thread to release me
872 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
873 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
874 // Use traditional method of waiting on my own b_go flag
875 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
876 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
877 flag.wait(this_thr, TRUE
878 USE_ITT_BUILD_ARG(itt_sync_obj) );
879 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
880 }
881 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
882 // Wait on my "offset" bits on parent's b_go flag
883 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
884 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
885 bt, this_thr
886 USE_ITT_BUILD_ARG(itt_sync_obj) );
887 flag.wait(this_thr, TRUE
888 USE_ITT_BUILD_ARG(itt_sync_obj) );
889 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
890 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
891 }
892 else { // Reset my bits on parent's b_go flag
893 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
894 }
895 }
896 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
897 // Early exit for reaping threads releasing forkjoin barrier
898 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
899 return;
900 // The worker thread may now assume that the team is valid.
901 team = __kmp_threads[gtid]->th.th_team;
902 KMP_DEBUG_ASSERT(team != NULL);
903 tid = __kmp_tid_from_gtid(gtid);
904
905 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
906 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
907 KMP_MB(); // Flush all pending memory write invalidates.
908 }
909
Andrey Churbanov42a79212015-01-27 16:50:31 +0000910 int level = team->t.t_level;
911 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
912 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
913 ++level; // level was not increased in teams construct for team_of_workers
914 if( this_thr->th.th_teams_size.nteams > 1 )
915 ++level; // level was not increased in teams construct for team_of_masters
916 }
917 if (level == 1) thr_bar->use_oncore_barrier = 1;
918 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000919 nproc = this_thr->th.th_team_nproc;
920
921 // If the team size has increased, we still communicate with old leaves via oncore barrier.
922 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
923 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
924 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
925 // But if the entire team changes, we won't use oncore barrier at all
926 if (team_change) old_leaf_kids = 0;
927
928#if KMP_BARRIER_ICV_PUSH
929 if (propagate_icvs) {
930 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
931 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
932 }
933 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
934 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
935 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
936 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
937 &thr_bar->parent_bar->th_fixed_icvs);
938 // non-leaves will get ICVs piggybacked with b_go via NGO store
939 }
940 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
941 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
942 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
943 else // leaves copy parent's fixed ICVs directly to local ICV store
944 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
945 &thr_bar->parent_bar->th_fixed_icvs);
946 }
947 }
948#endif // KMP_BARRIER_ICV_PUSH
949
950 // Now, release my children
951 if (thr_bar->my_level) { // not a leaf
952 register kmp_int32 child_tid;
953 kmp_uint32 last;
954 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
955 if (KMP_MASTER_TID(tid)) { // do a flat release
956 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
957 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
958 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
959 ngo_load(&thr_bar->th_fixed_icvs);
960 // This loops over all the threads skipping only the leaf nodes in the hierarchy
961 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
962 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
963 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
964 " go(%p): %u => %u\n",
965 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
966 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
967 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
968 // Use ngo store (if available) to both store ICVs and release child via child's b_go
969 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
970 }
971 ngo_sync();
972 }
973 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
974 // Now, release leaf children
975 if (thr_bar->leaf_kids) { // if there are any
976 // We test team_change on the off-chance that the level 1 team changed.
977 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
978 if (old_leaf_kids) { // release old leaf kids
979 thr_bar->b_go |= old_leaf_state;
980 }
981 // Release new leaf kids
982 last = tid+thr_bar->skip_per_level[1];
983 if (last > nproc) last = nproc;
984 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
985 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
986 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
987 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
988 " T#%d(%d:%d) go(%p): %u => %u\n",
989 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
990 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
991 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
992 // Release child using child's b_go flag
993 kmp_flag_64 flag(&child_bar->b_go, child_thr);
994 flag.release();
995 }
996 }
997 else { // Release all children at once with leaf_state bits on my own b_go flag
998 thr_bar->b_go |= thr_bar->leaf_state;
999 }
1000 }
1001 }
1002 else { // Blocktime is not infinite; do a simple hierarchical release
1003 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1004 last = tid+thr_bar->skip_per_level[d+1];
1005 kmp_uint32 skip = thr_bar->skip_per_level[d];
1006 if (last > nproc) last = nproc;
1007 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1008 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1009 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1010 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1011 " go(%p): %u => %u\n",
1012 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1013 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1014 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1015 // Release child using child's b_go flag
1016 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1017 flag.release();
1018 }
1019 }
1020 }
1021#if KMP_BARRIER_ICV_PUSH
1022 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1023 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1024#endif // KMP_BARRIER_ICV_PUSH
1025 }
1026 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1027 gtid, team->t.t_id, tid, bt));
1028}
1029
1030// ---------------------------- End of Barrier Algorithms ----------------------------
1031
1032// Internal function to do a barrier.
1033/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1034 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1035 Returns 0 if master thread, 1 if worker thread. */
1036int
1037__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1038 void *reduce_data, void (*reduce)(void *, void *))
1039{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001040 KMP_TIME_DEVELOPER_BLOCK(KMP_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001041 register int tid = __kmp_tid_from_gtid(gtid);
1042 register kmp_info_t *this_thr = __kmp_threads[gtid];
1043 register kmp_team_t *team = this_thr->th.th_team;
1044 register int status = 0;
1045 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001046#if OMPT_SUPPORT
1047 ompt_task_id_t my_task_id;
1048 ompt_parallel_id_t my_parallel_id;
1049#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001050
1051 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1052 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1053
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001054#if OMPT_SUPPORT
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001055 if (ompt_status & ompt_status_track) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001056#if OMPT_BLAME
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001057 if (ompt_status == ompt_status_track_callback) {
1058 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1059 my_parallel_id = team->t.ompt_team_info.parallel_id;
1060
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001061#if OMPT_TRACE
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001062 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1063 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1064 ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1065 my_parallel_id, my_task_id);
1066 }
1067 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001068#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001069 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1070 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1071 my_parallel_id, my_task_id);
1072 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001073 }
1074#endif
1075 // It is OK to report the barrier state after the barrier begin callback.
1076 // According to the OMPT specification, a compliant implementation may
1077 // even delay reporting this state until the barrier begins to wait.
1078 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001079 }
1080#endif
1081
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001082 if (! team->t.t_serialized) {
1083#if USE_ITT_BUILD
1084 // This value will be used in itt notify events below.
1085 void *itt_sync_obj = NULL;
1086# if USE_ITT_NOTIFY
1087 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1088 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1089# endif
1090#endif /* USE_ITT_BUILD */
1091 if (__kmp_tasking_mode == tskm_extra_barrier) {
1092 __kmp_tasking_barrier(team, this_thr, gtid);
1093 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1094 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1095 }
1096
1097 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1098 the team struct is not guaranteed to exist. */
1099 // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1100 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1101 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1102 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1103 }
1104
1105#if USE_ITT_BUILD
1106 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1107 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1108#endif /* USE_ITT_BUILD */
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001109#if USE_DEBUGGER
1110 // Let the debugger know: the thread arrived to the barrier and waiting.
1111 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1112 team->t.t_bar[bt].b_master_arrived += 1;
1113 } else {
1114 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1115 } // if
1116#endif /* USE_DEBUGGER */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001117 if (reduce != NULL) {
1118 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1119 this_thr->th.th_local.reduce_data = reduce_data;
1120 }
1121 switch (__kmp_barrier_gather_pattern[bt]) {
1122 case bp_hyper_bar: {
1123 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1124 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1125 USE_ITT_BUILD_ARG(itt_sync_obj) );
1126 break;
1127 }
1128 case bp_hierarchical_bar: {
1129 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1130 USE_ITT_BUILD_ARG(itt_sync_obj));
1131 break;
1132 }
1133 case bp_tree_bar: {
1134 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1135 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1136 USE_ITT_BUILD_ARG(itt_sync_obj) );
1137 break;
1138 }
1139 default: {
1140 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1141 USE_ITT_BUILD_ARG(itt_sync_obj) );
1142 }
1143 }
1144
1145 KMP_MB();
1146
1147 if (KMP_MASTER_TID(tid)) {
1148 status = 0;
1149 if (__kmp_tasking_mode != tskm_immediate_exec) {
1150 __kmp_task_team_wait(this_thr, team
1151 USE_ITT_BUILD_ARG(itt_sync_obj) );
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001152 __kmp_task_team_setup(this_thr, team, 0, 0); // use 0,0 to only setup the current team if nthreads > 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001153 }
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001154#if USE_DEBUGGER
1155 // Let the debugger know: All threads are arrived and starting leaving the barrier.
1156 team->t.t_bar[bt].b_team_arrived += 1;
1157#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001158
1159#if USE_ITT_BUILD
1160 /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1161 before the final summation into the shared variable is done (final summation can be a
1162 long operation for array reductions). */
1163 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1164 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1165#endif /* USE_ITT_BUILD */
1166#if USE_ITT_BUILD && USE_ITT_NOTIFY
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001167 // Barrier - report frame end (only if active_level == 1)
1168 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1169#if OMP_40_ENABLED
1170 this_thr->th.th_teams_microtask == NULL &&
1171#endif
1172 team->t.t_active_level == 1)
1173 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001174 kmp_uint64 cur_time = __itt_get_timestamp();
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001175 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001176 int nproc = this_thr->th.th_team_nproc;
1177 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001178 switch(__kmp_forkjoin_frames_mode) {
1179 case 1:
1180 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1181 this_thr->th.th_frame_time = cur_time;
1182 break;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001183 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001184 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1185 break;
1186 case 3:
1187 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001188 // Initialize with master's wait time
1189 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001190 for (i=1; i<nproc; ++i) {
1191 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1192 }
1193 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1194 }
1195 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1196 this_thr->th.th_frame_time = cur_time;
1197 break;
1198 }
1199 }
1200#endif /* USE_ITT_BUILD */
1201 } else {
1202 status = 1;
1203#if USE_ITT_BUILD
1204 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1205 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1206#endif /* USE_ITT_BUILD */
1207 }
1208 if (status == 1 || ! is_split) {
1209 switch (__kmp_barrier_release_pattern[bt]) {
1210 case bp_hyper_bar: {
1211 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1212 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1213 USE_ITT_BUILD_ARG(itt_sync_obj) );
1214 break;
1215 }
1216 case bp_hierarchical_bar: {
1217 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1218 USE_ITT_BUILD_ARG(itt_sync_obj) );
1219 break;
1220 }
1221 case bp_tree_bar: {
1222 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1223 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1224 USE_ITT_BUILD_ARG(itt_sync_obj) );
1225 break;
1226 }
1227 default: {
1228 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1229 USE_ITT_BUILD_ARG(itt_sync_obj) );
1230 }
1231 }
1232 if (__kmp_tasking_mode != tskm_immediate_exec) {
1233 __kmp_task_team_sync(this_thr, team);
1234 }
1235 }
1236
1237#if USE_ITT_BUILD
1238 /* GEH: TODO: Move this under if-condition above and also include in
1239 __kmp_end_split_barrier(). This will more accurately represent the actual release time
1240 of the threads for split barriers. */
1241 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1242 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1243#endif /* USE_ITT_BUILD */
1244 } else { // Team is serialized.
1245 status = 0;
1246 if (__kmp_tasking_mode != tskm_immediate_exec) {
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001247#if OMP_41_ENABLED
1248 if ( this_thr->th.th_task_team != NULL ) {
1249 void *itt_sync_obj = NULL;
1250#if USE_ITT_NOTIFY
1251 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1252 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1253 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1254 }
1255#endif
1256
Jonathan Peytone8104ad2015-06-08 18:56:33 +00001257 kmp_task_team_t * task_team;
1258 task_team = this_thr->th.th_task_team;
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001259 KMP_DEBUG_ASSERT(task_team->tt.tt_found_proxy_tasks == TRUE);
1260 __kmp_task_team_wait(this_thr, team
1261 USE_ITT_BUILD_ARG(itt_sync_obj));
1262 __kmp_task_team_setup(this_thr, team, 0, 0);
1263
1264#if USE_ITT_BUILD
1265 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1266 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1267#endif /* USE_ITT_BUILD */
1268 }
1269#else
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001270 // The task team should be NULL for serialized code (tasks will be executed immediately)
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001271 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001272 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001273#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001274 }
1275 }
1276 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1277 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001278
1279#if OMPT_SUPPORT
1280 if (ompt_status & ompt_status_track) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001281#if OMPT_BLAME
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001282 if ((ompt_status == ompt_status_track_callback) &&
1283 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1284 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1285 my_parallel_id, my_task_id);
1286 }
1287#endif
1288 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1289 }
1290#endif
1291
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001292 return status;
1293}
1294
1295
1296void
1297__kmp_end_split_barrier(enum barrier_type bt, int gtid)
1298{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001299 KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001300 int tid = __kmp_tid_from_gtid(gtid);
1301 kmp_info_t *this_thr = __kmp_threads[gtid];
1302 kmp_team_t *team = this_thr->th.th_team;
1303
1304 if (!team->t.t_serialized) {
1305 if (KMP_MASTER_GTID(gtid)) {
1306 switch (__kmp_barrier_release_pattern[bt]) {
1307 case bp_hyper_bar: {
1308 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1309 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1310 USE_ITT_BUILD_ARG(NULL) );
1311 break;
1312 }
1313 case bp_hierarchical_bar: {
1314 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1315 USE_ITT_BUILD_ARG(NULL));
1316 break;
1317 }
1318 case bp_tree_bar: {
1319 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1320 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1321 USE_ITT_BUILD_ARG(NULL) );
1322 break;
1323 }
1324 default: {
1325 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1326 USE_ITT_BUILD_ARG(NULL) );
1327 }
1328 }
1329 if (__kmp_tasking_mode != tskm_immediate_exec) {
1330 __kmp_task_team_sync(this_thr, team);
1331 } // if
1332 }
1333 }
1334}
1335
1336
1337void
1338__kmp_join_barrier(int gtid)
1339{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001340 KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001341 register kmp_info_t *this_thr = __kmp_threads[gtid];
1342 register kmp_team_t *team;
1343 register kmp_uint nproc;
1344 kmp_info_t *master_thread;
1345 int tid;
1346#ifdef KMP_DEBUG
1347 int team_id;
1348#endif /* KMP_DEBUG */
1349#if USE_ITT_BUILD
1350 void *itt_sync_obj = NULL;
1351# if USE_ITT_NOTIFY
1352 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1353 // Get object created at fork_barrier
1354 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1355# endif
1356#endif /* USE_ITT_BUILD */
1357 KMP_MB();
1358
1359 // Get current info
1360 team = this_thr->th.th_team;
1361 nproc = this_thr->th.th_team_nproc;
1362 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1363 tid = __kmp_tid_from_gtid(gtid);
1364#ifdef KMP_DEBUG
1365 team_id = team->t.t_id;
1366#endif /* KMP_DEBUG */
1367 master_thread = this_thr->th.th_team_master;
1368#ifdef KMP_DEBUG
1369 if (master_thread != team->t.t_threads[0]) {
1370 __kmp_print_structure();
1371 }
1372#endif /* KMP_DEBUG */
1373 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1374 KMP_MB();
1375
1376 // Verify state
1377 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1378 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1379 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1380 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1381 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1382
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001383#if OMPT_SUPPORT
1384#if OMPT_TRACE
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001385 if ((ompt_status == ompt_status_track_callback) &&
1386 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1387 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1388 team->t.ompt_team_info.parallel_id,
1389 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1390 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001391#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001392 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1393#endif
1394
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001395 if (__kmp_tasking_mode == tskm_extra_barrier) {
1396 __kmp_tasking_barrier(team, this_thr, gtid);
1397 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1398 }
1399# ifdef KMP_DEBUG
1400 if (__kmp_tasking_mode != tskm_immediate_exec) {
1401 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001402 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001403 this_thr->th.th_task_team));
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001404 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001405 }
1406# endif /* KMP_DEBUG */
1407
1408 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1409 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1410 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1411 since the values are not used by __kmp_wait_template() in that case. */
1412 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1413 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1414 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1415 }
1416
1417#if USE_ITT_BUILD
1418 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1419 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1420#endif /* USE_ITT_BUILD */
1421
1422 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1423 case bp_hyper_bar: {
1424 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1425 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1426 USE_ITT_BUILD_ARG(itt_sync_obj) );
1427 break;
1428 }
1429 case bp_hierarchical_bar: {
1430 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1431 USE_ITT_BUILD_ARG(itt_sync_obj) );
1432 break;
1433 }
1434 case bp_tree_bar: {
1435 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1436 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1437 USE_ITT_BUILD_ARG(itt_sync_obj) );
1438 break;
1439 }
1440 default: {
1441 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1442 USE_ITT_BUILD_ARG(itt_sync_obj) );
1443 }
1444 }
1445
1446 /* From this point on, the team data structure may be deallocated at any time by the
1447 master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1448 data items that need to be referenced before the end of the barrier should be moved to
1449 the kmp_task_team_t structs. */
1450 if (KMP_MASTER_TID(tid)) {
1451 if (__kmp_tasking_mode != tskm_immediate_exec) {
1452 // Master shouldn't call decrease_load(). // TODO: enable master threads.
1453 // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1454 __kmp_task_team_wait(this_thr, team
1455 USE_ITT_BUILD_ARG(itt_sync_obj) );
1456 }
1457#if USE_ITT_BUILD
1458 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1459 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1460#endif /* USE_ITT_BUILD */
1461
1462# if USE_ITT_BUILD && USE_ITT_NOTIFY
1463 // Join barrier - report frame end
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001464 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1465#if OMP_40_ENABLED
1466 this_thr->th.th_teams_microtask == NULL &&
1467#endif
1468 team->t.t_active_level == 1)
1469 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001470 kmp_uint64 cur_time = __itt_get_timestamp();
1471 ident_t * loc = team->t.t_ident;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001472 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001473 int nproc = this_thr->th.th_team_nproc;
1474 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001475 switch(__kmp_forkjoin_frames_mode) {
1476 case 1:
1477 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1478 break;
1479 case 2:
1480 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1481 break;
1482 case 3:
1483 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001484 // Initialize with master's wait time
1485 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001486 for (i=1; i<nproc; ++i) {
1487 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1488 }
1489 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1490 }
1491 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1492 this_thr->th.th_frame_time = cur_time;
1493 break;
1494 }
1495 }
1496# endif /* USE_ITT_BUILD */
1497 }
1498#if USE_ITT_BUILD
1499 else {
1500 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1501 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1502 }
1503#endif /* USE_ITT_BUILD */
1504
1505#if KMP_DEBUG
1506 if (KMP_MASTER_TID(tid)) {
1507 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1508 gtid, team_id, tid, nproc));
1509 }
1510#endif /* KMP_DEBUG */
1511
1512 // TODO now, mark worker threads as done so they may be disbanded
1513 KMP_MB(); // Flush all pending memory write invalidates.
1514 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001515
1516#if OMPT_SUPPORT
Jonathan Peyton48281512015-07-01 15:16:04 +00001517 if (ompt_status & ompt_status_track) {
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001518#if OMPT_TRACE
1519 if ((ompt_status == ompt_status_track_callback) &&
1520 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1521 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1522 team->t.ompt_team_info.parallel_id,
1523 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1524 }
1525#endif
1526
1527 // return to default state
1528 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1529 }
1530#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001531}
1532
1533
1534// TODO release worker threads' fork barriers as we are ready instead of all at once
1535void
1536__kmp_fork_barrier(int gtid, int tid)
1537{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001538 KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001539 kmp_info_t *this_thr = __kmp_threads[gtid];
1540 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1541#if USE_ITT_BUILD
1542 void * itt_sync_obj = NULL;
1543#endif /* USE_ITT_BUILD */
1544
1545 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1546 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1547
1548 // th_team pointer only valid for master thread here
1549 if (KMP_MASTER_TID(tid)) {
1550#if USE_ITT_BUILD && USE_ITT_NOTIFY
1551 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1552 // Create itt barrier object
1553 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1554 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1555 }
1556#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1557
1558#ifdef KMP_DEBUG
1559 register kmp_info_t **other_threads = team->t.t_threads;
1560 register int i;
1561
1562 // Verify state
1563 KMP_MB();
1564
1565 for(i=1; i<team->t.t_nproc; ++i) {
1566 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1567 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1568 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1569 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1570 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1571 & ~(KMP_BARRIER_SLEEP_STATE))
1572 == KMP_INIT_BARRIER_STATE);
1573 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1574 }
1575#endif
1576
1577 if (__kmp_tasking_mode != tskm_immediate_exec) {
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001578 __kmp_task_team_setup(this_thr, team, 1, 0); // 1,0 indicates setup both task teams if nthreads > 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001579 }
1580
1581 /* The master thread may have changed its blocktime between the join barrier and the
1582 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1583 access it when the team struct is not guaranteed to exist. */
1584 // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1585 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1586 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1587 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1588 }
1589 } // master
1590
1591 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1592 case bp_hyper_bar: {
1593 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1594 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1595 USE_ITT_BUILD_ARG(itt_sync_obj) );
1596 break;
1597 }
1598 case bp_hierarchical_bar: {
1599 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1600 USE_ITT_BUILD_ARG(itt_sync_obj) );
1601 break;
1602 }
1603 case bp_tree_bar: {
1604 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1605 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1606 USE_ITT_BUILD_ARG(itt_sync_obj) );
1607 break;
1608 }
1609 default: {
1610 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1611 USE_ITT_BUILD_ARG(itt_sync_obj) );
1612 }
1613 }
1614
1615 // Early exit for reaping threads releasing forkjoin barrier
1616 if (TCR_4(__kmp_global.g.g_done)) {
1617 if (this_thr->th.th_task_team != NULL) {
1618 if (KMP_MASTER_TID(tid)) {
1619 TCW_PTR(this_thr->th.th_task_team, NULL);
1620 }
1621 else {
1622 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1623 }
1624 }
1625
1626#if USE_ITT_BUILD && USE_ITT_NOTIFY
1627 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1628 if (!KMP_MASTER_TID(tid)) {
1629 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1630 if (itt_sync_obj)
1631 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1632 }
1633 }
1634#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1635 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1636 return;
1637 }
1638
1639 /* We can now assume that a valid team structure has been allocated by the master and
1640 propagated to all worker threads. The current thread, however, may not be part of the
1641 team, so we can't blindly assume that the team pointer is non-null. */
1642 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1643 KMP_DEBUG_ASSERT(team != NULL);
1644 tid = __kmp_tid_from_gtid(gtid);
1645
1646
1647#if KMP_BARRIER_ICV_PULL
1648 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1649 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1650 this data before this function is called. We cannot modify __kmp_fork_call() to look at
1651 the fixed ICVs in the master's thread struct, because it is not always the case that the
1652 threads arrays have been allocated when __kmp_fork_call() is executed. */
Jonathan Peyton45be4502015-08-11 21:36:41 +00001653 {
1654 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
1655 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1656 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1657 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1658 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1659 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1660 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1661 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001662 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001663#endif // KMP_BARRIER_ICV_PULL
1664
1665 if (__kmp_tasking_mode != tskm_immediate_exec) {
1666 __kmp_task_team_sync(this_thr, team);
1667 }
1668
1669#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1670 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1671 if (proc_bind == proc_bind_intel) {
1672#endif
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001673#if KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001674 // Call dynamic affinity settings
1675 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1676 __kmp_balanced_affinity(tid, team->t.t_nproc);
1677 }
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001678#endif // KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001679#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1680 }
Andrey Churbanov94e569e2015-03-10 09:19:47 +00001681 else if (proc_bind != proc_bind_false) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001682 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1683 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1684 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1685 }
1686 else {
1687 __kmp_affinity_set_place(gtid);
1688 }
1689 }
1690#endif
1691
1692#if USE_ITT_BUILD && USE_ITT_NOTIFY
1693 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1694 if (!KMP_MASTER_TID(tid)) {
1695 // Get correct barrier object
1696 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1697 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1698 } // (prepare called inside barrier_release)
1699 }
1700#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1701 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1702}
1703
1704
1705void
1706__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1707{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001708 KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001709
1710 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1711 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1712
1713 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1714 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1715 this data before this function is called. */
1716#if KMP_BARRIER_ICV_PULL
1717 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1718 all of the worker threads can access them and make their own copies after the barrier. */
1719 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1720 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1721 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1722 0, team->t.t_threads[0], team));
1723#elif KMP_BARRIER_ICV_PUSH
1724 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1725 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1726 0, team->t.t_threads[0], team));
1727#else
1728 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1729 ngo_load(new_icvs);
1730 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
Jonathan Peyton91b78702015-06-08 19:39:07 +00001731 for (int f=1; f<new_nproc; ++f) { // Skip the master thread
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001732 // TODO: GEH - pass in better source location info since usually NULL here
1733 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1734 f, team->t.t_threads[f], team));
1735 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1736 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1737 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1738 f, team->t.t_threads[f], team));
1739 }
1740 ngo_sync();
1741#endif // KMP_BARRIER_ICV_PULL
1742}