blob: e6c4e8adfb9becc147231157c4c2c3fcddbdca30 [file] [log] [blame]
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001/*
2 * kmp_barrier.cpp
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16#include "kmp.h"
17#include "kmp_wait_release.h"
18#include "kmp_stats.h"
19#include "kmp_itt.h"
20
21#if KMP_MIC
22#include <immintrin.h>
23#define USE_NGO_STORES 1
24#endif // KMP_MIC
25
26#if KMP_MIC && USE_NGO_STORES
27// ICV copying
28#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
29#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
30#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
32#else
33#define ngo_load(src) ((void)0)
34#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
Jonathan Peyton01b58b72015-07-09 18:20:51 +000035#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
Jim Cownie4cc4bb42014-10-07 16:25:50 +000036#define ngo_sync() ((void)0)
37#endif /* KMP_MIC && USE_NGO_STORES */
38
39void __kmp_print_structure(void); // Forward declaration
40
41// ---------------------------- Barrier Algorithms ----------------------------
42
43// Linear Barrier
44static void
45__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
46 void (*reduce)(void *, void *)
47 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
48{
49 KMP_TIME_BLOCK(KMP_linear_gather);
50 register kmp_team_t *team = this_thr->th.th_team;
51 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
52 register kmp_info_t **other_threads = team->t.t_threads;
53
54 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
55 gtid, team->t.t_id, tid, bt));
56 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
57
58#if USE_ITT_BUILD && USE_ITT_NOTIFY
59 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +000060 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +000061 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
62 }
63#endif
64 // We now perform a linear reduction to signal that all of the threads have arrived.
65 if (!KMP_MASTER_TID(tid)) {
66 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
67 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
68 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
69 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
70 // Mark arrival to master thread
71 /* After performing this write, a worker thread may not assume that the team is valid
72 any more - it could be deallocated by the master thread at any time. */
73 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
74 flag.release();
75 } else {
76 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
77 register int nproc = this_thr->th.th_team_nproc;
78 register int i;
79 // Don't have to worry about sleep bit here or atomic since team setting
80 register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
81
82 // Collect all the worker team member threads.
83 for (i=1; i<nproc; ++i) {
84#if KMP_CACHE_MANAGE
85 // Prefetch next thread's arrived count
86 if (i+1 < nproc)
87 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
88#endif /* KMP_CACHE_MANAGE */
89 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
90 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
91 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
92 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
93
94 // Wait for worker thread to arrive
95 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
96 flag.wait(this_thr, FALSE
97 USE_ITT_BUILD_ARG(itt_sync_obj) );
98#if USE_ITT_BUILD && USE_ITT_NOTIFY
99 // Barrier imbalance - write min of the thread time and the other thread time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000100 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000101 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
102 other_threads[i]->th.th_bar_min_time);
103 }
104#endif
105 if (reduce) {
106 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
107 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
108 (*reduce)(this_thr->th.th_local.reduce_data,
109 other_threads[i]->th.th_local.reduce_data);
110 }
111 }
112 // Don't have to worry about sleep bit here or atomic since team setting
113 team_bar->b_arrived = new_state;
114 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
115 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
116 }
117 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
118 gtid, team->t.t_id, tid, bt));
119}
120
121static void
122__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
123 int propagate_icvs
124 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
125{
126 KMP_TIME_BLOCK(KMP_linear_release);
127 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
128 register kmp_team_t *team;
129
130 if (KMP_MASTER_TID(tid)) {
131 register unsigned int i;
132 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
133 register kmp_info_t **other_threads;
134
135 team = __kmp_threads[gtid]->th.th_team;
136 KMP_DEBUG_ASSERT(team != NULL);
137 other_threads = team->t.t_threads;
138
139 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
140 gtid, team->t.t_id, tid, bt));
141
142 if (nproc > 1) {
143#if KMP_BARRIER_ICV_PUSH
144 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
145 if (propagate_icvs) {
146 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
147 for (i=1; i<nproc; ++i) {
148 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
149 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
150 &team->t.t_implicit_task_taskdata[0].td_icvs);
151 }
152 ngo_sync();
153 }
154 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
155#endif // KMP_BARRIER_ICV_PUSH
156
157 // Now, release all of the worker threads
158 for (i=1; i<nproc; ++i) {
159#if KMP_CACHE_MANAGE
160 // Prefetch next thread's go flag
161 if (i+1 < nproc)
162 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
163#endif /* KMP_CACHE_MANAGE */
164 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
165 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
166 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
167 &other_threads[i]->th.th_bar[bt].bb.b_go,
168 other_threads[i]->th.th_bar[bt].bb.b_go,
169 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
170 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
171 flag.release();
172 }
173 }
174 } else { // Wait for the MASTER thread to release us
175 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
176 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
177 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
178 flag.wait(this_thr, TRUE
179 USE_ITT_BUILD_ARG(itt_sync_obj) );
180#if USE_ITT_BUILD && USE_ITT_NOTIFY
181 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
182 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
183 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
184 // Cancel wait on previous parallel region...
185 __kmp_itt_task_starting(itt_sync_obj);
186
187 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
188 return;
189
190 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
191 if (itt_sync_obj != NULL)
192 // Call prepare as early as possible for "new" barrier
193 __kmp_itt_task_finished(itt_sync_obj);
194 } else
195#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
196 // Early exit for reaping threads releasing forkjoin barrier
197 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
198 return;
199 // The worker thread may now assume that the team is valid.
200#ifdef KMP_DEBUG
201 tid = __kmp_tid_from_gtid(gtid);
202 team = __kmp_threads[gtid]->th.th_team;
203#endif
204 KMP_DEBUG_ASSERT(team != NULL);
205 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
206 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
207 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
208 KMP_MB(); // Flush all pending memory write invalidates.
209 }
210 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
211 gtid, team->t.t_id, tid, bt));
212}
213
214// Tree barrier
215static void
216__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
217 void (*reduce)(void *, void *)
218 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
219{
220 KMP_TIME_BLOCK(KMP_tree_gather);
221 register kmp_team_t *team = this_thr->th.th_team;
222 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
223 register kmp_info_t **other_threads = team->t.t_threads;
224 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
225 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
226 register kmp_uint32 branch_factor = 1 << branch_bits;
227 register kmp_uint32 child;
228 register kmp_uint32 child_tid;
229 register kmp_uint new_state;
230
231 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
232 gtid, team->t.t_id, tid, bt));
233 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
234
235#if USE_ITT_BUILD && USE_ITT_NOTIFY
236 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000237 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000238 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
239 }
240#endif
241 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
242 child_tid = (tid << branch_bits) + 1;
243 if (child_tid < nproc) {
244 // Parent threads wait for all their children to arrive
245 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
246 child = 1;
247 do {
248 register kmp_info_t *child_thr = other_threads[child_tid];
249 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
250#if KMP_CACHE_MANAGE
251 // Prefetch next thread's arrived count
252 if (child+1 <= branch_factor && child_tid+1 < nproc)
253 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
254#endif /* KMP_CACHE_MANAGE */
255 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
256 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
257 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
258 &child_bar->b_arrived, new_state));
259 // Wait for child to arrive
260 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
261 flag.wait(this_thr, FALSE
262 USE_ITT_BUILD_ARG(itt_sync_obj) );
263#if USE_ITT_BUILD && USE_ITT_NOTIFY
264 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000265 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000266 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
267 child_thr->th.th_bar_min_time);
268 }
269#endif
270 if (reduce) {
271 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
272 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
273 team->t.t_id, child_tid));
274 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
275 }
276 child++;
277 child_tid++;
278 }
279 while (child <= branch_factor && child_tid < nproc);
280 }
281
282 if (!KMP_MASTER_TID(tid)) { // Worker threads
283 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
284
285 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
286 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
287 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
288 &thr_bar->b_arrived, thr_bar->b_arrived,
289 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
290
291 // Mark arrival to parent thread
292 /* After performing this write, a worker thread may not assume that the team is valid
293 any more - it could be deallocated by the master thread at any time. */
294 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
295 flag.release();
296 } else {
297 // Need to update the team arrived pointer if we are the master thread
298 if (nproc > 1) // New value was already computed above
299 team->t.t_bar[bt].b_arrived = new_state;
300 else
301 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
302 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
303 gtid, team->t.t_id, tid, team->t.t_id,
304 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
305 }
306 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
307 gtid, team->t.t_id, tid, bt));
308}
309
310static void
311__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
312 int propagate_icvs
313 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
314{
315 KMP_TIME_BLOCK(KMP_tree_release);
316 register kmp_team_t *team;
317 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
318 register kmp_uint32 nproc;
319 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
320 register kmp_uint32 branch_factor = 1 << branch_bits;
321 register kmp_uint32 child;
322 register kmp_uint32 child_tid;
323
324 // Perform a tree release for all of the threads that have been gathered
325 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
326 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
327 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
328 // Wait for parent thread to release us
329 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
330 flag.wait(this_thr, TRUE
331 USE_ITT_BUILD_ARG(itt_sync_obj) );
332#if USE_ITT_BUILD && USE_ITT_NOTIFY
333 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
334 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
335 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
336 // Cancel wait on previous parallel region...
337 __kmp_itt_task_starting(itt_sync_obj);
338
339 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
340 return;
341
342 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
343 if (itt_sync_obj != NULL)
344 // Call prepare as early as possible for "new" barrier
345 __kmp_itt_task_finished(itt_sync_obj);
346 } else
347#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
348 // Early exit for reaping threads releasing forkjoin barrier
349 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
350 return;
351
352 // The worker thread may now assume that the team is valid.
353 team = __kmp_threads[gtid]->th.th_team;
354 KMP_DEBUG_ASSERT(team != NULL);
355 tid = __kmp_tid_from_gtid(gtid);
356
357 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
358 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
359 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
360 KMP_MB(); // Flush all pending memory write invalidates.
361 } else {
362 team = __kmp_threads[gtid]->th.th_team;
363 KMP_DEBUG_ASSERT(team != NULL);
364 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
365 gtid, team->t.t_id, tid, bt));
366 }
367 nproc = this_thr->th.th_team_nproc;
368 child_tid = (tid << branch_bits) + 1;
369
370 if (child_tid < nproc) {
371 register kmp_info_t **other_threads = team->t.t_threads;
372 child = 1;
373 // Parent threads release all their children
374 do {
375 register kmp_info_t *child_thr = other_threads[child_tid];
376 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
377#if KMP_CACHE_MANAGE
378 // Prefetch next thread's go count
379 if (child+1 <= branch_factor && child_tid+1 < nproc)
380 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
381#endif /* KMP_CACHE_MANAGE */
382
383#if KMP_BARRIER_ICV_PUSH
384 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
385 if (propagate_icvs) {
386 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
387 team, child_tid, FALSE);
388 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
389 &team->t.t_implicit_task_taskdata[0].td_icvs);
390 }
391 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
392#endif // KMP_BARRIER_ICV_PUSH
393 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
394 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
395 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
396 child_tid, &child_bar->b_go, child_bar->b_go,
397 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
398 // Release child from barrier
399 kmp_flag_64 flag(&child_bar->b_go, child_thr);
400 flag.release();
401 child++;
402 child_tid++;
403 }
404 while (child <= branch_factor && child_tid < nproc);
405 }
406 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
407 gtid, team->t.t_id, tid, bt));
408}
409
410
411// Hyper Barrier
412static void
413__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
414 void (*reduce)(void *, void *)
415 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
416{
417 KMP_TIME_BLOCK(KMP_hyper_gather);
418 register kmp_team_t *team = this_thr->th.th_team;
419 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
420 register kmp_info_t **other_threads = team->t.t_threads;
421 register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
422 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
423 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
424 register kmp_uint32 branch_factor = 1 << branch_bits;
425 register kmp_uint32 offset;
426 register kmp_uint32 level;
427
428 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
429 gtid, team->t.t_id, tid, bt));
430
431 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
432
433#if USE_ITT_BUILD && USE_ITT_NOTIFY
434 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000435 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000436 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
437 }
438#endif
439 /* Perform a hypercube-embedded tree gather to wait until all of the threads have
440 arrived, and reduce any required data as we go. */
441 kmp_flag_64 p_flag(&thr_bar->b_arrived);
442 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
443 {
444 register kmp_uint32 child;
445 register kmp_uint32 child_tid;
446
447 if (((tid >> level) & (branch_factor - 1)) != 0) {
448 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
449
450 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
451 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
452 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
453 &thr_bar->b_arrived, thr_bar->b_arrived,
454 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
455 // Mark arrival to parent thread
456 /* After performing this write (in the last iteration of the enclosing for loop),
457 a worker thread may not assume that the team is valid any more - it could be
458 deallocated by the master thread at any time. */
459 p_flag.set_waiter(other_threads[parent_tid]);
460 p_flag.release();
461 break;
462 }
463
464 // Parent threads wait for children to arrive
465 if (new_state == KMP_BARRIER_UNUSED_STATE)
466 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
467 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
468 child++, child_tid+=(1 << level))
469 {
470 register kmp_info_t *child_thr = other_threads[child_tid];
471 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
472#if KMP_CACHE_MANAGE
473 register kmp_uint32 next_child_tid = child_tid + (1 << level);
474 // Prefetch next thread's arrived count
475 if (child+1 < branch_factor && next_child_tid < num_threads)
476 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
477#endif /* KMP_CACHE_MANAGE */
478 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
479 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
480 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
481 &child_bar->b_arrived, new_state));
482 // Wait for child to arrive
483 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
484 c_flag.wait(this_thr, FALSE
485 USE_ITT_BUILD_ARG(itt_sync_obj) );
486#if USE_ITT_BUILD && USE_ITT_NOTIFY
487 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000488 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000489 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
490 child_thr->th.th_bar_min_time);
491 }
492#endif
493 if (reduce) {
494 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
495 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
496 team->t.t_id, child_tid));
497 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
498 }
499 }
500 }
501
502 if (KMP_MASTER_TID(tid)) {
503 // Need to update the team arrived pointer if we are the master thread
504 if (new_state == KMP_BARRIER_UNUSED_STATE)
505 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
506 else
507 team->t.t_bar[bt].b_arrived = new_state;
508 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
509 gtid, team->t.t_id, tid, team->t.t_id,
510 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
511 }
512 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
513 gtid, team->t.t_id, tid, bt));
514}
515
516// The reverse versions seem to beat the forward versions overall
517#define KMP_REVERSE_HYPER_BAR
518static void
519__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
520 int propagate_icvs
521 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
522{
523 KMP_TIME_BLOCK(KMP_hyper_release);
524 register kmp_team_t *team;
525 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
526 register kmp_info_t **other_threads;
527 register kmp_uint32 num_threads;
528 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
529 register kmp_uint32 branch_factor = 1 << branch_bits;
530 register kmp_uint32 child;
531 register kmp_uint32 child_tid;
532 register kmp_uint32 offset;
533 register kmp_uint32 level;
534
535 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
536 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
537 order of the corresponding gather, otherwise threads are released in the same order. */
538 if (KMP_MASTER_TID(tid)) { // master
539 team = __kmp_threads[gtid]->th.th_team;
540 KMP_DEBUG_ASSERT(team != NULL);
541 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
542 gtid, team->t.t_id, tid, bt));
543#if KMP_BARRIER_ICV_PUSH
544 if (propagate_icvs) { // master already has ICVs in final destination; copy
545 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
546 }
547#endif
548 }
549 else { // Handle fork barrier workers who aren't part of a team yet
550 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
551 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
552 // Wait for parent thread to release us
553 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
554 flag.wait(this_thr, TRUE
555 USE_ITT_BUILD_ARG(itt_sync_obj) );
556#if USE_ITT_BUILD && USE_ITT_NOTIFY
557 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
558 // In fork barrier where we could not get the object reliably
559 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
560 // Cancel wait on previous parallel region...
561 __kmp_itt_task_starting(itt_sync_obj);
562
563 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
564 return;
565
566 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
567 if (itt_sync_obj != NULL)
568 // Call prepare as early as possible for "new" barrier
569 __kmp_itt_task_finished(itt_sync_obj);
570 } else
571#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
572 // Early exit for reaping threads releasing forkjoin barrier
573 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
574 return;
575
576 // The worker thread may now assume that the team is valid.
577 team = __kmp_threads[gtid]->th.th_team;
578 KMP_DEBUG_ASSERT(team != NULL);
579 tid = __kmp_tid_from_gtid(gtid);
580
581 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
582 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
583 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
584 KMP_MB(); // Flush all pending memory write invalidates.
585 }
586 num_threads = this_thr->th.th_team_nproc;
587 other_threads = team->t.t_threads;
588
589#ifdef KMP_REVERSE_HYPER_BAR
590 // Count up to correct level for parent
591 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
592 level+=branch_bits, offset<<=branch_bits);
593
594 // Now go down from there
595 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
596 level-=branch_bits, offset>>=branch_bits)
597#else
598 // Go down the tree, level by level
599 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
600#endif // KMP_REVERSE_HYPER_BAR
601 {
602#ifdef KMP_REVERSE_HYPER_BAR
603 /* Now go in reverse order through the children, highest to lowest.
604 Initial setting of child is conservative here. */
605 child = num_threads >> ((level==0)?level:level-1);
606 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
607 child>=1; child--, child_tid-=(1<<level))
608#else
609 if (((tid >> level) & (branch_factor - 1)) != 0)
610 // No need to go lower than this, since this is the level parent would be notified
611 break;
612 // Iterate through children on this level of the tree
613 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
614 child++, child_tid+=(1<<level))
615#endif // KMP_REVERSE_HYPER_BAR
616 {
617 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
618 else {
619 register kmp_info_t *child_thr = other_threads[child_tid];
620 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
621#if KMP_CACHE_MANAGE
622 register kmp_uint32 next_child_tid = child_tid - (1 << level);
623 // Prefetch next thread's go count
624# ifdef KMP_REVERSE_HYPER_BAR
625 if (child-1 >= 1 && next_child_tid < num_threads)
626# else
627 if (child+1 < branch_factor && next_child_tid < num_threads)
628# endif // KMP_REVERSE_HYPER_BAR
629 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
630#endif /* KMP_CACHE_MANAGE */
631
632#if KMP_BARRIER_ICV_PUSH
633 if (propagate_icvs) // push my fixed ICVs to my child
634 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
635#endif // KMP_BARRIER_ICV_PUSH
636
637 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
638 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
639 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
640 child_tid, &child_bar->b_go, child_bar->b_go,
641 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
642 // Release child from barrier
643 kmp_flag_64 flag(&child_bar->b_go, child_thr);
644 flag.release();
645 }
646 }
647 }
648#if KMP_BARRIER_ICV_PUSH
649 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
650 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
651 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
652 }
653#endif
654 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
655 gtid, team->t.t_id, tid, bt));
656}
657
658// Hierarchical Barrier
659
660// Initialize thread barrier data
661/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
662 minimum amount of initialization required based on how the team has changed. Returns true if
663 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
664 team size increases, threads already in the team will respond to on-core wakeup on their parent
665 thread, but threads newly added to the team will only be listening on the their local b_go. */
666static bool
667__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
668 int gtid, int tid, kmp_team_t *team)
669{
670 // Checks to determine if (re-)initialization is needed
671 bool uninitialized = thr_bar->team == NULL;
672 bool team_changed = team != thr_bar->team;
673 bool team_sz_changed = nproc != thr_bar->nproc;
674 bool tid_changed = tid != thr_bar->old_tid;
675 bool retval = false;
676
677 if (uninitialized || team_sz_changed) {
678 __kmp_get_hierarchy(nproc, thr_bar);
679 }
680
681 if (uninitialized || team_sz_changed || tid_changed) {
682 thr_bar->my_level = thr_bar->depth-1; // default for master
683 thr_bar->parent_tid = -1; // default for master
684 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
685 kmp_uint32 d=0;
686 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
687 kmp_uint32 rem;
688 if (d == thr_bar->depth-2) { // reached level right below the master
689 thr_bar->parent_tid = 0;
690 thr_bar->my_level = d;
691 break;
692 }
693 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
694 // thread is not a subtree root at next level, so this is max
695 thr_bar->parent_tid = tid - rem;
696 thr_bar->my_level = d;
697 break;
698 }
699 ++d;
700 }
701 }
702 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
703 thr_bar->old_tid = tid;
704 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
705 }
706 if (uninitialized || team_changed || tid_changed) {
707 thr_bar->team = team;
708 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
709 retval = true;
710 }
711 if (uninitialized || team_sz_changed || tid_changed) {
712 thr_bar->nproc = nproc;
713 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
714 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
715 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
716 thr_bar->leaf_kids = nproc - tid - 1;
717 thr_bar->leaf_state = 0;
718 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
719 }
720 return retval;
721}
722
723static void
724__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
725 int gtid, int tid, void (*reduce) (void *, void *)
726 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
727{
728 KMP_TIME_BLOCK(KMP_hier_gather);
729 register kmp_team_t *team = this_thr->th.th_team;
730 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
731 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
732 register kmp_info_t **other_threads = team->t.t_threads;
733 register kmp_uint64 new_state;
734
Andrey Churbanov42a79212015-01-27 16:50:31 +0000735 int level = team->t.t_level;
736 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
737 if (this_thr->th.th_teams_size.nteams > 1)
738 ++level; // level was not increased in teams construct for team_of_masters
739 if (level == 1) thr_bar->use_oncore_barrier = 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000740 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
741
742 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
743 gtid, team->t.t_id, tid, bt));
744 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
745
Andrey Churbanove6bfb732015-05-06 18:34:15 +0000746#if USE_ITT_BUILD && USE_ITT_NOTIFY
747 // Barrier imbalance - save arrive time to the thread
748 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
749 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
750 }
751#endif
752
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000753 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
754
755 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
756 register kmp_int32 child_tid;
757 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
758 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
759 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
760 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
761 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
762 flag.wait(this_thr, FALSE
763 USE_ITT_BUILD_ARG(itt_sync_obj) );
764 if (reduce) {
765 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
766 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
767 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
768 team->t.t_id, child_tid));
769 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
770 }
771 }
772 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
773 }
774 // Next, wait for higher level children on each child's b_arrived flag
775 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
776 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
777 if (last > nproc) last = nproc;
778 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
779 register kmp_info_t *child_thr = other_threads[child_tid];
780 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
781 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
782 "arrived(%p) == %u\n",
783 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
784 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
785 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
786 flag.wait(this_thr, FALSE
787 USE_ITT_BUILD_ARG(itt_sync_obj) );
788 if (reduce) {
789 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
790 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
791 team->t.t_id, child_tid));
792 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
793 }
794 }
795 }
796 }
797 else { // Blocktime is not infinite
798 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
799 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
800 if (last > nproc) last = nproc;
801 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
802 register kmp_info_t *child_thr = other_threads[child_tid];
803 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
804 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
805 "arrived(%p) == %u\n",
806 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
807 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
808 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
809 flag.wait(this_thr, FALSE
810 USE_ITT_BUILD_ARG(itt_sync_obj) );
811 if (reduce) {
812 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
813 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
814 team->t.t_id, child_tid));
815 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
816 }
817 }
818 }
819 }
820 }
821 // All subordinates are gathered; now release parent if not master thread
822
823 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
824 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
825 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
826 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
827 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
828 /* Mark arrival to parent: After performing this write, a worker thread may not assume that
829 the team is valid any more - it could be deallocated by the master thread at any time. */
830 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
831 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
832 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
833 flag.release();
834 }
835 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
836 thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
837 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
838 flag.set_waiter(other_threads[thr_bar->parent_tid]);
839 flag.release();
840 }
841 } else { // Master thread needs to update the team's b_arrived value
842 team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
843 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
844 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
845 }
846 // Is the team access below unsafe or just technically invalid?
847 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
848 gtid, team->t.t_id, tid, bt));
849}
850
851static void
852__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
853 int propagate_icvs
854 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
855{
856 KMP_TIME_BLOCK(KMP_hier_release);
857 register kmp_team_t *team;
858 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
859 register kmp_uint32 nproc;
860 bool team_change = false; // indicates on-core barrier shouldn't be used
861
862 if (KMP_MASTER_TID(tid)) {
863 team = __kmp_threads[gtid]->th.th_team;
864 KMP_DEBUG_ASSERT(team != NULL);
865 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
866 gtid, team->t.t_id, tid, bt));
867 }
868 else { // Worker threads
869 // Wait for parent thread to release me
870 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
871 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
872 // Use traditional method of waiting on my own b_go flag
873 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
874 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
875 flag.wait(this_thr, TRUE
876 USE_ITT_BUILD_ARG(itt_sync_obj) );
877 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
878 }
879 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
880 // Wait on my "offset" bits on parent's b_go flag
881 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
882 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
883 bt, this_thr
884 USE_ITT_BUILD_ARG(itt_sync_obj) );
885 flag.wait(this_thr, TRUE
886 USE_ITT_BUILD_ARG(itt_sync_obj) );
887 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
888 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
889 }
890 else { // Reset my bits on parent's b_go flag
891 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
892 }
893 }
894 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
895 // Early exit for reaping threads releasing forkjoin barrier
896 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
897 return;
898 // The worker thread may now assume that the team is valid.
899 team = __kmp_threads[gtid]->th.th_team;
900 KMP_DEBUG_ASSERT(team != NULL);
901 tid = __kmp_tid_from_gtid(gtid);
902
903 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
904 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
905 KMP_MB(); // Flush all pending memory write invalidates.
906 }
907
Andrey Churbanov42a79212015-01-27 16:50:31 +0000908 int level = team->t.t_level;
909 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
910 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
911 ++level; // level was not increased in teams construct for team_of_workers
912 if( this_thr->th.th_teams_size.nteams > 1 )
913 ++level; // level was not increased in teams construct for team_of_masters
914 }
915 if (level == 1) thr_bar->use_oncore_barrier = 1;
916 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000917 nproc = this_thr->th.th_team_nproc;
918
919 // If the team size has increased, we still communicate with old leaves via oncore barrier.
920 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
921 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
922 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
923 // But if the entire team changes, we won't use oncore barrier at all
924 if (team_change) old_leaf_kids = 0;
925
926#if KMP_BARRIER_ICV_PUSH
927 if (propagate_icvs) {
928 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
929 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
930 }
931 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
932 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
933 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
934 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
935 &thr_bar->parent_bar->th_fixed_icvs);
936 // non-leaves will get ICVs piggybacked with b_go via NGO store
937 }
938 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
939 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
940 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
941 else // leaves copy parent's fixed ICVs directly to local ICV store
942 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
943 &thr_bar->parent_bar->th_fixed_icvs);
944 }
945 }
946#endif // KMP_BARRIER_ICV_PUSH
947
948 // Now, release my children
949 if (thr_bar->my_level) { // not a leaf
950 register kmp_int32 child_tid;
951 kmp_uint32 last;
952 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
953 if (KMP_MASTER_TID(tid)) { // do a flat release
954 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
955 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
956 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
957 ngo_load(&thr_bar->th_fixed_icvs);
958 // This loops over all the threads skipping only the leaf nodes in the hierarchy
959 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
960 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
961 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
962 " go(%p): %u => %u\n",
963 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
964 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
965 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
966 // Use ngo store (if available) to both store ICVs and release child via child's b_go
967 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
968 }
969 ngo_sync();
970 }
971 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
972 // Now, release leaf children
973 if (thr_bar->leaf_kids) { // if there are any
974 // We test team_change on the off-chance that the level 1 team changed.
975 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
976 if (old_leaf_kids) { // release old leaf kids
977 thr_bar->b_go |= old_leaf_state;
978 }
979 // Release new leaf kids
980 last = tid+thr_bar->skip_per_level[1];
981 if (last > nproc) last = nproc;
982 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
983 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
984 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
985 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
986 " T#%d(%d:%d) go(%p): %u => %u\n",
987 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
988 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
989 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
990 // Release child using child's b_go flag
991 kmp_flag_64 flag(&child_bar->b_go, child_thr);
992 flag.release();
993 }
994 }
995 else { // Release all children at once with leaf_state bits on my own b_go flag
996 thr_bar->b_go |= thr_bar->leaf_state;
997 }
998 }
999 }
1000 else { // Blocktime is not infinite; do a simple hierarchical release
1001 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1002 last = tid+thr_bar->skip_per_level[d+1];
1003 kmp_uint32 skip = thr_bar->skip_per_level[d];
1004 if (last > nproc) last = nproc;
1005 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1006 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1007 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1008 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1009 " go(%p): %u => %u\n",
1010 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1011 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1012 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1013 // Release child using child's b_go flag
1014 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1015 flag.release();
1016 }
1017 }
1018 }
1019#if KMP_BARRIER_ICV_PUSH
1020 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1021 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1022#endif // KMP_BARRIER_ICV_PUSH
1023 }
1024 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1025 gtid, team->t.t_id, tid, bt));
1026}
1027
1028// ---------------------------- End of Barrier Algorithms ----------------------------
1029
1030// Internal function to do a barrier.
1031/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1032 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1033 Returns 0 if master thread, 1 if worker thread. */
1034int
1035__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1036 void *reduce_data, void (*reduce)(void *, void *))
1037{
1038 KMP_TIME_BLOCK(KMP_barrier);
1039 register int tid = __kmp_tid_from_gtid(gtid);
1040 register kmp_info_t *this_thr = __kmp_threads[gtid];
1041 register kmp_team_t *team = this_thr->th.th_team;
1042 register int status = 0;
1043 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001044#if OMPT_SUPPORT
1045 ompt_task_id_t my_task_id;
1046 ompt_parallel_id_t my_parallel_id;
1047#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001048
1049 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1050 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1051
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001052#if OMPT_SUPPORT
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001053 if (ompt_status & ompt_status_track) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001054#if OMPT_BLAME
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001055 if (ompt_status == ompt_status_track_callback) {
1056 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1057 my_parallel_id = team->t.ompt_team_info.parallel_id;
1058
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001059#if OMPT_TRACE
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001060 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1061 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1062 ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1063 my_parallel_id, my_task_id);
1064 }
1065 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001066#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001067 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1068 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1069 my_parallel_id, my_task_id);
1070 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001071 }
1072#endif
1073 // It is OK to report the barrier state after the barrier begin callback.
1074 // According to the OMPT specification, a compliant implementation may
1075 // even delay reporting this state until the barrier begins to wait.
1076 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001077 }
1078#endif
1079
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001080 if (! team->t.t_serialized) {
1081#if USE_ITT_BUILD
1082 // This value will be used in itt notify events below.
1083 void *itt_sync_obj = NULL;
1084# if USE_ITT_NOTIFY
1085 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1086 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1087# endif
1088#endif /* USE_ITT_BUILD */
1089 if (__kmp_tasking_mode == tskm_extra_barrier) {
1090 __kmp_tasking_barrier(team, this_thr, gtid);
1091 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1092 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1093 }
1094
1095 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1096 the team struct is not guaranteed to exist. */
1097 // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1098 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1099 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1100 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1101 }
1102
1103#if USE_ITT_BUILD
1104 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1105 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1106#endif /* USE_ITT_BUILD */
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001107#if USE_DEBUGGER
1108 // Let the debugger know: the thread arrived to the barrier and waiting.
1109 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1110 team->t.t_bar[bt].b_master_arrived += 1;
1111 } else {
1112 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1113 } // if
1114#endif /* USE_DEBUGGER */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001115 if (reduce != NULL) {
1116 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1117 this_thr->th.th_local.reduce_data = reduce_data;
1118 }
1119 switch (__kmp_barrier_gather_pattern[bt]) {
1120 case bp_hyper_bar: {
1121 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1122 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1123 USE_ITT_BUILD_ARG(itt_sync_obj) );
1124 break;
1125 }
1126 case bp_hierarchical_bar: {
1127 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1128 USE_ITT_BUILD_ARG(itt_sync_obj));
1129 break;
1130 }
1131 case bp_tree_bar: {
1132 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1133 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1134 USE_ITT_BUILD_ARG(itt_sync_obj) );
1135 break;
1136 }
1137 default: {
1138 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1139 USE_ITT_BUILD_ARG(itt_sync_obj) );
1140 }
1141 }
1142
1143 KMP_MB();
1144
1145 if (KMP_MASTER_TID(tid)) {
1146 status = 0;
1147 if (__kmp_tasking_mode != tskm_immediate_exec) {
1148 __kmp_task_team_wait(this_thr, team
1149 USE_ITT_BUILD_ARG(itt_sync_obj) );
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001150 __kmp_task_team_setup(this_thr, team, 0, 0); // use 0,0 to only setup the current team if nthreads > 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001151 }
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001152#if USE_DEBUGGER
1153 // Let the debugger know: All threads are arrived and starting leaving the barrier.
1154 team->t.t_bar[bt].b_team_arrived += 1;
1155#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001156
1157#if USE_ITT_BUILD
1158 /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1159 before the final summation into the shared variable is done (final summation can be a
1160 long operation for array reductions). */
1161 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1162 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1163#endif /* USE_ITT_BUILD */
1164#if USE_ITT_BUILD && USE_ITT_NOTIFY
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001165 // Barrier - report frame end (only if active_level == 1)
1166 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1167#if OMP_40_ENABLED
1168 this_thr->th.th_teams_microtask == NULL &&
1169#endif
1170 team->t.t_active_level == 1)
1171 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001172 kmp_uint64 cur_time = __itt_get_timestamp();
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001173 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001174 int nproc = this_thr->th.th_team_nproc;
1175 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001176 switch(__kmp_forkjoin_frames_mode) {
1177 case 1:
1178 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1179 this_thr->th.th_frame_time = cur_time;
1180 break;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001181 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001182 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1183 break;
1184 case 3:
1185 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001186 // Initialize with master's wait time
1187 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001188 for (i=1; i<nproc; ++i) {
1189 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1190 }
1191 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1192 }
1193 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1194 this_thr->th.th_frame_time = cur_time;
1195 break;
1196 }
1197 }
1198#endif /* USE_ITT_BUILD */
1199 } else {
1200 status = 1;
1201#if USE_ITT_BUILD
1202 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1203 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1204#endif /* USE_ITT_BUILD */
1205 }
1206 if (status == 1 || ! is_split) {
1207 switch (__kmp_barrier_release_pattern[bt]) {
1208 case bp_hyper_bar: {
1209 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1210 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1211 USE_ITT_BUILD_ARG(itt_sync_obj) );
1212 break;
1213 }
1214 case bp_hierarchical_bar: {
1215 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1216 USE_ITT_BUILD_ARG(itt_sync_obj) );
1217 break;
1218 }
1219 case bp_tree_bar: {
1220 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1221 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1222 USE_ITT_BUILD_ARG(itt_sync_obj) );
1223 break;
1224 }
1225 default: {
1226 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1227 USE_ITT_BUILD_ARG(itt_sync_obj) );
1228 }
1229 }
1230 if (__kmp_tasking_mode != tskm_immediate_exec) {
1231 __kmp_task_team_sync(this_thr, team);
1232 }
1233 }
1234
1235#if USE_ITT_BUILD
1236 /* GEH: TODO: Move this under if-condition above and also include in
1237 __kmp_end_split_barrier(). This will more accurately represent the actual release time
1238 of the threads for split barriers. */
1239 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1240 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1241#endif /* USE_ITT_BUILD */
1242 } else { // Team is serialized.
1243 status = 0;
1244 if (__kmp_tasking_mode != tskm_immediate_exec) {
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001245#if OMP_41_ENABLED
1246 if ( this_thr->th.th_task_team != NULL ) {
1247 void *itt_sync_obj = NULL;
1248#if USE_ITT_NOTIFY
1249 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1250 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1251 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1252 }
1253#endif
1254
Jonathan Peytone8104ad2015-06-08 18:56:33 +00001255 kmp_task_team_t * task_team;
1256 task_team = this_thr->th.th_task_team;
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001257 KMP_DEBUG_ASSERT(task_team->tt.tt_found_proxy_tasks == TRUE);
1258 __kmp_task_team_wait(this_thr, team
1259 USE_ITT_BUILD_ARG(itt_sync_obj));
1260 __kmp_task_team_setup(this_thr, team, 0, 0);
1261
1262#if USE_ITT_BUILD
1263 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1264 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1265#endif /* USE_ITT_BUILD */
1266 }
1267#else
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001268 // The task team should be NULL for serialized code (tasks will be executed immediately)
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001269 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001270 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001271#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001272 }
1273 }
1274 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1275 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001276
1277#if OMPT_SUPPORT
1278 if (ompt_status & ompt_status_track) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001279#if OMPT_BLAME
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001280 if ((ompt_status == ompt_status_track_callback) &&
1281 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1282 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1283 my_parallel_id, my_task_id);
1284 }
1285#endif
1286 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1287 }
1288#endif
1289
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001290 return status;
1291}
1292
1293
1294void
1295__kmp_end_split_barrier(enum barrier_type bt, int gtid)
1296{
1297 KMP_TIME_BLOCK(KMP_end_split_barrier);
1298 int tid = __kmp_tid_from_gtid(gtid);
1299 kmp_info_t *this_thr = __kmp_threads[gtid];
1300 kmp_team_t *team = this_thr->th.th_team;
1301
1302 if (!team->t.t_serialized) {
1303 if (KMP_MASTER_GTID(gtid)) {
1304 switch (__kmp_barrier_release_pattern[bt]) {
1305 case bp_hyper_bar: {
1306 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1307 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1308 USE_ITT_BUILD_ARG(NULL) );
1309 break;
1310 }
1311 case bp_hierarchical_bar: {
1312 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1313 USE_ITT_BUILD_ARG(NULL));
1314 break;
1315 }
1316 case bp_tree_bar: {
1317 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1318 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1319 USE_ITT_BUILD_ARG(NULL) );
1320 break;
1321 }
1322 default: {
1323 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1324 USE_ITT_BUILD_ARG(NULL) );
1325 }
1326 }
1327 if (__kmp_tasking_mode != tskm_immediate_exec) {
1328 __kmp_task_team_sync(this_thr, team);
1329 } // if
1330 }
1331 }
1332}
1333
1334
1335void
1336__kmp_join_barrier(int gtid)
1337{
1338 KMP_TIME_BLOCK(KMP_join_barrier);
1339 register kmp_info_t *this_thr = __kmp_threads[gtid];
1340 register kmp_team_t *team;
1341 register kmp_uint nproc;
1342 kmp_info_t *master_thread;
1343 int tid;
1344#ifdef KMP_DEBUG
1345 int team_id;
1346#endif /* KMP_DEBUG */
1347#if USE_ITT_BUILD
1348 void *itt_sync_obj = NULL;
1349# if USE_ITT_NOTIFY
1350 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1351 // Get object created at fork_barrier
1352 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1353# endif
1354#endif /* USE_ITT_BUILD */
1355 KMP_MB();
1356
1357 // Get current info
1358 team = this_thr->th.th_team;
1359 nproc = this_thr->th.th_team_nproc;
1360 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1361 tid = __kmp_tid_from_gtid(gtid);
1362#ifdef KMP_DEBUG
1363 team_id = team->t.t_id;
1364#endif /* KMP_DEBUG */
1365 master_thread = this_thr->th.th_team_master;
1366#ifdef KMP_DEBUG
1367 if (master_thread != team->t.t_threads[0]) {
1368 __kmp_print_structure();
1369 }
1370#endif /* KMP_DEBUG */
1371 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1372 KMP_MB();
1373
1374 // Verify state
1375 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1376 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1377 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1378 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1379 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1380
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001381#if OMPT_SUPPORT
1382#if OMPT_TRACE
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001383 if ((ompt_status == ompt_status_track_callback) &&
1384 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1385 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1386 team->t.ompt_team_info.parallel_id,
1387 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1388 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001389#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001390 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1391#endif
1392
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001393 if (__kmp_tasking_mode == tskm_extra_barrier) {
1394 __kmp_tasking_barrier(team, this_thr, gtid);
1395 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1396 }
1397# ifdef KMP_DEBUG
1398 if (__kmp_tasking_mode != tskm_immediate_exec) {
1399 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001400 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001401 this_thr->th.th_task_team));
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001402 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001403 }
1404# endif /* KMP_DEBUG */
1405
1406 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1407 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1408 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1409 since the values are not used by __kmp_wait_template() in that case. */
1410 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1411 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1412 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1413 }
1414
1415#if USE_ITT_BUILD
1416 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1417 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1418#endif /* USE_ITT_BUILD */
1419
1420 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1421 case bp_hyper_bar: {
1422 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1423 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1424 USE_ITT_BUILD_ARG(itt_sync_obj) );
1425 break;
1426 }
1427 case bp_hierarchical_bar: {
1428 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1429 USE_ITT_BUILD_ARG(itt_sync_obj) );
1430 break;
1431 }
1432 case bp_tree_bar: {
1433 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1434 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1435 USE_ITT_BUILD_ARG(itt_sync_obj) );
1436 break;
1437 }
1438 default: {
1439 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1440 USE_ITT_BUILD_ARG(itt_sync_obj) );
1441 }
1442 }
1443
1444 /* From this point on, the team data structure may be deallocated at any time by the
1445 master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1446 data items that need to be referenced before the end of the barrier should be moved to
1447 the kmp_task_team_t structs. */
1448 if (KMP_MASTER_TID(tid)) {
1449 if (__kmp_tasking_mode != tskm_immediate_exec) {
1450 // Master shouldn't call decrease_load(). // TODO: enable master threads.
1451 // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1452 __kmp_task_team_wait(this_thr, team
1453 USE_ITT_BUILD_ARG(itt_sync_obj) );
1454 }
1455#if USE_ITT_BUILD
1456 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1457 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1458#endif /* USE_ITT_BUILD */
1459
1460# if USE_ITT_BUILD && USE_ITT_NOTIFY
1461 // Join barrier - report frame end
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001462 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1463#if OMP_40_ENABLED
1464 this_thr->th.th_teams_microtask == NULL &&
1465#endif
1466 team->t.t_active_level == 1)
1467 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001468 kmp_uint64 cur_time = __itt_get_timestamp();
1469 ident_t * loc = team->t.t_ident;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001470 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001471 int nproc = this_thr->th.th_team_nproc;
1472 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001473 switch(__kmp_forkjoin_frames_mode) {
1474 case 1:
1475 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1476 break;
1477 case 2:
1478 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1479 break;
1480 case 3:
1481 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001482 // Initialize with master's wait time
1483 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001484 for (i=1; i<nproc; ++i) {
1485 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1486 }
1487 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1488 }
1489 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1490 this_thr->th.th_frame_time = cur_time;
1491 break;
1492 }
1493 }
1494# endif /* USE_ITT_BUILD */
1495 }
1496#if USE_ITT_BUILD
1497 else {
1498 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1499 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1500 }
1501#endif /* USE_ITT_BUILD */
1502
1503#if KMP_DEBUG
1504 if (KMP_MASTER_TID(tid)) {
1505 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1506 gtid, team_id, tid, nproc));
1507 }
1508#endif /* KMP_DEBUG */
1509
1510 // TODO now, mark worker threads as done so they may be disbanded
1511 KMP_MB(); // Flush all pending memory write invalidates.
1512 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001513
1514#if OMPT_SUPPORT
Jonathan Peyton48281512015-07-01 15:16:04 +00001515 if (ompt_status & ompt_status_track) {
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001516#if OMPT_TRACE
1517 if ((ompt_status == ompt_status_track_callback) &&
1518 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1519 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1520 team->t.ompt_team_info.parallel_id,
1521 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1522 }
1523#endif
1524
1525 // return to default state
1526 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1527 }
1528#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001529}
1530
1531
1532// TODO release worker threads' fork barriers as we are ready instead of all at once
1533void
1534__kmp_fork_barrier(int gtid, int tid)
1535{
1536 KMP_TIME_BLOCK(KMP_fork_barrier);
1537 kmp_info_t *this_thr = __kmp_threads[gtid];
1538 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1539#if USE_ITT_BUILD
1540 void * itt_sync_obj = NULL;
1541#endif /* USE_ITT_BUILD */
1542
1543 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1544 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1545
1546 // th_team pointer only valid for master thread here
1547 if (KMP_MASTER_TID(tid)) {
1548#if USE_ITT_BUILD && USE_ITT_NOTIFY
1549 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1550 // Create itt barrier object
1551 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1552 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1553 }
1554#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1555
1556#ifdef KMP_DEBUG
1557 register kmp_info_t **other_threads = team->t.t_threads;
1558 register int i;
1559
1560 // Verify state
1561 KMP_MB();
1562
1563 for(i=1; i<team->t.t_nproc; ++i) {
1564 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1565 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1566 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1567 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1568 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1569 & ~(KMP_BARRIER_SLEEP_STATE))
1570 == KMP_INIT_BARRIER_STATE);
1571 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1572 }
1573#endif
1574
1575 if (__kmp_tasking_mode != tskm_immediate_exec) {
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001576 __kmp_task_team_setup(this_thr, team, 1, 0); // 1,0 indicates setup both task teams if nthreads > 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001577 }
1578
1579 /* The master thread may have changed its blocktime between the join barrier and the
1580 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1581 access it when the team struct is not guaranteed to exist. */
1582 // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1583 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1584 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1585 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1586 }
1587 } // master
1588
1589 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1590 case bp_hyper_bar: {
1591 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1592 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1593 USE_ITT_BUILD_ARG(itt_sync_obj) );
1594 break;
1595 }
1596 case bp_hierarchical_bar: {
1597 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1598 USE_ITT_BUILD_ARG(itt_sync_obj) );
1599 break;
1600 }
1601 case bp_tree_bar: {
1602 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1603 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1604 USE_ITT_BUILD_ARG(itt_sync_obj) );
1605 break;
1606 }
1607 default: {
1608 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1609 USE_ITT_BUILD_ARG(itt_sync_obj) );
1610 }
1611 }
1612
1613 // Early exit for reaping threads releasing forkjoin barrier
1614 if (TCR_4(__kmp_global.g.g_done)) {
1615 if (this_thr->th.th_task_team != NULL) {
1616 if (KMP_MASTER_TID(tid)) {
1617 TCW_PTR(this_thr->th.th_task_team, NULL);
1618 }
1619 else {
1620 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1621 }
1622 }
1623
1624#if USE_ITT_BUILD && USE_ITT_NOTIFY
1625 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1626 if (!KMP_MASTER_TID(tid)) {
1627 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1628 if (itt_sync_obj)
1629 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1630 }
1631 }
1632#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1633 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1634 return;
1635 }
1636
1637 /* We can now assume that a valid team structure has been allocated by the master and
1638 propagated to all worker threads. The current thread, however, may not be part of the
1639 team, so we can't blindly assume that the team pointer is non-null. */
1640 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1641 KMP_DEBUG_ASSERT(team != NULL);
1642 tid = __kmp_tid_from_gtid(gtid);
1643
1644
1645#if KMP_BARRIER_ICV_PULL
1646 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1647 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1648 this data before this function is called. We cannot modify __kmp_fork_call() to look at
1649 the fixed ICVs in the master's thread struct, because it is not always the case that the
1650 threads arrays have been allocated when __kmp_fork_call() is executed. */
1651 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
1652 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1653 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1654 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1655 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1656 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1657 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1658 }
1659 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
1660#endif // KMP_BARRIER_ICV_PULL
1661
1662 if (__kmp_tasking_mode != tskm_immediate_exec) {
1663 __kmp_task_team_sync(this_thr, team);
1664 }
1665
1666#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1667 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1668 if (proc_bind == proc_bind_intel) {
1669#endif
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001670#if KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001671 // Call dynamic affinity settings
1672 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1673 __kmp_balanced_affinity(tid, team->t.t_nproc);
1674 }
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001675#endif // KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001676#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1677 }
Andrey Churbanov94e569e2015-03-10 09:19:47 +00001678 else if (proc_bind != proc_bind_false) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001679 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1680 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1681 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1682 }
1683 else {
1684 __kmp_affinity_set_place(gtid);
1685 }
1686 }
1687#endif
1688
1689#if USE_ITT_BUILD && USE_ITT_NOTIFY
1690 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1691 if (!KMP_MASTER_TID(tid)) {
1692 // Get correct barrier object
1693 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1694 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1695 } // (prepare called inside barrier_release)
1696 }
1697#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1698 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1699}
1700
1701
1702void
1703__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1704{
1705 KMP_TIME_BLOCK(KMP_setup_icv_copy);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001706
1707 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1708 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1709
1710 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1711 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1712 this data before this function is called. */
1713#if KMP_BARRIER_ICV_PULL
1714 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1715 all of the worker threads can access them and make their own copies after the barrier. */
1716 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1717 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1718 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1719 0, team->t.t_threads[0], team));
1720#elif KMP_BARRIER_ICV_PUSH
1721 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1722 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1723 0, team->t.t_threads[0], team));
1724#else
1725 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1726 ngo_load(new_icvs);
1727 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
Jonathan Peyton91b78702015-06-08 19:39:07 +00001728 for (int f=1; f<new_nproc; ++f) { // Skip the master thread
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001729 // TODO: GEH - pass in better source location info since usually NULL here
1730 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1731 f, team->t.t_threads[f], team));
1732 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1733 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1734 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1735 f, team->t.t_threads[f], team));
1736 }
1737 ngo_sync();
1738#endif // KMP_BARRIER_ICV_PULL
1739}