blob: 72c9fa84ef30d391e9f0d33783bc11cefa513b90 [file] [log] [blame]
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001/*
2 * kmp_barrier.cpp
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16#include "kmp.h"
17#include "kmp_wait_release.h"
18#include "kmp_stats.h"
19#include "kmp_itt.h"
20
21#if KMP_MIC
22#include <immintrin.h>
23#define USE_NGO_STORES 1
24#endif // KMP_MIC
25
26#if KMP_MIC && USE_NGO_STORES
27// ICV copying
28#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
29#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
30#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
32#else
33#define ngo_load(src) ((void)0)
34#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
35#define ngo_store_go(dst, src) memcpy((dst), (src), CACHE_LINE)
36#define ngo_sync() ((void)0)
37#endif /* KMP_MIC && USE_NGO_STORES */
38
39void __kmp_print_structure(void); // Forward declaration
40
41// ---------------------------- Barrier Algorithms ----------------------------
42
43// Linear Barrier
44static void
45__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
46 void (*reduce)(void *, void *)
47 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
48{
49 KMP_TIME_BLOCK(KMP_linear_gather);
50 register kmp_team_t *team = this_thr->th.th_team;
51 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
52 register kmp_info_t **other_threads = team->t.t_threads;
53
54 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
55 gtid, team->t.t_id, tid, bt));
56 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
57
58#if USE_ITT_BUILD && USE_ITT_NOTIFY
59 // Barrier imbalance - save arrive time to the thread
60 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
61 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
62 }
63#endif
64 // We now perform a linear reduction to signal that all of the threads have arrived.
65 if (!KMP_MASTER_TID(tid)) {
66 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
67 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
68 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
69 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
70 // Mark arrival to master thread
71 /* After performing this write, a worker thread may not assume that the team is valid
72 any more - it could be deallocated by the master thread at any time. */
73 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
74 flag.release();
75 } else {
76 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
77 register int nproc = this_thr->th.th_team_nproc;
78 register int i;
79 // Don't have to worry about sleep bit here or atomic since team setting
80 register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
81
82 // Collect all the worker team member threads.
83 for (i=1; i<nproc; ++i) {
84#if KMP_CACHE_MANAGE
85 // Prefetch next thread's arrived count
86 if (i+1 < nproc)
87 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
88#endif /* KMP_CACHE_MANAGE */
89 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
90 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
91 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
92 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
93
94 // Wait for worker thread to arrive
95 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
96 flag.wait(this_thr, FALSE
97 USE_ITT_BUILD_ARG(itt_sync_obj) );
98#if USE_ITT_BUILD && USE_ITT_NOTIFY
99 // Barrier imbalance - write min of the thread time and the other thread time to the thread.
100 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
101 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
102 other_threads[i]->th.th_bar_min_time);
103 }
104#endif
105 if (reduce) {
106 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
107 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
108 (*reduce)(this_thr->th.th_local.reduce_data,
109 other_threads[i]->th.th_local.reduce_data);
110 }
111 }
112 // Don't have to worry about sleep bit here or atomic since team setting
113 team_bar->b_arrived = new_state;
114 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
115 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
116 }
117 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
118 gtid, team->t.t_id, tid, bt));
119}
120
121static void
122__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
123 int propagate_icvs
124 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
125{
126 KMP_TIME_BLOCK(KMP_linear_release);
127 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
128 register kmp_team_t *team;
129
130 if (KMP_MASTER_TID(tid)) {
131 register unsigned int i;
132 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
133 register kmp_info_t **other_threads;
134
135 team = __kmp_threads[gtid]->th.th_team;
136 KMP_DEBUG_ASSERT(team != NULL);
137 other_threads = team->t.t_threads;
138
139 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
140 gtid, team->t.t_id, tid, bt));
141
142 if (nproc > 1) {
143#if KMP_BARRIER_ICV_PUSH
144 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
145 if (propagate_icvs) {
146 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
147 for (i=1; i<nproc; ++i) {
148 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
149 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
150 &team->t.t_implicit_task_taskdata[0].td_icvs);
151 }
152 ngo_sync();
153 }
154 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
155#endif // KMP_BARRIER_ICV_PUSH
156
157 // Now, release all of the worker threads
158 for (i=1; i<nproc; ++i) {
159#if KMP_CACHE_MANAGE
160 // Prefetch next thread's go flag
161 if (i+1 < nproc)
162 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
163#endif /* KMP_CACHE_MANAGE */
164 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
165 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
166 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
167 &other_threads[i]->th.th_bar[bt].bb.b_go,
168 other_threads[i]->th.th_bar[bt].bb.b_go,
169 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
170 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
171 flag.release();
172 }
173 }
174 } else { // Wait for the MASTER thread to release us
175 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
176 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
177 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
178 flag.wait(this_thr, TRUE
179 USE_ITT_BUILD_ARG(itt_sync_obj) );
180#if USE_ITT_BUILD && USE_ITT_NOTIFY
181 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
182 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
183 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
184 // Cancel wait on previous parallel region...
185 __kmp_itt_task_starting(itt_sync_obj);
186
187 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
188 return;
189
190 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
191 if (itt_sync_obj != NULL)
192 // Call prepare as early as possible for "new" barrier
193 __kmp_itt_task_finished(itt_sync_obj);
194 } else
195#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
196 // Early exit for reaping threads releasing forkjoin barrier
197 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
198 return;
199 // The worker thread may now assume that the team is valid.
200#ifdef KMP_DEBUG
201 tid = __kmp_tid_from_gtid(gtid);
202 team = __kmp_threads[gtid]->th.th_team;
203#endif
204 KMP_DEBUG_ASSERT(team != NULL);
205 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
206 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
207 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
208 KMP_MB(); // Flush all pending memory write invalidates.
209 }
210 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
211 gtid, team->t.t_id, tid, bt));
212}
213
214// Tree barrier
215static void
216__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
217 void (*reduce)(void *, void *)
218 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
219{
220 KMP_TIME_BLOCK(KMP_tree_gather);
221 register kmp_team_t *team = this_thr->th.th_team;
222 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
223 register kmp_info_t **other_threads = team->t.t_threads;
224 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
225 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
226 register kmp_uint32 branch_factor = 1 << branch_bits;
227 register kmp_uint32 child;
228 register kmp_uint32 child_tid;
229 register kmp_uint new_state;
230
231 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
232 gtid, team->t.t_id, tid, bt));
233 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
234
235#if USE_ITT_BUILD && USE_ITT_NOTIFY
236 // Barrier imbalance - save arrive time to the thread
237 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
238 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
239 }
240#endif
241 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
242 child_tid = (tid << branch_bits) + 1;
243 if (child_tid < nproc) {
244 // Parent threads wait for all their children to arrive
245 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
246 child = 1;
247 do {
248 register kmp_info_t *child_thr = other_threads[child_tid];
249 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
250#if KMP_CACHE_MANAGE
251 // Prefetch next thread's arrived count
252 if (child+1 <= branch_factor && child_tid+1 < nproc)
253 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
254#endif /* KMP_CACHE_MANAGE */
255 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
256 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
257 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
258 &child_bar->b_arrived, new_state));
259 // Wait for child to arrive
260 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
261 flag.wait(this_thr, FALSE
262 USE_ITT_BUILD_ARG(itt_sync_obj) );
263#if USE_ITT_BUILD && USE_ITT_NOTIFY
264 // Barrier imbalance - write min of the thread time and a child time to the thread.
265 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
266 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
267 child_thr->th.th_bar_min_time);
268 }
269#endif
270 if (reduce) {
271 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
272 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
273 team->t.t_id, child_tid));
274 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
275 }
276 child++;
277 child_tid++;
278 }
279 while (child <= branch_factor && child_tid < nproc);
280 }
281
282 if (!KMP_MASTER_TID(tid)) { // Worker threads
283 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
284
285 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
286 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
287 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
288 &thr_bar->b_arrived, thr_bar->b_arrived,
289 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
290
291 // Mark arrival to parent thread
292 /* After performing this write, a worker thread may not assume that the team is valid
293 any more - it could be deallocated by the master thread at any time. */
294 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
295 flag.release();
296 } else {
297 // Need to update the team arrived pointer if we are the master thread
298 if (nproc > 1) // New value was already computed above
299 team->t.t_bar[bt].b_arrived = new_state;
300 else
301 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
302 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
303 gtid, team->t.t_id, tid, team->t.t_id,
304 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
305 }
306 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
307 gtid, team->t.t_id, tid, bt));
308}
309
310static void
311__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
312 int propagate_icvs
313 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
314{
315 KMP_TIME_BLOCK(KMP_tree_release);
316 register kmp_team_t *team;
317 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
318 register kmp_uint32 nproc;
319 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
320 register kmp_uint32 branch_factor = 1 << branch_bits;
321 register kmp_uint32 child;
322 register kmp_uint32 child_tid;
323
324 // Perform a tree release for all of the threads that have been gathered
325 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
326 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
327 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
328 // Wait for parent thread to release us
329 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
330 flag.wait(this_thr, TRUE
331 USE_ITT_BUILD_ARG(itt_sync_obj) );
332#if USE_ITT_BUILD && USE_ITT_NOTIFY
333 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
334 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
335 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
336 // Cancel wait on previous parallel region...
337 __kmp_itt_task_starting(itt_sync_obj);
338
339 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
340 return;
341
342 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
343 if (itt_sync_obj != NULL)
344 // Call prepare as early as possible for "new" barrier
345 __kmp_itt_task_finished(itt_sync_obj);
346 } else
347#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
348 // Early exit for reaping threads releasing forkjoin barrier
349 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
350 return;
351
352 // The worker thread may now assume that the team is valid.
353 team = __kmp_threads[gtid]->th.th_team;
354 KMP_DEBUG_ASSERT(team != NULL);
355 tid = __kmp_tid_from_gtid(gtid);
356
357 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
358 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
359 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
360 KMP_MB(); // Flush all pending memory write invalidates.
361 } else {
362 team = __kmp_threads[gtid]->th.th_team;
363 KMP_DEBUG_ASSERT(team != NULL);
364 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
365 gtid, team->t.t_id, tid, bt));
366 }
367 nproc = this_thr->th.th_team_nproc;
368 child_tid = (tid << branch_bits) + 1;
369
370 if (child_tid < nproc) {
371 register kmp_info_t **other_threads = team->t.t_threads;
372 child = 1;
373 // Parent threads release all their children
374 do {
375 register kmp_info_t *child_thr = other_threads[child_tid];
376 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
377#if KMP_CACHE_MANAGE
378 // Prefetch next thread's go count
379 if (child+1 <= branch_factor && child_tid+1 < nproc)
380 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
381#endif /* KMP_CACHE_MANAGE */
382
383#if KMP_BARRIER_ICV_PUSH
384 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
385 if (propagate_icvs) {
386 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
387 team, child_tid, FALSE);
388 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
389 &team->t.t_implicit_task_taskdata[0].td_icvs);
390 }
391 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
392#endif // KMP_BARRIER_ICV_PUSH
393 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
394 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
395 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
396 child_tid, &child_bar->b_go, child_bar->b_go,
397 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
398 // Release child from barrier
399 kmp_flag_64 flag(&child_bar->b_go, child_thr);
400 flag.release();
401 child++;
402 child_tid++;
403 }
404 while (child <= branch_factor && child_tid < nproc);
405 }
406 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
407 gtid, team->t.t_id, tid, bt));
408}
409
410
411// Hyper Barrier
412static void
413__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
414 void (*reduce)(void *, void *)
415 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
416{
417 KMP_TIME_BLOCK(KMP_hyper_gather);
418 register kmp_team_t *team = this_thr->th.th_team;
419 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
420 register kmp_info_t **other_threads = team->t.t_threads;
421 register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
422 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
423 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
424 register kmp_uint32 branch_factor = 1 << branch_bits;
425 register kmp_uint32 offset;
426 register kmp_uint32 level;
427
428 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
429 gtid, team->t.t_id, tid, bt));
430
431 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
432
433#if USE_ITT_BUILD && USE_ITT_NOTIFY
434 // Barrier imbalance - save arrive time to the thread
435 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
436 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
437 }
438#endif
439 /* Perform a hypercube-embedded tree gather to wait until all of the threads have
440 arrived, and reduce any required data as we go. */
441 kmp_flag_64 p_flag(&thr_bar->b_arrived);
442 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
443 {
444 register kmp_uint32 child;
445 register kmp_uint32 child_tid;
446
447 if (((tid >> level) & (branch_factor - 1)) != 0) {
448 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
449
450 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
451 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
452 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
453 &thr_bar->b_arrived, thr_bar->b_arrived,
454 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
455 // Mark arrival to parent thread
456 /* After performing this write (in the last iteration of the enclosing for loop),
457 a worker thread may not assume that the team is valid any more - it could be
458 deallocated by the master thread at any time. */
459 p_flag.set_waiter(other_threads[parent_tid]);
460 p_flag.release();
461 break;
462 }
463
464 // Parent threads wait for children to arrive
465 if (new_state == KMP_BARRIER_UNUSED_STATE)
466 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
467 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
468 child++, child_tid+=(1 << level))
469 {
470 register kmp_info_t *child_thr = other_threads[child_tid];
471 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
472#if KMP_CACHE_MANAGE
473 register kmp_uint32 next_child_tid = child_tid + (1 << level);
474 // Prefetch next thread's arrived count
475 if (child+1 < branch_factor && next_child_tid < num_threads)
476 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
477#endif /* KMP_CACHE_MANAGE */
478 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
479 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
480 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
481 &child_bar->b_arrived, new_state));
482 // Wait for child to arrive
483 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
484 c_flag.wait(this_thr, FALSE
485 USE_ITT_BUILD_ARG(itt_sync_obj) );
486#if USE_ITT_BUILD && USE_ITT_NOTIFY
487 // Barrier imbalance - write min of the thread time and a child time to the thread.
488 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
489 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
490 child_thr->th.th_bar_min_time);
491 }
492#endif
493 if (reduce) {
494 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
495 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
496 team->t.t_id, child_tid));
497 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
498 }
499 }
500 }
501
502 if (KMP_MASTER_TID(tid)) {
503 // Need to update the team arrived pointer if we are the master thread
504 if (new_state == KMP_BARRIER_UNUSED_STATE)
505 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
506 else
507 team->t.t_bar[bt].b_arrived = new_state;
508 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
509 gtid, team->t.t_id, tid, team->t.t_id,
510 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
511 }
512 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
513 gtid, team->t.t_id, tid, bt));
514}
515
516// The reverse versions seem to beat the forward versions overall
517#define KMP_REVERSE_HYPER_BAR
518static void
519__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
520 int propagate_icvs
521 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
522{
523 KMP_TIME_BLOCK(KMP_hyper_release);
524 register kmp_team_t *team;
525 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
526 register kmp_info_t **other_threads;
527 register kmp_uint32 num_threads;
528 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
529 register kmp_uint32 branch_factor = 1 << branch_bits;
530 register kmp_uint32 child;
531 register kmp_uint32 child_tid;
532 register kmp_uint32 offset;
533 register kmp_uint32 level;
534
535 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
536 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
537 order of the corresponding gather, otherwise threads are released in the same order. */
538 if (KMP_MASTER_TID(tid)) { // master
539 team = __kmp_threads[gtid]->th.th_team;
540 KMP_DEBUG_ASSERT(team != NULL);
541 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
542 gtid, team->t.t_id, tid, bt));
543#if KMP_BARRIER_ICV_PUSH
544 if (propagate_icvs) { // master already has ICVs in final destination; copy
545 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
546 }
547#endif
548 }
549 else { // Handle fork barrier workers who aren't part of a team yet
550 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
551 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
552 // Wait for parent thread to release us
553 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
554 flag.wait(this_thr, TRUE
555 USE_ITT_BUILD_ARG(itt_sync_obj) );
556#if USE_ITT_BUILD && USE_ITT_NOTIFY
557 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
558 // In fork barrier where we could not get the object reliably
559 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
560 // Cancel wait on previous parallel region...
561 __kmp_itt_task_starting(itt_sync_obj);
562
563 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
564 return;
565
566 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
567 if (itt_sync_obj != NULL)
568 // Call prepare as early as possible for "new" barrier
569 __kmp_itt_task_finished(itt_sync_obj);
570 } else
571#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
572 // Early exit for reaping threads releasing forkjoin barrier
573 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
574 return;
575
576 // The worker thread may now assume that the team is valid.
577 team = __kmp_threads[gtid]->th.th_team;
578 KMP_DEBUG_ASSERT(team != NULL);
579 tid = __kmp_tid_from_gtid(gtid);
580
581 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
582 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
583 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
584 KMP_MB(); // Flush all pending memory write invalidates.
585 }
586 num_threads = this_thr->th.th_team_nproc;
587 other_threads = team->t.t_threads;
588
589#ifdef KMP_REVERSE_HYPER_BAR
590 // Count up to correct level for parent
591 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
592 level+=branch_bits, offset<<=branch_bits);
593
594 // Now go down from there
595 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
596 level-=branch_bits, offset>>=branch_bits)
597#else
598 // Go down the tree, level by level
599 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
600#endif // KMP_REVERSE_HYPER_BAR
601 {
602#ifdef KMP_REVERSE_HYPER_BAR
603 /* Now go in reverse order through the children, highest to lowest.
604 Initial setting of child is conservative here. */
605 child = num_threads >> ((level==0)?level:level-1);
606 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
607 child>=1; child--, child_tid-=(1<<level))
608#else
609 if (((tid >> level) & (branch_factor - 1)) != 0)
610 // No need to go lower than this, since this is the level parent would be notified
611 break;
612 // Iterate through children on this level of the tree
613 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
614 child++, child_tid+=(1<<level))
615#endif // KMP_REVERSE_HYPER_BAR
616 {
617 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
618 else {
619 register kmp_info_t *child_thr = other_threads[child_tid];
620 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
621#if KMP_CACHE_MANAGE
622 register kmp_uint32 next_child_tid = child_tid - (1 << level);
623 // Prefetch next thread's go count
624# ifdef KMP_REVERSE_HYPER_BAR
625 if (child-1 >= 1 && next_child_tid < num_threads)
626# else
627 if (child+1 < branch_factor && next_child_tid < num_threads)
628# endif // KMP_REVERSE_HYPER_BAR
629 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
630#endif /* KMP_CACHE_MANAGE */
631
632#if KMP_BARRIER_ICV_PUSH
633 if (propagate_icvs) // push my fixed ICVs to my child
634 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
635#endif // KMP_BARRIER_ICV_PUSH
636
637 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
638 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
639 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
640 child_tid, &child_bar->b_go, child_bar->b_go,
641 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
642 // Release child from barrier
643 kmp_flag_64 flag(&child_bar->b_go, child_thr);
644 flag.release();
645 }
646 }
647 }
648#if KMP_BARRIER_ICV_PUSH
649 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
650 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
651 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
652 }
653#endif
654 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
655 gtid, team->t.t_id, tid, bt));
656}
657
658// Hierarchical Barrier
659
660// Initialize thread barrier data
661/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
662 minimum amount of initialization required based on how the team has changed. Returns true if
663 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
664 team size increases, threads already in the team will respond to on-core wakeup on their parent
665 thread, but threads newly added to the team will only be listening on the their local b_go. */
666static bool
667__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
668 int gtid, int tid, kmp_team_t *team)
669{
670 // Checks to determine if (re-)initialization is needed
671 bool uninitialized = thr_bar->team == NULL;
672 bool team_changed = team != thr_bar->team;
673 bool team_sz_changed = nproc != thr_bar->nproc;
674 bool tid_changed = tid != thr_bar->old_tid;
675 bool retval = false;
676
677 if (uninitialized || team_sz_changed) {
678 __kmp_get_hierarchy(nproc, thr_bar);
679 }
680
681 if (uninitialized || team_sz_changed || tid_changed) {
682 thr_bar->my_level = thr_bar->depth-1; // default for master
683 thr_bar->parent_tid = -1; // default for master
684 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
685 kmp_uint32 d=0;
686 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
687 kmp_uint32 rem;
688 if (d == thr_bar->depth-2) { // reached level right below the master
689 thr_bar->parent_tid = 0;
690 thr_bar->my_level = d;
691 break;
692 }
693 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
694 // thread is not a subtree root at next level, so this is max
695 thr_bar->parent_tid = tid - rem;
696 thr_bar->my_level = d;
697 break;
698 }
699 ++d;
700 }
701 }
702 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
703 thr_bar->old_tid = tid;
704 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
705 }
706 if (uninitialized || team_changed || tid_changed) {
707 thr_bar->team = team;
708 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
709 retval = true;
710 }
711 if (uninitialized || team_sz_changed || tid_changed) {
712 thr_bar->nproc = nproc;
713 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
714 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
715 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
716 thr_bar->leaf_kids = nproc - tid - 1;
717 thr_bar->leaf_state = 0;
718 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
719 }
720 return retval;
721}
722
723static void
724__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
725 int gtid, int tid, void (*reduce) (void *, void *)
726 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
727{
728 KMP_TIME_BLOCK(KMP_hier_gather);
729 register kmp_team_t *team = this_thr->th.th_team;
730 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
731 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
732 register kmp_info_t **other_threads = team->t.t_threads;
733 register kmp_uint64 new_state;
734
Andrey Churbanov42a79212015-01-27 16:50:31 +0000735 int level = team->t.t_level;
736 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
737 if (this_thr->th.th_teams_size.nteams > 1)
738 ++level; // level was not increased in teams construct for team_of_masters
739 if (level == 1) thr_bar->use_oncore_barrier = 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000740 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
741
742 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
743 gtid, team->t.t_id, tid, bt));
744 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
745
746 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
747
748 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
749 register kmp_int32 child_tid;
750 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
751 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
752 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
753 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
754 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
755 flag.wait(this_thr, FALSE
756 USE_ITT_BUILD_ARG(itt_sync_obj) );
757 if (reduce) {
758 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
759 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
760 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
761 team->t.t_id, child_tid));
762 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
763 }
764 }
765 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
766 }
767 // Next, wait for higher level children on each child's b_arrived flag
768 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
769 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
770 if (last > nproc) last = nproc;
771 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
772 register kmp_info_t *child_thr = other_threads[child_tid];
773 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
774 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
775 "arrived(%p) == %u\n",
776 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
777 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
778 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
779 flag.wait(this_thr, FALSE
780 USE_ITT_BUILD_ARG(itt_sync_obj) );
781 if (reduce) {
782 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
783 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
784 team->t.t_id, child_tid));
785 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
786 }
787 }
788 }
789 }
790 else { // Blocktime is not infinite
791 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
792 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
793 if (last > nproc) last = nproc;
794 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
795 register kmp_info_t *child_thr = other_threads[child_tid];
796 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
797 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
798 "arrived(%p) == %u\n",
799 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
800 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
801 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
802 flag.wait(this_thr, FALSE
803 USE_ITT_BUILD_ARG(itt_sync_obj) );
804 if (reduce) {
805 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
806 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
807 team->t.t_id, child_tid));
808 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
809 }
810 }
811 }
812 }
813 }
814 // All subordinates are gathered; now release parent if not master thread
815
816 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
817 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
818 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
819 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
820 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
821 /* Mark arrival to parent: After performing this write, a worker thread may not assume that
822 the team is valid any more - it could be deallocated by the master thread at any time. */
823 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
824 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
825 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
826 flag.release();
827 }
828 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
829 thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
830 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
831 flag.set_waiter(other_threads[thr_bar->parent_tid]);
832 flag.release();
833 }
834 } else { // Master thread needs to update the team's b_arrived value
835 team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
836 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
837 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
838 }
839 // Is the team access below unsafe or just technically invalid?
840 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
841 gtid, team->t.t_id, tid, bt));
842}
843
844static void
845__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
846 int propagate_icvs
847 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
848{
849 KMP_TIME_BLOCK(KMP_hier_release);
850 register kmp_team_t *team;
851 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
852 register kmp_uint32 nproc;
853 bool team_change = false; // indicates on-core barrier shouldn't be used
854
855 if (KMP_MASTER_TID(tid)) {
856 team = __kmp_threads[gtid]->th.th_team;
857 KMP_DEBUG_ASSERT(team != NULL);
858 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
859 gtid, team->t.t_id, tid, bt));
860 }
861 else { // Worker threads
862 // Wait for parent thread to release me
863 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
864 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
865 // Use traditional method of waiting on my own b_go flag
866 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
867 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
868 flag.wait(this_thr, TRUE
869 USE_ITT_BUILD_ARG(itt_sync_obj) );
870 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
871 }
872 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
873 // Wait on my "offset" bits on parent's b_go flag
874 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
875 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
876 bt, this_thr
877 USE_ITT_BUILD_ARG(itt_sync_obj) );
878 flag.wait(this_thr, TRUE
879 USE_ITT_BUILD_ARG(itt_sync_obj) );
880 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
881 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
882 }
883 else { // Reset my bits on parent's b_go flag
884 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
885 }
886 }
887 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
888 // Early exit for reaping threads releasing forkjoin barrier
889 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
890 return;
891 // The worker thread may now assume that the team is valid.
892 team = __kmp_threads[gtid]->th.th_team;
893 KMP_DEBUG_ASSERT(team != NULL);
894 tid = __kmp_tid_from_gtid(gtid);
895
896 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
897 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
898 KMP_MB(); // Flush all pending memory write invalidates.
899 }
900
Andrey Churbanov42a79212015-01-27 16:50:31 +0000901 int level = team->t.t_level;
902 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
903 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
904 ++level; // level was not increased in teams construct for team_of_workers
905 if( this_thr->th.th_teams_size.nteams > 1 )
906 ++level; // level was not increased in teams construct for team_of_masters
907 }
908 if (level == 1) thr_bar->use_oncore_barrier = 1;
909 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000910 nproc = this_thr->th.th_team_nproc;
911
912 // If the team size has increased, we still communicate with old leaves via oncore barrier.
913 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
914 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
915 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
916 // But if the entire team changes, we won't use oncore barrier at all
917 if (team_change) old_leaf_kids = 0;
918
919#if KMP_BARRIER_ICV_PUSH
920 if (propagate_icvs) {
921 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
922 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
923 }
924 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
925 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
926 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
927 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
928 &thr_bar->parent_bar->th_fixed_icvs);
929 // non-leaves will get ICVs piggybacked with b_go via NGO store
930 }
931 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
932 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
933 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
934 else // leaves copy parent's fixed ICVs directly to local ICV store
935 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
936 &thr_bar->parent_bar->th_fixed_icvs);
937 }
938 }
939#endif // KMP_BARRIER_ICV_PUSH
940
941 // Now, release my children
942 if (thr_bar->my_level) { // not a leaf
943 register kmp_int32 child_tid;
944 kmp_uint32 last;
945 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
946 if (KMP_MASTER_TID(tid)) { // do a flat release
947 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
948 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
949 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
950 ngo_load(&thr_bar->th_fixed_icvs);
951 // This loops over all the threads skipping only the leaf nodes in the hierarchy
952 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
953 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
954 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
955 " go(%p): %u => %u\n",
956 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
957 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
958 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
959 // Use ngo store (if available) to both store ICVs and release child via child's b_go
960 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
961 }
962 ngo_sync();
963 }
964 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
965 // Now, release leaf children
966 if (thr_bar->leaf_kids) { // if there are any
967 // We test team_change on the off-chance that the level 1 team changed.
968 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
969 if (old_leaf_kids) { // release old leaf kids
970 thr_bar->b_go |= old_leaf_state;
971 }
972 // Release new leaf kids
973 last = tid+thr_bar->skip_per_level[1];
974 if (last > nproc) last = nproc;
975 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
976 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
977 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
978 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
979 " T#%d(%d:%d) go(%p): %u => %u\n",
980 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
981 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
982 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
983 // Release child using child's b_go flag
984 kmp_flag_64 flag(&child_bar->b_go, child_thr);
985 flag.release();
986 }
987 }
988 else { // Release all children at once with leaf_state bits on my own b_go flag
989 thr_bar->b_go |= thr_bar->leaf_state;
990 }
991 }
992 }
993 else { // Blocktime is not infinite; do a simple hierarchical release
994 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
995 last = tid+thr_bar->skip_per_level[d+1];
996 kmp_uint32 skip = thr_bar->skip_per_level[d];
997 if (last > nproc) last = nproc;
998 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
999 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1000 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1001 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1002 " go(%p): %u => %u\n",
1003 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1004 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1005 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1006 // Release child using child's b_go flag
1007 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1008 flag.release();
1009 }
1010 }
1011 }
1012#if KMP_BARRIER_ICV_PUSH
1013 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1014 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1015#endif // KMP_BARRIER_ICV_PUSH
1016 }
1017 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1018 gtid, team->t.t_id, tid, bt));
1019}
1020
1021// ---------------------------- End of Barrier Algorithms ----------------------------
1022
1023// Internal function to do a barrier.
1024/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1025 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1026 Returns 0 if master thread, 1 if worker thread. */
1027int
1028__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1029 void *reduce_data, void (*reduce)(void *, void *))
1030{
1031 KMP_TIME_BLOCK(KMP_barrier);
1032 register int tid = __kmp_tid_from_gtid(gtid);
1033 register kmp_info_t *this_thr = __kmp_threads[gtid];
1034 register kmp_team_t *team = this_thr->th.th_team;
1035 register int status = 0;
1036 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001037#if OMPT_SUPPORT
1038 ompt_task_id_t my_task_id;
1039 ompt_parallel_id_t my_parallel_id;
1040#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001041
1042 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1043 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1044
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001045#if OMPT_SUPPORT && OMPT_TRACE
1046 if (ompt_status & ompt_status_track) {
1047 if (ompt_status == ompt_status_track_callback) {
1048 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1049 my_parallel_id = team->t.ompt_team_info.parallel_id;
1050
1051 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1052 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1053 ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1054 my_parallel_id, my_task_id);
1055 }
1056 }
1057 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1058 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1059 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1060 my_parallel_id, my_task_id);
1061 }
1062 } else {
1063 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1064 }
1065 }
1066#endif
1067
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001068 if (! team->t.t_serialized) {
1069#if USE_ITT_BUILD
1070 // This value will be used in itt notify events below.
1071 void *itt_sync_obj = NULL;
1072# if USE_ITT_NOTIFY
1073 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1074 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1075# endif
1076#endif /* USE_ITT_BUILD */
1077 if (__kmp_tasking_mode == tskm_extra_barrier) {
1078 __kmp_tasking_barrier(team, this_thr, gtid);
1079 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1080 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1081 }
1082
1083 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1084 the team struct is not guaranteed to exist. */
1085 // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1086 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1087 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1088 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1089 }
1090
1091#if USE_ITT_BUILD
1092 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1093 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1094#endif /* USE_ITT_BUILD */
1095
1096 if (reduce != NULL) {
1097 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1098 this_thr->th.th_local.reduce_data = reduce_data;
1099 }
1100 switch (__kmp_barrier_gather_pattern[bt]) {
1101 case bp_hyper_bar: {
1102 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1103 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1104 USE_ITT_BUILD_ARG(itt_sync_obj) );
1105 break;
1106 }
1107 case bp_hierarchical_bar: {
1108 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1109 USE_ITT_BUILD_ARG(itt_sync_obj));
1110 break;
1111 }
1112 case bp_tree_bar: {
1113 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1114 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1115 USE_ITT_BUILD_ARG(itt_sync_obj) );
1116 break;
1117 }
1118 default: {
1119 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1120 USE_ITT_BUILD_ARG(itt_sync_obj) );
1121 }
1122 }
1123
1124 KMP_MB();
1125
1126 if (KMP_MASTER_TID(tid)) {
1127 status = 0;
1128 if (__kmp_tasking_mode != tskm_immediate_exec) {
1129 __kmp_task_team_wait(this_thr, team
1130 USE_ITT_BUILD_ARG(itt_sync_obj) );
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001131 __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001132 }
1133
1134
1135#if USE_ITT_BUILD
1136 /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1137 before the final summation into the shared variable is done (final summation can be a
1138 long operation for array reductions). */
1139 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1140 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1141#endif /* USE_ITT_BUILD */
1142#if USE_ITT_BUILD && USE_ITT_NOTIFY
1143 // Barrier - report frame end
1144 if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1145 kmp_uint64 cur_time = __itt_get_timestamp();
1146 kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1147 int nproc = this_thr->th.th_team_nproc;
1148 int i;
1149 // Initialize with master's wait time
1150 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1151 switch(__kmp_forkjoin_frames_mode) {
1152 case 1:
1153 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1154 this_thr->th.th_frame_time = cur_time;
1155 break;
1156 case 2:
1157 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1158 break;
1159 case 3:
1160 if( __itt_metadata_add_ptr ) {
1161 for (i=1; i<nproc; ++i) {
1162 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1163 }
1164 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1165 }
1166 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1167 this_thr->th.th_frame_time = cur_time;
1168 break;
1169 }
1170 }
1171#endif /* USE_ITT_BUILD */
1172 } else {
1173 status = 1;
1174#if USE_ITT_BUILD
1175 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1176 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1177#endif /* USE_ITT_BUILD */
1178 }
1179 if (status == 1 || ! is_split) {
1180 switch (__kmp_barrier_release_pattern[bt]) {
1181 case bp_hyper_bar: {
1182 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1183 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1184 USE_ITT_BUILD_ARG(itt_sync_obj) );
1185 break;
1186 }
1187 case bp_hierarchical_bar: {
1188 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1189 USE_ITT_BUILD_ARG(itt_sync_obj) );
1190 break;
1191 }
1192 case bp_tree_bar: {
1193 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1194 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1195 USE_ITT_BUILD_ARG(itt_sync_obj) );
1196 break;
1197 }
1198 default: {
1199 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1200 USE_ITT_BUILD_ARG(itt_sync_obj) );
1201 }
1202 }
1203 if (__kmp_tasking_mode != tskm_immediate_exec) {
1204 __kmp_task_team_sync(this_thr, team);
1205 }
1206 }
1207
1208#if USE_ITT_BUILD
1209 /* GEH: TODO: Move this under if-condition above and also include in
1210 __kmp_end_split_barrier(). This will more accurately represent the actual release time
1211 of the threads for split barriers. */
1212 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1213 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1214#endif /* USE_ITT_BUILD */
1215 } else { // Team is serialized.
1216 status = 0;
1217 if (__kmp_tasking_mode != tskm_immediate_exec) {
1218 // The task team should be NULL for serialized code (tasks will be executed immediately)
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001219 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001220 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1221 }
1222 }
1223 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1224 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001225
1226#if OMPT_SUPPORT
1227 if (ompt_status & ompt_status_track) {
1228#if OMPT_TRACE
1229 if ((ompt_status == ompt_status_track_callback) &&
1230 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1231 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1232 my_parallel_id, my_task_id);
1233 }
1234#endif
1235 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1236 }
1237#endif
1238
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001239 return status;
1240}
1241
1242
1243void
1244__kmp_end_split_barrier(enum barrier_type bt, int gtid)
1245{
1246 KMP_TIME_BLOCK(KMP_end_split_barrier);
1247 int tid = __kmp_tid_from_gtid(gtid);
1248 kmp_info_t *this_thr = __kmp_threads[gtid];
1249 kmp_team_t *team = this_thr->th.th_team;
1250
1251 if (!team->t.t_serialized) {
1252 if (KMP_MASTER_GTID(gtid)) {
1253 switch (__kmp_barrier_release_pattern[bt]) {
1254 case bp_hyper_bar: {
1255 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1256 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1257 USE_ITT_BUILD_ARG(NULL) );
1258 break;
1259 }
1260 case bp_hierarchical_bar: {
1261 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1262 USE_ITT_BUILD_ARG(NULL));
1263 break;
1264 }
1265 case bp_tree_bar: {
1266 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1267 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1268 USE_ITT_BUILD_ARG(NULL) );
1269 break;
1270 }
1271 default: {
1272 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1273 USE_ITT_BUILD_ARG(NULL) );
1274 }
1275 }
1276 if (__kmp_tasking_mode != tskm_immediate_exec) {
1277 __kmp_task_team_sync(this_thr, team);
1278 } // if
1279 }
1280 }
1281}
1282
1283
1284void
1285__kmp_join_barrier(int gtid)
1286{
1287 KMP_TIME_BLOCK(KMP_join_barrier);
1288 register kmp_info_t *this_thr = __kmp_threads[gtid];
1289 register kmp_team_t *team;
1290 register kmp_uint nproc;
1291 kmp_info_t *master_thread;
1292 int tid;
1293#ifdef KMP_DEBUG
1294 int team_id;
1295#endif /* KMP_DEBUG */
1296#if USE_ITT_BUILD
1297 void *itt_sync_obj = NULL;
1298# if USE_ITT_NOTIFY
1299 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1300 // Get object created at fork_barrier
1301 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1302# endif
1303#endif /* USE_ITT_BUILD */
1304 KMP_MB();
1305
1306 // Get current info
1307 team = this_thr->th.th_team;
1308 nproc = this_thr->th.th_team_nproc;
1309 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1310 tid = __kmp_tid_from_gtid(gtid);
1311#ifdef KMP_DEBUG
1312 team_id = team->t.t_id;
1313#endif /* KMP_DEBUG */
1314 master_thread = this_thr->th.th_team_master;
1315#ifdef KMP_DEBUG
1316 if (master_thread != team->t.t_threads[0]) {
1317 __kmp_print_structure();
1318 }
1319#endif /* KMP_DEBUG */
1320 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1321 KMP_MB();
1322
1323 // Verify state
1324 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1325 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1326 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1327 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1328 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1329
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001330#if OMPT_SUPPORT && OMPT_TRACE
1331 if ((ompt_status == ompt_status_track_callback) &&
1332 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1333 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1334 team->t.ompt_team_info.parallel_id,
1335 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1336 }
1337 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1338#endif
1339
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001340 if (__kmp_tasking_mode == tskm_extra_barrier) {
1341 __kmp_tasking_barrier(team, this_thr, gtid);
1342 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1343 }
1344# ifdef KMP_DEBUG
1345 if (__kmp_tasking_mode != tskm_immediate_exec) {
1346 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001347 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001348 this_thr->th.th_task_team));
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001349 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001350 }
1351# endif /* KMP_DEBUG */
1352
1353 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1354 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1355 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1356 since the values are not used by __kmp_wait_template() in that case. */
1357 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1358 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1359 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1360 }
1361
1362#if USE_ITT_BUILD
1363 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1364 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1365#endif /* USE_ITT_BUILD */
1366
1367 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1368 case bp_hyper_bar: {
1369 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1370 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1371 USE_ITT_BUILD_ARG(itt_sync_obj) );
1372 break;
1373 }
1374 case bp_hierarchical_bar: {
1375 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1376 USE_ITT_BUILD_ARG(itt_sync_obj) );
1377 break;
1378 }
1379 case bp_tree_bar: {
1380 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1381 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1382 USE_ITT_BUILD_ARG(itt_sync_obj) );
1383 break;
1384 }
1385 default: {
1386 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1387 USE_ITT_BUILD_ARG(itt_sync_obj) );
1388 }
1389 }
1390
1391 /* From this point on, the team data structure may be deallocated at any time by the
1392 master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1393 data items that need to be referenced before the end of the barrier should be moved to
1394 the kmp_task_team_t structs. */
1395 if (KMP_MASTER_TID(tid)) {
1396 if (__kmp_tasking_mode != tskm_immediate_exec) {
1397 // Master shouldn't call decrease_load(). // TODO: enable master threads.
1398 // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1399 __kmp_task_team_wait(this_thr, team
1400 USE_ITT_BUILD_ARG(itt_sync_obj) );
1401 }
1402#if USE_ITT_BUILD
1403 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1404 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1405#endif /* USE_ITT_BUILD */
1406
1407# if USE_ITT_BUILD && USE_ITT_NOTIFY
1408 // Join barrier - report frame end
1409 if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1410 kmp_uint64 cur_time = __itt_get_timestamp();
1411 ident_t * loc = team->t.t_ident;
1412 kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1413 int nproc = this_thr->th.th_team_nproc;
1414 int i;
1415 // Initialize with master's wait time
1416 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1417 switch(__kmp_forkjoin_frames_mode) {
1418 case 1:
1419 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1420 break;
1421 case 2:
1422 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1423 break;
1424 case 3:
1425 if( __itt_metadata_add_ptr ) {
1426 for (i=1; i<nproc; ++i) {
1427 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1428 }
1429 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1430 }
1431 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1432 this_thr->th.th_frame_time = cur_time;
1433 break;
1434 }
1435 }
1436# endif /* USE_ITT_BUILD */
1437 }
1438#if USE_ITT_BUILD
1439 else {
1440 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1441 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1442 }
1443#endif /* USE_ITT_BUILD */
1444
1445#if KMP_DEBUG
1446 if (KMP_MASTER_TID(tid)) {
1447 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1448 gtid, team_id, tid, nproc));
1449 }
1450#endif /* KMP_DEBUG */
1451
1452 // TODO now, mark worker threads as done so they may be disbanded
1453 KMP_MB(); // Flush all pending memory write invalidates.
1454 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001455
1456#if OMPT_SUPPORT
1457 if (ompt_status == ompt_status_track) {
1458#if OMPT_TRACE
1459 if ((ompt_status == ompt_status_track_callback) &&
1460 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1461 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1462 team->t.ompt_team_info.parallel_id,
1463 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1464 }
1465#endif
1466
1467 // return to default state
1468 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1469 }
1470#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001471}
1472
1473
1474// TODO release worker threads' fork barriers as we are ready instead of all at once
1475void
1476__kmp_fork_barrier(int gtid, int tid)
1477{
1478 KMP_TIME_BLOCK(KMP_fork_barrier);
1479 kmp_info_t *this_thr = __kmp_threads[gtid];
1480 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1481#if USE_ITT_BUILD
1482 void * itt_sync_obj = NULL;
1483#endif /* USE_ITT_BUILD */
1484
1485 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1486 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1487
1488 // th_team pointer only valid for master thread here
1489 if (KMP_MASTER_TID(tid)) {
1490#if USE_ITT_BUILD && USE_ITT_NOTIFY
1491 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1492 // Create itt barrier object
1493 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1494 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1495 }
1496#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1497
1498#ifdef KMP_DEBUG
1499 register kmp_info_t **other_threads = team->t.t_threads;
1500 register int i;
1501
1502 // Verify state
1503 KMP_MB();
1504
1505 for(i=1; i<team->t.t_nproc; ++i) {
1506 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1507 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1508 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1509 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1510 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1511 & ~(KMP_BARRIER_SLEEP_STATE))
1512 == KMP_INIT_BARRIER_STATE);
1513 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1514 }
1515#endif
1516
1517 if (__kmp_tasking_mode != tskm_immediate_exec) {
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001518 __kmp_task_team_setup(this_thr, team, 1); // 1 indicates setup both task teams
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001519 }
1520
1521 /* The master thread may have changed its blocktime between the join barrier and the
1522 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1523 access it when the team struct is not guaranteed to exist. */
1524 // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1525 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1526 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1527 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1528 }
1529 } // master
1530
1531 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1532 case bp_hyper_bar: {
1533 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1534 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1535 USE_ITT_BUILD_ARG(itt_sync_obj) );
1536 break;
1537 }
1538 case bp_hierarchical_bar: {
1539 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1540 USE_ITT_BUILD_ARG(itt_sync_obj) );
1541 break;
1542 }
1543 case bp_tree_bar: {
1544 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1545 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1546 USE_ITT_BUILD_ARG(itt_sync_obj) );
1547 break;
1548 }
1549 default: {
1550 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1551 USE_ITT_BUILD_ARG(itt_sync_obj) );
1552 }
1553 }
1554
1555 // Early exit for reaping threads releasing forkjoin barrier
1556 if (TCR_4(__kmp_global.g.g_done)) {
1557 if (this_thr->th.th_task_team != NULL) {
1558 if (KMP_MASTER_TID(tid)) {
1559 TCW_PTR(this_thr->th.th_task_team, NULL);
1560 }
1561 else {
1562 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1563 }
1564 }
1565
1566#if USE_ITT_BUILD && USE_ITT_NOTIFY
1567 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1568 if (!KMP_MASTER_TID(tid)) {
1569 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1570 if (itt_sync_obj)
1571 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1572 }
1573 }
1574#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1575 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1576 return;
1577 }
1578
1579 /* We can now assume that a valid team structure has been allocated by the master and
1580 propagated to all worker threads. The current thread, however, may not be part of the
1581 team, so we can't blindly assume that the team pointer is non-null. */
1582 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1583 KMP_DEBUG_ASSERT(team != NULL);
1584 tid = __kmp_tid_from_gtid(gtid);
1585
1586
1587#if KMP_BARRIER_ICV_PULL
1588 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1589 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1590 this data before this function is called. We cannot modify __kmp_fork_call() to look at
1591 the fixed ICVs in the master's thread struct, because it is not always the case that the
1592 threads arrays have been allocated when __kmp_fork_call() is executed. */
1593 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
1594 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1595 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1596 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1597 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1598 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1599 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1600 }
1601 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
1602#endif // KMP_BARRIER_ICV_PULL
1603
1604 if (__kmp_tasking_mode != tskm_immediate_exec) {
1605 __kmp_task_team_sync(this_thr, team);
1606 }
1607
1608#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1609 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1610 if (proc_bind == proc_bind_intel) {
1611#endif
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001612#if KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001613 // Call dynamic affinity settings
1614 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1615 __kmp_balanced_affinity(tid, team->t.t_nproc);
1616 }
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001617#endif // KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001618#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1619 }
Andrey Churbanov94e569e2015-03-10 09:19:47 +00001620 else if (proc_bind != proc_bind_false) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001621 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1622 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1623 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1624 }
1625 else {
1626 __kmp_affinity_set_place(gtid);
1627 }
1628 }
1629#endif
1630
1631#if USE_ITT_BUILD && USE_ITT_NOTIFY
1632 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1633 if (!KMP_MASTER_TID(tid)) {
1634 // Get correct barrier object
1635 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1636 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1637 } // (prepare called inside barrier_release)
1638 }
1639#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1640 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1641}
1642
1643
1644void
1645__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1646{
1647 KMP_TIME_BLOCK(KMP_setup_icv_copy);
1648 int f;
1649
1650 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1651 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1652
1653 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1654 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1655 this data before this function is called. */
1656#if KMP_BARRIER_ICV_PULL
1657 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1658 all of the worker threads can access them and make their own copies after the barrier. */
1659 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1660 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1661 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1662 0, team->t.t_threads[0], team));
1663#elif KMP_BARRIER_ICV_PUSH
1664 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1665 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1666 0, team->t.t_threads[0], team));
1667#else
1668 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1669 ngo_load(new_icvs);
1670 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1671 for (f=1; f<new_nproc; ++f) { // Skip the master thread
1672 // TODO: GEH - pass in better source location info since usually NULL here
1673 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1674 f, team->t.t_threads[f], team));
1675 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1676 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1677 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1678 f, team->t.t_threads[f], team));
1679 }
1680 ngo_sync();
1681#endif // KMP_BARRIER_ICV_PULL
1682}