blob: d300af05db3fd306c8b2f179e232dfad46f2a9f8 [file] [log] [blame]
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001/*
2 * kmp_barrier.cpp
3 * $Revision: 43473 $
4 * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
5 */
6
7
8//===----------------------------------------------------------------------===//
9//
10// The LLVM Compiler Infrastructure
11//
12// This file is dual licensed under the MIT and the University of Illinois Open
13// Source Licenses. See LICENSE.txt for details.
14//
15//===----------------------------------------------------------------------===//
16
17
18#include "kmp.h"
19#include "kmp_wait_release.h"
20#include "kmp_stats.h"
21#include "kmp_itt.h"
22
23#if KMP_MIC
24#include <immintrin.h>
25#define USE_NGO_STORES 1
26#endif // KMP_MIC
27
28#if KMP_MIC && USE_NGO_STORES
29// ICV copying
30#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
31#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
33#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
34#else
35#define ngo_load(src) ((void)0)
36#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
37#define ngo_store_go(dst, src) memcpy((dst), (src), CACHE_LINE)
38#define ngo_sync() ((void)0)
39#endif /* KMP_MIC && USE_NGO_STORES */
40
41void __kmp_print_structure(void); // Forward declaration
42
43// ---------------------------- Barrier Algorithms ----------------------------
44
45// Linear Barrier
46static void
47__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48 void (*reduce)(void *, void *)
49 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
50{
51 KMP_TIME_BLOCK(KMP_linear_gather);
52 register kmp_team_t *team = this_thr->th.th_team;
53 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
54 register kmp_info_t **other_threads = team->t.t_threads;
55
56 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57 gtid, team->t.t_id, tid, bt));
58 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59
60#if USE_ITT_BUILD && USE_ITT_NOTIFY
61 // Barrier imbalance - save arrive time to the thread
62 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
63 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
64 }
65#endif
66 // We now perform a linear reduction to signal that all of the threads have arrived.
67 if (!KMP_MASTER_TID(tid)) {
68 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
69 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
70 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
71 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
72 // Mark arrival to master thread
73 /* After performing this write, a worker thread may not assume that the team is valid
74 any more - it could be deallocated by the master thread at any time. */
75 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
76 flag.release();
77 } else {
78 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
79 register int nproc = this_thr->th.th_team_nproc;
80 register int i;
81 // Don't have to worry about sleep bit here or atomic since team setting
82 register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
83
84 // Collect all the worker team member threads.
85 for (i=1; i<nproc; ++i) {
86#if KMP_CACHE_MANAGE
87 // Prefetch next thread's arrived count
88 if (i+1 < nproc)
89 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
90#endif /* KMP_CACHE_MANAGE */
91 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
92 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
93 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
94 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
95
96 // Wait for worker thread to arrive
97 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
98 flag.wait(this_thr, FALSE
99 USE_ITT_BUILD_ARG(itt_sync_obj) );
100#if USE_ITT_BUILD && USE_ITT_NOTIFY
101 // Barrier imbalance - write min of the thread time and the other thread time to the thread.
102 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
103 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
104 other_threads[i]->th.th_bar_min_time);
105 }
106#endif
107 if (reduce) {
108 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
109 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
110 (*reduce)(this_thr->th.th_local.reduce_data,
111 other_threads[i]->th.th_local.reduce_data);
112 }
113 }
114 // Don't have to worry about sleep bit here or atomic since team setting
115 team_bar->b_arrived = new_state;
116 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
117 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
118 }
119 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
120 gtid, team->t.t_id, tid, bt));
121}
122
123static void
124__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
125 int propagate_icvs
126 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
127{
128 KMP_TIME_BLOCK(KMP_linear_release);
129 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
130 register kmp_team_t *team;
131
132 if (KMP_MASTER_TID(tid)) {
133 register unsigned int i;
134 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
135 register kmp_info_t **other_threads;
136
137 team = __kmp_threads[gtid]->th.th_team;
138 KMP_DEBUG_ASSERT(team != NULL);
139 other_threads = team->t.t_threads;
140
141 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
142 gtid, team->t.t_id, tid, bt));
143
144 if (nproc > 1) {
145#if KMP_BARRIER_ICV_PUSH
146 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
147 if (propagate_icvs) {
148 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
149 for (i=1; i<nproc; ++i) {
150 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
151 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
152 &team->t.t_implicit_task_taskdata[0].td_icvs);
153 }
154 ngo_sync();
155 }
156 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
157#endif // KMP_BARRIER_ICV_PUSH
158
159 // Now, release all of the worker threads
160 for (i=1; i<nproc; ++i) {
161#if KMP_CACHE_MANAGE
162 // Prefetch next thread's go flag
163 if (i+1 < nproc)
164 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
165#endif /* KMP_CACHE_MANAGE */
166 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
167 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
168 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
169 &other_threads[i]->th.th_bar[bt].bb.b_go,
170 other_threads[i]->th.th_bar[bt].bb.b_go,
171 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
172 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
173 flag.release();
174 }
175 }
176 } else { // Wait for the MASTER thread to release us
177 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
178 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
179 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
180 flag.wait(this_thr, TRUE
181 USE_ITT_BUILD_ARG(itt_sync_obj) );
182#if USE_ITT_BUILD && USE_ITT_NOTIFY
183 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
184 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
185 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
186 // Cancel wait on previous parallel region...
187 __kmp_itt_task_starting(itt_sync_obj);
188
189 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
190 return;
191
192 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
193 if (itt_sync_obj != NULL)
194 // Call prepare as early as possible for "new" barrier
195 __kmp_itt_task_finished(itt_sync_obj);
196 } else
197#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
198 // Early exit for reaping threads releasing forkjoin barrier
199 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
200 return;
201 // The worker thread may now assume that the team is valid.
202#ifdef KMP_DEBUG
203 tid = __kmp_tid_from_gtid(gtid);
204 team = __kmp_threads[gtid]->th.th_team;
205#endif
206 KMP_DEBUG_ASSERT(team != NULL);
207 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
208 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
209 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
210 KMP_MB(); // Flush all pending memory write invalidates.
211 }
212 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
213 gtid, team->t.t_id, tid, bt));
214}
215
216// Tree barrier
217static void
218__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
219 void (*reduce)(void *, void *)
220 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
221{
222 KMP_TIME_BLOCK(KMP_tree_gather);
223 register kmp_team_t *team = this_thr->th.th_team;
224 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
225 register kmp_info_t **other_threads = team->t.t_threads;
226 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
227 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
228 register kmp_uint32 branch_factor = 1 << branch_bits;
229 register kmp_uint32 child;
230 register kmp_uint32 child_tid;
231 register kmp_uint new_state;
232
233 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
234 gtid, team->t.t_id, tid, bt));
235 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
236
237#if USE_ITT_BUILD && USE_ITT_NOTIFY
238 // Barrier imbalance - save arrive time to the thread
239 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
240 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
241 }
242#endif
243 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
244 child_tid = (tid << branch_bits) + 1;
245 if (child_tid < nproc) {
246 // Parent threads wait for all their children to arrive
247 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
248 child = 1;
249 do {
250 register kmp_info_t *child_thr = other_threads[child_tid];
251 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
252#if KMP_CACHE_MANAGE
253 // Prefetch next thread's arrived count
254 if (child+1 <= branch_factor && child_tid+1 < nproc)
255 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
256#endif /* KMP_CACHE_MANAGE */
257 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
258 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
259 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
260 &child_bar->b_arrived, new_state));
261 // Wait for child to arrive
262 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
263 flag.wait(this_thr, FALSE
264 USE_ITT_BUILD_ARG(itt_sync_obj) );
265#if USE_ITT_BUILD && USE_ITT_NOTIFY
266 // Barrier imbalance - write min of the thread time and a child time to the thread.
267 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
268 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
269 child_thr->th.th_bar_min_time);
270 }
271#endif
272 if (reduce) {
273 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
274 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
275 team->t.t_id, child_tid));
276 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
277 }
278 child++;
279 child_tid++;
280 }
281 while (child <= branch_factor && child_tid < nproc);
282 }
283
284 if (!KMP_MASTER_TID(tid)) { // Worker threads
285 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
286
287 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
288 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
289 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
290 &thr_bar->b_arrived, thr_bar->b_arrived,
291 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
292
293 // Mark arrival to parent thread
294 /* After performing this write, a worker thread may not assume that the team is valid
295 any more - it could be deallocated by the master thread at any time. */
296 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
297 flag.release();
298 } else {
299 // Need to update the team arrived pointer if we are the master thread
300 if (nproc > 1) // New value was already computed above
301 team->t.t_bar[bt].b_arrived = new_state;
302 else
303 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
304 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
305 gtid, team->t.t_id, tid, team->t.t_id,
306 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
307 }
308 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
309 gtid, team->t.t_id, tid, bt));
310}
311
312static void
313__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
314 int propagate_icvs
315 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
316{
317 KMP_TIME_BLOCK(KMP_tree_release);
318 register kmp_team_t *team;
319 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
320 register kmp_uint32 nproc;
321 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
322 register kmp_uint32 branch_factor = 1 << branch_bits;
323 register kmp_uint32 child;
324 register kmp_uint32 child_tid;
325
326 // Perform a tree release for all of the threads that have been gathered
327 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
328 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
329 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
330 // Wait for parent thread to release us
331 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
332 flag.wait(this_thr, TRUE
333 USE_ITT_BUILD_ARG(itt_sync_obj) );
334#if USE_ITT_BUILD && USE_ITT_NOTIFY
335 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
336 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
337 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
338 // Cancel wait on previous parallel region...
339 __kmp_itt_task_starting(itt_sync_obj);
340
341 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
342 return;
343
344 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
345 if (itt_sync_obj != NULL)
346 // Call prepare as early as possible for "new" barrier
347 __kmp_itt_task_finished(itt_sync_obj);
348 } else
349#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
350 // Early exit for reaping threads releasing forkjoin barrier
351 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
352 return;
353
354 // The worker thread may now assume that the team is valid.
355 team = __kmp_threads[gtid]->th.th_team;
356 KMP_DEBUG_ASSERT(team != NULL);
357 tid = __kmp_tid_from_gtid(gtid);
358
359 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
360 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
361 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
362 KMP_MB(); // Flush all pending memory write invalidates.
363 } else {
364 team = __kmp_threads[gtid]->th.th_team;
365 KMP_DEBUG_ASSERT(team != NULL);
366 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
367 gtid, team->t.t_id, tid, bt));
368 }
369 nproc = this_thr->th.th_team_nproc;
370 child_tid = (tid << branch_bits) + 1;
371
372 if (child_tid < nproc) {
373 register kmp_info_t **other_threads = team->t.t_threads;
374 child = 1;
375 // Parent threads release all their children
376 do {
377 register kmp_info_t *child_thr = other_threads[child_tid];
378 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
379#if KMP_CACHE_MANAGE
380 // Prefetch next thread's go count
381 if (child+1 <= branch_factor && child_tid+1 < nproc)
382 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
383#endif /* KMP_CACHE_MANAGE */
384
385#if KMP_BARRIER_ICV_PUSH
386 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
387 if (propagate_icvs) {
388 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
389 team, child_tid, FALSE);
390 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
391 &team->t.t_implicit_task_taskdata[0].td_icvs);
392 }
393 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
394#endif // KMP_BARRIER_ICV_PUSH
395 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
396 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
397 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
398 child_tid, &child_bar->b_go, child_bar->b_go,
399 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
400 // Release child from barrier
401 kmp_flag_64 flag(&child_bar->b_go, child_thr);
402 flag.release();
403 child++;
404 child_tid++;
405 }
406 while (child <= branch_factor && child_tid < nproc);
407 }
408 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
409 gtid, team->t.t_id, tid, bt));
410}
411
412
413// Hyper Barrier
414static void
415__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
416 void (*reduce)(void *, void *)
417 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
418{
419 KMP_TIME_BLOCK(KMP_hyper_gather);
420 register kmp_team_t *team = this_thr->th.th_team;
421 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
422 register kmp_info_t **other_threads = team->t.t_threads;
423 register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
424 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
425 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
426 register kmp_uint32 branch_factor = 1 << branch_bits;
427 register kmp_uint32 offset;
428 register kmp_uint32 level;
429
430 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
431 gtid, team->t.t_id, tid, bt));
432
433 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
434
435#if USE_ITT_BUILD && USE_ITT_NOTIFY
436 // Barrier imbalance - save arrive time to the thread
437 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
438 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
439 }
440#endif
441 /* Perform a hypercube-embedded tree gather to wait until all of the threads have
442 arrived, and reduce any required data as we go. */
443 kmp_flag_64 p_flag(&thr_bar->b_arrived);
444 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
445 {
446 register kmp_uint32 child;
447 register kmp_uint32 child_tid;
448
449 if (((tid >> level) & (branch_factor - 1)) != 0) {
450 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
451
452 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
453 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
454 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
455 &thr_bar->b_arrived, thr_bar->b_arrived,
456 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
457 // Mark arrival to parent thread
458 /* After performing this write (in the last iteration of the enclosing for loop),
459 a worker thread may not assume that the team is valid any more - it could be
460 deallocated by the master thread at any time. */
461 p_flag.set_waiter(other_threads[parent_tid]);
462 p_flag.release();
463 break;
464 }
465
466 // Parent threads wait for children to arrive
467 if (new_state == KMP_BARRIER_UNUSED_STATE)
468 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
469 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
470 child++, child_tid+=(1 << level))
471 {
472 register kmp_info_t *child_thr = other_threads[child_tid];
473 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474#if KMP_CACHE_MANAGE
475 register kmp_uint32 next_child_tid = child_tid + (1 << level);
476 // Prefetch next thread's arrived count
477 if (child+1 < branch_factor && next_child_tid < num_threads)
478 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
479#endif /* KMP_CACHE_MANAGE */
480 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
481 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
482 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
483 &child_bar->b_arrived, new_state));
484 // Wait for child to arrive
485 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
486 c_flag.wait(this_thr, FALSE
487 USE_ITT_BUILD_ARG(itt_sync_obj) );
488#if USE_ITT_BUILD && USE_ITT_NOTIFY
489 // Barrier imbalance - write min of the thread time and a child time to the thread.
490 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
491 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
492 child_thr->th.th_bar_min_time);
493 }
494#endif
495 if (reduce) {
496 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
497 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
498 team->t.t_id, child_tid));
499 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
500 }
501 }
502 }
503
504 if (KMP_MASTER_TID(tid)) {
505 // Need to update the team arrived pointer if we are the master thread
506 if (new_state == KMP_BARRIER_UNUSED_STATE)
507 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
508 else
509 team->t.t_bar[bt].b_arrived = new_state;
510 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
511 gtid, team->t.t_id, tid, team->t.t_id,
512 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
513 }
514 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
515 gtid, team->t.t_id, tid, bt));
516}
517
518// The reverse versions seem to beat the forward versions overall
519#define KMP_REVERSE_HYPER_BAR
520static void
521__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
522 int propagate_icvs
523 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
524{
525 KMP_TIME_BLOCK(KMP_hyper_release);
526 register kmp_team_t *team;
527 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
528 register kmp_info_t **other_threads;
529 register kmp_uint32 num_threads;
530 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
531 register kmp_uint32 branch_factor = 1 << branch_bits;
532 register kmp_uint32 child;
533 register kmp_uint32 child_tid;
534 register kmp_uint32 offset;
535 register kmp_uint32 level;
536
537 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
538 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
539 order of the corresponding gather, otherwise threads are released in the same order. */
540 if (KMP_MASTER_TID(tid)) { // master
541 team = __kmp_threads[gtid]->th.th_team;
542 KMP_DEBUG_ASSERT(team != NULL);
543 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
544 gtid, team->t.t_id, tid, bt));
545#if KMP_BARRIER_ICV_PUSH
546 if (propagate_icvs) { // master already has ICVs in final destination; copy
547 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
548 }
549#endif
550 }
551 else { // Handle fork barrier workers who aren't part of a team yet
552 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
553 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
554 // Wait for parent thread to release us
555 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
556 flag.wait(this_thr, TRUE
557 USE_ITT_BUILD_ARG(itt_sync_obj) );
558#if USE_ITT_BUILD && USE_ITT_NOTIFY
559 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
560 // In fork barrier where we could not get the object reliably
561 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
562 // Cancel wait on previous parallel region...
563 __kmp_itt_task_starting(itt_sync_obj);
564
565 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
566 return;
567
568 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
569 if (itt_sync_obj != NULL)
570 // Call prepare as early as possible for "new" barrier
571 __kmp_itt_task_finished(itt_sync_obj);
572 } else
573#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
574 // Early exit for reaping threads releasing forkjoin barrier
575 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
576 return;
577
578 // The worker thread may now assume that the team is valid.
579 team = __kmp_threads[gtid]->th.th_team;
580 KMP_DEBUG_ASSERT(team != NULL);
581 tid = __kmp_tid_from_gtid(gtid);
582
583 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
584 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
585 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
586 KMP_MB(); // Flush all pending memory write invalidates.
587 }
588 num_threads = this_thr->th.th_team_nproc;
589 other_threads = team->t.t_threads;
590
591#ifdef KMP_REVERSE_HYPER_BAR
592 // Count up to correct level for parent
593 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
594 level+=branch_bits, offset<<=branch_bits);
595
596 // Now go down from there
597 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
598 level-=branch_bits, offset>>=branch_bits)
599#else
600 // Go down the tree, level by level
601 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
602#endif // KMP_REVERSE_HYPER_BAR
603 {
604#ifdef KMP_REVERSE_HYPER_BAR
605 /* Now go in reverse order through the children, highest to lowest.
606 Initial setting of child is conservative here. */
607 child = num_threads >> ((level==0)?level:level-1);
608 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
609 child>=1; child--, child_tid-=(1<<level))
610#else
611 if (((tid >> level) & (branch_factor - 1)) != 0)
612 // No need to go lower than this, since this is the level parent would be notified
613 break;
614 // Iterate through children on this level of the tree
615 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
616 child++, child_tid+=(1<<level))
617#endif // KMP_REVERSE_HYPER_BAR
618 {
619 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
620 else {
621 register kmp_info_t *child_thr = other_threads[child_tid];
622 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
623#if KMP_CACHE_MANAGE
624 register kmp_uint32 next_child_tid = child_tid - (1 << level);
625 // Prefetch next thread's go count
626# ifdef KMP_REVERSE_HYPER_BAR
627 if (child-1 >= 1 && next_child_tid < num_threads)
628# else
629 if (child+1 < branch_factor && next_child_tid < num_threads)
630# endif // KMP_REVERSE_HYPER_BAR
631 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
632#endif /* KMP_CACHE_MANAGE */
633
634#if KMP_BARRIER_ICV_PUSH
635 if (propagate_icvs) // push my fixed ICVs to my child
636 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
637#endif // KMP_BARRIER_ICV_PUSH
638
639 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
640 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
641 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
642 child_tid, &child_bar->b_go, child_bar->b_go,
643 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
644 // Release child from barrier
645 kmp_flag_64 flag(&child_bar->b_go, child_thr);
646 flag.release();
647 }
648 }
649 }
650#if KMP_BARRIER_ICV_PUSH
651 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
652 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
653 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
654 }
655#endif
656 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
657 gtid, team->t.t_id, tid, bt));
658}
659
660// Hierarchical Barrier
661
662// Initialize thread barrier data
663/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
664 minimum amount of initialization required based on how the team has changed. Returns true if
665 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
666 team size increases, threads already in the team will respond to on-core wakeup on their parent
667 thread, but threads newly added to the team will only be listening on the their local b_go. */
668static bool
669__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
670 int gtid, int tid, kmp_team_t *team)
671{
672 // Checks to determine if (re-)initialization is needed
673 bool uninitialized = thr_bar->team == NULL;
674 bool team_changed = team != thr_bar->team;
675 bool team_sz_changed = nproc != thr_bar->nproc;
676 bool tid_changed = tid != thr_bar->old_tid;
677 bool retval = false;
678
679 if (uninitialized || team_sz_changed) {
680 __kmp_get_hierarchy(nproc, thr_bar);
681 }
682
683 if (uninitialized || team_sz_changed || tid_changed) {
684 thr_bar->my_level = thr_bar->depth-1; // default for master
685 thr_bar->parent_tid = -1; // default for master
686 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
687 kmp_uint32 d=0;
688 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
689 kmp_uint32 rem;
690 if (d == thr_bar->depth-2) { // reached level right below the master
691 thr_bar->parent_tid = 0;
692 thr_bar->my_level = d;
693 break;
694 }
695 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
696 // thread is not a subtree root at next level, so this is max
697 thr_bar->parent_tid = tid - rem;
698 thr_bar->my_level = d;
699 break;
700 }
701 ++d;
702 }
703 }
704 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
705 thr_bar->old_tid = tid;
706 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
707 }
708 if (uninitialized || team_changed || tid_changed) {
709 thr_bar->team = team;
710 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
711 retval = true;
712 }
713 if (uninitialized || team_sz_changed || tid_changed) {
714 thr_bar->nproc = nproc;
715 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
716 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
717 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
718 thr_bar->leaf_kids = nproc - tid - 1;
719 thr_bar->leaf_state = 0;
720 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
721 }
722 return retval;
723}
724
725static void
726__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
727 int gtid, int tid, void (*reduce) (void *, void *)
728 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
729{
730 KMP_TIME_BLOCK(KMP_hier_gather);
731 register kmp_team_t *team = this_thr->th.th_team;
732 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
733 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
734 register kmp_info_t **other_threads = team->t.t_threads;
735 register kmp_uint64 new_state;
736
Andrey Churbanov42a79212015-01-27 16:50:31 +0000737 int level = team->t.t_level;
738 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
739 if (this_thr->th.th_teams_size.nteams > 1)
740 ++level; // level was not increased in teams construct for team_of_masters
741 if (level == 1) thr_bar->use_oncore_barrier = 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000742 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
743
744 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
745 gtid, team->t.t_id, tid, bt));
746 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
747
748 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
749
750 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
751 register kmp_int32 child_tid;
752 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
753 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
754 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
755 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
756 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
757 flag.wait(this_thr, FALSE
758 USE_ITT_BUILD_ARG(itt_sync_obj) );
759 if (reduce) {
760 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
761 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
762 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
763 team->t.t_id, child_tid));
764 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
765 }
766 }
767 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
768 }
769 // Next, wait for higher level children on each child's b_arrived flag
770 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
771 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
772 if (last > nproc) last = nproc;
773 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
774 register kmp_info_t *child_thr = other_threads[child_tid];
775 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
776 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
777 "arrived(%p) == %u\n",
778 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
779 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
780 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
781 flag.wait(this_thr, FALSE
782 USE_ITT_BUILD_ARG(itt_sync_obj) );
783 if (reduce) {
784 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
785 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
786 team->t.t_id, child_tid));
787 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
788 }
789 }
790 }
791 }
792 else { // Blocktime is not infinite
793 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
794 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
795 if (last > nproc) last = nproc;
796 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
797 register kmp_info_t *child_thr = other_threads[child_tid];
798 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
799 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
800 "arrived(%p) == %u\n",
801 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
802 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
803 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
804 flag.wait(this_thr, FALSE
805 USE_ITT_BUILD_ARG(itt_sync_obj) );
806 if (reduce) {
807 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
808 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
809 team->t.t_id, child_tid));
810 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
811 }
812 }
813 }
814 }
815 }
816 // All subordinates are gathered; now release parent if not master thread
817
818 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
819 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
820 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
821 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
822 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
823 /* Mark arrival to parent: After performing this write, a worker thread may not assume that
824 the team is valid any more - it could be deallocated by the master thread at any time. */
825 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
826 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
827 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
828 flag.release();
829 }
830 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
831 thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
832 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
833 flag.set_waiter(other_threads[thr_bar->parent_tid]);
834 flag.release();
835 }
836 } else { // Master thread needs to update the team's b_arrived value
837 team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
838 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
839 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
840 }
841 // Is the team access below unsafe or just technically invalid?
842 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
843 gtid, team->t.t_id, tid, bt));
844}
845
846static void
847__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
848 int propagate_icvs
849 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
850{
851 KMP_TIME_BLOCK(KMP_hier_release);
852 register kmp_team_t *team;
853 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
854 register kmp_uint32 nproc;
855 bool team_change = false; // indicates on-core barrier shouldn't be used
856
857 if (KMP_MASTER_TID(tid)) {
858 team = __kmp_threads[gtid]->th.th_team;
859 KMP_DEBUG_ASSERT(team != NULL);
860 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
861 gtid, team->t.t_id, tid, bt));
862 }
863 else { // Worker threads
864 // Wait for parent thread to release me
865 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
866 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
867 // Use traditional method of waiting on my own b_go flag
868 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
869 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
870 flag.wait(this_thr, TRUE
871 USE_ITT_BUILD_ARG(itt_sync_obj) );
872 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
873 }
874 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
875 // Wait on my "offset" bits on parent's b_go flag
876 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
877 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
878 bt, this_thr
879 USE_ITT_BUILD_ARG(itt_sync_obj) );
880 flag.wait(this_thr, TRUE
881 USE_ITT_BUILD_ARG(itt_sync_obj) );
882 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
883 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
884 }
885 else { // Reset my bits on parent's b_go flag
886 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
887 }
888 }
889 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
890 // Early exit for reaping threads releasing forkjoin barrier
891 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
892 return;
893 // The worker thread may now assume that the team is valid.
894 team = __kmp_threads[gtid]->th.th_team;
895 KMP_DEBUG_ASSERT(team != NULL);
896 tid = __kmp_tid_from_gtid(gtid);
897
898 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
899 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
900 KMP_MB(); // Flush all pending memory write invalidates.
901 }
902
Andrey Churbanov42a79212015-01-27 16:50:31 +0000903 int level = team->t.t_level;
904 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
905 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
906 ++level; // level was not increased in teams construct for team_of_workers
907 if( this_thr->th.th_teams_size.nteams > 1 )
908 ++level; // level was not increased in teams construct for team_of_masters
909 }
910 if (level == 1) thr_bar->use_oncore_barrier = 1;
911 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000912 nproc = this_thr->th.th_team_nproc;
913
914 // If the team size has increased, we still communicate with old leaves via oncore barrier.
915 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
916 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
917 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
918 // But if the entire team changes, we won't use oncore barrier at all
919 if (team_change) old_leaf_kids = 0;
920
921#if KMP_BARRIER_ICV_PUSH
922 if (propagate_icvs) {
923 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
924 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
925 }
926 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
927 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
928 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
929 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
930 &thr_bar->parent_bar->th_fixed_icvs);
931 // non-leaves will get ICVs piggybacked with b_go via NGO store
932 }
933 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
934 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
935 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
936 else // leaves copy parent's fixed ICVs directly to local ICV store
937 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
938 &thr_bar->parent_bar->th_fixed_icvs);
939 }
940 }
941#endif // KMP_BARRIER_ICV_PUSH
942
943 // Now, release my children
944 if (thr_bar->my_level) { // not a leaf
945 register kmp_int32 child_tid;
946 kmp_uint32 last;
947 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
948 if (KMP_MASTER_TID(tid)) { // do a flat release
949 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
950 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
951 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
952 ngo_load(&thr_bar->th_fixed_icvs);
953 // This loops over all the threads skipping only the leaf nodes in the hierarchy
954 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
955 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
956 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
957 " go(%p): %u => %u\n",
958 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
959 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
960 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
961 // Use ngo store (if available) to both store ICVs and release child via child's b_go
962 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
963 }
964 ngo_sync();
965 }
966 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
967 // Now, release leaf children
968 if (thr_bar->leaf_kids) { // if there are any
969 // We test team_change on the off-chance that the level 1 team changed.
970 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
971 if (old_leaf_kids) { // release old leaf kids
972 thr_bar->b_go |= old_leaf_state;
973 }
974 // Release new leaf kids
975 last = tid+thr_bar->skip_per_level[1];
976 if (last > nproc) last = nproc;
977 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
978 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
979 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
980 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
981 " T#%d(%d:%d) go(%p): %u => %u\n",
982 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
983 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
984 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
985 // Release child using child's b_go flag
986 kmp_flag_64 flag(&child_bar->b_go, child_thr);
987 flag.release();
988 }
989 }
990 else { // Release all children at once with leaf_state bits on my own b_go flag
991 thr_bar->b_go |= thr_bar->leaf_state;
992 }
993 }
994 }
995 else { // Blocktime is not infinite; do a simple hierarchical release
996 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
997 last = tid+thr_bar->skip_per_level[d+1];
998 kmp_uint32 skip = thr_bar->skip_per_level[d];
999 if (last > nproc) last = nproc;
1000 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1001 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1002 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1003 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1004 " go(%p): %u => %u\n",
1005 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1006 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1007 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1008 // Release child using child's b_go flag
1009 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1010 flag.release();
1011 }
1012 }
1013 }
1014#if KMP_BARRIER_ICV_PUSH
1015 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1016 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1017#endif // KMP_BARRIER_ICV_PUSH
1018 }
1019 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1020 gtid, team->t.t_id, tid, bt));
1021}
1022
1023// ---------------------------- End of Barrier Algorithms ----------------------------
1024
1025// Internal function to do a barrier.
1026/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1027 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1028 Returns 0 if master thread, 1 if worker thread. */
1029int
1030__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1031 void *reduce_data, void (*reduce)(void *, void *))
1032{
1033 KMP_TIME_BLOCK(KMP_barrier);
1034 register int tid = __kmp_tid_from_gtid(gtid);
1035 register kmp_info_t *this_thr = __kmp_threads[gtid];
1036 register kmp_team_t *team = this_thr->th.th_team;
1037 register int status = 0;
1038 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1039
1040 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1041 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1042
1043 if (! team->t.t_serialized) {
1044#if USE_ITT_BUILD
1045 // This value will be used in itt notify events below.
1046 void *itt_sync_obj = NULL;
1047# if USE_ITT_NOTIFY
1048 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1049 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1050# endif
1051#endif /* USE_ITT_BUILD */
1052 if (__kmp_tasking_mode == tskm_extra_barrier) {
1053 __kmp_tasking_barrier(team, this_thr, gtid);
1054 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1055 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1056 }
1057
1058 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1059 the team struct is not guaranteed to exist. */
1060 // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1061 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1062 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1063 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1064 }
1065
1066#if USE_ITT_BUILD
1067 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1068 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1069#endif /* USE_ITT_BUILD */
1070
1071 if (reduce != NULL) {
1072 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1073 this_thr->th.th_local.reduce_data = reduce_data;
1074 }
1075 switch (__kmp_barrier_gather_pattern[bt]) {
1076 case bp_hyper_bar: {
1077 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1078 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1079 USE_ITT_BUILD_ARG(itt_sync_obj) );
1080 break;
1081 }
1082 case bp_hierarchical_bar: {
1083 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1084 USE_ITT_BUILD_ARG(itt_sync_obj));
1085 break;
1086 }
1087 case bp_tree_bar: {
1088 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1089 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1090 USE_ITT_BUILD_ARG(itt_sync_obj) );
1091 break;
1092 }
1093 default: {
1094 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1095 USE_ITT_BUILD_ARG(itt_sync_obj) );
1096 }
1097 }
1098
1099 KMP_MB();
1100
1101 if (KMP_MASTER_TID(tid)) {
1102 status = 0;
1103 if (__kmp_tasking_mode != tskm_immediate_exec) {
1104 __kmp_task_team_wait(this_thr, team
1105 USE_ITT_BUILD_ARG(itt_sync_obj) );
1106 __kmp_task_team_setup(this_thr, team);
1107 }
1108
1109
1110#if USE_ITT_BUILD
1111 /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1112 before the final summation into the shared variable is done (final summation can be a
1113 long operation for array reductions). */
1114 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1115 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1116#endif /* USE_ITT_BUILD */
1117#if USE_ITT_BUILD && USE_ITT_NOTIFY
1118 // Barrier - report frame end
1119 if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1120 kmp_uint64 cur_time = __itt_get_timestamp();
1121 kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1122 int nproc = this_thr->th.th_team_nproc;
1123 int i;
1124 // Initialize with master's wait time
1125 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1126 switch(__kmp_forkjoin_frames_mode) {
1127 case 1:
1128 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1129 this_thr->th.th_frame_time = cur_time;
1130 break;
1131 case 2:
1132 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1133 break;
1134 case 3:
1135 if( __itt_metadata_add_ptr ) {
1136 for (i=1; i<nproc; ++i) {
1137 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1138 }
1139 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1140 }
1141 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1142 this_thr->th.th_frame_time = cur_time;
1143 break;
1144 }
1145 }
1146#endif /* USE_ITT_BUILD */
1147 } else {
1148 status = 1;
1149#if USE_ITT_BUILD
1150 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1151 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1152#endif /* USE_ITT_BUILD */
1153 }
1154 if (status == 1 || ! is_split) {
1155 switch (__kmp_barrier_release_pattern[bt]) {
1156 case bp_hyper_bar: {
1157 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1158 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1159 USE_ITT_BUILD_ARG(itt_sync_obj) );
1160 break;
1161 }
1162 case bp_hierarchical_bar: {
1163 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1164 USE_ITT_BUILD_ARG(itt_sync_obj) );
1165 break;
1166 }
1167 case bp_tree_bar: {
1168 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1169 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1170 USE_ITT_BUILD_ARG(itt_sync_obj) );
1171 break;
1172 }
1173 default: {
1174 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1175 USE_ITT_BUILD_ARG(itt_sync_obj) );
1176 }
1177 }
1178 if (__kmp_tasking_mode != tskm_immediate_exec) {
1179 __kmp_task_team_sync(this_thr, team);
1180 }
1181 }
1182
1183#if USE_ITT_BUILD
1184 /* GEH: TODO: Move this under if-condition above and also include in
1185 __kmp_end_split_barrier(). This will more accurately represent the actual release time
1186 of the threads for split barriers. */
1187 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1188 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1189#endif /* USE_ITT_BUILD */
1190 } else { // Team is serialized.
1191 status = 0;
1192 if (__kmp_tasking_mode != tskm_immediate_exec) {
1193 // The task team should be NULL for serialized code (tasks will be executed immediately)
1194 KMP_DEBUG_ASSERT(team->t.t_task_team == NULL);
1195 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1196 }
1197 }
1198 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1199 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1200 return status;
1201}
1202
1203
1204void
1205__kmp_end_split_barrier(enum barrier_type bt, int gtid)
1206{
1207 KMP_TIME_BLOCK(KMP_end_split_barrier);
1208 int tid = __kmp_tid_from_gtid(gtid);
1209 kmp_info_t *this_thr = __kmp_threads[gtid];
1210 kmp_team_t *team = this_thr->th.th_team;
1211
1212 if (!team->t.t_serialized) {
1213 if (KMP_MASTER_GTID(gtid)) {
1214 switch (__kmp_barrier_release_pattern[bt]) {
1215 case bp_hyper_bar: {
1216 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1217 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1218 USE_ITT_BUILD_ARG(NULL) );
1219 break;
1220 }
1221 case bp_hierarchical_bar: {
1222 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1223 USE_ITT_BUILD_ARG(NULL));
1224 break;
1225 }
1226 case bp_tree_bar: {
1227 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1228 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1229 USE_ITT_BUILD_ARG(NULL) );
1230 break;
1231 }
1232 default: {
1233 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1234 USE_ITT_BUILD_ARG(NULL) );
1235 }
1236 }
1237 if (__kmp_tasking_mode != tskm_immediate_exec) {
1238 __kmp_task_team_sync(this_thr, team);
1239 } // if
1240 }
1241 }
1242}
1243
1244
1245void
1246__kmp_join_barrier(int gtid)
1247{
1248 KMP_TIME_BLOCK(KMP_join_barrier);
1249 register kmp_info_t *this_thr = __kmp_threads[gtid];
1250 register kmp_team_t *team;
1251 register kmp_uint nproc;
1252 kmp_info_t *master_thread;
1253 int tid;
1254#ifdef KMP_DEBUG
1255 int team_id;
1256#endif /* KMP_DEBUG */
1257#if USE_ITT_BUILD
1258 void *itt_sync_obj = NULL;
1259# if USE_ITT_NOTIFY
1260 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1261 // Get object created at fork_barrier
1262 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1263# endif
1264#endif /* USE_ITT_BUILD */
1265 KMP_MB();
1266
1267 // Get current info
1268 team = this_thr->th.th_team;
1269 nproc = this_thr->th.th_team_nproc;
1270 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1271 tid = __kmp_tid_from_gtid(gtid);
1272#ifdef KMP_DEBUG
1273 team_id = team->t.t_id;
1274#endif /* KMP_DEBUG */
1275 master_thread = this_thr->th.th_team_master;
1276#ifdef KMP_DEBUG
1277 if (master_thread != team->t.t_threads[0]) {
1278 __kmp_print_structure();
1279 }
1280#endif /* KMP_DEBUG */
1281 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1282 KMP_MB();
1283
1284 // Verify state
1285 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1286 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1287 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1288 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1289 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1290
1291 if (__kmp_tasking_mode == tskm_extra_barrier) {
1292 __kmp_tasking_barrier(team, this_thr, gtid);
1293 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1294 }
1295# ifdef KMP_DEBUG
1296 if (__kmp_tasking_mode != tskm_immediate_exec) {
1297 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1298 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team,
1299 this_thr->th.th_task_team));
1300 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team);
1301 }
1302# endif /* KMP_DEBUG */
1303
1304 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1305 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1306 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1307 since the values are not used by __kmp_wait_template() in that case. */
1308 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1309 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1310 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1311 }
1312
1313#if USE_ITT_BUILD
1314 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1315 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1316#endif /* USE_ITT_BUILD */
1317
1318 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1319 case bp_hyper_bar: {
1320 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1321 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1322 USE_ITT_BUILD_ARG(itt_sync_obj) );
1323 break;
1324 }
1325 case bp_hierarchical_bar: {
1326 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1327 USE_ITT_BUILD_ARG(itt_sync_obj) );
1328 break;
1329 }
1330 case bp_tree_bar: {
1331 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1332 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1333 USE_ITT_BUILD_ARG(itt_sync_obj) );
1334 break;
1335 }
1336 default: {
1337 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1338 USE_ITT_BUILD_ARG(itt_sync_obj) );
1339 }
1340 }
1341
1342 /* From this point on, the team data structure may be deallocated at any time by the
1343 master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1344 data items that need to be referenced before the end of the barrier should be moved to
1345 the kmp_task_team_t structs. */
1346 if (KMP_MASTER_TID(tid)) {
1347 if (__kmp_tasking_mode != tskm_immediate_exec) {
1348 // Master shouldn't call decrease_load(). // TODO: enable master threads.
1349 // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1350 __kmp_task_team_wait(this_thr, team
1351 USE_ITT_BUILD_ARG(itt_sync_obj) );
1352 }
1353#if USE_ITT_BUILD
1354 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1355 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1356#endif /* USE_ITT_BUILD */
1357
1358# if USE_ITT_BUILD && USE_ITT_NOTIFY
1359 // Join barrier - report frame end
1360 if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1361 kmp_uint64 cur_time = __itt_get_timestamp();
1362 ident_t * loc = team->t.t_ident;
1363 kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1364 int nproc = this_thr->th.th_team_nproc;
1365 int i;
1366 // Initialize with master's wait time
1367 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1368 switch(__kmp_forkjoin_frames_mode) {
1369 case 1:
1370 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1371 break;
1372 case 2:
1373 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1374 break;
1375 case 3:
1376 if( __itt_metadata_add_ptr ) {
1377 for (i=1; i<nproc; ++i) {
1378 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1379 }
1380 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1381 }
1382 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1383 this_thr->th.th_frame_time = cur_time;
1384 break;
1385 }
1386 }
1387# endif /* USE_ITT_BUILD */
1388 }
1389#if USE_ITT_BUILD
1390 else {
1391 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1392 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1393 }
1394#endif /* USE_ITT_BUILD */
1395
1396#if KMP_DEBUG
1397 if (KMP_MASTER_TID(tid)) {
1398 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1399 gtid, team_id, tid, nproc));
1400 }
1401#endif /* KMP_DEBUG */
1402
1403 // TODO now, mark worker threads as done so they may be disbanded
1404 KMP_MB(); // Flush all pending memory write invalidates.
1405 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1406}
1407
1408
1409// TODO release worker threads' fork barriers as we are ready instead of all at once
1410void
1411__kmp_fork_barrier(int gtid, int tid)
1412{
1413 KMP_TIME_BLOCK(KMP_fork_barrier);
1414 kmp_info_t *this_thr = __kmp_threads[gtid];
1415 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1416#if USE_ITT_BUILD
1417 void * itt_sync_obj = NULL;
1418#endif /* USE_ITT_BUILD */
1419
1420 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1421 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1422
1423 // th_team pointer only valid for master thread here
1424 if (KMP_MASTER_TID(tid)) {
1425#if USE_ITT_BUILD && USE_ITT_NOTIFY
1426 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1427 // Create itt barrier object
1428 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1429 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1430 }
1431#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1432
1433#ifdef KMP_DEBUG
1434 register kmp_info_t **other_threads = team->t.t_threads;
1435 register int i;
1436
1437 // Verify state
1438 KMP_MB();
1439
1440 for(i=1; i<team->t.t_nproc; ++i) {
1441 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1442 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1443 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1444 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1445 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1446 & ~(KMP_BARRIER_SLEEP_STATE))
1447 == KMP_INIT_BARRIER_STATE);
1448 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1449 }
1450#endif
1451
1452 if (__kmp_tasking_mode != tskm_immediate_exec) {
1453 __kmp_task_team_setup(this_thr, team);
1454 }
1455
1456 /* The master thread may have changed its blocktime between the join barrier and the
1457 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1458 access it when the team struct is not guaranteed to exist. */
1459 // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1460 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1461 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1462 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1463 }
1464 } // master
1465
1466 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1467 case bp_hyper_bar: {
1468 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1469 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1470 USE_ITT_BUILD_ARG(itt_sync_obj) );
1471 break;
1472 }
1473 case bp_hierarchical_bar: {
1474 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1475 USE_ITT_BUILD_ARG(itt_sync_obj) );
1476 break;
1477 }
1478 case bp_tree_bar: {
1479 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1480 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1481 USE_ITT_BUILD_ARG(itt_sync_obj) );
1482 break;
1483 }
1484 default: {
1485 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1486 USE_ITT_BUILD_ARG(itt_sync_obj) );
1487 }
1488 }
1489
1490 // Early exit for reaping threads releasing forkjoin barrier
1491 if (TCR_4(__kmp_global.g.g_done)) {
1492 if (this_thr->th.th_task_team != NULL) {
1493 if (KMP_MASTER_TID(tid)) {
1494 TCW_PTR(this_thr->th.th_task_team, NULL);
1495 }
1496 else {
1497 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1498 }
1499 }
1500
1501#if USE_ITT_BUILD && USE_ITT_NOTIFY
1502 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1503 if (!KMP_MASTER_TID(tid)) {
1504 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1505 if (itt_sync_obj)
1506 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1507 }
1508 }
1509#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1510 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1511 return;
1512 }
1513
1514 /* We can now assume that a valid team structure has been allocated by the master and
1515 propagated to all worker threads. The current thread, however, may not be part of the
1516 team, so we can't blindly assume that the team pointer is non-null. */
1517 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1518 KMP_DEBUG_ASSERT(team != NULL);
1519 tid = __kmp_tid_from_gtid(gtid);
1520
1521
1522#if KMP_BARRIER_ICV_PULL
1523 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1524 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1525 this data before this function is called. We cannot modify __kmp_fork_call() to look at
1526 the fixed ICVs in the master's thread struct, because it is not always the case that the
1527 threads arrays have been allocated when __kmp_fork_call() is executed. */
1528 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
1529 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1530 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1531 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1532 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1533 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1534 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1535 }
1536 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
1537#endif // KMP_BARRIER_ICV_PULL
1538
1539 if (__kmp_tasking_mode != tskm_immediate_exec) {
1540 __kmp_task_team_sync(this_thr, team);
1541 }
1542
1543#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1544 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1545 if (proc_bind == proc_bind_intel) {
1546#endif
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001547#if KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001548 // Call dynamic affinity settings
1549 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1550 __kmp_balanced_affinity(tid, team->t.t_nproc);
1551 }
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001552#endif // KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001553#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1554 }
1555 else if ((proc_bind != proc_bind_false)
1556 && (proc_bind != proc_bind_disabled)) {
1557 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1558 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1559 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1560 }
1561 else {
1562 __kmp_affinity_set_place(gtid);
1563 }
1564 }
1565#endif
1566
1567#if USE_ITT_BUILD && USE_ITT_NOTIFY
1568 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1569 if (!KMP_MASTER_TID(tid)) {
1570 // Get correct barrier object
1571 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1572 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1573 } // (prepare called inside barrier_release)
1574 }
1575#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1576 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1577}
1578
1579
1580void
1581__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1582{
1583 KMP_TIME_BLOCK(KMP_setup_icv_copy);
1584 int f;
1585
1586 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1587 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1588
1589 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1590 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1591 this data before this function is called. */
1592#if KMP_BARRIER_ICV_PULL
1593 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1594 all of the worker threads can access them and make their own copies after the barrier. */
1595 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1596 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1597 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1598 0, team->t.t_threads[0], team));
1599#elif KMP_BARRIER_ICV_PUSH
1600 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1601 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1602 0, team->t.t_threads[0], team));
1603#else
1604 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1605 ngo_load(new_icvs);
1606 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1607 for (f=1; f<new_nproc; ++f) { // Skip the master thread
1608 // TODO: GEH - pass in better source location info since usually NULL here
1609 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1610 f, team->t.t_threads[f], team));
1611 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1612 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1613 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1614 f, team->t.t_threads[f], team));
1615 }
1616 ngo_sync();
1617#endif // KMP_BARRIER_ICV_PULL
1618}