blob: 5ece174128eee43422f49aaaeb15d651ab47e277 [file] [log] [blame]
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001/*
2 * kmp_barrier.cpp
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16#include "kmp.h"
17#include "kmp_wait_release.h"
18#include "kmp_stats.h"
19#include "kmp_itt.h"
20
21#if KMP_MIC
22#include <immintrin.h>
23#define USE_NGO_STORES 1
24#endif // KMP_MIC
25
26#if KMP_MIC && USE_NGO_STORES
27// ICV copying
28#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
29#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
30#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
32#else
33#define ngo_load(src) ((void)0)
34#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
35#define ngo_store_go(dst, src) memcpy((dst), (src), CACHE_LINE)
36#define ngo_sync() ((void)0)
37#endif /* KMP_MIC && USE_NGO_STORES */
38
39void __kmp_print_structure(void); // Forward declaration
40
41// ---------------------------- Barrier Algorithms ----------------------------
42
43// Linear Barrier
44static void
45__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
46 void (*reduce)(void *, void *)
47 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
48{
49 KMP_TIME_BLOCK(KMP_linear_gather);
50 register kmp_team_t *team = this_thr->th.th_team;
51 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
52 register kmp_info_t **other_threads = team->t.t_threads;
53
54 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
55 gtid, team->t.t_id, tid, bt));
56 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
57
58#if USE_ITT_BUILD && USE_ITT_NOTIFY
59 // Barrier imbalance - save arrive time to the thread
60 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
61 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
62 }
63#endif
64 // We now perform a linear reduction to signal that all of the threads have arrived.
65 if (!KMP_MASTER_TID(tid)) {
66 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
67 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
68 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
69 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
70 // Mark arrival to master thread
71 /* After performing this write, a worker thread may not assume that the team is valid
72 any more - it could be deallocated by the master thread at any time. */
73 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
74 flag.release();
75 } else {
76 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
77 register int nproc = this_thr->th.th_team_nproc;
78 register int i;
79 // Don't have to worry about sleep bit here or atomic since team setting
80 register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
81
82 // Collect all the worker team member threads.
83 for (i=1; i<nproc; ++i) {
84#if KMP_CACHE_MANAGE
85 // Prefetch next thread's arrived count
86 if (i+1 < nproc)
87 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
88#endif /* KMP_CACHE_MANAGE */
89 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
90 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
91 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
92 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
93
94 // Wait for worker thread to arrive
95 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
96 flag.wait(this_thr, FALSE
97 USE_ITT_BUILD_ARG(itt_sync_obj) );
98#if USE_ITT_BUILD && USE_ITT_NOTIFY
99 // Barrier imbalance - write min of the thread time and the other thread time to the thread.
100 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
101 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
102 other_threads[i]->th.th_bar_min_time);
103 }
104#endif
105 if (reduce) {
106 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
107 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
108 (*reduce)(this_thr->th.th_local.reduce_data,
109 other_threads[i]->th.th_local.reduce_data);
110 }
111 }
112 // Don't have to worry about sleep bit here or atomic since team setting
113 team_bar->b_arrived = new_state;
114 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
115 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
116 }
117 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
118 gtid, team->t.t_id, tid, bt));
119}
120
121static void
122__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
123 int propagate_icvs
124 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
125{
126 KMP_TIME_BLOCK(KMP_linear_release);
127 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
128 register kmp_team_t *team;
129
130 if (KMP_MASTER_TID(tid)) {
131 register unsigned int i;
132 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
133 register kmp_info_t **other_threads;
134
135 team = __kmp_threads[gtid]->th.th_team;
136 KMP_DEBUG_ASSERT(team != NULL);
137 other_threads = team->t.t_threads;
138
139 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
140 gtid, team->t.t_id, tid, bt));
141
142 if (nproc > 1) {
143#if KMP_BARRIER_ICV_PUSH
144 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
145 if (propagate_icvs) {
146 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
147 for (i=1; i<nproc; ++i) {
148 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
149 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
150 &team->t.t_implicit_task_taskdata[0].td_icvs);
151 }
152 ngo_sync();
153 }
154 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
155#endif // KMP_BARRIER_ICV_PUSH
156
157 // Now, release all of the worker threads
158 for (i=1; i<nproc; ++i) {
159#if KMP_CACHE_MANAGE
160 // Prefetch next thread's go flag
161 if (i+1 < nproc)
162 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
163#endif /* KMP_CACHE_MANAGE */
164 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
165 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
166 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
167 &other_threads[i]->th.th_bar[bt].bb.b_go,
168 other_threads[i]->th.th_bar[bt].bb.b_go,
169 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
170 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
171 flag.release();
172 }
173 }
174 } else { // Wait for the MASTER thread to release us
175 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
176 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
177 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
178 flag.wait(this_thr, TRUE
179 USE_ITT_BUILD_ARG(itt_sync_obj) );
180#if USE_ITT_BUILD && USE_ITT_NOTIFY
181 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
182 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
183 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
184 // Cancel wait on previous parallel region...
185 __kmp_itt_task_starting(itt_sync_obj);
186
187 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
188 return;
189
190 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
191 if (itt_sync_obj != NULL)
192 // Call prepare as early as possible for "new" barrier
193 __kmp_itt_task_finished(itt_sync_obj);
194 } else
195#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
196 // Early exit for reaping threads releasing forkjoin barrier
197 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
198 return;
199 // The worker thread may now assume that the team is valid.
200#ifdef KMP_DEBUG
201 tid = __kmp_tid_from_gtid(gtid);
202 team = __kmp_threads[gtid]->th.th_team;
203#endif
204 KMP_DEBUG_ASSERT(team != NULL);
205 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
206 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
207 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
208 KMP_MB(); // Flush all pending memory write invalidates.
209 }
210 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
211 gtid, team->t.t_id, tid, bt));
212}
213
214// Tree barrier
215static void
216__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
217 void (*reduce)(void *, void *)
218 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
219{
220 KMP_TIME_BLOCK(KMP_tree_gather);
221 register kmp_team_t *team = this_thr->th.th_team;
222 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
223 register kmp_info_t **other_threads = team->t.t_threads;
224 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
225 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
226 register kmp_uint32 branch_factor = 1 << branch_bits;
227 register kmp_uint32 child;
228 register kmp_uint32 child_tid;
229 register kmp_uint new_state;
230
231 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
232 gtid, team->t.t_id, tid, bt));
233 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
234
235#if USE_ITT_BUILD && USE_ITT_NOTIFY
236 // Barrier imbalance - save arrive time to the thread
237 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
238 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
239 }
240#endif
241 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
242 child_tid = (tid << branch_bits) + 1;
243 if (child_tid < nproc) {
244 // Parent threads wait for all their children to arrive
245 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
246 child = 1;
247 do {
248 register kmp_info_t *child_thr = other_threads[child_tid];
249 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
250#if KMP_CACHE_MANAGE
251 // Prefetch next thread's arrived count
252 if (child+1 <= branch_factor && child_tid+1 < nproc)
253 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
254#endif /* KMP_CACHE_MANAGE */
255 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
256 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
257 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
258 &child_bar->b_arrived, new_state));
259 // Wait for child to arrive
260 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
261 flag.wait(this_thr, FALSE
262 USE_ITT_BUILD_ARG(itt_sync_obj) );
263#if USE_ITT_BUILD && USE_ITT_NOTIFY
264 // Barrier imbalance - write min of the thread time and a child time to the thread.
265 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
266 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
267 child_thr->th.th_bar_min_time);
268 }
269#endif
270 if (reduce) {
271 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
272 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
273 team->t.t_id, child_tid));
274 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
275 }
276 child++;
277 child_tid++;
278 }
279 while (child <= branch_factor && child_tid < nproc);
280 }
281
282 if (!KMP_MASTER_TID(tid)) { // Worker threads
283 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
284
285 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
286 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
287 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
288 &thr_bar->b_arrived, thr_bar->b_arrived,
289 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
290
291 // Mark arrival to parent thread
292 /* After performing this write, a worker thread may not assume that the team is valid
293 any more - it could be deallocated by the master thread at any time. */
294 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
295 flag.release();
296 } else {
297 // Need to update the team arrived pointer if we are the master thread
298 if (nproc > 1) // New value was already computed above
299 team->t.t_bar[bt].b_arrived = new_state;
300 else
301 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
302 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
303 gtid, team->t.t_id, tid, team->t.t_id,
304 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
305 }
306 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
307 gtid, team->t.t_id, tid, bt));
308}
309
310static void
311__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
312 int propagate_icvs
313 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
314{
315 KMP_TIME_BLOCK(KMP_tree_release);
316 register kmp_team_t *team;
317 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
318 register kmp_uint32 nproc;
319 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
320 register kmp_uint32 branch_factor = 1 << branch_bits;
321 register kmp_uint32 child;
322 register kmp_uint32 child_tid;
323
324 // Perform a tree release for all of the threads that have been gathered
325 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
326 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
327 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
328 // Wait for parent thread to release us
329 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
330 flag.wait(this_thr, TRUE
331 USE_ITT_BUILD_ARG(itt_sync_obj) );
332#if USE_ITT_BUILD && USE_ITT_NOTIFY
333 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
334 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
335 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
336 // Cancel wait on previous parallel region...
337 __kmp_itt_task_starting(itt_sync_obj);
338
339 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
340 return;
341
342 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
343 if (itt_sync_obj != NULL)
344 // Call prepare as early as possible for "new" barrier
345 __kmp_itt_task_finished(itt_sync_obj);
346 } else
347#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
348 // Early exit for reaping threads releasing forkjoin barrier
349 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
350 return;
351
352 // The worker thread may now assume that the team is valid.
353 team = __kmp_threads[gtid]->th.th_team;
354 KMP_DEBUG_ASSERT(team != NULL);
355 tid = __kmp_tid_from_gtid(gtid);
356
357 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
358 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
359 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
360 KMP_MB(); // Flush all pending memory write invalidates.
361 } else {
362 team = __kmp_threads[gtid]->th.th_team;
363 KMP_DEBUG_ASSERT(team != NULL);
364 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
365 gtid, team->t.t_id, tid, bt));
366 }
367 nproc = this_thr->th.th_team_nproc;
368 child_tid = (tid << branch_bits) + 1;
369
370 if (child_tid < nproc) {
371 register kmp_info_t **other_threads = team->t.t_threads;
372 child = 1;
373 // Parent threads release all their children
374 do {
375 register kmp_info_t *child_thr = other_threads[child_tid];
376 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
377#if KMP_CACHE_MANAGE
378 // Prefetch next thread's go count
379 if (child+1 <= branch_factor && child_tid+1 < nproc)
380 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
381#endif /* KMP_CACHE_MANAGE */
382
383#if KMP_BARRIER_ICV_PUSH
384 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
385 if (propagate_icvs) {
386 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
387 team, child_tid, FALSE);
388 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
389 &team->t.t_implicit_task_taskdata[0].td_icvs);
390 }
391 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
392#endif // KMP_BARRIER_ICV_PUSH
393 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
394 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
395 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
396 child_tid, &child_bar->b_go, child_bar->b_go,
397 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
398 // Release child from barrier
399 kmp_flag_64 flag(&child_bar->b_go, child_thr);
400 flag.release();
401 child++;
402 child_tid++;
403 }
404 while (child <= branch_factor && child_tid < nproc);
405 }
406 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
407 gtid, team->t.t_id, tid, bt));
408}
409
410
411// Hyper Barrier
412static void
413__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
414 void (*reduce)(void *, void *)
415 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
416{
417 KMP_TIME_BLOCK(KMP_hyper_gather);
418 register kmp_team_t *team = this_thr->th.th_team;
419 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
420 register kmp_info_t **other_threads = team->t.t_threads;
421 register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
422 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
423 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
424 register kmp_uint32 branch_factor = 1 << branch_bits;
425 register kmp_uint32 offset;
426 register kmp_uint32 level;
427
428 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
429 gtid, team->t.t_id, tid, bt));
430
431 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
432
433#if USE_ITT_BUILD && USE_ITT_NOTIFY
434 // Barrier imbalance - save arrive time to the thread
435 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
436 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
437 }
438#endif
439 /* Perform a hypercube-embedded tree gather to wait until all of the threads have
440 arrived, and reduce any required data as we go. */
441 kmp_flag_64 p_flag(&thr_bar->b_arrived);
442 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
443 {
444 register kmp_uint32 child;
445 register kmp_uint32 child_tid;
446
447 if (((tid >> level) & (branch_factor - 1)) != 0) {
448 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
449
450 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
451 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
452 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
453 &thr_bar->b_arrived, thr_bar->b_arrived,
454 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
455 // Mark arrival to parent thread
456 /* After performing this write (in the last iteration of the enclosing for loop),
457 a worker thread may not assume that the team is valid any more - it could be
458 deallocated by the master thread at any time. */
459 p_flag.set_waiter(other_threads[parent_tid]);
460 p_flag.release();
461 break;
462 }
463
464 // Parent threads wait for children to arrive
465 if (new_state == KMP_BARRIER_UNUSED_STATE)
466 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
467 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
468 child++, child_tid+=(1 << level))
469 {
470 register kmp_info_t *child_thr = other_threads[child_tid];
471 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
472#if KMP_CACHE_MANAGE
473 register kmp_uint32 next_child_tid = child_tid + (1 << level);
474 // Prefetch next thread's arrived count
475 if (child+1 < branch_factor && next_child_tid < num_threads)
476 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
477#endif /* KMP_CACHE_MANAGE */
478 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
479 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
480 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
481 &child_bar->b_arrived, new_state));
482 // Wait for child to arrive
483 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
484 c_flag.wait(this_thr, FALSE
485 USE_ITT_BUILD_ARG(itt_sync_obj) );
486#if USE_ITT_BUILD && USE_ITT_NOTIFY
487 // Barrier imbalance - write min of the thread time and a child time to the thread.
488 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
489 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
490 child_thr->th.th_bar_min_time);
491 }
492#endif
493 if (reduce) {
494 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
495 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
496 team->t.t_id, child_tid));
497 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
498 }
499 }
500 }
501
502 if (KMP_MASTER_TID(tid)) {
503 // Need to update the team arrived pointer if we are the master thread
504 if (new_state == KMP_BARRIER_UNUSED_STATE)
505 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
506 else
507 team->t.t_bar[bt].b_arrived = new_state;
508 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
509 gtid, team->t.t_id, tid, team->t.t_id,
510 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
511 }
512 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
513 gtid, team->t.t_id, tid, bt));
514}
515
516// The reverse versions seem to beat the forward versions overall
517#define KMP_REVERSE_HYPER_BAR
518static void
519__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
520 int propagate_icvs
521 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
522{
523 KMP_TIME_BLOCK(KMP_hyper_release);
524 register kmp_team_t *team;
525 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
526 register kmp_info_t **other_threads;
527 register kmp_uint32 num_threads;
528 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
529 register kmp_uint32 branch_factor = 1 << branch_bits;
530 register kmp_uint32 child;
531 register kmp_uint32 child_tid;
532 register kmp_uint32 offset;
533 register kmp_uint32 level;
534
535 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
536 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
537 order of the corresponding gather, otherwise threads are released in the same order. */
538 if (KMP_MASTER_TID(tid)) { // master
539 team = __kmp_threads[gtid]->th.th_team;
540 KMP_DEBUG_ASSERT(team != NULL);
541 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
542 gtid, team->t.t_id, tid, bt));
543#if KMP_BARRIER_ICV_PUSH
544 if (propagate_icvs) { // master already has ICVs in final destination; copy
545 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
546 }
547#endif
548 }
549 else { // Handle fork barrier workers who aren't part of a team yet
550 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
551 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
552 // Wait for parent thread to release us
553 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
554 flag.wait(this_thr, TRUE
555 USE_ITT_BUILD_ARG(itt_sync_obj) );
556#if USE_ITT_BUILD && USE_ITT_NOTIFY
557 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
558 // In fork barrier where we could not get the object reliably
559 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
560 // Cancel wait on previous parallel region...
561 __kmp_itt_task_starting(itt_sync_obj);
562
563 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
564 return;
565
566 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
567 if (itt_sync_obj != NULL)
568 // Call prepare as early as possible for "new" barrier
569 __kmp_itt_task_finished(itt_sync_obj);
570 } else
571#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
572 // Early exit for reaping threads releasing forkjoin barrier
573 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
574 return;
575
576 // The worker thread may now assume that the team is valid.
577 team = __kmp_threads[gtid]->th.th_team;
578 KMP_DEBUG_ASSERT(team != NULL);
579 tid = __kmp_tid_from_gtid(gtid);
580
581 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
582 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
583 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
584 KMP_MB(); // Flush all pending memory write invalidates.
585 }
586 num_threads = this_thr->th.th_team_nproc;
587 other_threads = team->t.t_threads;
588
589#ifdef KMP_REVERSE_HYPER_BAR
590 // Count up to correct level for parent
591 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
592 level+=branch_bits, offset<<=branch_bits);
593
594 // Now go down from there
595 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
596 level-=branch_bits, offset>>=branch_bits)
597#else
598 // Go down the tree, level by level
599 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
600#endif // KMP_REVERSE_HYPER_BAR
601 {
602#ifdef KMP_REVERSE_HYPER_BAR
603 /* Now go in reverse order through the children, highest to lowest.
604 Initial setting of child is conservative here. */
605 child = num_threads >> ((level==0)?level:level-1);
606 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
607 child>=1; child--, child_tid-=(1<<level))
608#else
609 if (((tid >> level) & (branch_factor - 1)) != 0)
610 // No need to go lower than this, since this is the level parent would be notified
611 break;
612 // Iterate through children on this level of the tree
613 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
614 child++, child_tid+=(1<<level))
615#endif // KMP_REVERSE_HYPER_BAR
616 {
617 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
618 else {
619 register kmp_info_t *child_thr = other_threads[child_tid];
620 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
621#if KMP_CACHE_MANAGE
622 register kmp_uint32 next_child_tid = child_tid - (1 << level);
623 // Prefetch next thread's go count
624# ifdef KMP_REVERSE_HYPER_BAR
625 if (child-1 >= 1 && next_child_tid < num_threads)
626# else
627 if (child+1 < branch_factor && next_child_tid < num_threads)
628# endif // KMP_REVERSE_HYPER_BAR
629 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
630#endif /* KMP_CACHE_MANAGE */
631
632#if KMP_BARRIER_ICV_PUSH
633 if (propagate_icvs) // push my fixed ICVs to my child
634 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
635#endif // KMP_BARRIER_ICV_PUSH
636
637 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
638 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
639 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
640 child_tid, &child_bar->b_go, child_bar->b_go,
641 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
642 // Release child from barrier
643 kmp_flag_64 flag(&child_bar->b_go, child_thr);
644 flag.release();
645 }
646 }
647 }
648#if KMP_BARRIER_ICV_PUSH
649 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
650 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
651 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
652 }
653#endif
654 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
655 gtid, team->t.t_id, tid, bt));
656}
657
658// Hierarchical Barrier
659
660// Initialize thread barrier data
661/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
662 minimum amount of initialization required based on how the team has changed. Returns true if
663 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
664 team size increases, threads already in the team will respond to on-core wakeup on their parent
665 thread, but threads newly added to the team will only be listening on the their local b_go. */
666static bool
667__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
668 int gtid, int tid, kmp_team_t *team)
669{
670 // Checks to determine if (re-)initialization is needed
671 bool uninitialized = thr_bar->team == NULL;
672 bool team_changed = team != thr_bar->team;
673 bool team_sz_changed = nproc != thr_bar->nproc;
674 bool tid_changed = tid != thr_bar->old_tid;
675 bool retval = false;
676
677 if (uninitialized || team_sz_changed) {
678 __kmp_get_hierarchy(nproc, thr_bar);
679 }
680
681 if (uninitialized || team_sz_changed || tid_changed) {
682 thr_bar->my_level = thr_bar->depth-1; // default for master
683 thr_bar->parent_tid = -1; // default for master
684 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
685 kmp_uint32 d=0;
686 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
687 kmp_uint32 rem;
688 if (d == thr_bar->depth-2) { // reached level right below the master
689 thr_bar->parent_tid = 0;
690 thr_bar->my_level = d;
691 break;
692 }
693 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
694 // thread is not a subtree root at next level, so this is max
695 thr_bar->parent_tid = tid - rem;
696 thr_bar->my_level = d;
697 break;
698 }
699 ++d;
700 }
701 }
702 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
703 thr_bar->old_tid = tid;
704 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
705 }
706 if (uninitialized || team_changed || tid_changed) {
707 thr_bar->team = team;
708 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
709 retval = true;
710 }
711 if (uninitialized || team_sz_changed || tid_changed) {
712 thr_bar->nproc = nproc;
713 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
714 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
715 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
716 thr_bar->leaf_kids = nproc - tid - 1;
717 thr_bar->leaf_state = 0;
718 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
719 }
720 return retval;
721}
722
723static void
724__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
725 int gtid, int tid, void (*reduce) (void *, void *)
726 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
727{
728 KMP_TIME_BLOCK(KMP_hier_gather);
729 register kmp_team_t *team = this_thr->th.th_team;
730 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
731 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
732 register kmp_info_t **other_threads = team->t.t_threads;
733 register kmp_uint64 new_state;
734
Andrey Churbanov42a79212015-01-27 16:50:31 +0000735 int level = team->t.t_level;
736 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
737 if (this_thr->th.th_teams_size.nteams > 1)
738 ++level; // level was not increased in teams construct for team_of_masters
739 if (level == 1) thr_bar->use_oncore_barrier = 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000740 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
741
742 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
743 gtid, team->t.t_id, tid, bt));
744 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
745
746 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
747
748 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
749 register kmp_int32 child_tid;
750 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
751 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
752 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
753 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
754 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
755 flag.wait(this_thr, FALSE
756 USE_ITT_BUILD_ARG(itt_sync_obj) );
757 if (reduce) {
758 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
759 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
760 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
761 team->t.t_id, child_tid));
762 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
763 }
764 }
765 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
766 }
767 // Next, wait for higher level children on each child's b_arrived flag
768 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
769 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
770 if (last > nproc) last = nproc;
771 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
772 register kmp_info_t *child_thr = other_threads[child_tid];
773 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
774 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
775 "arrived(%p) == %u\n",
776 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
777 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
778 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
779 flag.wait(this_thr, FALSE
780 USE_ITT_BUILD_ARG(itt_sync_obj) );
781 if (reduce) {
782 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
783 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
784 team->t.t_id, child_tid));
785 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
786 }
787 }
788 }
789 }
790 else { // Blocktime is not infinite
791 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
792 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
793 if (last > nproc) last = nproc;
794 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
795 register kmp_info_t *child_thr = other_threads[child_tid];
796 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
797 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
798 "arrived(%p) == %u\n",
799 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
800 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
801 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
802 flag.wait(this_thr, FALSE
803 USE_ITT_BUILD_ARG(itt_sync_obj) );
804 if (reduce) {
805 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
806 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
807 team->t.t_id, child_tid));
808 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
809 }
810 }
811 }
812 }
813 }
814 // All subordinates are gathered; now release parent if not master thread
815
816 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
817 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
818 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
819 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
820 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
821 /* Mark arrival to parent: After performing this write, a worker thread may not assume that
822 the team is valid any more - it could be deallocated by the master thread at any time. */
823 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
824 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
825 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
826 flag.release();
827 }
828 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
829 thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
830 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
831 flag.set_waiter(other_threads[thr_bar->parent_tid]);
832 flag.release();
833 }
834 } else { // Master thread needs to update the team's b_arrived value
835 team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
836 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
837 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
838 }
839 // Is the team access below unsafe or just technically invalid?
840 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
841 gtid, team->t.t_id, tid, bt));
842}
843
844static void
845__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
846 int propagate_icvs
847 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
848{
849 KMP_TIME_BLOCK(KMP_hier_release);
850 register kmp_team_t *team;
851 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
852 register kmp_uint32 nproc;
853 bool team_change = false; // indicates on-core barrier shouldn't be used
854
855 if (KMP_MASTER_TID(tid)) {
856 team = __kmp_threads[gtid]->th.th_team;
857 KMP_DEBUG_ASSERT(team != NULL);
858 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
859 gtid, team->t.t_id, tid, bt));
860 }
861 else { // Worker threads
862 // Wait for parent thread to release me
863 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
864 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
865 // Use traditional method of waiting on my own b_go flag
866 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
867 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
868 flag.wait(this_thr, TRUE
869 USE_ITT_BUILD_ARG(itt_sync_obj) );
870 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
871 }
872 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
873 // Wait on my "offset" bits on parent's b_go flag
874 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
875 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
876 bt, this_thr
877 USE_ITT_BUILD_ARG(itt_sync_obj) );
878 flag.wait(this_thr, TRUE
879 USE_ITT_BUILD_ARG(itt_sync_obj) );
880 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
881 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
882 }
883 else { // Reset my bits on parent's b_go flag
884 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
885 }
886 }
887 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
888 // Early exit for reaping threads releasing forkjoin barrier
889 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
890 return;
891 // The worker thread may now assume that the team is valid.
892 team = __kmp_threads[gtid]->th.th_team;
893 KMP_DEBUG_ASSERT(team != NULL);
894 tid = __kmp_tid_from_gtid(gtid);
895
896 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
897 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
898 KMP_MB(); // Flush all pending memory write invalidates.
899 }
900
Andrey Churbanov42a79212015-01-27 16:50:31 +0000901 int level = team->t.t_level;
902 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
903 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
904 ++level; // level was not increased in teams construct for team_of_workers
905 if( this_thr->th.th_teams_size.nteams > 1 )
906 ++level; // level was not increased in teams construct for team_of_masters
907 }
908 if (level == 1) thr_bar->use_oncore_barrier = 1;
909 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000910 nproc = this_thr->th.th_team_nproc;
911
912 // If the team size has increased, we still communicate with old leaves via oncore barrier.
913 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
914 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
915 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
916 // But if the entire team changes, we won't use oncore barrier at all
917 if (team_change) old_leaf_kids = 0;
918
919#if KMP_BARRIER_ICV_PUSH
920 if (propagate_icvs) {
921 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
922 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
923 }
924 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
925 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
926 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
927 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
928 &thr_bar->parent_bar->th_fixed_icvs);
929 // non-leaves will get ICVs piggybacked with b_go via NGO store
930 }
931 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
932 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
933 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
934 else // leaves copy parent's fixed ICVs directly to local ICV store
935 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
936 &thr_bar->parent_bar->th_fixed_icvs);
937 }
938 }
939#endif // KMP_BARRIER_ICV_PUSH
940
941 // Now, release my children
942 if (thr_bar->my_level) { // not a leaf
943 register kmp_int32 child_tid;
944 kmp_uint32 last;
945 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
946 if (KMP_MASTER_TID(tid)) { // do a flat release
947 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
948 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
949 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
950 ngo_load(&thr_bar->th_fixed_icvs);
951 // This loops over all the threads skipping only the leaf nodes in the hierarchy
952 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
953 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
954 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
955 " go(%p): %u => %u\n",
956 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
957 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
958 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
959 // Use ngo store (if available) to both store ICVs and release child via child's b_go
960 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
961 }
962 ngo_sync();
963 }
964 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
965 // Now, release leaf children
966 if (thr_bar->leaf_kids) { // if there are any
967 // We test team_change on the off-chance that the level 1 team changed.
968 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
969 if (old_leaf_kids) { // release old leaf kids
970 thr_bar->b_go |= old_leaf_state;
971 }
972 // Release new leaf kids
973 last = tid+thr_bar->skip_per_level[1];
974 if (last > nproc) last = nproc;
975 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
976 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
977 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
978 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
979 " T#%d(%d:%d) go(%p): %u => %u\n",
980 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
981 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
982 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
983 // Release child using child's b_go flag
984 kmp_flag_64 flag(&child_bar->b_go, child_thr);
985 flag.release();
986 }
987 }
988 else { // Release all children at once with leaf_state bits on my own b_go flag
989 thr_bar->b_go |= thr_bar->leaf_state;
990 }
991 }
992 }
993 else { // Blocktime is not infinite; do a simple hierarchical release
994 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
995 last = tid+thr_bar->skip_per_level[d+1];
996 kmp_uint32 skip = thr_bar->skip_per_level[d];
997 if (last > nproc) last = nproc;
998 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
999 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1000 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1001 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1002 " go(%p): %u => %u\n",
1003 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1004 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1005 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1006 // Release child using child's b_go flag
1007 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1008 flag.release();
1009 }
1010 }
1011 }
1012#if KMP_BARRIER_ICV_PUSH
1013 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1014 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1015#endif // KMP_BARRIER_ICV_PUSH
1016 }
1017 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1018 gtid, team->t.t_id, tid, bt));
1019}
1020
1021// ---------------------------- End of Barrier Algorithms ----------------------------
1022
1023// Internal function to do a barrier.
1024/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1025 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1026 Returns 0 if master thread, 1 if worker thread. */
1027int
1028__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1029 void *reduce_data, void (*reduce)(void *, void *))
1030{
1031 KMP_TIME_BLOCK(KMP_barrier);
1032 register int tid = __kmp_tid_from_gtid(gtid);
1033 register kmp_info_t *this_thr = __kmp_threads[gtid];
1034 register kmp_team_t *team = this_thr->th.th_team;
1035 register int status = 0;
1036 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1037
1038 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1039 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1040
1041 if (! team->t.t_serialized) {
1042#if USE_ITT_BUILD
1043 // This value will be used in itt notify events below.
1044 void *itt_sync_obj = NULL;
1045# if USE_ITT_NOTIFY
1046 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1047 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1048# endif
1049#endif /* USE_ITT_BUILD */
1050 if (__kmp_tasking_mode == tskm_extra_barrier) {
1051 __kmp_tasking_barrier(team, this_thr, gtid);
1052 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1053 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1054 }
1055
1056 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1057 the team struct is not guaranteed to exist. */
1058 // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1059 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1060 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1061 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1062 }
1063
1064#if USE_ITT_BUILD
1065 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1066 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1067#endif /* USE_ITT_BUILD */
1068
1069 if (reduce != NULL) {
1070 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1071 this_thr->th.th_local.reduce_data = reduce_data;
1072 }
1073 switch (__kmp_barrier_gather_pattern[bt]) {
1074 case bp_hyper_bar: {
1075 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1076 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1077 USE_ITT_BUILD_ARG(itt_sync_obj) );
1078 break;
1079 }
1080 case bp_hierarchical_bar: {
1081 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1082 USE_ITT_BUILD_ARG(itt_sync_obj));
1083 break;
1084 }
1085 case bp_tree_bar: {
1086 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1087 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1088 USE_ITT_BUILD_ARG(itt_sync_obj) );
1089 break;
1090 }
1091 default: {
1092 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1093 USE_ITT_BUILD_ARG(itt_sync_obj) );
1094 }
1095 }
1096
1097 KMP_MB();
1098
1099 if (KMP_MASTER_TID(tid)) {
1100 status = 0;
1101 if (__kmp_tasking_mode != tskm_immediate_exec) {
1102 __kmp_task_team_wait(this_thr, team
1103 USE_ITT_BUILD_ARG(itt_sync_obj) );
1104 __kmp_task_team_setup(this_thr, team);
1105 }
1106
1107
1108#if USE_ITT_BUILD
1109 /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1110 before the final summation into the shared variable is done (final summation can be a
1111 long operation for array reductions). */
1112 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1113 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1114#endif /* USE_ITT_BUILD */
1115#if USE_ITT_BUILD && USE_ITT_NOTIFY
1116 // Barrier - report frame end
1117 if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1118 kmp_uint64 cur_time = __itt_get_timestamp();
1119 kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1120 int nproc = this_thr->th.th_team_nproc;
1121 int i;
1122 // Initialize with master's wait time
1123 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1124 switch(__kmp_forkjoin_frames_mode) {
1125 case 1:
1126 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1127 this_thr->th.th_frame_time = cur_time;
1128 break;
1129 case 2:
1130 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1131 break;
1132 case 3:
1133 if( __itt_metadata_add_ptr ) {
1134 for (i=1; i<nproc; ++i) {
1135 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1136 }
1137 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1138 }
1139 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1140 this_thr->th.th_frame_time = cur_time;
1141 break;
1142 }
1143 }
1144#endif /* USE_ITT_BUILD */
1145 } else {
1146 status = 1;
1147#if USE_ITT_BUILD
1148 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1149 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1150#endif /* USE_ITT_BUILD */
1151 }
1152 if (status == 1 || ! is_split) {
1153 switch (__kmp_barrier_release_pattern[bt]) {
1154 case bp_hyper_bar: {
1155 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1156 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1157 USE_ITT_BUILD_ARG(itt_sync_obj) );
1158 break;
1159 }
1160 case bp_hierarchical_bar: {
1161 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1162 USE_ITT_BUILD_ARG(itt_sync_obj) );
1163 break;
1164 }
1165 case bp_tree_bar: {
1166 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1167 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1168 USE_ITT_BUILD_ARG(itt_sync_obj) );
1169 break;
1170 }
1171 default: {
1172 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1173 USE_ITT_BUILD_ARG(itt_sync_obj) );
1174 }
1175 }
1176 if (__kmp_tasking_mode != tskm_immediate_exec) {
1177 __kmp_task_team_sync(this_thr, team);
1178 }
1179 }
1180
1181#if USE_ITT_BUILD
1182 /* GEH: TODO: Move this under if-condition above and also include in
1183 __kmp_end_split_barrier(). This will more accurately represent the actual release time
1184 of the threads for split barriers. */
1185 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1186 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1187#endif /* USE_ITT_BUILD */
1188 } else { // Team is serialized.
1189 status = 0;
1190 if (__kmp_tasking_mode != tskm_immediate_exec) {
1191 // The task team should be NULL for serialized code (tasks will be executed immediately)
1192 KMP_DEBUG_ASSERT(team->t.t_task_team == NULL);
1193 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1194 }
1195 }
1196 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1197 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1198 return status;
1199}
1200
1201
1202void
1203__kmp_end_split_barrier(enum barrier_type bt, int gtid)
1204{
1205 KMP_TIME_BLOCK(KMP_end_split_barrier);
1206 int tid = __kmp_tid_from_gtid(gtid);
1207 kmp_info_t *this_thr = __kmp_threads[gtid];
1208 kmp_team_t *team = this_thr->th.th_team;
1209
1210 if (!team->t.t_serialized) {
1211 if (KMP_MASTER_GTID(gtid)) {
1212 switch (__kmp_barrier_release_pattern[bt]) {
1213 case bp_hyper_bar: {
1214 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1215 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1216 USE_ITT_BUILD_ARG(NULL) );
1217 break;
1218 }
1219 case bp_hierarchical_bar: {
1220 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1221 USE_ITT_BUILD_ARG(NULL));
1222 break;
1223 }
1224 case bp_tree_bar: {
1225 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1226 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1227 USE_ITT_BUILD_ARG(NULL) );
1228 break;
1229 }
1230 default: {
1231 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1232 USE_ITT_BUILD_ARG(NULL) );
1233 }
1234 }
1235 if (__kmp_tasking_mode != tskm_immediate_exec) {
1236 __kmp_task_team_sync(this_thr, team);
1237 } // if
1238 }
1239 }
1240}
1241
1242
1243void
1244__kmp_join_barrier(int gtid)
1245{
1246 KMP_TIME_BLOCK(KMP_join_barrier);
1247 register kmp_info_t *this_thr = __kmp_threads[gtid];
1248 register kmp_team_t *team;
1249 register kmp_uint nproc;
1250 kmp_info_t *master_thread;
1251 int tid;
1252#ifdef KMP_DEBUG
1253 int team_id;
1254#endif /* KMP_DEBUG */
1255#if USE_ITT_BUILD
1256 void *itt_sync_obj = NULL;
1257# if USE_ITT_NOTIFY
1258 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1259 // Get object created at fork_barrier
1260 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1261# endif
1262#endif /* USE_ITT_BUILD */
1263 KMP_MB();
1264
1265 // Get current info
1266 team = this_thr->th.th_team;
1267 nproc = this_thr->th.th_team_nproc;
1268 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1269 tid = __kmp_tid_from_gtid(gtid);
1270#ifdef KMP_DEBUG
1271 team_id = team->t.t_id;
1272#endif /* KMP_DEBUG */
1273 master_thread = this_thr->th.th_team_master;
1274#ifdef KMP_DEBUG
1275 if (master_thread != team->t.t_threads[0]) {
1276 __kmp_print_structure();
1277 }
1278#endif /* KMP_DEBUG */
1279 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1280 KMP_MB();
1281
1282 // Verify state
1283 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1284 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1285 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1286 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1287 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1288
1289 if (__kmp_tasking_mode == tskm_extra_barrier) {
1290 __kmp_tasking_barrier(team, this_thr, gtid);
1291 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1292 }
1293# ifdef KMP_DEBUG
1294 if (__kmp_tasking_mode != tskm_immediate_exec) {
1295 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1296 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team,
1297 this_thr->th.th_task_team));
1298 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team);
1299 }
1300# endif /* KMP_DEBUG */
1301
1302 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1303 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1304 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1305 since the values are not used by __kmp_wait_template() in that case. */
1306 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1307 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1308 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1309 }
1310
1311#if USE_ITT_BUILD
1312 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1313 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1314#endif /* USE_ITT_BUILD */
1315
1316 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1317 case bp_hyper_bar: {
1318 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1319 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1320 USE_ITT_BUILD_ARG(itt_sync_obj) );
1321 break;
1322 }
1323 case bp_hierarchical_bar: {
1324 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1325 USE_ITT_BUILD_ARG(itt_sync_obj) );
1326 break;
1327 }
1328 case bp_tree_bar: {
1329 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1330 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1331 USE_ITT_BUILD_ARG(itt_sync_obj) );
1332 break;
1333 }
1334 default: {
1335 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1336 USE_ITT_BUILD_ARG(itt_sync_obj) );
1337 }
1338 }
1339
1340 /* From this point on, the team data structure may be deallocated at any time by the
1341 master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1342 data items that need to be referenced before the end of the barrier should be moved to
1343 the kmp_task_team_t structs. */
1344 if (KMP_MASTER_TID(tid)) {
1345 if (__kmp_tasking_mode != tskm_immediate_exec) {
1346 // Master shouldn't call decrease_load(). // TODO: enable master threads.
1347 // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1348 __kmp_task_team_wait(this_thr, team
1349 USE_ITT_BUILD_ARG(itt_sync_obj) );
1350 }
1351#if USE_ITT_BUILD
1352 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1353 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1354#endif /* USE_ITT_BUILD */
1355
1356# if USE_ITT_BUILD && USE_ITT_NOTIFY
1357 // Join barrier - report frame end
1358 if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1359 kmp_uint64 cur_time = __itt_get_timestamp();
1360 ident_t * loc = team->t.t_ident;
1361 kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1362 int nproc = this_thr->th.th_team_nproc;
1363 int i;
1364 // Initialize with master's wait time
1365 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1366 switch(__kmp_forkjoin_frames_mode) {
1367 case 1:
1368 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1369 break;
1370 case 2:
1371 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1372 break;
1373 case 3:
1374 if( __itt_metadata_add_ptr ) {
1375 for (i=1; i<nproc; ++i) {
1376 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1377 }
1378 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1379 }
1380 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1381 this_thr->th.th_frame_time = cur_time;
1382 break;
1383 }
1384 }
1385# endif /* USE_ITT_BUILD */
1386 }
1387#if USE_ITT_BUILD
1388 else {
1389 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1390 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1391 }
1392#endif /* USE_ITT_BUILD */
1393
1394#if KMP_DEBUG
1395 if (KMP_MASTER_TID(tid)) {
1396 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1397 gtid, team_id, tid, nproc));
1398 }
1399#endif /* KMP_DEBUG */
1400
1401 // TODO now, mark worker threads as done so they may be disbanded
1402 KMP_MB(); // Flush all pending memory write invalidates.
1403 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1404}
1405
1406
1407// TODO release worker threads' fork barriers as we are ready instead of all at once
1408void
1409__kmp_fork_barrier(int gtid, int tid)
1410{
1411 KMP_TIME_BLOCK(KMP_fork_barrier);
1412 kmp_info_t *this_thr = __kmp_threads[gtid];
1413 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1414#if USE_ITT_BUILD
1415 void * itt_sync_obj = NULL;
1416#endif /* USE_ITT_BUILD */
1417
1418 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1419 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1420
1421 // th_team pointer only valid for master thread here
1422 if (KMP_MASTER_TID(tid)) {
1423#if USE_ITT_BUILD && USE_ITT_NOTIFY
1424 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1425 // Create itt barrier object
1426 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1427 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1428 }
1429#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1430
1431#ifdef KMP_DEBUG
1432 register kmp_info_t **other_threads = team->t.t_threads;
1433 register int i;
1434
1435 // Verify state
1436 KMP_MB();
1437
1438 for(i=1; i<team->t.t_nproc; ++i) {
1439 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1440 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1441 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1442 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1443 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1444 & ~(KMP_BARRIER_SLEEP_STATE))
1445 == KMP_INIT_BARRIER_STATE);
1446 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1447 }
1448#endif
1449
1450 if (__kmp_tasking_mode != tskm_immediate_exec) {
1451 __kmp_task_team_setup(this_thr, team);
1452 }
1453
1454 /* The master thread may have changed its blocktime between the join barrier and the
1455 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1456 access it when the team struct is not guaranteed to exist. */
1457 // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1458 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1459 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1460 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1461 }
1462 } // master
1463
1464 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1465 case bp_hyper_bar: {
1466 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1467 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1468 USE_ITT_BUILD_ARG(itt_sync_obj) );
1469 break;
1470 }
1471 case bp_hierarchical_bar: {
1472 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1473 USE_ITT_BUILD_ARG(itt_sync_obj) );
1474 break;
1475 }
1476 case bp_tree_bar: {
1477 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1478 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1479 USE_ITT_BUILD_ARG(itt_sync_obj) );
1480 break;
1481 }
1482 default: {
1483 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1484 USE_ITT_BUILD_ARG(itt_sync_obj) );
1485 }
1486 }
1487
1488 // Early exit for reaping threads releasing forkjoin barrier
1489 if (TCR_4(__kmp_global.g.g_done)) {
1490 if (this_thr->th.th_task_team != NULL) {
1491 if (KMP_MASTER_TID(tid)) {
1492 TCW_PTR(this_thr->th.th_task_team, NULL);
1493 }
1494 else {
1495 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1496 }
1497 }
1498
1499#if USE_ITT_BUILD && USE_ITT_NOTIFY
1500 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1501 if (!KMP_MASTER_TID(tid)) {
1502 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1503 if (itt_sync_obj)
1504 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1505 }
1506 }
1507#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1508 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1509 return;
1510 }
1511
1512 /* We can now assume that a valid team structure has been allocated by the master and
1513 propagated to all worker threads. The current thread, however, may not be part of the
1514 team, so we can't blindly assume that the team pointer is non-null. */
1515 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1516 KMP_DEBUG_ASSERT(team != NULL);
1517 tid = __kmp_tid_from_gtid(gtid);
1518
1519
1520#if KMP_BARRIER_ICV_PULL
1521 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1522 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1523 this data before this function is called. We cannot modify __kmp_fork_call() to look at
1524 the fixed ICVs in the master's thread struct, because it is not always the case that the
1525 threads arrays have been allocated when __kmp_fork_call() is executed. */
1526 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
1527 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1528 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1529 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1530 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1531 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1532 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1533 }
1534 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
1535#endif // KMP_BARRIER_ICV_PULL
1536
1537 if (__kmp_tasking_mode != tskm_immediate_exec) {
1538 __kmp_task_team_sync(this_thr, team);
1539 }
1540
1541#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1542 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1543 if (proc_bind == proc_bind_intel) {
1544#endif
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001545#if KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001546 // Call dynamic affinity settings
1547 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1548 __kmp_balanced_affinity(tid, team->t.t_nproc);
1549 }
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001550#endif // KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001551#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1552 }
1553 else if ((proc_bind != proc_bind_false)
1554 && (proc_bind != proc_bind_disabled)) {
1555 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1556 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1557 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1558 }
1559 else {
1560 __kmp_affinity_set_place(gtid);
1561 }
1562 }
1563#endif
1564
1565#if USE_ITT_BUILD && USE_ITT_NOTIFY
1566 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1567 if (!KMP_MASTER_TID(tid)) {
1568 // Get correct barrier object
1569 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1570 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1571 } // (prepare called inside barrier_release)
1572 }
1573#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1574 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1575}
1576
1577
1578void
1579__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1580{
1581 KMP_TIME_BLOCK(KMP_setup_icv_copy);
1582 int f;
1583
1584 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1585 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1586
1587 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1588 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1589 this data before this function is called. */
1590#if KMP_BARRIER_ICV_PULL
1591 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1592 all of the worker threads can access them and make their own copies after the barrier. */
1593 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1594 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1595 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1596 0, team->t.t_threads[0], team));
1597#elif KMP_BARRIER_ICV_PUSH
1598 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1599 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1600 0, team->t.t_threads[0], team));
1601#else
1602 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1603 ngo_load(new_icvs);
1604 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1605 for (f=1; f<new_nproc; ++f) { // Skip the master thread
1606 // TODO: GEH - pass in better source location info since usually NULL here
1607 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1608 f, team->t.t_threads[f], team));
1609 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1610 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1611 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1612 f, team->t.t_threads[f], team));
1613 }
1614 ngo_sync();
1615#endif // KMP_BARRIER_ICV_PULL
1616}