blob: dc1d0ec8760a4d0a19a12f4c92e348d047079864 [file] [log] [blame]
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001/*
2 * kmp_barrier.cpp
3 * $Revision: 43473 $
4 * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
5 */
6
7
8//===----------------------------------------------------------------------===//
9//
10// The LLVM Compiler Infrastructure
11//
12// This file is dual licensed under the MIT and the University of Illinois Open
13// Source Licenses. See LICENSE.txt for details.
14//
15//===----------------------------------------------------------------------===//
16
17
18#include "kmp.h"
19#include "kmp_wait_release.h"
20#include "kmp_stats.h"
21#include "kmp_itt.h"
22
23#if KMP_MIC
24#include <immintrin.h>
25#define USE_NGO_STORES 1
26#endif // KMP_MIC
27
28#if KMP_MIC && USE_NGO_STORES
29// ICV copying
30#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
31#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
33#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
34#else
35#define ngo_load(src) ((void)0)
36#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
37#define ngo_store_go(dst, src) memcpy((dst), (src), CACHE_LINE)
38#define ngo_sync() ((void)0)
39#endif /* KMP_MIC && USE_NGO_STORES */
40
41void __kmp_print_structure(void); // Forward declaration
42
43// ---------------------------- Barrier Algorithms ----------------------------
44
45// Linear Barrier
46static void
47__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48 void (*reduce)(void *, void *)
49 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
50{
51 KMP_TIME_BLOCK(KMP_linear_gather);
52 register kmp_team_t *team = this_thr->th.th_team;
53 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
54 register kmp_info_t **other_threads = team->t.t_threads;
55
56 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57 gtid, team->t.t_id, tid, bt));
58 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59
60#if USE_ITT_BUILD && USE_ITT_NOTIFY
61 // Barrier imbalance - save arrive time to the thread
62 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
63 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
64 }
65#endif
66 // We now perform a linear reduction to signal that all of the threads have arrived.
67 if (!KMP_MASTER_TID(tid)) {
68 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
69 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
70 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
71 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
72 // Mark arrival to master thread
73 /* After performing this write, a worker thread may not assume that the team is valid
74 any more - it could be deallocated by the master thread at any time. */
75 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
76 flag.release();
77 } else {
78 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
79 register int nproc = this_thr->th.th_team_nproc;
80 register int i;
81 // Don't have to worry about sleep bit here or atomic since team setting
82 register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
83
84 // Collect all the worker team member threads.
85 for (i=1; i<nproc; ++i) {
86#if KMP_CACHE_MANAGE
87 // Prefetch next thread's arrived count
88 if (i+1 < nproc)
89 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
90#endif /* KMP_CACHE_MANAGE */
91 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
92 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
93 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
94 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
95
96 // Wait for worker thread to arrive
97 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
98 flag.wait(this_thr, FALSE
99 USE_ITT_BUILD_ARG(itt_sync_obj) );
100#if USE_ITT_BUILD && USE_ITT_NOTIFY
101 // Barrier imbalance - write min of the thread time and the other thread time to the thread.
102 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
103 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
104 other_threads[i]->th.th_bar_min_time);
105 }
106#endif
107 if (reduce) {
108 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
109 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
110 (*reduce)(this_thr->th.th_local.reduce_data,
111 other_threads[i]->th.th_local.reduce_data);
112 }
113 }
114 // Don't have to worry about sleep bit here or atomic since team setting
115 team_bar->b_arrived = new_state;
116 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
117 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
118 }
119 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
120 gtid, team->t.t_id, tid, bt));
121}
122
123static void
124__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
125 int propagate_icvs
126 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
127{
128 KMP_TIME_BLOCK(KMP_linear_release);
129 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
130 register kmp_team_t *team;
131
132 if (KMP_MASTER_TID(tid)) {
133 register unsigned int i;
134 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
135 register kmp_info_t **other_threads;
136
137 team = __kmp_threads[gtid]->th.th_team;
138 KMP_DEBUG_ASSERT(team != NULL);
139 other_threads = team->t.t_threads;
140
141 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
142 gtid, team->t.t_id, tid, bt));
143
144 if (nproc > 1) {
145#if KMP_BARRIER_ICV_PUSH
146 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
147 if (propagate_icvs) {
148 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
149 for (i=1; i<nproc; ++i) {
150 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
151 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
152 &team->t.t_implicit_task_taskdata[0].td_icvs);
153 }
154 ngo_sync();
155 }
156 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
157#endif // KMP_BARRIER_ICV_PUSH
158
159 // Now, release all of the worker threads
160 for (i=1; i<nproc; ++i) {
161#if KMP_CACHE_MANAGE
162 // Prefetch next thread's go flag
163 if (i+1 < nproc)
164 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
165#endif /* KMP_CACHE_MANAGE */
166 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
167 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
168 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
169 &other_threads[i]->th.th_bar[bt].bb.b_go,
170 other_threads[i]->th.th_bar[bt].bb.b_go,
171 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
172 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
173 flag.release();
174 }
175 }
176 } else { // Wait for the MASTER thread to release us
177 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
178 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
179 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
180 flag.wait(this_thr, TRUE
181 USE_ITT_BUILD_ARG(itt_sync_obj) );
182#if USE_ITT_BUILD && USE_ITT_NOTIFY
183 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
184 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
185 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
186 // Cancel wait on previous parallel region...
187 __kmp_itt_task_starting(itt_sync_obj);
188
189 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
190 return;
191
192 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
193 if (itt_sync_obj != NULL)
194 // Call prepare as early as possible for "new" barrier
195 __kmp_itt_task_finished(itt_sync_obj);
196 } else
197#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
198 // Early exit for reaping threads releasing forkjoin barrier
199 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
200 return;
201 // The worker thread may now assume that the team is valid.
202#ifdef KMP_DEBUG
203 tid = __kmp_tid_from_gtid(gtid);
204 team = __kmp_threads[gtid]->th.th_team;
205#endif
206 KMP_DEBUG_ASSERT(team != NULL);
207 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
208 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
209 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
210 KMP_MB(); // Flush all pending memory write invalidates.
211 }
212 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
213 gtid, team->t.t_id, tid, bt));
214}
215
216// Tree barrier
217static void
218__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
219 void (*reduce)(void *, void *)
220 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
221{
222 KMP_TIME_BLOCK(KMP_tree_gather);
223 register kmp_team_t *team = this_thr->th.th_team;
224 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
225 register kmp_info_t **other_threads = team->t.t_threads;
226 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
227 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
228 register kmp_uint32 branch_factor = 1 << branch_bits;
229 register kmp_uint32 child;
230 register kmp_uint32 child_tid;
231 register kmp_uint new_state;
232
233 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
234 gtid, team->t.t_id, tid, bt));
235 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
236
237#if USE_ITT_BUILD && USE_ITT_NOTIFY
238 // Barrier imbalance - save arrive time to the thread
239 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
240 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
241 }
242#endif
243 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
244 child_tid = (tid << branch_bits) + 1;
245 if (child_tid < nproc) {
246 // Parent threads wait for all their children to arrive
247 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
248 child = 1;
249 do {
250 register kmp_info_t *child_thr = other_threads[child_tid];
251 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
252#if KMP_CACHE_MANAGE
253 // Prefetch next thread's arrived count
254 if (child+1 <= branch_factor && child_tid+1 < nproc)
255 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
256#endif /* KMP_CACHE_MANAGE */
257 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
258 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
259 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
260 &child_bar->b_arrived, new_state));
261 // Wait for child to arrive
262 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
263 flag.wait(this_thr, FALSE
264 USE_ITT_BUILD_ARG(itt_sync_obj) );
265#if USE_ITT_BUILD && USE_ITT_NOTIFY
266 // Barrier imbalance - write min of the thread time and a child time to the thread.
267 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
268 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
269 child_thr->th.th_bar_min_time);
270 }
271#endif
272 if (reduce) {
273 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
274 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
275 team->t.t_id, child_tid));
276 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
277 }
278 child++;
279 child_tid++;
280 }
281 while (child <= branch_factor && child_tid < nproc);
282 }
283
284 if (!KMP_MASTER_TID(tid)) { // Worker threads
285 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
286
287 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
288 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
289 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
290 &thr_bar->b_arrived, thr_bar->b_arrived,
291 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
292
293 // Mark arrival to parent thread
294 /* After performing this write, a worker thread may not assume that the team is valid
295 any more - it could be deallocated by the master thread at any time. */
296 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
297 flag.release();
298 } else {
299 // Need to update the team arrived pointer if we are the master thread
300 if (nproc > 1) // New value was already computed above
301 team->t.t_bar[bt].b_arrived = new_state;
302 else
303 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
304 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
305 gtid, team->t.t_id, tid, team->t.t_id,
306 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
307 }
308 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
309 gtid, team->t.t_id, tid, bt));
310}
311
312static void
313__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
314 int propagate_icvs
315 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
316{
317 KMP_TIME_BLOCK(KMP_tree_release);
318 register kmp_team_t *team;
319 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
320 register kmp_uint32 nproc;
321 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
322 register kmp_uint32 branch_factor = 1 << branch_bits;
323 register kmp_uint32 child;
324 register kmp_uint32 child_tid;
325
326 // Perform a tree release for all of the threads that have been gathered
327 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
328 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
329 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
330 // Wait for parent thread to release us
331 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
332 flag.wait(this_thr, TRUE
333 USE_ITT_BUILD_ARG(itt_sync_obj) );
334#if USE_ITT_BUILD && USE_ITT_NOTIFY
335 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
336 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
337 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
338 // Cancel wait on previous parallel region...
339 __kmp_itt_task_starting(itt_sync_obj);
340
341 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
342 return;
343
344 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
345 if (itt_sync_obj != NULL)
346 // Call prepare as early as possible for "new" barrier
347 __kmp_itt_task_finished(itt_sync_obj);
348 } else
349#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
350 // Early exit for reaping threads releasing forkjoin barrier
351 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
352 return;
353
354 // The worker thread may now assume that the team is valid.
355 team = __kmp_threads[gtid]->th.th_team;
356 KMP_DEBUG_ASSERT(team != NULL);
357 tid = __kmp_tid_from_gtid(gtid);
358
359 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
360 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
361 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
362 KMP_MB(); // Flush all pending memory write invalidates.
363 } else {
364 team = __kmp_threads[gtid]->th.th_team;
365 KMP_DEBUG_ASSERT(team != NULL);
366 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
367 gtid, team->t.t_id, tid, bt));
368 }
369 nproc = this_thr->th.th_team_nproc;
370 child_tid = (tid << branch_bits) + 1;
371
372 if (child_tid < nproc) {
373 register kmp_info_t **other_threads = team->t.t_threads;
374 child = 1;
375 // Parent threads release all their children
376 do {
377 register kmp_info_t *child_thr = other_threads[child_tid];
378 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
379#if KMP_CACHE_MANAGE
380 // Prefetch next thread's go count
381 if (child+1 <= branch_factor && child_tid+1 < nproc)
382 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
383#endif /* KMP_CACHE_MANAGE */
384
385#if KMP_BARRIER_ICV_PUSH
386 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
387 if (propagate_icvs) {
388 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
389 team, child_tid, FALSE);
390 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
391 &team->t.t_implicit_task_taskdata[0].td_icvs);
392 }
393 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
394#endif // KMP_BARRIER_ICV_PUSH
395 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
396 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
397 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
398 child_tid, &child_bar->b_go, child_bar->b_go,
399 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
400 // Release child from barrier
401 kmp_flag_64 flag(&child_bar->b_go, child_thr);
402 flag.release();
403 child++;
404 child_tid++;
405 }
406 while (child <= branch_factor && child_tid < nproc);
407 }
408 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
409 gtid, team->t.t_id, tid, bt));
410}
411
412
413// Hyper Barrier
414static void
415__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
416 void (*reduce)(void *, void *)
417 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
418{
419 KMP_TIME_BLOCK(KMP_hyper_gather);
420 register kmp_team_t *team = this_thr->th.th_team;
421 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
422 register kmp_info_t **other_threads = team->t.t_threads;
423 register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
424 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
425 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
426 register kmp_uint32 branch_factor = 1 << branch_bits;
427 register kmp_uint32 offset;
428 register kmp_uint32 level;
429
430 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
431 gtid, team->t.t_id, tid, bt));
432
433 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
434
435#if USE_ITT_BUILD && USE_ITT_NOTIFY
436 // Barrier imbalance - save arrive time to the thread
437 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
438 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
439 }
440#endif
441 /* Perform a hypercube-embedded tree gather to wait until all of the threads have
442 arrived, and reduce any required data as we go. */
443 kmp_flag_64 p_flag(&thr_bar->b_arrived);
444 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
445 {
446 register kmp_uint32 child;
447 register kmp_uint32 child_tid;
448
449 if (((tid >> level) & (branch_factor - 1)) != 0) {
450 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
451
452 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
453 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
454 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
455 &thr_bar->b_arrived, thr_bar->b_arrived,
456 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
457 // Mark arrival to parent thread
458 /* After performing this write (in the last iteration of the enclosing for loop),
459 a worker thread may not assume that the team is valid any more - it could be
460 deallocated by the master thread at any time. */
461 p_flag.set_waiter(other_threads[parent_tid]);
462 p_flag.release();
463 break;
464 }
465
466 // Parent threads wait for children to arrive
467 if (new_state == KMP_BARRIER_UNUSED_STATE)
468 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
469 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
470 child++, child_tid+=(1 << level))
471 {
472 register kmp_info_t *child_thr = other_threads[child_tid];
473 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474#if KMP_CACHE_MANAGE
475 register kmp_uint32 next_child_tid = child_tid + (1 << level);
476 // Prefetch next thread's arrived count
477 if (child+1 < branch_factor && next_child_tid < num_threads)
478 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
479#endif /* KMP_CACHE_MANAGE */
480 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
481 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
482 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
483 &child_bar->b_arrived, new_state));
484 // Wait for child to arrive
485 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
486 c_flag.wait(this_thr, FALSE
487 USE_ITT_BUILD_ARG(itt_sync_obj) );
488#if USE_ITT_BUILD && USE_ITT_NOTIFY
489 // Barrier imbalance - write min of the thread time and a child time to the thread.
490 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
491 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
492 child_thr->th.th_bar_min_time);
493 }
494#endif
495 if (reduce) {
496 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
497 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
498 team->t.t_id, child_tid));
499 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
500 }
501 }
502 }
503
504 if (KMP_MASTER_TID(tid)) {
505 // Need to update the team arrived pointer if we are the master thread
506 if (new_state == KMP_BARRIER_UNUSED_STATE)
507 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
508 else
509 team->t.t_bar[bt].b_arrived = new_state;
510 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
511 gtid, team->t.t_id, tid, team->t.t_id,
512 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
513 }
514 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
515 gtid, team->t.t_id, tid, bt));
516}
517
518// The reverse versions seem to beat the forward versions overall
519#define KMP_REVERSE_HYPER_BAR
520static void
521__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
522 int propagate_icvs
523 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
524{
525 KMP_TIME_BLOCK(KMP_hyper_release);
526 register kmp_team_t *team;
527 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
528 register kmp_info_t **other_threads;
529 register kmp_uint32 num_threads;
530 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
531 register kmp_uint32 branch_factor = 1 << branch_bits;
532 register kmp_uint32 child;
533 register kmp_uint32 child_tid;
534 register kmp_uint32 offset;
535 register kmp_uint32 level;
536
537 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
538 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
539 order of the corresponding gather, otherwise threads are released in the same order. */
540 if (KMP_MASTER_TID(tid)) { // master
541 team = __kmp_threads[gtid]->th.th_team;
542 KMP_DEBUG_ASSERT(team != NULL);
543 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
544 gtid, team->t.t_id, tid, bt));
545#if KMP_BARRIER_ICV_PUSH
546 if (propagate_icvs) { // master already has ICVs in final destination; copy
547 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
548 }
549#endif
550 }
551 else { // Handle fork barrier workers who aren't part of a team yet
552 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
553 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
554 // Wait for parent thread to release us
555 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
556 flag.wait(this_thr, TRUE
557 USE_ITT_BUILD_ARG(itt_sync_obj) );
558#if USE_ITT_BUILD && USE_ITT_NOTIFY
559 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
560 // In fork barrier where we could not get the object reliably
561 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
562 // Cancel wait on previous parallel region...
563 __kmp_itt_task_starting(itt_sync_obj);
564
565 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
566 return;
567
568 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
569 if (itt_sync_obj != NULL)
570 // Call prepare as early as possible for "new" barrier
571 __kmp_itt_task_finished(itt_sync_obj);
572 } else
573#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
574 // Early exit for reaping threads releasing forkjoin barrier
575 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
576 return;
577
578 // The worker thread may now assume that the team is valid.
579 team = __kmp_threads[gtid]->th.th_team;
580 KMP_DEBUG_ASSERT(team != NULL);
581 tid = __kmp_tid_from_gtid(gtid);
582
583 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
584 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
585 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
586 KMP_MB(); // Flush all pending memory write invalidates.
587 }
588 num_threads = this_thr->th.th_team_nproc;
589 other_threads = team->t.t_threads;
590
591#ifdef KMP_REVERSE_HYPER_BAR
592 // Count up to correct level for parent
593 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
594 level+=branch_bits, offset<<=branch_bits);
595
596 // Now go down from there
597 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
598 level-=branch_bits, offset>>=branch_bits)
599#else
600 // Go down the tree, level by level
601 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
602#endif // KMP_REVERSE_HYPER_BAR
603 {
604#ifdef KMP_REVERSE_HYPER_BAR
605 /* Now go in reverse order through the children, highest to lowest.
606 Initial setting of child is conservative here. */
607 child = num_threads >> ((level==0)?level:level-1);
608 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
609 child>=1; child--, child_tid-=(1<<level))
610#else
611 if (((tid >> level) & (branch_factor - 1)) != 0)
612 // No need to go lower than this, since this is the level parent would be notified
613 break;
614 // Iterate through children on this level of the tree
615 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
616 child++, child_tid+=(1<<level))
617#endif // KMP_REVERSE_HYPER_BAR
618 {
619 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
620 else {
621 register kmp_info_t *child_thr = other_threads[child_tid];
622 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
623#if KMP_CACHE_MANAGE
624 register kmp_uint32 next_child_tid = child_tid - (1 << level);
625 // Prefetch next thread's go count
626# ifdef KMP_REVERSE_HYPER_BAR
627 if (child-1 >= 1 && next_child_tid < num_threads)
628# else
629 if (child+1 < branch_factor && next_child_tid < num_threads)
630# endif // KMP_REVERSE_HYPER_BAR
631 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
632#endif /* KMP_CACHE_MANAGE */
633
634#if KMP_BARRIER_ICV_PUSH
635 if (propagate_icvs) // push my fixed ICVs to my child
636 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
637#endif // KMP_BARRIER_ICV_PUSH
638
639 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
640 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
641 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
642 child_tid, &child_bar->b_go, child_bar->b_go,
643 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
644 // Release child from barrier
645 kmp_flag_64 flag(&child_bar->b_go, child_thr);
646 flag.release();
647 }
648 }
649 }
650#if KMP_BARRIER_ICV_PUSH
651 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
652 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
653 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
654 }
655#endif
656 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
657 gtid, team->t.t_id, tid, bt));
658}
659
660// Hierarchical Barrier
661
662// Initialize thread barrier data
663/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
664 minimum amount of initialization required based on how the team has changed. Returns true if
665 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
666 team size increases, threads already in the team will respond to on-core wakeup on their parent
667 thread, but threads newly added to the team will only be listening on the their local b_go. */
668static bool
669__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
670 int gtid, int tid, kmp_team_t *team)
671{
672 // Checks to determine if (re-)initialization is needed
673 bool uninitialized = thr_bar->team == NULL;
674 bool team_changed = team != thr_bar->team;
675 bool team_sz_changed = nproc != thr_bar->nproc;
676 bool tid_changed = tid != thr_bar->old_tid;
677 bool retval = false;
678
679 if (uninitialized || team_sz_changed) {
680 __kmp_get_hierarchy(nproc, thr_bar);
681 }
682
683 if (uninitialized || team_sz_changed || tid_changed) {
684 thr_bar->my_level = thr_bar->depth-1; // default for master
685 thr_bar->parent_tid = -1; // default for master
686 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
687 kmp_uint32 d=0;
688 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
689 kmp_uint32 rem;
690 if (d == thr_bar->depth-2) { // reached level right below the master
691 thr_bar->parent_tid = 0;
692 thr_bar->my_level = d;
693 break;
694 }
695 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
696 // thread is not a subtree root at next level, so this is max
697 thr_bar->parent_tid = tid - rem;
698 thr_bar->my_level = d;
699 break;
700 }
701 ++d;
702 }
703 }
704 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
705 thr_bar->old_tid = tid;
706 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
707 }
708 if (uninitialized || team_changed || tid_changed) {
709 thr_bar->team = team;
710 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
711 retval = true;
712 }
713 if (uninitialized || team_sz_changed || tid_changed) {
714 thr_bar->nproc = nproc;
715 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
716 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
717 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
718 thr_bar->leaf_kids = nproc - tid - 1;
719 thr_bar->leaf_state = 0;
720 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
721 }
722 return retval;
723}
724
725static void
726__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
727 int gtid, int tid, void (*reduce) (void *, void *)
728 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
729{
730 KMP_TIME_BLOCK(KMP_hier_gather);
731 register kmp_team_t *team = this_thr->th.th_team;
732 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
733 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
734 register kmp_info_t **other_threads = team->t.t_threads;
735 register kmp_uint64 new_state;
736
737 if (this_thr->th.th_team->t.t_level == 1) thr_bar->use_oncore_barrier = 1;
738 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
739
740 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
741 gtid, team->t.t_id, tid, bt));
742 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
743
744 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
745
746 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
747 register kmp_int32 child_tid;
748 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
749 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
750 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
751 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
752 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
753 flag.wait(this_thr, FALSE
754 USE_ITT_BUILD_ARG(itt_sync_obj) );
755 if (reduce) {
756 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
757 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
758 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
759 team->t.t_id, child_tid));
760 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
761 }
762 }
763 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
764 }
765 // Next, wait for higher level children on each child's b_arrived flag
766 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
767 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
768 if (last > nproc) last = nproc;
769 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
770 register kmp_info_t *child_thr = other_threads[child_tid];
771 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
772 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
773 "arrived(%p) == %u\n",
774 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
775 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
776 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
777 flag.wait(this_thr, FALSE
778 USE_ITT_BUILD_ARG(itt_sync_obj) );
779 if (reduce) {
780 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
781 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
782 team->t.t_id, child_tid));
783 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
784 }
785 }
786 }
787 }
788 else { // Blocktime is not infinite
789 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
790 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
791 if (last > nproc) last = nproc;
792 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
793 register kmp_info_t *child_thr = other_threads[child_tid];
794 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
795 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
796 "arrived(%p) == %u\n",
797 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
798 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
799 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
800 flag.wait(this_thr, FALSE
801 USE_ITT_BUILD_ARG(itt_sync_obj) );
802 if (reduce) {
803 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
804 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
805 team->t.t_id, child_tid));
806 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
807 }
808 }
809 }
810 }
811 }
812 // All subordinates are gathered; now release parent if not master thread
813
814 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
815 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
816 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
817 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
818 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
819 /* Mark arrival to parent: After performing this write, a worker thread may not assume that
820 the team is valid any more - it could be deallocated by the master thread at any time. */
821 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
822 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
823 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
824 flag.release();
825 }
826 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
827 thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
828 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
829 flag.set_waiter(other_threads[thr_bar->parent_tid]);
830 flag.release();
831 }
832 } else { // Master thread needs to update the team's b_arrived value
833 team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
834 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
835 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
836 }
Andrey Churbanov3984da52015-01-13 14:47:02 +0000837 // If nested, but outer level is top-level, resume use of oncore optimization
838 if (this_thr->th.th_team->t.t_level <=2) thr_bar->use_oncore_barrier = 1;
839 else thr_bar->use_oncore_barrier = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000840 // Is the team access below unsafe or just technically invalid?
841 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
842 gtid, team->t.t_id, tid, bt));
843}
844
845static void
846__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
847 int propagate_icvs
848 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
849{
850 KMP_TIME_BLOCK(KMP_hier_release);
851 register kmp_team_t *team;
852 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
853 register kmp_uint32 nproc;
854 bool team_change = false; // indicates on-core barrier shouldn't be used
855
856 if (KMP_MASTER_TID(tid)) {
857 team = __kmp_threads[gtid]->th.th_team;
858 KMP_DEBUG_ASSERT(team != NULL);
859 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
860 gtid, team->t.t_id, tid, bt));
861 }
862 else { // Worker threads
863 // Wait for parent thread to release me
864 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
865 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
866 // Use traditional method of waiting on my own b_go flag
867 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
868 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
869 flag.wait(this_thr, TRUE
870 USE_ITT_BUILD_ARG(itt_sync_obj) );
871 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
872 }
873 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
874 // Wait on my "offset" bits on parent's b_go flag
875 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
876 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
877 bt, this_thr
878 USE_ITT_BUILD_ARG(itt_sync_obj) );
879 flag.wait(this_thr, TRUE
880 USE_ITT_BUILD_ARG(itt_sync_obj) );
881 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
882 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
883 }
884 else { // Reset my bits on parent's b_go flag
885 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
886 }
887 }
888 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
889 // Early exit for reaping threads releasing forkjoin barrier
890 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
891 return;
892 // The worker thread may now assume that the team is valid.
893 team = __kmp_threads[gtid]->th.th_team;
894 KMP_DEBUG_ASSERT(team != NULL);
895 tid = __kmp_tid_from_gtid(gtid);
896
897 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
898 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
899 KMP_MB(); // Flush all pending memory write invalidates.
900 }
901
Andrey Churbanov3984da52015-01-13 14:47:02 +0000902 if (this_thr->th.th_team->t.t_level <= 1) thr_bar->use_oncore_barrier = 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000903 else thr_bar->use_oncore_barrier = 0;
904 nproc = this_thr->th.th_team_nproc;
905
906 // If the team size has increased, we still communicate with old leaves via oncore barrier.
907 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
908 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
909 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
910 // But if the entire team changes, we won't use oncore barrier at all
911 if (team_change) old_leaf_kids = 0;
912
913#if KMP_BARRIER_ICV_PUSH
914 if (propagate_icvs) {
915 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
916 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
917 }
918 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
919 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
920 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
921 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
922 &thr_bar->parent_bar->th_fixed_icvs);
923 // non-leaves will get ICVs piggybacked with b_go via NGO store
924 }
925 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
926 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
927 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
928 else // leaves copy parent's fixed ICVs directly to local ICV store
929 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
930 &thr_bar->parent_bar->th_fixed_icvs);
931 }
932 }
933#endif // KMP_BARRIER_ICV_PUSH
934
935 // Now, release my children
936 if (thr_bar->my_level) { // not a leaf
937 register kmp_int32 child_tid;
938 kmp_uint32 last;
939 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
940 if (KMP_MASTER_TID(tid)) { // do a flat release
941 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
942 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
943 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
944 ngo_load(&thr_bar->th_fixed_icvs);
945 // This loops over all the threads skipping only the leaf nodes in the hierarchy
946 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
947 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
948 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
949 " go(%p): %u => %u\n",
950 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
951 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
952 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
953 // Use ngo store (if available) to both store ICVs and release child via child's b_go
954 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
955 }
956 ngo_sync();
957 }
958 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
959 // Now, release leaf children
960 if (thr_bar->leaf_kids) { // if there are any
961 // We test team_change on the off-chance that the level 1 team changed.
962 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
963 if (old_leaf_kids) { // release old leaf kids
964 thr_bar->b_go |= old_leaf_state;
965 }
966 // Release new leaf kids
967 last = tid+thr_bar->skip_per_level[1];
968 if (last > nproc) last = nproc;
969 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
970 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
971 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
972 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
973 " T#%d(%d:%d) go(%p): %u => %u\n",
974 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
975 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
976 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
977 // Release child using child's b_go flag
978 kmp_flag_64 flag(&child_bar->b_go, child_thr);
979 flag.release();
980 }
981 }
982 else { // Release all children at once with leaf_state bits on my own b_go flag
983 thr_bar->b_go |= thr_bar->leaf_state;
984 }
985 }
986 }
987 else { // Blocktime is not infinite; do a simple hierarchical release
988 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
989 last = tid+thr_bar->skip_per_level[d+1];
990 kmp_uint32 skip = thr_bar->skip_per_level[d];
991 if (last > nproc) last = nproc;
992 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
993 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
994 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
995 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
996 " go(%p): %u => %u\n",
997 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
998 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
999 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1000 // Release child using child's b_go flag
1001 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1002 flag.release();
1003 }
1004 }
1005 }
1006#if KMP_BARRIER_ICV_PUSH
1007 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1008 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1009#endif // KMP_BARRIER_ICV_PUSH
1010 }
1011 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1012 gtid, team->t.t_id, tid, bt));
1013}
1014
1015// ---------------------------- End of Barrier Algorithms ----------------------------
1016
1017// Internal function to do a barrier.
1018/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1019 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1020 Returns 0 if master thread, 1 if worker thread. */
1021int
1022__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1023 void *reduce_data, void (*reduce)(void *, void *))
1024{
1025 KMP_TIME_BLOCK(KMP_barrier);
1026 register int tid = __kmp_tid_from_gtid(gtid);
1027 register kmp_info_t *this_thr = __kmp_threads[gtid];
1028 register kmp_team_t *team = this_thr->th.th_team;
1029 register int status = 0;
1030 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1031
1032 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1033 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1034
1035 if (! team->t.t_serialized) {
1036#if USE_ITT_BUILD
1037 // This value will be used in itt notify events below.
1038 void *itt_sync_obj = NULL;
1039# if USE_ITT_NOTIFY
1040 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1041 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1042# endif
1043#endif /* USE_ITT_BUILD */
1044 if (__kmp_tasking_mode == tskm_extra_barrier) {
1045 __kmp_tasking_barrier(team, this_thr, gtid);
1046 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1047 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1048 }
1049
1050 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1051 the team struct is not guaranteed to exist. */
1052 // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1053 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1054 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1055 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1056 }
1057
1058#if USE_ITT_BUILD
1059 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1060 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1061#endif /* USE_ITT_BUILD */
1062
1063 if (reduce != NULL) {
1064 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1065 this_thr->th.th_local.reduce_data = reduce_data;
1066 }
1067 switch (__kmp_barrier_gather_pattern[bt]) {
1068 case bp_hyper_bar: {
1069 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1070 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1071 USE_ITT_BUILD_ARG(itt_sync_obj) );
1072 break;
1073 }
1074 case bp_hierarchical_bar: {
1075 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1076 USE_ITT_BUILD_ARG(itt_sync_obj));
1077 break;
1078 }
1079 case bp_tree_bar: {
1080 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1081 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1082 USE_ITT_BUILD_ARG(itt_sync_obj) );
1083 break;
1084 }
1085 default: {
1086 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1087 USE_ITT_BUILD_ARG(itt_sync_obj) );
1088 }
1089 }
1090
1091 KMP_MB();
1092
1093 if (KMP_MASTER_TID(tid)) {
1094 status = 0;
1095 if (__kmp_tasking_mode != tskm_immediate_exec) {
1096 __kmp_task_team_wait(this_thr, team
1097 USE_ITT_BUILD_ARG(itt_sync_obj) );
1098 __kmp_task_team_setup(this_thr, team);
1099 }
1100
1101
1102#if USE_ITT_BUILD
1103 /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1104 before the final summation into the shared variable is done (final summation can be a
1105 long operation for array reductions). */
1106 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1107 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1108#endif /* USE_ITT_BUILD */
1109#if USE_ITT_BUILD && USE_ITT_NOTIFY
1110 // Barrier - report frame end
1111 if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1112 kmp_uint64 cur_time = __itt_get_timestamp();
1113 kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1114 int nproc = this_thr->th.th_team_nproc;
1115 int i;
1116 // Initialize with master's wait time
1117 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1118 switch(__kmp_forkjoin_frames_mode) {
1119 case 1:
1120 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1121 this_thr->th.th_frame_time = cur_time;
1122 break;
1123 case 2:
1124 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1125 break;
1126 case 3:
1127 if( __itt_metadata_add_ptr ) {
1128 for (i=1; i<nproc; ++i) {
1129 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1130 }
1131 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1132 }
1133 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1134 this_thr->th.th_frame_time = cur_time;
1135 break;
1136 }
1137 }
1138#endif /* USE_ITT_BUILD */
1139 } else {
1140 status = 1;
1141#if USE_ITT_BUILD
1142 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1143 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1144#endif /* USE_ITT_BUILD */
1145 }
1146 if (status == 1 || ! is_split) {
1147 switch (__kmp_barrier_release_pattern[bt]) {
1148 case bp_hyper_bar: {
1149 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1150 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1151 USE_ITT_BUILD_ARG(itt_sync_obj) );
1152 break;
1153 }
1154 case bp_hierarchical_bar: {
1155 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1156 USE_ITT_BUILD_ARG(itt_sync_obj) );
1157 break;
1158 }
1159 case bp_tree_bar: {
1160 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1161 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1162 USE_ITT_BUILD_ARG(itt_sync_obj) );
1163 break;
1164 }
1165 default: {
1166 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1167 USE_ITT_BUILD_ARG(itt_sync_obj) );
1168 }
1169 }
1170 if (__kmp_tasking_mode != tskm_immediate_exec) {
1171 __kmp_task_team_sync(this_thr, team);
1172 }
1173 }
1174
1175#if USE_ITT_BUILD
1176 /* GEH: TODO: Move this under if-condition above and also include in
1177 __kmp_end_split_barrier(). This will more accurately represent the actual release time
1178 of the threads for split barriers. */
1179 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1180 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1181#endif /* USE_ITT_BUILD */
1182 } else { // Team is serialized.
1183 status = 0;
1184 if (__kmp_tasking_mode != tskm_immediate_exec) {
1185 // The task team should be NULL for serialized code (tasks will be executed immediately)
1186 KMP_DEBUG_ASSERT(team->t.t_task_team == NULL);
1187 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1188 }
1189 }
1190 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1191 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1192 return status;
1193}
1194
1195
1196void
1197__kmp_end_split_barrier(enum barrier_type bt, int gtid)
1198{
1199 KMP_TIME_BLOCK(KMP_end_split_barrier);
1200 int tid = __kmp_tid_from_gtid(gtid);
1201 kmp_info_t *this_thr = __kmp_threads[gtid];
1202 kmp_team_t *team = this_thr->th.th_team;
1203
1204 if (!team->t.t_serialized) {
1205 if (KMP_MASTER_GTID(gtid)) {
1206 switch (__kmp_barrier_release_pattern[bt]) {
1207 case bp_hyper_bar: {
1208 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1209 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1210 USE_ITT_BUILD_ARG(NULL) );
1211 break;
1212 }
1213 case bp_hierarchical_bar: {
1214 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1215 USE_ITT_BUILD_ARG(NULL));
1216 break;
1217 }
1218 case bp_tree_bar: {
1219 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1220 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1221 USE_ITT_BUILD_ARG(NULL) );
1222 break;
1223 }
1224 default: {
1225 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1226 USE_ITT_BUILD_ARG(NULL) );
1227 }
1228 }
1229 if (__kmp_tasking_mode != tskm_immediate_exec) {
1230 __kmp_task_team_sync(this_thr, team);
1231 } // if
1232 }
1233 }
1234}
1235
1236
1237void
1238__kmp_join_barrier(int gtid)
1239{
1240 KMP_TIME_BLOCK(KMP_join_barrier);
1241 register kmp_info_t *this_thr = __kmp_threads[gtid];
1242 register kmp_team_t *team;
1243 register kmp_uint nproc;
1244 kmp_info_t *master_thread;
1245 int tid;
1246#ifdef KMP_DEBUG
1247 int team_id;
1248#endif /* KMP_DEBUG */
1249#if USE_ITT_BUILD
1250 void *itt_sync_obj = NULL;
1251# if USE_ITT_NOTIFY
1252 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1253 // Get object created at fork_barrier
1254 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1255# endif
1256#endif /* USE_ITT_BUILD */
1257 KMP_MB();
1258
1259 // Get current info
1260 team = this_thr->th.th_team;
1261 nproc = this_thr->th.th_team_nproc;
1262 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1263 tid = __kmp_tid_from_gtid(gtid);
1264#ifdef KMP_DEBUG
1265 team_id = team->t.t_id;
1266#endif /* KMP_DEBUG */
1267 master_thread = this_thr->th.th_team_master;
1268#ifdef KMP_DEBUG
1269 if (master_thread != team->t.t_threads[0]) {
1270 __kmp_print_structure();
1271 }
1272#endif /* KMP_DEBUG */
1273 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1274 KMP_MB();
1275
1276 // Verify state
1277 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1278 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1279 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1280 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1281 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1282
1283 if (__kmp_tasking_mode == tskm_extra_barrier) {
1284 __kmp_tasking_barrier(team, this_thr, gtid);
1285 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1286 }
1287# ifdef KMP_DEBUG
1288 if (__kmp_tasking_mode != tskm_immediate_exec) {
1289 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1290 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team,
1291 this_thr->th.th_task_team));
1292 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team);
1293 }
1294# endif /* KMP_DEBUG */
1295
1296 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1297 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1298 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1299 since the values are not used by __kmp_wait_template() in that case. */
1300 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1301 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1302 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1303 }
1304
1305#if USE_ITT_BUILD
1306 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1307 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1308#endif /* USE_ITT_BUILD */
1309
1310 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1311 case bp_hyper_bar: {
1312 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1313 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1314 USE_ITT_BUILD_ARG(itt_sync_obj) );
1315 break;
1316 }
1317 case bp_hierarchical_bar: {
1318 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1319 USE_ITT_BUILD_ARG(itt_sync_obj) );
1320 break;
1321 }
1322 case bp_tree_bar: {
1323 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1324 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1325 USE_ITT_BUILD_ARG(itt_sync_obj) );
1326 break;
1327 }
1328 default: {
1329 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1330 USE_ITT_BUILD_ARG(itt_sync_obj) );
1331 }
1332 }
1333
1334 /* From this point on, the team data structure may be deallocated at any time by the
1335 master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1336 data items that need to be referenced before the end of the barrier should be moved to
1337 the kmp_task_team_t structs. */
1338 if (KMP_MASTER_TID(tid)) {
1339 if (__kmp_tasking_mode != tskm_immediate_exec) {
1340 // Master shouldn't call decrease_load(). // TODO: enable master threads.
1341 // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1342 __kmp_task_team_wait(this_thr, team
1343 USE_ITT_BUILD_ARG(itt_sync_obj) );
1344 }
1345#if USE_ITT_BUILD
1346 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1347 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1348#endif /* USE_ITT_BUILD */
1349
1350# if USE_ITT_BUILD && USE_ITT_NOTIFY
1351 // Join barrier - report frame end
1352 if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1353 kmp_uint64 cur_time = __itt_get_timestamp();
1354 ident_t * loc = team->t.t_ident;
1355 kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1356 int nproc = this_thr->th.th_team_nproc;
1357 int i;
1358 // Initialize with master's wait time
1359 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1360 switch(__kmp_forkjoin_frames_mode) {
1361 case 1:
1362 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1363 break;
1364 case 2:
1365 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1366 break;
1367 case 3:
1368 if( __itt_metadata_add_ptr ) {
1369 for (i=1; i<nproc; ++i) {
1370 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1371 }
1372 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1373 }
1374 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1375 this_thr->th.th_frame_time = cur_time;
1376 break;
1377 }
1378 }
1379# endif /* USE_ITT_BUILD */
1380 }
1381#if USE_ITT_BUILD
1382 else {
1383 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1384 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1385 }
1386#endif /* USE_ITT_BUILD */
1387
1388#if KMP_DEBUG
1389 if (KMP_MASTER_TID(tid)) {
1390 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1391 gtid, team_id, tid, nproc));
1392 }
1393#endif /* KMP_DEBUG */
1394
1395 // TODO now, mark worker threads as done so they may be disbanded
1396 KMP_MB(); // Flush all pending memory write invalidates.
1397 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1398}
1399
1400
1401// TODO release worker threads' fork barriers as we are ready instead of all at once
1402void
1403__kmp_fork_barrier(int gtid, int tid)
1404{
1405 KMP_TIME_BLOCK(KMP_fork_barrier);
1406 kmp_info_t *this_thr = __kmp_threads[gtid];
1407 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1408#if USE_ITT_BUILD
1409 void * itt_sync_obj = NULL;
1410#endif /* USE_ITT_BUILD */
1411
1412 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1413 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1414
1415 // th_team pointer only valid for master thread here
1416 if (KMP_MASTER_TID(tid)) {
1417#if USE_ITT_BUILD && USE_ITT_NOTIFY
1418 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1419 // Create itt barrier object
1420 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1421 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1422 }
1423#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1424
1425#ifdef KMP_DEBUG
1426 register kmp_info_t **other_threads = team->t.t_threads;
1427 register int i;
1428
1429 // Verify state
1430 KMP_MB();
1431
1432 for(i=1; i<team->t.t_nproc; ++i) {
1433 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1434 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1435 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1436 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1437 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1438 & ~(KMP_BARRIER_SLEEP_STATE))
1439 == KMP_INIT_BARRIER_STATE);
1440 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1441 }
1442#endif
1443
1444 if (__kmp_tasking_mode != tskm_immediate_exec) {
1445 __kmp_task_team_setup(this_thr, team);
1446 }
1447
1448 /* The master thread may have changed its blocktime between the join barrier and the
1449 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1450 access it when the team struct is not guaranteed to exist. */
1451 // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1452 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1453 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1454 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1455 }
1456 } // master
1457
1458 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1459 case bp_hyper_bar: {
1460 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1461 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1462 USE_ITT_BUILD_ARG(itt_sync_obj) );
1463 break;
1464 }
1465 case bp_hierarchical_bar: {
1466 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1467 USE_ITT_BUILD_ARG(itt_sync_obj) );
1468 break;
1469 }
1470 case bp_tree_bar: {
1471 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1472 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1473 USE_ITT_BUILD_ARG(itt_sync_obj) );
1474 break;
1475 }
1476 default: {
1477 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1478 USE_ITT_BUILD_ARG(itt_sync_obj) );
1479 }
1480 }
1481
1482 // Early exit for reaping threads releasing forkjoin barrier
1483 if (TCR_4(__kmp_global.g.g_done)) {
1484 if (this_thr->th.th_task_team != NULL) {
1485 if (KMP_MASTER_TID(tid)) {
1486 TCW_PTR(this_thr->th.th_task_team, NULL);
1487 }
1488 else {
1489 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1490 }
1491 }
1492
1493#if USE_ITT_BUILD && USE_ITT_NOTIFY
1494 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1495 if (!KMP_MASTER_TID(tid)) {
1496 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1497 if (itt_sync_obj)
1498 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1499 }
1500 }
1501#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1502 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1503 return;
1504 }
1505
1506 /* We can now assume that a valid team structure has been allocated by the master and
1507 propagated to all worker threads. The current thread, however, may not be part of the
1508 team, so we can't blindly assume that the team pointer is non-null. */
1509 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1510 KMP_DEBUG_ASSERT(team != NULL);
1511 tid = __kmp_tid_from_gtid(gtid);
1512
1513
1514#if KMP_BARRIER_ICV_PULL
1515 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1516 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1517 this data before this function is called. We cannot modify __kmp_fork_call() to look at
1518 the fixed ICVs in the master's thread struct, because it is not always the case that the
1519 threads arrays have been allocated when __kmp_fork_call() is executed. */
1520 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
1521 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1522 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1523 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1524 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1525 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1526 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1527 }
1528 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
1529#endif // KMP_BARRIER_ICV_PULL
1530
1531 if (__kmp_tasking_mode != tskm_immediate_exec) {
1532 __kmp_task_team_sync(this_thr, team);
1533 }
1534
1535#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1536 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1537 if (proc_bind == proc_bind_intel) {
1538#endif
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001539#if KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001540 // Call dynamic affinity settings
1541 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1542 __kmp_balanced_affinity(tid, team->t.t_nproc);
1543 }
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001544#endif // KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001545#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1546 }
1547 else if ((proc_bind != proc_bind_false)
1548 && (proc_bind != proc_bind_disabled)) {
1549 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1550 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1551 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1552 }
1553 else {
1554 __kmp_affinity_set_place(gtid);
1555 }
1556 }
1557#endif
1558
1559#if USE_ITT_BUILD && USE_ITT_NOTIFY
1560 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1561 if (!KMP_MASTER_TID(tid)) {
1562 // Get correct barrier object
1563 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1564 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1565 } // (prepare called inside barrier_release)
1566 }
1567#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1568 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1569}
1570
1571
1572void
1573__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1574{
1575 KMP_TIME_BLOCK(KMP_setup_icv_copy);
1576 int f;
1577
1578 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1579 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1580
1581 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1582 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1583 this data before this function is called. */
1584#if KMP_BARRIER_ICV_PULL
1585 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1586 all of the worker threads can access them and make their own copies after the barrier. */
1587 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1588 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1589 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1590 0, team->t.t_threads[0], team));
1591#elif KMP_BARRIER_ICV_PUSH
1592 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1593 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1594 0, team->t.t_threads[0], team));
1595#else
1596 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1597 ngo_load(new_icvs);
1598 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1599 for (f=1; f<new_nproc; ++f) { // Skip the master thread
1600 // TODO: GEH - pass in better source location info since usually NULL here
1601 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1602 f, team->t.t_threads[f], team));
1603 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1604 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1605 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1606 f, team->t.t_threads[f], team));
1607 }
1608 ngo_sync();
1609#endif // KMP_BARRIER_ICV_PULL
1610}