blob: cbd1edd9e7c0c8310750be1b50794715ac935a8e [file] [log] [blame]
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001/*
2 * kmp_barrier.cpp
3 * $Revision: 43473 $
4 * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
5 */
6
7
8//===----------------------------------------------------------------------===//
9//
10// The LLVM Compiler Infrastructure
11//
12// This file is dual licensed under the MIT and the University of Illinois Open
13// Source Licenses. See LICENSE.txt for details.
14//
15//===----------------------------------------------------------------------===//
16
17
18#include "kmp.h"
19#include "kmp_wait_release.h"
20#include "kmp_stats.h"
21#include "kmp_itt.h"
22
23#if KMP_MIC
24#include <immintrin.h>
25#define USE_NGO_STORES 1
26#endif // KMP_MIC
27
28#if KMP_MIC && USE_NGO_STORES
29// ICV copying
30#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
31#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
33#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
34#else
35#define ngo_load(src) ((void)0)
36#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
37#define ngo_store_go(dst, src) memcpy((dst), (src), CACHE_LINE)
38#define ngo_sync() ((void)0)
39#endif /* KMP_MIC && USE_NGO_STORES */
40
41void __kmp_print_structure(void); // Forward declaration
42
43// ---------------------------- Barrier Algorithms ----------------------------
44
45// Linear Barrier
46static void
47__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48 void (*reduce)(void *, void *)
49 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
50{
51 KMP_TIME_BLOCK(KMP_linear_gather);
52 register kmp_team_t *team = this_thr->th.th_team;
53 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
54 register kmp_info_t **other_threads = team->t.t_threads;
55
56 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57 gtid, team->t.t_id, tid, bt));
58 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59
60#if USE_ITT_BUILD && USE_ITT_NOTIFY
61 // Barrier imbalance - save arrive time to the thread
62 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
63 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
64 }
65#endif
66 // We now perform a linear reduction to signal that all of the threads have arrived.
67 if (!KMP_MASTER_TID(tid)) {
68 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
69 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
70 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
71 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
72 // Mark arrival to master thread
73 /* After performing this write, a worker thread may not assume that the team is valid
74 any more - it could be deallocated by the master thread at any time. */
75 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
76 flag.release();
77 } else {
78 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
79 register int nproc = this_thr->th.th_team_nproc;
80 register int i;
81 // Don't have to worry about sleep bit here or atomic since team setting
82 register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
83
84 // Collect all the worker team member threads.
85 for (i=1; i<nproc; ++i) {
86#if KMP_CACHE_MANAGE
87 // Prefetch next thread's arrived count
88 if (i+1 < nproc)
89 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
90#endif /* KMP_CACHE_MANAGE */
91 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
92 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
93 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
94 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
95
96 // Wait for worker thread to arrive
97 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
98 flag.wait(this_thr, FALSE
99 USE_ITT_BUILD_ARG(itt_sync_obj) );
100#if USE_ITT_BUILD && USE_ITT_NOTIFY
101 // Barrier imbalance - write min of the thread time and the other thread time to the thread.
102 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
103 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
104 other_threads[i]->th.th_bar_min_time);
105 }
106#endif
107 if (reduce) {
108 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
109 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
110 (*reduce)(this_thr->th.th_local.reduce_data,
111 other_threads[i]->th.th_local.reduce_data);
112 }
113 }
114 // Don't have to worry about sleep bit here or atomic since team setting
115 team_bar->b_arrived = new_state;
116 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
117 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
118 }
119 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
120 gtid, team->t.t_id, tid, bt));
121}
122
123static void
124__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
125 int propagate_icvs
126 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
127{
128 KMP_TIME_BLOCK(KMP_linear_release);
129 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
130 register kmp_team_t *team;
131
132 if (KMP_MASTER_TID(tid)) {
133 register unsigned int i;
134 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
135 register kmp_info_t **other_threads;
136
137 team = __kmp_threads[gtid]->th.th_team;
138 KMP_DEBUG_ASSERT(team != NULL);
139 other_threads = team->t.t_threads;
140
141 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
142 gtid, team->t.t_id, tid, bt));
143
144 if (nproc > 1) {
145#if KMP_BARRIER_ICV_PUSH
146 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
147 if (propagate_icvs) {
148 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
149 for (i=1; i<nproc; ++i) {
150 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
151 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
152 &team->t.t_implicit_task_taskdata[0].td_icvs);
153 }
154 ngo_sync();
155 }
156 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
157#endif // KMP_BARRIER_ICV_PUSH
158
159 // Now, release all of the worker threads
160 for (i=1; i<nproc; ++i) {
161#if KMP_CACHE_MANAGE
162 // Prefetch next thread's go flag
163 if (i+1 < nproc)
164 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
165#endif /* KMP_CACHE_MANAGE */
166 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
167 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
168 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
169 &other_threads[i]->th.th_bar[bt].bb.b_go,
170 other_threads[i]->th.th_bar[bt].bb.b_go,
171 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
172 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
173 flag.release();
174 }
175 }
176 } else { // Wait for the MASTER thread to release us
177 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
178 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
179 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
180 flag.wait(this_thr, TRUE
181 USE_ITT_BUILD_ARG(itt_sync_obj) );
182#if USE_ITT_BUILD && USE_ITT_NOTIFY
183 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
184 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
185 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
186 // Cancel wait on previous parallel region...
187 __kmp_itt_task_starting(itt_sync_obj);
188
189 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
190 return;
191
192 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
193 if (itt_sync_obj != NULL)
194 // Call prepare as early as possible for "new" barrier
195 __kmp_itt_task_finished(itt_sync_obj);
196 } else
197#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
198 // Early exit for reaping threads releasing forkjoin barrier
199 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
200 return;
201 // The worker thread may now assume that the team is valid.
202#ifdef KMP_DEBUG
203 tid = __kmp_tid_from_gtid(gtid);
204 team = __kmp_threads[gtid]->th.th_team;
205#endif
206 KMP_DEBUG_ASSERT(team != NULL);
207 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
208 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
209 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
210 KMP_MB(); // Flush all pending memory write invalidates.
211 }
212 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
213 gtid, team->t.t_id, tid, bt));
214}
215
216// Tree barrier
217static void
218__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
219 void (*reduce)(void *, void *)
220 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
221{
222 KMP_TIME_BLOCK(KMP_tree_gather);
223 register kmp_team_t *team = this_thr->th.th_team;
224 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
225 register kmp_info_t **other_threads = team->t.t_threads;
226 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
227 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
228 register kmp_uint32 branch_factor = 1 << branch_bits;
229 register kmp_uint32 child;
230 register kmp_uint32 child_tid;
231 register kmp_uint new_state;
232
233 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
234 gtid, team->t.t_id, tid, bt));
235 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
236
237#if USE_ITT_BUILD && USE_ITT_NOTIFY
238 // Barrier imbalance - save arrive time to the thread
239 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
240 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
241 }
242#endif
243 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
244 child_tid = (tid << branch_bits) + 1;
245 if (child_tid < nproc) {
246 // Parent threads wait for all their children to arrive
247 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
248 child = 1;
249 do {
250 register kmp_info_t *child_thr = other_threads[child_tid];
251 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
252#if KMP_CACHE_MANAGE
253 // Prefetch next thread's arrived count
254 if (child+1 <= branch_factor && child_tid+1 < nproc)
255 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
256#endif /* KMP_CACHE_MANAGE */
257 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
258 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
259 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
260 &child_bar->b_arrived, new_state));
261 // Wait for child to arrive
262 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
263 flag.wait(this_thr, FALSE
264 USE_ITT_BUILD_ARG(itt_sync_obj) );
265#if USE_ITT_BUILD && USE_ITT_NOTIFY
266 // Barrier imbalance - write min of the thread time and a child time to the thread.
267 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
268 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
269 child_thr->th.th_bar_min_time);
270 }
271#endif
272 if (reduce) {
273 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
274 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
275 team->t.t_id, child_tid));
276 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
277 }
278 child++;
279 child_tid++;
280 }
281 while (child <= branch_factor && child_tid < nproc);
282 }
283
284 if (!KMP_MASTER_TID(tid)) { // Worker threads
285 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
286
287 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
288 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
289 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
290 &thr_bar->b_arrived, thr_bar->b_arrived,
291 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
292
293 // Mark arrival to parent thread
294 /* After performing this write, a worker thread may not assume that the team is valid
295 any more - it could be deallocated by the master thread at any time. */
296 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
297 flag.release();
298 } else {
299 // Need to update the team arrived pointer if we are the master thread
300 if (nproc > 1) // New value was already computed above
301 team->t.t_bar[bt].b_arrived = new_state;
302 else
303 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
304 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
305 gtid, team->t.t_id, tid, team->t.t_id,
306 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
307 }
308 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
309 gtid, team->t.t_id, tid, bt));
310}
311
312static void
313__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
314 int propagate_icvs
315 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
316{
317 KMP_TIME_BLOCK(KMP_tree_release);
318 register kmp_team_t *team;
319 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
320 register kmp_uint32 nproc;
321 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
322 register kmp_uint32 branch_factor = 1 << branch_bits;
323 register kmp_uint32 child;
324 register kmp_uint32 child_tid;
325
326 // Perform a tree release for all of the threads that have been gathered
327 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
328 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
329 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
330 // Wait for parent thread to release us
331 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
332 flag.wait(this_thr, TRUE
333 USE_ITT_BUILD_ARG(itt_sync_obj) );
334#if USE_ITT_BUILD && USE_ITT_NOTIFY
335 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
336 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
337 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
338 // Cancel wait on previous parallel region...
339 __kmp_itt_task_starting(itt_sync_obj);
340
341 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
342 return;
343
344 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
345 if (itt_sync_obj != NULL)
346 // Call prepare as early as possible for "new" barrier
347 __kmp_itt_task_finished(itt_sync_obj);
348 } else
349#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
350 // Early exit for reaping threads releasing forkjoin barrier
351 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
352 return;
353
354 // The worker thread may now assume that the team is valid.
355 team = __kmp_threads[gtid]->th.th_team;
356 KMP_DEBUG_ASSERT(team != NULL);
357 tid = __kmp_tid_from_gtid(gtid);
358
359 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
360 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
361 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
362 KMP_MB(); // Flush all pending memory write invalidates.
363 } else {
364 team = __kmp_threads[gtid]->th.th_team;
365 KMP_DEBUG_ASSERT(team != NULL);
366 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
367 gtid, team->t.t_id, tid, bt));
368 }
369 nproc = this_thr->th.th_team_nproc;
370 child_tid = (tid << branch_bits) + 1;
371
372 if (child_tid < nproc) {
373 register kmp_info_t **other_threads = team->t.t_threads;
374 child = 1;
375 // Parent threads release all their children
376 do {
377 register kmp_info_t *child_thr = other_threads[child_tid];
378 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
379#if KMP_CACHE_MANAGE
380 // Prefetch next thread's go count
381 if (child+1 <= branch_factor && child_tid+1 < nproc)
382 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
383#endif /* KMP_CACHE_MANAGE */
384
385#if KMP_BARRIER_ICV_PUSH
386 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
387 if (propagate_icvs) {
388 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
389 team, child_tid, FALSE);
390 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
391 &team->t.t_implicit_task_taskdata[0].td_icvs);
392 }
393 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
394#endif // KMP_BARRIER_ICV_PUSH
395 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
396 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
397 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
398 child_tid, &child_bar->b_go, child_bar->b_go,
399 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
400 // Release child from barrier
401 kmp_flag_64 flag(&child_bar->b_go, child_thr);
402 flag.release();
403 child++;
404 child_tid++;
405 }
406 while (child <= branch_factor && child_tid < nproc);
407 }
408 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
409 gtid, team->t.t_id, tid, bt));
410}
411
412
413// Hyper Barrier
414static void
415__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
416 void (*reduce)(void *, void *)
417 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
418{
419 KMP_TIME_BLOCK(KMP_hyper_gather);
420 register kmp_team_t *team = this_thr->th.th_team;
421 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
422 register kmp_info_t **other_threads = team->t.t_threads;
423 register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
424 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
425 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
426 register kmp_uint32 branch_factor = 1 << branch_bits;
427 register kmp_uint32 offset;
428 register kmp_uint32 level;
429
430 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
431 gtid, team->t.t_id, tid, bt));
432
433 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
434
435#if USE_ITT_BUILD && USE_ITT_NOTIFY
436 // Barrier imbalance - save arrive time to the thread
437 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
438 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
439 }
440#endif
441 /* Perform a hypercube-embedded tree gather to wait until all of the threads have
442 arrived, and reduce any required data as we go. */
443 kmp_flag_64 p_flag(&thr_bar->b_arrived);
444 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
445 {
446 register kmp_uint32 child;
447 register kmp_uint32 child_tid;
448
449 if (((tid >> level) & (branch_factor - 1)) != 0) {
450 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
451
452 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
453 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
454 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
455 &thr_bar->b_arrived, thr_bar->b_arrived,
456 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
457 // Mark arrival to parent thread
458 /* After performing this write (in the last iteration of the enclosing for loop),
459 a worker thread may not assume that the team is valid any more - it could be
460 deallocated by the master thread at any time. */
461 p_flag.set_waiter(other_threads[parent_tid]);
462 p_flag.release();
463 break;
464 }
465
466 // Parent threads wait for children to arrive
467 if (new_state == KMP_BARRIER_UNUSED_STATE)
468 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
469 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
470 child++, child_tid+=(1 << level))
471 {
472 register kmp_info_t *child_thr = other_threads[child_tid];
473 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474#if KMP_CACHE_MANAGE
475 register kmp_uint32 next_child_tid = child_tid + (1 << level);
476 // Prefetch next thread's arrived count
477 if (child+1 < branch_factor && next_child_tid < num_threads)
478 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
479#endif /* KMP_CACHE_MANAGE */
480 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
481 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
482 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
483 &child_bar->b_arrived, new_state));
484 // Wait for child to arrive
485 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
486 c_flag.wait(this_thr, FALSE
487 USE_ITT_BUILD_ARG(itt_sync_obj) );
488#if USE_ITT_BUILD && USE_ITT_NOTIFY
489 // Barrier imbalance - write min of the thread time and a child time to the thread.
490 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
491 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
492 child_thr->th.th_bar_min_time);
493 }
494#endif
495 if (reduce) {
496 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
497 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
498 team->t.t_id, child_tid));
499 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
500 }
501 }
502 }
503
504 if (KMP_MASTER_TID(tid)) {
505 // Need to update the team arrived pointer if we are the master thread
506 if (new_state == KMP_BARRIER_UNUSED_STATE)
507 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
508 else
509 team->t.t_bar[bt].b_arrived = new_state;
510 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
511 gtid, team->t.t_id, tid, team->t.t_id,
512 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
513 }
514 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
515 gtid, team->t.t_id, tid, bt));
516}
517
518// The reverse versions seem to beat the forward versions overall
519#define KMP_REVERSE_HYPER_BAR
520static void
521__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
522 int propagate_icvs
523 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
524{
525 KMP_TIME_BLOCK(KMP_hyper_release);
526 register kmp_team_t *team;
527 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
528 register kmp_info_t **other_threads;
529 register kmp_uint32 num_threads;
530 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
531 register kmp_uint32 branch_factor = 1 << branch_bits;
532 register kmp_uint32 child;
533 register kmp_uint32 child_tid;
534 register kmp_uint32 offset;
535 register kmp_uint32 level;
536
537 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
538 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
539 order of the corresponding gather, otherwise threads are released in the same order. */
540 if (KMP_MASTER_TID(tid)) { // master
541 team = __kmp_threads[gtid]->th.th_team;
542 KMP_DEBUG_ASSERT(team != NULL);
543 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
544 gtid, team->t.t_id, tid, bt));
545#if KMP_BARRIER_ICV_PUSH
546 if (propagate_icvs) { // master already has ICVs in final destination; copy
547 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
548 }
549#endif
550 }
551 else { // Handle fork barrier workers who aren't part of a team yet
552 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
553 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
554 // Wait for parent thread to release us
555 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
556 flag.wait(this_thr, TRUE
557 USE_ITT_BUILD_ARG(itt_sync_obj) );
558#if USE_ITT_BUILD && USE_ITT_NOTIFY
559 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
560 // In fork barrier where we could not get the object reliably
561 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
562 // Cancel wait on previous parallel region...
563 __kmp_itt_task_starting(itt_sync_obj);
564
565 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
566 return;
567
568 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
569 if (itt_sync_obj != NULL)
570 // Call prepare as early as possible for "new" barrier
571 __kmp_itt_task_finished(itt_sync_obj);
572 } else
573#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
574 // Early exit for reaping threads releasing forkjoin barrier
575 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
576 return;
577
578 // The worker thread may now assume that the team is valid.
579 team = __kmp_threads[gtid]->th.th_team;
580 KMP_DEBUG_ASSERT(team != NULL);
581 tid = __kmp_tid_from_gtid(gtid);
582
583 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
584 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
585 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
586 KMP_MB(); // Flush all pending memory write invalidates.
587 }
588 num_threads = this_thr->th.th_team_nproc;
589 other_threads = team->t.t_threads;
590
591#ifdef KMP_REVERSE_HYPER_BAR
592 // Count up to correct level for parent
593 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
594 level+=branch_bits, offset<<=branch_bits);
595
596 // Now go down from there
597 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
598 level-=branch_bits, offset>>=branch_bits)
599#else
600 // Go down the tree, level by level
601 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
602#endif // KMP_REVERSE_HYPER_BAR
603 {
604#ifdef KMP_REVERSE_HYPER_BAR
605 /* Now go in reverse order through the children, highest to lowest.
606 Initial setting of child is conservative here. */
607 child = num_threads >> ((level==0)?level:level-1);
608 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
609 child>=1; child--, child_tid-=(1<<level))
610#else
611 if (((tid >> level) & (branch_factor - 1)) != 0)
612 // No need to go lower than this, since this is the level parent would be notified
613 break;
614 // Iterate through children on this level of the tree
615 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
616 child++, child_tid+=(1<<level))
617#endif // KMP_REVERSE_HYPER_BAR
618 {
619 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
620 else {
621 register kmp_info_t *child_thr = other_threads[child_tid];
622 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
623#if KMP_CACHE_MANAGE
624 register kmp_uint32 next_child_tid = child_tid - (1 << level);
625 // Prefetch next thread's go count
626# ifdef KMP_REVERSE_HYPER_BAR
627 if (child-1 >= 1 && next_child_tid < num_threads)
628# else
629 if (child+1 < branch_factor && next_child_tid < num_threads)
630# endif // KMP_REVERSE_HYPER_BAR
631 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
632#endif /* KMP_CACHE_MANAGE */
633
634#if KMP_BARRIER_ICV_PUSH
635 if (propagate_icvs) // push my fixed ICVs to my child
636 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
637#endif // KMP_BARRIER_ICV_PUSH
638
639 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
640 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
641 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
642 child_tid, &child_bar->b_go, child_bar->b_go,
643 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
644 // Release child from barrier
645 kmp_flag_64 flag(&child_bar->b_go, child_thr);
646 flag.release();
647 }
648 }
649 }
650#if KMP_BARRIER_ICV_PUSH
651 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
652 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
653 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
654 }
655#endif
656 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
657 gtid, team->t.t_id, tid, bt));
658}
659
660// Hierarchical Barrier
661
662// Initialize thread barrier data
663/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
664 minimum amount of initialization required based on how the team has changed. Returns true if
665 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
666 team size increases, threads already in the team will respond to on-core wakeup on their parent
667 thread, but threads newly added to the team will only be listening on the their local b_go. */
668static bool
669__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
670 int gtid, int tid, kmp_team_t *team)
671{
672 // Checks to determine if (re-)initialization is needed
673 bool uninitialized = thr_bar->team == NULL;
674 bool team_changed = team != thr_bar->team;
675 bool team_sz_changed = nproc != thr_bar->nproc;
676 bool tid_changed = tid != thr_bar->old_tid;
677 bool retval = false;
678
679 if (uninitialized || team_sz_changed) {
680 __kmp_get_hierarchy(nproc, thr_bar);
681 }
682
683 if (uninitialized || team_sz_changed || tid_changed) {
684 thr_bar->my_level = thr_bar->depth-1; // default for master
685 thr_bar->parent_tid = -1; // default for master
686 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
687 kmp_uint32 d=0;
688 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
689 kmp_uint32 rem;
690 if (d == thr_bar->depth-2) { // reached level right below the master
691 thr_bar->parent_tid = 0;
692 thr_bar->my_level = d;
693 break;
694 }
695 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
696 // thread is not a subtree root at next level, so this is max
697 thr_bar->parent_tid = tid - rem;
698 thr_bar->my_level = d;
699 break;
700 }
701 ++d;
702 }
703 }
704 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
705 thr_bar->old_tid = tid;
706 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
707 }
708 if (uninitialized || team_changed || tid_changed) {
709 thr_bar->team = team;
710 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
711 retval = true;
712 }
713 if (uninitialized || team_sz_changed || tid_changed) {
714 thr_bar->nproc = nproc;
715 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
716 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
717 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
718 thr_bar->leaf_kids = nproc - tid - 1;
719 thr_bar->leaf_state = 0;
720 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
721 }
722 return retval;
723}
724
725static void
726__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
727 int gtid, int tid, void (*reduce) (void *, void *)
728 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
729{
730 KMP_TIME_BLOCK(KMP_hier_gather);
731 register kmp_team_t *team = this_thr->th.th_team;
732 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
733 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
734 register kmp_info_t **other_threads = team->t.t_threads;
735 register kmp_uint64 new_state;
736
737 if (this_thr->th.th_team->t.t_level == 1) thr_bar->use_oncore_barrier = 1;
738 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
739
740 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
741 gtid, team->t.t_id, tid, bt));
742 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
743
744 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
745
746 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
747 register kmp_int32 child_tid;
748 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
749 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
750 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
751 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
752 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
753 flag.wait(this_thr, FALSE
754 USE_ITT_BUILD_ARG(itt_sync_obj) );
755 if (reduce) {
756 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
757 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
758 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
759 team->t.t_id, child_tid));
760 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
761 }
762 }
763 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
764 }
765 // Next, wait for higher level children on each child's b_arrived flag
766 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
767 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
768 if (last > nproc) last = nproc;
769 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
770 register kmp_info_t *child_thr = other_threads[child_tid];
771 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
772 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
773 "arrived(%p) == %u\n",
774 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
775 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
776 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
777 flag.wait(this_thr, FALSE
778 USE_ITT_BUILD_ARG(itt_sync_obj) );
779 if (reduce) {
780 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
781 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
782 team->t.t_id, child_tid));
783 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
784 }
785 }
786 }
787 }
788 else { // Blocktime is not infinite
789 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
790 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
791 if (last > nproc) last = nproc;
792 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
793 register kmp_info_t *child_thr = other_threads[child_tid];
794 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
795 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
796 "arrived(%p) == %u\n",
797 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
798 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
799 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
800 flag.wait(this_thr, FALSE
801 USE_ITT_BUILD_ARG(itt_sync_obj) );
802 if (reduce) {
803 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
804 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
805 team->t.t_id, child_tid));
806 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
807 }
808 }
809 }
810 }
811 }
812 // All subordinates are gathered; now release parent if not master thread
813
814 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
815 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
816 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
817 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
818 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
819 /* Mark arrival to parent: After performing this write, a worker thread may not assume that
820 the team is valid any more - it could be deallocated by the master thread at any time. */
821 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
822 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
823 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
824 flag.release();
825 }
826 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
827 thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
828 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
829 flag.set_waiter(other_threads[thr_bar->parent_tid]);
830 flag.release();
831 }
832 } else { // Master thread needs to update the team's b_arrived value
833 team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
834 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
835 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
836 }
837 // Is the team access below unsafe or just technically invalid?
838 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
839 gtid, team->t.t_id, tid, bt));
840}
841
842static void
843__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
844 int propagate_icvs
845 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
846{
847 KMP_TIME_BLOCK(KMP_hier_release);
848 register kmp_team_t *team;
849 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
850 register kmp_uint32 nproc;
851 bool team_change = false; // indicates on-core barrier shouldn't be used
852
853 if (KMP_MASTER_TID(tid)) {
854 team = __kmp_threads[gtid]->th.th_team;
855 KMP_DEBUG_ASSERT(team != NULL);
856 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
857 gtid, team->t.t_id, tid, bt));
858 }
859 else { // Worker threads
860 // Wait for parent thread to release me
861 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
862 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
863 // Use traditional method of waiting on my own b_go flag
864 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
865 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
866 flag.wait(this_thr, TRUE
867 USE_ITT_BUILD_ARG(itt_sync_obj) );
868 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
869 }
870 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
871 // Wait on my "offset" bits on parent's b_go flag
872 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
873 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
874 bt, this_thr
875 USE_ITT_BUILD_ARG(itt_sync_obj) );
876 flag.wait(this_thr, TRUE
877 USE_ITT_BUILD_ARG(itt_sync_obj) );
878 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
879 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
880 }
881 else { // Reset my bits on parent's b_go flag
882 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
883 }
884 }
885 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
886 // Early exit for reaping threads releasing forkjoin barrier
887 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
888 return;
889 // The worker thread may now assume that the team is valid.
890 team = __kmp_threads[gtid]->th.th_team;
891 KMP_DEBUG_ASSERT(team != NULL);
892 tid = __kmp_tid_from_gtid(gtid);
893
894 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
895 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
896 KMP_MB(); // Flush all pending memory write invalidates.
897 }
898
899 if (this_thr->th.th_team->t.t_level == 1) thr_bar->use_oncore_barrier = 1;
900 else thr_bar->use_oncore_barrier = 0;
901 nproc = this_thr->th.th_team_nproc;
902
903 // If the team size has increased, we still communicate with old leaves via oncore barrier.
904 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
905 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
906 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
907 // But if the entire team changes, we won't use oncore barrier at all
908 if (team_change) old_leaf_kids = 0;
909
910#if KMP_BARRIER_ICV_PUSH
911 if (propagate_icvs) {
912 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
913 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
914 }
915 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
916 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
917 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
918 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
919 &thr_bar->parent_bar->th_fixed_icvs);
920 // non-leaves will get ICVs piggybacked with b_go via NGO store
921 }
922 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
923 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
924 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
925 else // leaves copy parent's fixed ICVs directly to local ICV store
926 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
927 &thr_bar->parent_bar->th_fixed_icvs);
928 }
929 }
930#endif // KMP_BARRIER_ICV_PUSH
931
932 // Now, release my children
933 if (thr_bar->my_level) { // not a leaf
934 register kmp_int32 child_tid;
935 kmp_uint32 last;
936 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
937 if (KMP_MASTER_TID(tid)) { // do a flat release
938 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
939 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
940 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
941 ngo_load(&thr_bar->th_fixed_icvs);
942 // This loops over all the threads skipping only the leaf nodes in the hierarchy
943 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
944 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
945 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
946 " go(%p): %u => %u\n",
947 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
948 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
949 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
950 // Use ngo store (if available) to both store ICVs and release child via child's b_go
951 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
952 }
953 ngo_sync();
954 }
955 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
956 // Now, release leaf children
957 if (thr_bar->leaf_kids) { // if there are any
958 // We test team_change on the off-chance that the level 1 team changed.
959 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
960 if (old_leaf_kids) { // release old leaf kids
961 thr_bar->b_go |= old_leaf_state;
962 }
963 // Release new leaf kids
964 last = tid+thr_bar->skip_per_level[1];
965 if (last > nproc) last = nproc;
966 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
967 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
968 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
969 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
970 " T#%d(%d:%d) go(%p): %u => %u\n",
971 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
972 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
973 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
974 // Release child using child's b_go flag
975 kmp_flag_64 flag(&child_bar->b_go, child_thr);
976 flag.release();
977 }
978 }
979 else { // Release all children at once with leaf_state bits on my own b_go flag
980 thr_bar->b_go |= thr_bar->leaf_state;
981 }
982 }
983 }
984 else { // Blocktime is not infinite; do a simple hierarchical release
985 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
986 last = tid+thr_bar->skip_per_level[d+1];
987 kmp_uint32 skip = thr_bar->skip_per_level[d];
988 if (last > nproc) last = nproc;
989 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
990 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
991 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
992 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
993 " go(%p): %u => %u\n",
994 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
995 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
996 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
997 // Release child using child's b_go flag
998 kmp_flag_64 flag(&child_bar->b_go, child_thr);
999 flag.release();
1000 }
1001 }
1002 }
1003#if KMP_BARRIER_ICV_PUSH
1004 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1005 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1006#endif // KMP_BARRIER_ICV_PUSH
1007 }
1008 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1009 gtid, team->t.t_id, tid, bt));
1010}
1011
1012// ---------------------------- End of Barrier Algorithms ----------------------------
1013
1014// Internal function to do a barrier.
1015/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1016 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1017 Returns 0 if master thread, 1 if worker thread. */
1018int
1019__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1020 void *reduce_data, void (*reduce)(void *, void *))
1021{
1022 KMP_TIME_BLOCK(KMP_barrier);
1023 register int tid = __kmp_tid_from_gtid(gtid);
1024 register kmp_info_t *this_thr = __kmp_threads[gtid];
1025 register kmp_team_t *team = this_thr->th.th_team;
1026 register int status = 0;
1027 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1028
1029 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1030 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1031
1032 if (! team->t.t_serialized) {
1033#if USE_ITT_BUILD
1034 // This value will be used in itt notify events below.
1035 void *itt_sync_obj = NULL;
1036# if USE_ITT_NOTIFY
1037 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1038 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1039# endif
1040#endif /* USE_ITT_BUILD */
1041 if (__kmp_tasking_mode == tskm_extra_barrier) {
1042 __kmp_tasking_barrier(team, this_thr, gtid);
1043 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1044 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1045 }
1046
1047 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1048 the team struct is not guaranteed to exist. */
1049 // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1050 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1051 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1052 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1053 }
1054
1055#if USE_ITT_BUILD
1056 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1057 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1058#endif /* USE_ITT_BUILD */
1059
1060 if (reduce != NULL) {
1061 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1062 this_thr->th.th_local.reduce_data = reduce_data;
1063 }
1064 switch (__kmp_barrier_gather_pattern[bt]) {
1065 case bp_hyper_bar: {
1066 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1067 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1068 USE_ITT_BUILD_ARG(itt_sync_obj) );
1069 break;
1070 }
1071 case bp_hierarchical_bar: {
1072 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1073 USE_ITT_BUILD_ARG(itt_sync_obj));
1074 break;
1075 }
1076 case bp_tree_bar: {
1077 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1078 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1079 USE_ITT_BUILD_ARG(itt_sync_obj) );
1080 break;
1081 }
1082 default: {
1083 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1084 USE_ITT_BUILD_ARG(itt_sync_obj) );
1085 }
1086 }
1087
1088 KMP_MB();
1089
1090 if (KMP_MASTER_TID(tid)) {
1091 status = 0;
1092 if (__kmp_tasking_mode != tskm_immediate_exec) {
1093 __kmp_task_team_wait(this_thr, team
1094 USE_ITT_BUILD_ARG(itt_sync_obj) );
1095 __kmp_task_team_setup(this_thr, team);
1096 }
1097
1098
1099#if USE_ITT_BUILD
1100 /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1101 before the final summation into the shared variable is done (final summation can be a
1102 long operation for array reductions). */
1103 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1104 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1105#endif /* USE_ITT_BUILD */
1106#if USE_ITT_BUILD && USE_ITT_NOTIFY
1107 // Barrier - report frame end
1108 if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1109 kmp_uint64 cur_time = __itt_get_timestamp();
1110 kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1111 int nproc = this_thr->th.th_team_nproc;
1112 int i;
1113 // Initialize with master's wait time
1114 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1115 switch(__kmp_forkjoin_frames_mode) {
1116 case 1:
1117 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1118 this_thr->th.th_frame_time = cur_time;
1119 break;
1120 case 2:
1121 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1122 break;
1123 case 3:
1124 if( __itt_metadata_add_ptr ) {
1125 for (i=1; i<nproc; ++i) {
1126 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1127 }
1128 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1129 }
1130 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1131 this_thr->th.th_frame_time = cur_time;
1132 break;
1133 }
1134 }
1135#endif /* USE_ITT_BUILD */
1136 } else {
1137 status = 1;
1138#if USE_ITT_BUILD
1139 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1140 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1141#endif /* USE_ITT_BUILD */
1142 }
1143 if (status == 1 || ! is_split) {
1144 switch (__kmp_barrier_release_pattern[bt]) {
1145 case bp_hyper_bar: {
1146 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1147 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1148 USE_ITT_BUILD_ARG(itt_sync_obj) );
1149 break;
1150 }
1151 case bp_hierarchical_bar: {
1152 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1153 USE_ITT_BUILD_ARG(itt_sync_obj) );
1154 break;
1155 }
1156 case bp_tree_bar: {
1157 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1158 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1159 USE_ITT_BUILD_ARG(itt_sync_obj) );
1160 break;
1161 }
1162 default: {
1163 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1164 USE_ITT_BUILD_ARG(itt_sync_obj) );
1165 }
1166 }
1167 if (__kmp_tasking_mode != tskm_immediate_exec) {
1168 __kmp_task_team_sync(this_thr, team);
1169 }
1170 }
1171
1172#if USE_ITT_BUILD
1173 /* GEH: TODO: Move this under if-condition above and also include in
1174 __kmp_end_split_barrier(). This will more accurately represent the actual release time
1175 of the threads for split barriers. */
1176 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1177 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1178#endif /* USE_ITT_BUILD */
1179 } else { // Team is serialized.
1180 status = 0;
1181 if (__kmp_tasking_mode != tskm_immediate_exec) {
1182 // The task team should be NULL for serialized code (tasks will be executed immediately)
1183 KMP_DEBUG_ASSERT(team->t.t_task_team == NULL);
1184 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1185 }
1186 }
1187 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1188 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1189 return status;
1190}
1191
1192
1193void
1194__kmp_end_split_barrier(enum barrier_type bt, int gtid)
1195{
1196 KMP_TIME_BLOCK(KMP_end_split_barrier);
1197 int tid = __kmp_tid_from_gtid(gtid);
1198 kmp_info_t *this_thr = __kmp_threads[gtid];
1199 kmp_team_t *team = this_thr->th.th_team;
1200
1201 if (!team->t.t_serialized) {
1202 if (KMP_MASTER_GTID(gtid)) {
1203 switch (__kmp_barrier_release_pattern[bt]) {
1204 case bp_hyper_bar: {
1205 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1206 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1207 USE_ITT_BUILD_ARG(NULL) );
1208 break;
1209 }
1210 case bp_hierarchical_bar: {
1211 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1212 USE_ITT_BUILD_ARG(NULL));
1213 break;
1214 }
1215 case bp_tree_bar: {
1216 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1217 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1218 USE_ITT_BUILD_ARG(NULL) );
1219 break;
1220 }
1221 default: {
1222 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1223 USE_ITT_BUILD_ARG(NULL) );
1224 }
1225 }
1226 if (__kmp_tasking_mode != tskm_immediate_exec) {
1227 __kmp_task_team_sync(this_thr, team);
1228 } // if
1229 }
1230 }
1231}
1232
1233
1234void
1235__kmp_join_barrier(int gtid)
1236{
1237 KMP_TIME_BLOCK(KMP_join_barrier);
1238 register kmp_info_t *this_thr = __kmp_threads[gtid];
1239 register kmp_team_t *team;
1240 register kmp_uint nproc;
1241 kmp_info_t *master_thread;
1242 int tid;
1243#ifdef KMP_DEBUG
1244 int team_id;
1245#endif /* KMP_DEBUG */
1246#if USE_ITT_BUILD
1247 void *itt_sync_obj = NULL;
1248# if USE_ITT_NOTIFY
1249 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1250 // Get object created at fork_barrier
1251 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1252# endif
1253#endif /* USE_ITT_BUILD */
1254 KMP_MB();
1255
1256 // Get current info
1257 team = this_thr->th.th_team;
1258 nproc = this_thr->th.th_team_nproc;
1259 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1260 tid = __kmp_tid_from_gtid(gtid);
1261#ifdef KMP_DEBUG
1262 team_id = team->t.t_id;
1263#endif /* KMP_DEBUG */
1264 master_thread = this_thr->th.th_team_master;
1265#ifdef KMP_DEBUG
1266 if (master_thread != team->t.t_threads[0]) {
1267 __kmp_print_structure();
1268 }
1269#endif /* KMP_DEBUG */
1270 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1271 KMP_MB();
1272
1273 // Verify state
1274 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1275 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1276 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1277 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1278 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1279
1280 if (__kmp_tasking_mode == tskm_extra_barrier) {
1281 __kmp_tasking_barrier(team, this_thr, gtid);
1282 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1283 }
1284# ifdef KMP_DEBUG
1285 if (__kmp_tasking_mode != tskm_immediate_exec) {
1286 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1287 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team,
1288 this_thr->th.th_task_team));
1289 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team);
1290 }
1291# endif /* KMP_DEBUG */
1292
1293 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1294 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1295 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1296 since the values are not used by __kmp_wait_template() in that case. */
1297 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1298 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1299 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1300 }
1301
1302#if USE_ITT_BUILD
1303 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1304 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1305#endif /* USE_ITT_BUILD */
1306
1307 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1308 case bp_hyper_bar: {
1309 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1310 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1311 USE_ITT_BUILD_ARG(itt_sync_obj) );
1312 break;
1313 }
1314 case bp_hierarchical_bar: {
1315 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1316 USE_ITT_BUILD_ARG(itt_sync_obj) );
1317 break;
1318 }
1319 case bp_tree_bar: {
1320 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1321 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1322 USE_ITT_BUILD_ARG(itt_sync_obj) );
1323 break;
1324 }
1325 default: {
1326 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1327 USE_ITT_BUILD_ARG(itt_sync_obj) );
1328 }
1329 }
1330
1331 /* From this point on, the team data structure may be deallocated at any time by the
1332 master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1333 data items that need to be referenced before the end of the barrier should be moved to
1334 the kmp_task_team_t structs. */
1335 if (KMP_MASTER_TID(tid)) {
1336 if (__kmp_tasking_mode != tskm_immediate_exec) {
1337 // Master shouldn't call decrease_load(). // TODO: enable master threads.
1338 // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1339 __kmp_task_team_wait(this_thr, team
1340 USE_ITT_BUILD_ARG(itt_sync_obj) );
1341 }
1342#if USE_ITT_BUILD
1343 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1344 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1345#endif /* USE_ITT_BUILD */
1346
1347# if USE_ITT_BUILD && USE_ITT_NOTIFY
1348 // Join barrier - report frame end
1349 if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1350 kmp_uint64 cur_time = __itt_get_timestamp();
1351 ident_t * loc = team->t.t_ident;
1352 kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1353 int nproc = this_thr->th.th_team_nproc;
1354 int i;
1355 // Initialize with master's wait time
1356 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1357 switch(__kmp_forkjoin_frames_mode) {
1358 case 1:
1359 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1360 break;
1361 case 2:
1362 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1363 break;
1364 case 3:
1365 if( __itt_metadata_add_ptr ) {
1366 for (i=1; i<nproc; ++i) {
1367 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1368 }
1369 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1370 }
1371 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1372 this_thr->th.th_frame_time = cur_time;
1373 break;
1374 }
1375 }
1376# endif /* USE_ITT_BUILD */
1377 }
1378#if USE_ITT_BUILD
1379 else {
1380 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1381 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1382 }
1383#endif /* USE_ITT_BUILD */
1384
1385#if KMP_DEBUG
1386 if (KMP_MASTER_TID(tid)) {
1387 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1388 gtid, team_id, tid, nproc));
1389 }
1390#endif /* KMP_DEBUG */
1391
1392 // TODO now, mark worker threads as done so they may be disbanded
1393 KMP_MB(); // Flush all pending memory write invalidates.
1394 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1395}
1396
1397
1398// TODO release worker threads' fork barriers as we are ready instead of all at once
1399void
1400__kmp_fork_barrier(int gtid, int tid)
1401{
1402 KMP_TIME_BLOCK(KMP_fork_barrier);
1403 kmp_info_t *this_thr = __kmp_threads[gtid];
1404 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1405#if USE_ITT_BUILD
1406 void * itt_sync_obj = NULL;
1407#endif /* USE_ITT_BUILD */
1408
1409 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1410 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1411
1412 // th_team pointer only valid for master thread here
1413 if (KMP_MASTER_TID(tid)) {
1414#if USE_ITT_BUILD && USE_ITT_NOTIFY
1415 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1416 // Create itt barrier object
1417 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1418 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1419 }
1420#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1421
1422#ifdef KMP_DEBUG
1423 register kmp_info_t **other_threads = team->t.t_threads;
1424 register int i;
1425
1426 // Verify state
1427 KMP_MB();
1428
1429 for(i=1; i<team->t.t_nproc; ++i) {
1430 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1431 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1432 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1433 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1434 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1435 & ~(KMP_BARRIER_SLEEP_STATE))
1436 == KMP_INIT_BARRIER_STATE);
1437 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1438 }
1439#endif
1440
1441 if (__kmp_tasking_mode != tskm_immediate_exec) {
1442 __kmp_task_team_setup(this_thr, team);
1443 }
1444
1445 /* The master thread may have changed its blocktime between the join barrier and the
1446 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1447 access it when the team struct is not guaranteed to exist. */
1448 // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1449 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1450 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1451 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1452 }
1453 } // master
1454
1455 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1456 case bp_hyper_bar: {
1457 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1458 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1459 USE_ITT_BUILD_ARG(itt_sync_obj) );
1460 break;
1461 }
1462 case bp_hierarchical_bar: {
1463 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1464 USE_ITT_BUILD_ARG(itt_sync_obj) );
1465 break;
1466 }
1467 case bp_tree_bar: {
1468 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1469 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1470 USE_ITT_BUILD_ARG(itt_sync_obj) );
1471 break;
1472 }
1473 default: {
1474 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1475 USE_ITT_BUILD_ARG(itt_sync_obj) );
1476 }
1477 }
1478
1479 // Early exit for reaping threads releasing forkjoin barrier
1480 if (TCR_4(__kmp_global.g.g_done)) {
1481 if (this_thr->th.th_task_team != NULL) {
1482 if (KMP_MASTER_TID(tid)) {
1483 TCW_PTR(this_thr->th.th_task_team, NULL);
1484 }
1485 else {
1486 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1487 }
1488 }
1489
1490#if USE_ITT_BUILD && USE_ITT_NOTIFY
1491 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1492 if (!KMP_MASTER_TID(tid)) {
1493 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1494 if (itt_sync_obj)
1495 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1496 }
1497 }
1498#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1499 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1500 return;
1501 }
1502
1503 /* We can now assume that a valid team structure has been allocated by the master and
1504 propagated to all worker threads. The current thread, however, may not be part of the
1505 team, so we can't blindly assume that the team pointer is non-null. */
1506 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1507 KMP_DEBUG_ASSERT(team != NULL);
1508 tid = __kmp_tid_from_gtid(gtid);
1509
1510
1511#if KMP_BARRIER_ICV_PULL
1512 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1513 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1514 this data before this function is called. We cannot modify __kmp_fork_call() to look at
1515 the fixed ICVs in the master's thread struct, because it is not always the case that the
1516 threads arrays have been allocated when __kmp_fork_call() is executed. */
1517 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
1518 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1519 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1520 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1521 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1522 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1523 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1524 }
1525 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
1526#endif // KMP_BARRIER_ICV_PULL
1527
1528 if (__kmp_tasking_mode != tskm_immediate_exec) {
1529 __kmp_task_team_sync(this_thr, team);
1530 }
1531
1532#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1533 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1534 if (proc_bind == proc_bind_intel) {
1535#endif
1536#if KMP_MIC
1537 // Call dynamic affinity settings
1538 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1539 __kmp_balanced_affinity(tid, team->t.t_nproc);
1540 }
1541#endif
1542#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1543 }
1544 else if ((proc_bind != proc_bind_false)
1545 && (proc_bind != proc_bind_disabled)) {
1546 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1547 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1548 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1549 }
1550 else {
1551 __kmp_affinity_set_place(gtid);
1552 }
1553 }
1554#endif
1555
1556#if USE_ITT_BUILD && USE_ITT_NOTIFY
1557 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1558 if (!KMP_MASTER_TID(tid)) {
1559 // Get correct barrier object
1560 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1561 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1562 } // (prepare called inside barrier_release)
1563 }
1564#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1565 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1566}
1567
1568
1569void
1570__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1571{
1572 KMP_TIME_BLOCK(KMP_setup_icv_copy);
1573 int f;
1574
1575 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1576 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1577
1578 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1579 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1580 this data before this function is called. */
1581#if KMP_BARRIER_ICV_PULL
1582 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1583 all of the worker threads can access them and make their own copies after the barrier. */
1584 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1585 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1586 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1587 0, team->t.t_threads[0], team));
1588#elif KMP_BARRIER_ICV_PUSH
1589 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1590 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1591 0, team->t.t_threads[0], team));
1592#else
1593 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1594 ngo_load(new_icvs);
1595 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1596 for (f=1; f<new_nproc; ++f) { // Skip the master thread
1597 // TODO: GEH - pass in better source location info since usually NULL here
1598 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1599 f, team->t.t_threads[f], team));
1600 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1601 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1602 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1603 f, team->t.t_threads[f], team));
1604 }
1605 ngo_sync();
1606#endif // KMP_BARRIER_ICV_PULL
1607}