blob: f265e01d021275d867499a8291bf762b3a43764c [file] [log] [blame]
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001/*
2 * kmp_barrier.cpp
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16#include "kmp.h"
17#include "kmp_wait_release.h"
18#include "kmp_stats.h"
19#include "kmp_itt.h"
20
21#if KMP_MIC
22#include <immintrin.h>
23#define USE_NGO_STORES 1
24#endif // KMP_MIC
25
26#if KMP_MIC && USE_NGO_STORES
27// ICV copying
28#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
29#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
30#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
32#else
33#define ngo_load(src) ((void)0)
34#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
35#define ngo_store_go(dst, src) memcpy((dst), (src), CACHE_LINE)
36#define ngo_sync() ((void)0)
37#endif /* KMP_MIC && USE_NGO_STORES */
38
39void __kmp_print_structure(void); // Forward declaration
40
41// ---------------------------- Barrier Algorithms ----------------------------
42
43// Linear Barrier
44static void
45__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
46 void (*reduce)(void *, void *)
47 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
48{
49 KMP_TIME_BLOCK(KMP_linear_gather);
50 register kmp_team_t *team = this_thr->th.th_team;
51 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
52 register kmp_info_t **other_threads = team->t.t_threads;
53
54 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
55 gtid, team->t.t_id, tid, bt));
56 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
57
58#if USE_ITT_BUILD && USE_ITT_NOTIFY
59 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +000060 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +000061 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
62 }
63#endif
64 // We now perform a linear reduction to signal that all of the threads have arrived.
65 if (!KMP_MASTER_TID(tid)) {
66 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
67 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
68 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
69 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
70 // Mark arrival to master thread
71 /* After performing this write, a worker thread may not assume that the team is valid
72 any more - it could be deallocated by the master thread at any time. */
73 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
74 flag.release();
75 } else {
76 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
77 register int nproc = this_thr->th.th_team_nproc;
78 register int i;
79 // Don't have to worry about sleep bit here or atomic since team setting
80 register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
81
82 // Collect all the worker team member threads.
83 for (i=1; i<nproc; ++i) {
84#if KMP_CACHE_MANAGE
85 // Prefetch next thread's arrived count
86 if (i+1 < nproc)
87 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
88#endif /* KMP_CACHE_MANAGE */
89 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
90 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
91 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
92 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
93
94 // Wait for worker thread to arrive
95 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
96 flag.wait(this_thr, FALSE
97 USE_ITT_BUILD_ARG(itt_sync_obj) );
98#if USE_ITT_BUILD && USE_ITT_NOTIFY
99 // Barrier imbalance - write min of the thread time and the other thread time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000100 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000101 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
102 other_threads[i]->th.th_bar_min_time);
103 }
104#endif
105 if (reduce) {
106 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
107 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
108 (*reduce)(this_thr->th.th_local.reduce_data,
109 other_threads[i]->th.th_local.reduce_data);
110 }
111 }
112 // Don't have to worry about sleep bit here or atomic since team setting
113 team_bar->b_arrived = new_state;
114 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
115 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
116 }
117 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
118 gtid, team->t.t_id, tid, bt));
119}
120
121static void
122__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
123 int propagate_icvs
124 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
125{
126 KMP_TIME_BLOCK(KMP_linear_release);
127 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
128 register kmp_team_t *team;
129
130 if (KMP_MASTER_TID(tid)) {
131 register unsigned int i;
132 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
133 register kmp_info_t **other_threads;
134
135 team = __kmp_threads[gtid]->th.th_team;
136 KMP_DEBUG_ASSERT(team != NULL);
137 other_threads = team->t.t_threads;
138
139 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
140 gtid, team->t.t_id, tid, bt));
141
142 if (nproc > 1) {
143#if KMP_BARRIER_ICV_PUSH
144 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
145 if (propagate_icvs) {
146 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
147 for (i=1; i<nproc; ++i) {
148 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
149 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
150 &team->t.t_implicit_task_taskdata[0].td_icvs);
151 }
152 ngo_sync();
153 }
154 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
155#endif // KMP_BARRIER_ICV_PUSH
156
157 // Now, release all of the worker threads
158 for (i=1; i<nproc; ++i) {
159#if KMP_CACHE_MANAGE
160 // Prefetch next thread's go flag
161 if (i+1 < nproc)
162 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
163#endif /* KMP_CACHE_MANAGE */
164 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
165 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
166 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
167 &other_threads[i]->th.th_bar[bt].bb.b_go,
168 other_threads[i]->th.th_bar[bt].bb.b_go,
169 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
170 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
171 flag.release();
172 }
173 }
174 } else { // Wait for the MASTER thread to release us
175 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
176 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
177 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
178 flag.wait(this_thr, TRUE
179 USE_ITT_BUILD_ARG(itt_sync_obj) );
180#if USE_ITT_BUILD && USE_ITT_NOTIFY
181 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
182 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
183 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
184 // Cancel wait on previous parallel region...
185 __kmp_itt_task_starting(itt_sync_obj);
186
187 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
188 return;
189
190 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
191 if (itt_sync_obj != NULL)
192 // Call prepare as early as possible for "new" barrier
193 __kmp_itt_task_finished(itt_sync_obj);
194 } else
195#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
196 // Early exit for reaping threads releasing forkjoin barrier
197 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
198 return;
199 // The worker thread may now assume that the team is valid.
200#ifdef KMP_DEBUG
201 tid = __kmp_tid_from_gtid(gtid);
202 team = __kmp_threads[gtid]->th.th_team;
203#endif
204 KMP_DEBUG_ASSERT(team != NULL);
205 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
206 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
207 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
208 KMP_MB(); // Flush all pending memory write invalidates.
209 }
210 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
211 gtid, team->t.t_id, tid, bt));
212}
213
214// Tree barrier
215static void
216__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
217 void (*reduce)(void *, void *)
218 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
219{
220 KMP_TIME_BLOCK(KMP_tree_gather);
221 register kmp_team_t *team = this_thr->th.th_team;
222 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
223 register kmp_info_t **other_threads = team->t.t_threads;
224 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
225 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
226 register kmp_uint32 branch_factor = 1 << branch_bits;
227 register kmp_uint32 child;
228 register kmp_uint32 child_tid;
229 register kmp_uint new_state;
230
231 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
232 gtid, team->t.t_id, tid, bt));
233 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
234
235#if USE_ITT_BUILD && USE_ITT_NOTIFY
236 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000237 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000238 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
239 }
240#endif
241 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
242 child_tid = (tid << branch_bits) + 1;
243 if (child_tid < nproc) {
244 // Parent threads wait for all their children to arrive
245 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
246 child = 1;
247 do {
248 register kmp_info_t *child_thr = other_threads[child_tid];
249 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
250#if KMP_CACHE_MANAGE
251 // Prefetch next thread's arrived count
252 if (child+1 <= branch_factor && child_tid+1 < nproc)
253 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
254#endif /* KMP_CACHE_MANAGE */
255 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
256 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
257 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
258 &child_bar->b_arrived, new_state));
259 // Wait for child to arrive
260 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
261 flag.wait(this_thr, FALSE
262 USE_ITT_BUILD_ARG(itt_sync_obj) );
263#if USE_ITT_BUILD && USE_ITT_NOTIFY
264 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000265 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000266 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
267 child_thr->th.th_bar_min_time);
268 }
269#endif
270 if (reduce) {
271 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
272 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
273 team->t.t_id, child_tid));
274 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
275 }
276 child++;
277 child_tid++;
278 }
279 while (child <= branch_factor && child_tid < nproc);
280 }
281
282 if (!KMP_MASTER_TID(tid)) { // Worker threads
283 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
284
285 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
286 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
287 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
288 &thr_bar->b_arrived, thr_bar->b_arrived,
289 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
290
291 // Mark arrival to parent thread
292 /* After performing this write, a worker thread may not assume that the team is valid
293 any more - it could be deallocated by the master thread at any time. */
294 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
295 flag.release();
296 } else {
297 // Need to update the team arrived pointer if we are the master thread
298 if (nproc > 1) // New value was already computed above
299 team->t.t_bar[bt].b_arrived = new_state;
300 else
301 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
302 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
303 gtid, team->t.t_id, tid, team->t.t_id,
304 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
305 }
306 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
307 gtid, team->t.t_id, tid, bt));
308}
309
310static void
311__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
312 int propagate_icvs
313 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
314{
315 KMP_TIME_BLOCK(KMP_tree_release);
316 register kmp_team_t *team;
317 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
318 register kmp_uint32 nproc;
319 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
320 register kmp_uint32 branch_factor = 1 << branch_bits;
321 register kmp_uint32 child;
322 register kmp_uint32 child_tid;
323
324 // Perform a tree release for all of the threads that have been gathered
325 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
326 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
327 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
328 // Wait for parent thread to release us
329 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
330 flag.wait(this_thr, TRUE
331 USE_ITT_BUILD_ARG(itt_sync_obj) );
332#if USE_ITT_BUILD && USE_ITT_NOTIFY
333 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
334 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
335 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
336 // Cancel wait on previous parallel region...
337 __kmp_itt_task_starting(itt_sync_obj);
338
339 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
340 return;
341
342 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
343 if (itt_sync_obj != NULL)
344 // Call prepare as early as possible for "new" barrier
345 __kmp_itt_task_finished(itt_sync_obj);
346 } else
347#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
348 // Early exit for reaping threads releasing forkjoin barrier
349 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
350 return;
351
352 // The worker thread may now assume that the team is valid.
353 team = __kmp_threads[gtid]->th.th_team;
354 KMP_DEBUG_ASSERT(team != NULL);
355 tid = __kmp_tid_from_gtid(gtid);
356
357 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
358 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
359 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
360 KMP_MB(); // Flush all pending memory write invalidates.
361 } else {
362 team = __kmp_threads[gtid]->th.th_team;
363 KMP_DEBUG_ASSERT(team != NULL);
364 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
365 gtid, team->t.t_id, tid, bt));
366 }
367 nproc = this_thr->th.th_team_nproc;
368 child_tid = (tid << branch_bits) + 1;
369
370 if (child_tid < nproc) {
371 register kmp_info_t **other_threads = team->t.t_threads;
372 child = 1;
373 // Parent threads release all their children
374 do {
375 register kmp_info_t *child_thr = other_threads[child_tid];
376 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
377#if KMP_CACHE_MANAGE
378 // Prefetch next thread's go count
379 if (child+1 <= branch_factor && child_tid+1 < nproc)
380 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
381#endif /* KMP_CACHE_MANAGE */
382
383#if KMP_BARRIER_ICV_PUSH
384 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
385 if (propagate_icvs) {
386 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
387 team, child_tid, FALSE);
388 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
389 &team->t.t_implicit_task_taskdata[0].td_icvs);
390 }
391 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
392#endif // KMP_BARRIER_ICV_PUSH
393 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
394 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
395 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
396 child_tid, &child_bar->b_go, child_bar->b_go,
397 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
398 // Release child from barrier
399 kmp_flag_64 flag(&child_bar->b_go, child_thr);
400 flag.release();
401 child++;
402 child_tid++;
403 }
404 while (child <= branch_factor && child_tid < nproc);
405 }
406 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
407 gtid, team->t.t_id, tid, bt));
408}
409
410
411// Hyper Barrier
412static void
413__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
414 void (*reduce)(void *, void *)
415 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
416{
417 KMP_TIME_BLOCK(KMP_hyper_gather);
418 register kmp_team_t *team = this_thr->th.th_team;
419 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
420 register kmp_info_t **other_threads = team->t.t_threads;
421 register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
422 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
423 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
424 register kmp_uint32 branch_factor = 1 << branch_bits;
425 register kmp_uint32 offset;
426 register kmp_uint32 level;
427
428 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
429 gtid, team->t.t_id, tid, bt));
430
431 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
432
433#if USE_ITT_BUILD && USE_ITT_NOTIFY
434 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000435 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000436 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
437 }
438#endif
439 /* Perform a hypercube-embedded tree gather to wait until all of the threads have
440 arrived, and reduce any required data as we go. */
441 kmp_flag_64 p_flag(&thr_bar->b_arrived);
442 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
443 {
444 register kmp_uint32 child;
445 register kmp_uint32 child_tid;
446
447 if (((tid >> level) & (branch_factor - 1)) != 0) {
448 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
449
450 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
451 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
452 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
453 &thr_bar->b_arrived, thr_bar->b_arrived,
454 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
455 // Mark arrival to parent thread
456 /* After performing this write (in the last iteration of the enclosing for loop),
457 a worker thread may not assume that the team is valid any more - it could be
458 deallocated by the master thread at any time. */
459 p_flag.set_waiter(other_threads[parent_tid]);
460 p_flag.release();
461 break;
462 }
463
464 // Parent threads wait for children to arrive
465 if (new_state == KMP_BARRIER_UNUSED_STATE)
466 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
467 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
468 child++, child_tid+=(1 << level))
469 {
470 register kmp_info_t *child_thr = other_threads[child_tid];
471 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
472#if KMP_CACHE_MANAGE
473 register kmp_uint32 next_child_tid = child_tid + (1 << level);
474 // Prefetch next thread's arrived count
475 if (child+1 < branch_factor && next_child_tid < num_threads)
476 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
477#endif /* KMP_CACHE_MANAGE */
478 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
479 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
480 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
481 &child_bar->b_arrived, new_state));
482 // Wait for child to arrive
483 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
484 c_flag.wait(this_thr, FALSE
485 USE_ITT_BUILD_ARG(itt_sync_obj) );
486#if USE_ITT_BUILD && USE_ITT_NOTIFY
487 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000488 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000489 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
490 child_thr->th.th_bar_min_time);
491 }
492#endif
493 if (reduce) {
494 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
495 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
496 team->t.t_id, child_tid));
497 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
498 }
499 }
500 }
501
502 if (KMP_MASTER_TID(tid)) {
503 // Need to update the team arrived pointer if we are the master thread
504 if (new_state == KMP_BARRIER_UNUSED_STATE)
505 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
506 else
507 team->t.t_bar[bt].b_arrived = new_state;
508 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
509 gtid, team->t.t_id, tid, team->t.t_id,
510 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
511 }
512 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
513 gtid, team->t.t_id, tid, bt));
514}
515
516// The reverse versions seem to beat the forward versions overall
517#define KMP_REVERSE_HYPER_BAR
518static void
519__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
520 int propagate_icvs
521 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
522{
523 KMP_TIME_BLOCK(KMP_hyper_release);
524 register kmp_team_t *team;
525 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
526 register kmp_info_t **other_threads;
527 register kmp_uint32 num_threads;
528 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
529 register kmp_uint32 branch_factor = 1 << branch_bits;
530 register kmp_uint32 child;
531 register kmp_uint32 child_tid;
532 register kmp_uint32 offset;
533 register kmp_uint32 level;
534
535 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
536 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
537 order of the corresponding gather, otherwise threads are released in the same order. */
538 if (KMP_MASTER_TID(tid)) { // master
539 team = __kmp_threads[gtid]->th.th_team;
540 KMP_DEBUG_ASSERT(team != NULL);
541 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
542 gtid, team->t.t_id, tid, bt));
543#if KMP_BARRIER_ICV_PUSH
544 if (propagate_icvs) { // master already has ICVs in final destination; copy
545 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
546 }
547#endif
548 }
549 else { // Handle fork barrier workers who aren't part of a team yet
550 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
551 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
552 // Wait for parent thread to release us
553 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
554 flag.wait(this_thr, TRUE
555 USE_ITT_BUILD_ARG(itt_sync_obj) );
556#if USE_ITT_BUILD && USE_ITT_NOTIFY
557 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
558 // In fork barrier where we could not get the object reliably
559 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
560 // Cancel wait on previous parallel region...
561 __kmp_itt_task_starting(itt_sync_obj);
562
563 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
564 return;
565
566 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
567 if (itt_sync_obj != NULL)
568 // Call prepare as early as possible for "new" barrier
569 __kmp_itt_task_finished(itt_sync_obj);
570 } else
571#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
572 // Early exit for reaping threads releasing forkjoin barrier
573 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
574 return;
575
576 // The worker thread may now assume that the team is valid.
577 team = __kmp_threads[gtid]->th.th_team;
578 KMP_DEBUG_ASSERT(team != NULL);
579 tid = __kmp_tid_from_gtid(gtid);
580
581 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
582 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
583 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
584 KMP_MB(); // Flush all pending memory write invalidates.
585 }
586 num_threads = this_thr->th.th_team_nproc;
587 other_threads = team->t.t_threads;
588
589#ifdef KMP_REVERSE_HYPER_BAR
590 // Count up to correct level for parent
591 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
592 level+=branch_bits, offset<<=branch_bits);
593
594 // Now go down from there
595 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
596 level-=branch_bits, offset>>=branch_bits)
597#else
598 // Go down the tree, level by level
599 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
600#endif // KMP_REVERSE_HYPER_BAR
601 {
602#ifdef KMP_REVERSE_HYPER_BAR
603 /* Now go in reverse order through the children, highest to lowest.
604 Initial setting of child is conservative here. */
605 child = num_threads >> ((level==0)?level:level-1);
606 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
607 child>=1; child--, child_tid-=(1<<level))
608#else
609 if (((tid >> level) & (branch_factor - 1)) != 0)
610 // No need to go lower than this, since this is the level parent would be notified
611 break;
612 // Iterate through children on this level of the tree
613 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
614 child++, child_tid+=(1<<level))
615#endif // KMP_REVERSE_HYPER_BAR
616 {
617 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
618 else {
619 register kmp_info_t *child_thr = other_threads[child_tid];
620 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
621#if KMP_CACHE_MANAGE
622 register kmp_uint32 next_child_tid = child_tid - (1 << level);
623 // Prefetch next thread's go count
624# ifdef KMP_REVERSE_HYPER_BAR
625 if (child-1 >= 1 && next_child_tid < num_threads)
626# else
627 if (child+1 < branch_factor && next_child_tid < num_threads)
628# endif // KMP_REVERSE_HYPER_BAR
629 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
630#endif /* KMP_CACHE_MANAGE */
631
632#if KMP_BARRIER_ICV_PUSH
633 if (propagate_icvs) // push my fixed ICVs to my child
634 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
635#endif // KMP_BARRIER_ICV_PUSH
636
637 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
638 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
639 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
640 child_tid, &child_bar->b_go, child_bar->b_go,
641 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
642 // Release child from barrier
643 kmp_flag_64 flag(&child_bar->b_go, child_thr);
644 flag.release();
645 }
646 }
647 }
648#if KMP_BARRIER_ICV_PUSH
649 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
650 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
651 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
652 }
653#endif
654 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
655 gtid, team->t.t_id, tid, bt));
656}
657
658// Hierarchical Barrier
659
660// Initialize thread barrier data
661/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
662 minimum amount of initialization required based on how the team has changed. Returns true if
663 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
664 team size increases, threads already in the team will respond to on-core wakeup on their parent
665 thread, but threads newly added to the team will only be listening on the their local b_go. */
666static bool
667__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
668 int gtid, int tid, kmp_team_t *team)
669{
670 // Checks to determine if (re-)initialization is needed
671 bool uninitialized = thr_bar->team == NULL;
672 bool team_changed = team != thr_bar->team;
673 bool team_sz_changed = nproc != thr_bar->nproc;
674 bool tid_changed = tid != thr_bar->old_tid;
675 bool retval = false;
676
677 if (uninitialized || team_sz_changed) {
678 __kmp_get_hierarchy(nproc, thr_bar);
679 }
680
681 if (uninitialized || team_sz_changed || tid_changed) {
682 thr_bar->my_level = thr_bar->depth-1; // default for master
683 thr_bar->parent_tid = -1; // default for master
684 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
685 kmp_uint32 d=0;
686 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
687 kmp_uint32 rem;
688 if (d == thr_bar->depth-2) { // reached level right below the master
689 thr_bar->parent_tid = 0;
690 thr_bar->my_level = d;
691 break;
692 }
693 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
694 // thread is not a subtree root at next level, so this is max
695 thr_bar->parent_tid = tid - rem;
696 thr_bar->my_level = d;
697 break;
698 }
699 ++d;
700 }
701 }
702 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
703 thr_bar->old_tid = tid;
704 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
705 }
706 if (uninitialized || team_changed || tid_changed) {
707 thr_bar->team = team;
708 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
709 retval = true;
710 }
711 if (uninitialized || team_sz_changed || tid_changed) {
712 thr_bar->nproc = nproc;
713 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
714 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
715 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
716 thr_bar->leaf_kids = nproc - tid - 1;
717 thr_bar->leaf_state = 0;
718 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
719 }
720 return retval;
721}
722
723static void
724__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
725 int gtid, int tid, void (*reduce) (void *, void *)
726 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
727{
728 KMP_TIME_BLOCK(KMP_hier_gather);
729 register kmp_team_t *team = this_thr->th.th_team;
730 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
731 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
732 register kmp_info_t **other_threads = team->t.t_threads;
733 register kmp_uint64 new_state;
734
Andrey Churbanov42a79212015-01-27 16:50:31 +0000735 int level = team->t.t_level;
736 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
737 if (this_thr->th.th_teams_size.nteams > 1)
738 ++level; // level was not increased in teams construct for team_of_masters
739 if (level == 1) thr_bar->use_oncore_barrier = 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000740 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
741
742 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
743 gtid, team->t.t_id, tid, bt));
744 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
745
Andrey Churbanove6bfb732015-05-06 18:34:15 +0000746#if USE_ITT_BUILD && USE_ITT_NOTIFY
747 // Barrier imbalance - save arrive time to the thread
748 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
749 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
750 }
751#endif
752
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000753 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
754
755 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
756 register kmp_int32 child_tid;
757 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
758 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
759 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
760 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
761 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
762 flag.wait(this_thr, FALSE
763 USE_ITT_BUILD_ARG(itt_sync_obj) );
764 if (reduce) {
765 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
766 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
767 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
768 team->t.t_id, child_tid));
769 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
770 }
771 }
772 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
773 }
774 // Next, wait for higher level children on each child's b_arrived flag
775 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
776 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
777 if (last > nproc) last = nproc;
778 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
779 register kmp_info_t *child_thr = other_threads[child_tid];
780 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
781 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
782 "arrived(%p) == %u\n",
783 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
784 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
785 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
786 flag.wait(this_thr, FALSE
787 USE_ITT_BUILD_ARG(itt_sync_obj) );
788 if (reduce) {
789 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
790 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
791 team->t.t_id, child_tid));
792 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
793 }
794 }
795 }
796 }
797 else { // Blocktime is not infinite
798 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
799 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
800 if (last > nproc) last = nproc;
801 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
802 register kmp_info_t *child_thr = other_threads[child_tid];
803 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
804 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
805 "arrived(%p) == %u\n",
806 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
807 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
808 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
809 flag.wait(this_thr, FALSE
810 USE_ITT_BUILD_ARG(itt_sync_obj) );
811 if (reduce) {
812 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
813 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
814 team->t.t_id, child_tid));
815 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
816 }
817 }
818 }
819 }
820 }
821 // All subordinates are gathered; now release parent if not master thread
822
823 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
824 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
825 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
826 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
827 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
828 /* Mark arrival to parent: After performing this write, a worker thread may not assume that
829 the team is valid any more - it could be deallocated by the master thread at any time. */
830 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
831 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
832 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
833 flag.release();
834 }
835 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
836 thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
837 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
838 flag.set_waiter(other_threads[thr_bar->parent_tid]);
839 flag.release();
840 }
841 } else { // Master thread needs to update the team's b_arrived value
842 team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
843 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
844 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
845 }
846 // Is the team access below unsafe or just technically invalid?
847 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
848 gtid, team->t.t_id, tid, bt));
849}
850
851static void
852__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
853 int propagate_icvs
854 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
855{
856 KMP_TIME_BLOCK(KMP_hier_release);
857 register kmp_team_t *team;
858 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
859 register kmp_uint32 nproc;
860 bool team_change = false; // indicates on-core barrier shouldn't be used
861
862 if (KMP_MASTER_TID(tid)) {
863 team = __kmp_threads[gtid]->th.th_team;
864 KMP_DEBUG_ASSERT(team != NULL);
865 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
866 gtid, team->t.t_id, tid, bt));
867 }
868 else { // Worker threads
869 // Wait for parent thread to release me
870 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
871 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
872 // Use traditional method of waiting on my own b_go flag
873 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
874 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
875 flag.wait(this_thr, TRUE
876 USE_ITT_BUILD_ARG(itt_sync_obj) );
877 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
878 }
879 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
880 // Wait on my "offset" bits on parent's b_go flag
881 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
882 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
883 bt, this_thr
884 USE_ITT_BUILD_ARG(itt_sync_obj) );
885 flag.wait(this_thr, TRUE
886 USE_ITT_BUILD_ARG(itt_sync_obj) );
887 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
888 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
889 }
890 else { // Reset my bits on parent's b_go flag
891 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
892 }
893 }
894 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
895 // Early exit for reaping threads releasing forkjoin barrier
896 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
897 return;
898 // The worker thread may now assume that the team is valid.
899 team = __kmp_threads[gtid]->th.th_team;
900 KMP_DEBUG_ASSERT(team != NULL);
901 tid = __kmp_tid_from_gtid(gtid);
902
903 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
904 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
905 KMP_MB(); // Flush all pending memory write invalidates.
906 }
907
Andrey Churbanov42a79212015-01-27 16:50:31 +0000908 int level = team->t.t_level;
909 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
910 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
911 ++level; // level was not increased in teams construct for team_of_workers
912 if( this_thr->th.th_teams_size.nteams > 1 )
913 ++level; // level was not increased in teams construct for team_of_masters
914 }
915 if (level == 1) thr_bar->use_oncore_barrier = 1;
916 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000917 nproc = this_thr->th.th_team_nproc;
918
919 // If the team size has increased, we still communicate with old leaves via oncore barrier.
920 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
921 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
922 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
923 // But if the entire team changes, we won't use oncore barrier at all
924 if (team_change) old_leaf_kids = 0;
925
926#if KMP_BARRIER_ICV_PUSH
927 if (propagate_icvs) {
928 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
929 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
930 }
931 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
932 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
933 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
934 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
935 &thr_bar->parent_bar->th_fixed_icvs);
936 // non-leaves will get ICVs piggybacked with b_go via NGO store
937 }
938 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
939 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
940 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
941 else // leaves copy parent's fixed ICVs directly to local ICV store
942 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
943 &thr_bar->parent_bar->th_fixed_icvs);
944 }
945 }
946#endif // KMP_BARRIER_ICV_PUSH
947
948 // Now, release my children
949 if (thr_bar->my_level) { // not a leaf
950 register kmp_int32 child_tid;
951 kmp_uint32 last;
952 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
953 if (KMP_MASTER_TID(tid)) { // do a flat release
954 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
955 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
956 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
957 ngo_load(&thr_bar->th_fixed_icvs);
958 // This loops over all the threads skipping only the leaf nodes in the hierarchy
959 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
960 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
961 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
962 " go(%p): %u => %u\n",
963 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
964 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
965 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
966 // Use ngo store (if available) to both store ICVs and release child via child's b_go
967 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
968 }
969 ngo_sync();
970 }
971 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
972 // Now, release leaf children
973 if (thr_bar->leaf_kids) { // if there are any
974 // We test team_change on the off-chance that the level 1 team changed.
975 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
976 if (old_leaf_kids) { // release old leaf kids
977 thr_bar->b_go |= old_leaf_state;
978 }
979 // Release new leaf kids
980 last = tid+thr_bar->skip_per_level[1];
981 if (last > nproc) last = nproc;
982 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
983 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
984 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
985 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
986 " T#%d(%d:%d) go(%p): %u => %u\n",
987 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
988 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
989 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
990 // Release child using child's b_go flag
991 kmp_flag_64 flag(&child_bar->b_go, child_thr);
992 flag.release();
993 }
994 }
995 else { // Release all children at once with leaf_state bits on my own b_go flag
996 thr_bar->b_go |= thr_bar->leaf_state;
997 }
998 }
999 }
1000 else { // Blocktime is not infinite; do a simple hierarchical release
1001 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1002 last = tid+thr_bar->skip_per_level[d+1];
1003 kmp_uint32 skip = thr_bar->skip_per_level[d];
1004 if (last > nproc) last = nproc;
1005 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1006 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1007 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1008 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1009 " go(%p): %u => %u\n",
1010 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1011 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1012 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1013 // Release child using child's b_go flag
1014 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1015 flag.release();
1016 }
1017 }
1018 }
1019#if KMP_BARRIER_ICV_PUSH
1020 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1021 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1022#endif // KMP_BARRIER_ICV_PUSH
1023 }
1024 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1025 gtid, team->t.t_id, tid, bt));
1026}
1027
1028// ---------------------------- End of Barrier Algorithms ----------------------------
1029
1030// Internal function to do a barrier.
1031/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1032 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1033 Returns 0 if master thread, 1 if worker thread. */
1034int
1035__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1036 void *reduce_data, void (*reduce)(void *, void *))
1037{
1038 KMP_TIME_BLOCK(KMP_barrier);
1039 register int tid = __kmp_tid_from_gtid(gtid);
1040 register kmp_info_t *this_thr = __kmp_threads[gtid];
1041 register kmp_team_t *team = this_thr->th.th_team;
1042 register int status = 0;
1043 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001044#if OMPT_SUPPORT
1045 ompt_task_id_t my_task_id;
1046 ompt_parallel_id_t my_parallel_id;
1047#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001048
1049 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1050 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1051
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001052#if OMPT_SUPPORT && OMPT_TRACE
1053 if (ompt_status & ompt_status_track) {
1054 if (ompt_status == ompt_status_track_callback) {
1055 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1056 my_parallel_id = team->t.ompt_team_info.parallel_id;
1057
1058 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1059 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1060 ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1061 my_parallel_id, my_task_id);
1062 }
1063 }
1064 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1065 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1066 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1067 my_parallel_id, my_task_id);
1068 }
1069 } else {
1070 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1071 }
1072 }
1073#endif
1074
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001075 if (! team->t.t_serialized) {
1076#if USE_ITT_BUILD
1077 // This value will be used in itt notify events below.
1078 void *itt_sync_obj = NULL;
1079# if USE_ITT_NOTIFY
1080 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1081 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1082# endif
1083#endif /* USE_ITT_BUILD */
1084 if (__kmp_tasking_mode == tskm_extra_barrier) {
1085 __kmp_tasking_barrier(team, this_thr, gtid);
1086 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1087 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1088 }
1089
1090 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1091 the team struct is not guaranteed to exist. */
1092 // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1093 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1094 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1095 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1096 }
1097
1098#if USE_ITT_BUILD
1099 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1100 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1101#endif /* USE_ITT_BUILD */
1102
1103 if (reduce != NULL) {
1104 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1105 this_thr->th.th_local.reduce_data = reduce_data;
1106 }
1107 switch (__kmp_barrier_gather_pattern[bt]) {
1108 case bp_hyper_bar: {
1109 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1110 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1111 USE_ITT_BUILD_ARG(itt_sync_obj) );
1112 break;
1113 }
1114 case bp_hierarchical_bar: {
1115 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1116 USE_ITT_BUILD_ARG(itt_sync_obj));
1117 break;
1118 }
1119 case bp_tree_bar: {
1120 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1121 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1122 USE_ITT_BUILD_ARG(itt_sync_obj) );
1123 break;
1124 }
1125 default: {
1126 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1127 USE_ITT_BUILD_ARG(itt_sync_obj) );
1128 }
1129 }
1130
1131 KMP_MB();
1132
1133 if (KMP_MASTER_TID(tid)) {
1134 status = 0;
1135 if (__kmp_tasking_mode != tskm_immediate_exec) {
1136 __kmp_task_team_wait(this_thr, team
1137 USE_ITT_BUILD_ARG(itt_sync_obj) );
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001138 __kmp_task_team_setup(this_thr, team, 0, 0); // use 0,0 to only setup the current team if nthreads > 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001139 }
1140
1141
1142#if USE_ITT_BUILD
1143 /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1144 before the final summation into the shared variable is done (final summation can be a
1145 long operation for array reductions). */
1146 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1147 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1148#endif /* USE_ITT_BUILD */
1149#if USE_ITT_BUILD && USE_ITT_NOTIFY
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001150 // Barrier - report frame end (only if active_level == 1)
1151 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1152#if OMP_40_ENABLED
1153 this_thr->th.th_teams_microtask == NULL &&
1154#endif
1155 team->t.t_active_level == 1)
1156 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001157 kmp_uint64 cur_time = __itt_get_timestamp();
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001158 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001159 int nproc = this_thr->th.th_team_nproc;
1160 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001161 switch(__kmp_forkjoin_frames_mode) {
1162 case 1:
1163 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1164 this_thr->th.th_frame_time = cur_time;
1165 break;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001166 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001167 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1168 break;
1169 case 3:
1170 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001171 // Initialize with master's wait time
1172 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001173 for (i=1; i<nproc; ++i) {
1174 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1175 }
1176 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1177 }
1178 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1179 this_thr->th.th_frame_time = cur_time;
1180 break;
1181 }
1182 }
1183#endif /* USE_ITT_BUILD */
1184 } else {
1185 status = 1;
1186#if USE_ITT_BUILD
1187 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1188 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1189#endif /* USE_ITT_BUILD */
1190 }
1191 if (status == 1 || ! is_split) {
1192 switch (__kmp_barrier_release_pattern[bt]) {
1193 case bp_hyper_bar: {
1194 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1195 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1196 USE_ITT_BUILD_ARG(itt_sync_obj) );
1197 break;
1198 }
1199 case bp_hierarchical_bar: {
1200 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1201 USE_ITT_BUILD_ARG(itt_sync_obj) );
1202 break;
1203 }
1204 case bp_tree_bar: {
1205 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1206 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1207 USE_ITT_BUILD_ARG(itt_sync_obj) );
1208 break;
1209 }
1210 default: {
1211 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1212 USE_ITT_BUILD_ARG(itt_sync_obj) );
1213 }
1214 }
1215 if (__kmp_tasking_mode != tskm_immediate_exec) {
1216 __kmp_task_team_sync(this_thr, team);
1217 }
1218 }
1219
1220#if USE_ITT_BUILD
1221 /* GEH: TODO: Move this under if-condition above and also include in
1222 __kmp_end_split_barrier(). This will more accurately represent the actual release time
1223 of the threads for split barriers. */
1224 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1225 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1226#endif /* USE_ITT_BUILD */
1227 } else { // Team is serialized.
1228 status = 0;
1229 if (__kmp_tasking_mode != tskm_immediate_exec) {
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001230#if OMP_41_ENABLED
1231 if ( this_thr->th.th_task_team != NULL ) {
1232 void *itt_sync_obj = NULL;
1233#if USE_ITT_NOTIFY
1234 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1235 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1236 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1237 }
1238#endif
1239
1240 kmp_task_team_t * task_team = this_thr->th.th_task_team;
1241 KMP_DEBUG_ASSERT(task_team->tt.tt_found_proxy_tasks == TRUE);
1242 __kmp_task_team_wait(this_thr, team
1243 USE_ITT_BUILD_ARG(itt_sync_obj));
1244 __kmp_task_team_setup(this_thr, team, 0, 0);
1245
1246#if USE_ITT_BUILD
1247 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1248 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1249#endif /* USE_ITT_BUILD */
1250 }
1251#else
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001252 // The task team should be NULL for serialized code (tasks will be executed immediately)
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001253 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001254 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001255#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001256 }
1257 }
1258 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1259 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001260
1261#if OMPT_SUPPORT
1262 if (ompt_status & ompt_status_track) {
1263#if OMPT_TRACE
1264 if ((ompt_status == ompt_status_track_callback) &&
1265 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1266 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1267 my_parallel_id, my_task_id);
1268 }
1269#endif
1270 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1271 }
1272#endif
1273
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001274 return status;
1275}
1276
1277
1278void
1279__kmp_end_split_barrier(enum barrier_type bt, int gtid)
1280{
1281 KMP_TIME_BLOCK(KMP_end_split_barrier);
1282 int tid = __kmp_tid_from_gtid(gtid);
1283 kmp_info_t *this_thr = __kmp_threads[gtid];
1284 kmp_team_t *team = this_thr->th.th_team;
1285
1286 if (!team->t.t_serialized) {
1287 if (KMP_MASTER_GTID(gtid)) {
1288 switch (__kmp_barrier_release_pattern[bt]) {
1289 case bp_hyper_bar: {
1290 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1291 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1292 USE_ITT_BUILD_ARG(NULL) );
1293 break;
1294 }
1295 case bp_hierarchical_bar: {
1296 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1297 USE_ITT_BUILD_ARG(NULL));
1298 break;
1299 }
1300 case bp_tree_bar: {
1301 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1302 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1303 USE_ITT_BUILD_ARG(NULL) );
1304 break;
1305 }
1306 default: {
1307 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1308 USE_ITT_BUILD_ARG(NULL) );
1309 }
1310 }
1311 if (__kmp_tasking_mode != tskm_immediate_exec) {
1312 __kmp_task_team_sync(this_thr, team);
1313 } // if
1314 }
1315 }
1316}
1317
1318
1319void
1320__kmp_join_barrier(int gtid)
1321{
1322 KMP_TIME_BLOCK(KMP_join_barrier);
1323 register kmp_info_t *this_thr = __kmp_threads[gtid];
1324 register kmp_team_t *team;
1325 register kmp_uint nproc;
1326 kmp_info_t *master_thread;
1327 int tid;
1328#ifdef KMP_DEBUG
1329 int team_id;
1330#endif /* KMP_DEBUG */
1331#if USE_ITT_BUILD
1332 void *itt_sync_obj = NULL;
1333# if USE_ITT_NOTIFY
1334 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1335 // Get object created at fork_barrier
1336 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1337# endif
1338#endif /* USE_ITT_BUILD */
1339 KMP_MB();
1340
1341 // Get current info
1342 team = this_thr->th.th_team;
1343 nproc = this_thr->th.th_team_nproc;
1344 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1345 tid = __kmp_tid_from_gtid(gtid);
1346#ifdef KMP_DEBUG
1347 team_id = team->t.t_id;
1348#endif /* KMP_DEBUG */
1349 master_thread = this_thr->th.th_team_master;
1350#ifdef KMP_DEBUG
1351 if (master_thread != team->t.t_threads[0]) {
1352 __kmp_print_structure();
1353 }
1354#endif /* KMP_DEBUG */
1355 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1356 KMP_MB();
1357
1358 // Verify state
1359 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1360 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1361 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1362 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1363 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1364
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001365#if OMPT_SUPPORT && OMPT_TRACE
1366 if ((ompt_status == ompt_status_track_callback) &&
1367 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1368 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1369 team->t.ompt_team_info.parallel_id,
1370 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1371 }
1372 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1373#endif
1374
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001375 if (__kmp_tasking_mode == tskm_extra_barrier) {
1376 __kmp_tasking_barrier(team, this_thr, gtid);
1377 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1378 }
1379# ifdef KMP_DEBUG
1380 if (__kmp_tasking_mode != tskm_immediate_exec) {
1381 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001382 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001383 this_thr->th.th_task_team));
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001384 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001385 }
1386# endif /* KMP_DEBUG */
1387
1388 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1389 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1390 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1391 since the values are not used by __kmp_wait_template() in that case. */
1392 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1393 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1394 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1395 }
1396
1397#if USE_ITT_BUILD
1398 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1399 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1400#endif /* USE_ITT_BUILD */
1401
1402 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1403 case bp_hyper_bar: {
1404 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1405 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1406 USE_ITT_BUILD_ARG(itt_sync_obj) );
1407 break;
1408 }
1409 case bp_hierarchical_bar: {
1410 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1411 USE_ITT_BUILD_ARG(itt_sync_obj) );
1412 break;
1413 }
1414 case bp_tree_bar: {
1415 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1416 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1417 USE_ITT_BUILD_ARG(itt_sync_obj) );
1418 break;
1419 }
1420 default: {
1421 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1422 USE_ITT_BUILD_ARG(itt_sync_obj) );
1423 }
1424 }
1425
1426 /* From this point on, the team data structure may be deallocated at any time by the
1427 master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1428 data items that need to be referenced before the end of the barrier should be moved to
1429 the kmp_task_team_t structs. */
1430 if (KMP_MASTER_TID(tid)) {
1431 if (__kmp_tasking_mode != tskm_immediate_exec) {
1432 // Master shouldn't call decrease_load(). // TODO: enable master threads.
1433 // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1434 __kmp_task_team_wait(this_thr, team
1435 USE_ITT_BUILD_ARG(itt_sync_obj) );
1436 }
1437#if USE_ITT_BUILD
1438 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1439 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1440#endif /* USE_ITT_BUILD */
1441
1442# if USE_ITT_BUILD && USE_ITT_NOTIFY
1443 // Join barrier - report frame end
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001444 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1445#if OMP_40_ENABLED
1446 this_thr->th.th_teams_microtask == NULL &&
1447#endif
1448 team->t.t_active_level == 1)
1449 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001450 kmp_uint64 cur_time = __itt_get_timestamp();
1451 ident_t * loc = team->t.t_ident;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001452 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001453 int nproc = this_thr->th.th_team_nproc;
1454 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001455 switch(__kmp_forkjoin_frames_mode) {
1456 case 1:
1457 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1458 break;
1459 case 2:
1460 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1461 break;
1462 case 3:
1463 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001464 // Initialize with master's wait time
1465 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001466 for (i=1; i<nproc; ++i) {
1467 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1468 }
1469 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1470 }
1471 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1472 this_thr->th.th_frame_time = cur_time;
1473 break;
1474 }
1475 }
1476# endif /* USE_ITT_BUILD */
1477 }
1478#if USE_ITT_BUILD
1479 else {
1480 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1481 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1482 }
1483#endif /* USE_ITT_BUILD */
1484
1485#if KMP_DEBUG
1486 if (KMP_MASTER_TID(tid)) {
1487 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1488 gtid, team_id, tid, nproc));
1489 }
1490#endif /* KMP_DEBUG */
1491
1492 // TODO now, mark worker threads as done so they may be disbanded
1493 KMP_MB(); // Flush all pending memory write invalidates.
1494 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001495
1496#if OMPT_SUPPORT
1497 if (ompt_status == ompt_status_track) {
1498#if OMPT_TRACE
1499 if ((ompt_status == ompt_status_track_callback) &&
1500 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1501 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1502 team->t.ompt_team_info.parallel_id,
1503 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1504 }
1505#endif
1506
1507 // return to default state
1508 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1509 }
1510#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001511}
1512
1513
1514// TODO release worker threads' fork barriers as we are ready instead of all at once
1515void
1516__kmp_fork_barrier(int gtid, int tid)
1517{
1518 KMP_TIME_BLOCK(KMP_fork_barrier);
1519 kmp_info_t *this_thr = __kmp_threads[gtid];
1520 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1521#if USE_ITT_BUILD
1522 void * itt_sync_obj = NULL;
1523#endif /* USE_ITT_BUILD */
1524
1525 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1526 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1527
1528 // th_team pointer only valid for master thread here
1529 if (KMP_MASTER_TID(tid)) {
1530#if USE_ITT_BUILD && USE_ITT_NOTIFY
1531 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1532 // Create itt barrier object
1533 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1534 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1535 }
1536#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1537
1538#ifdef KMP_DEBUG
1539 register kmp_info_t **other_threads = team->t.t_threads;
1540 register int i;
1541
1542 // Verify state
1543 KMP_MB();
1544
1545 for(i=1; i<team->t.t_nproc; ++i) {
1546 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1547 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1548 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1549 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1550 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1551 & ~(KMP_BARRIER_SLEEP_STATE))
1552 == KMP_INIT_BARRIER_STATE);
1553 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1554 }
1555#endif
1556
1557 if (__kmp_tasking_mode != tskm_immediate_exec) {
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001558 __kmp_task_team_setup(this_thr, team, 1, 0); // 1,0 indicates setup both task teams if nthreads > 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001559 }
1560
1561 /* The master thread may have changed its blocktime between the join barrier and the
1562 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1563 access it when the team struct is not guaranteed to exist. */
1564 // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1565 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1566 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1567 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1568 }
1569 } // master
1570
1571 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1572 case bp_hyper_bar: {
1573 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1574 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1575 USE_ITT_BUILD_ARG(itt_sync_obj) );
1576 break;
1577 }
1578 case bp_hierarchical_bar: {
1579 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1580 USE_ITT_BUILD_ARG(itt_sync_obj) );
1581 break;
1582 }
1583 case bp_tree_bar: {
1584 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1585 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1586 USE_ITT_BUILD_ARG(itt_sync_obj) );
1587 break;
1588 }
1589 default: {
1590 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1591 USE_ITT_BUILD_ARG(itt_sync_obj) );
1592 }
1593 }
1594
1595 // Early exit for reaping threads releasing forkjoin barrier
1596 if (TCR_4(__kmp_global.g.g_done)) {
1597 if (this_thr->th.th_task_team != NULL) {
1598 if (KMP_MASTER_TID(tid)) {
1599 TCW_PTR(this_thr->th.th_task_team, NULL);
1600 }
1601 else {
1602 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1603 }
1604 }
1605
1606#if USE_ITT_BUILD && USE_ITT_NOTIFY
1607 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1608 if (!KMP_MASTER_TID(tid)) {
1609 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1610 if (itt_sync_obj)
1611 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1612 }
1613 }
1614#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1615 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1616 return;
1617 }
1618
1619 /* We can now assume that a valid team structure has been allocated by the master and
1620 propagated to all worker threads. The current thread, however, may not be part of the
1621 team, so we can't blindly assume that the team pointer is non-null. */
1622 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1623 KMP_DEBUG_ASSERT(team != NULL);
1624 tid = __kmp_tid_from_gtid(gtid);
1625
1626
1627#if KMP_BARRIER_ICV_PULL
1628 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1629 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1630 this data before this function is called. We cannot modify __kmp_fork_call() to look at
1631 the fixed ICVs in the master's thread struct, because it is not always the case that the
1632 threads arrays have been allocated when __kmp_fork_call() is executed. */
1633 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
1634 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1635 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1636 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1637 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1638 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1639 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1640 }
1641 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
1642#endif // KMP_BARRIER_ICV_PULL
1643
1644 if (__kmp_tasking_mode != tskm_immediate_exec) {
1645 __kmp_task_team_sync(this_thr, team);
1646 }
1647
1648#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1649 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1650 if (proc_bind == proc_bind_intel) {
1651#endif
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001652#if KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001653 // Call dynamic affinity settings
1654 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1655 __kmp_balanced_affinity(tid, team->t.t_nproc);
1656 }
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001657#endif // KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001658#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1659 }
Andrey Churbanov94e569e2015-03-10 09:19:47 +00001660 else if (proc_bind != proc_bind_false) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001661 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1662 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1663 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1664 }
1665 else {
1666 __kmp_affinity_set_place(gtid);
1667 }
1668 }
1669#endif
1670
1671#if USE_ITT_BUILD && USE_ITT_NOTIFY
1672 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1673 if (!KMP_MASTER_TID(tid)) {
1674 // Get correct barrier object
1675 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1676 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1677 } // (prepare called inside barrier_release)
1678 }
1679#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1680 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1681}
1682
1683
1684void
1685__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1686{
1687 KMP_TIME_BLOCK(KMP_setup_icv_copy);
1688 int f;
1689
1690 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1691 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1692
1693 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1694 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1695 this data before this function is called. */
1696#if KMP_BARRIER_ICV_PULL
1697 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1698 all of the worker threads can access them and make their own copies after the barrier. */
1699 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1700 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1701 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1702 0, team->t.t_threads[0], team));
1703#elif KMP_BARRIER_ICV_PUSH
1704 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1705 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1706 0, team->t.t_threads[0], team));
1707#else
1708 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1709 ngo_load(new_icvs);
1710 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1711 for (f=1; f<new_nproc; ++f) { // Skip the master thread
1712 // TODO: GEH - pass in better source location info since usually NULL here
1713 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1714 f, team->t.t_threads[f], team));
1715 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1716 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1717 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1718 f, team->t.t_threads[f], team));
1719 }
1720 ngo_sync();
1721#endif // KMP_BARRIER_ICV_PULL
1722}