blob: d72939870fad4423090b4093a7488f4448ad9815 [file] [log] [blame]
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001/*
2 * kmp_barrier.cpp
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16#include "kmp.h"
17#include "kmp_wait_release.h"
18#include "kmp_stats.h"
19#include "kmp_itt.h"
20
21#if KMP_MIC
22#include <immintrin.h>
23#define USE_NGO_STORES 1
24#endif // KMP_MIC
25
26#if KMP_MIC && USE_NGO_STORES
27// ICV copying
28#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
29#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
30#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
32#else
33#define ngo_load(src) ((void)0)
34#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
35#define ngo_store_go(dst, src) memcpy((dst), (src), CACHE_LINE)
36#define ngo_sync() ((void)0)
37#endif /* KMP_MIC && USE_NGO_STORES */
38
39void __kmp_print_structure(void); // Forward declaration
40
41// ---------------------------- Barrier Algorithms ----------------------------
42
43// Linear Barrier
44static void
45__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
46 void (*reduce)(void *, void *)
47 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
48{
49 KMP_TIME_BLOCK(KMP_linear_gather);
50 register kmp_team_t *team = this_thr->th.th_team;
51 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
52 register kmp_info_t **other_threads = team->t.t_threads;
53
54 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
55 gtid, team->t.t_id, tid, bt));
56 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
57
58#if USE_ITT_BUILD && USE_ITT_NOTIFY
59 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +000060 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +000061 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
62 }
63#endif
64 // We now perform a linear reduction to signal that all of the threads have arrived.
65 if (!KMP_MASTER_TID(tid)) {
66 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
67 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
68 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
69 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
70 // Mark arrival to master thread
71 /* After performing this write, a worker thread may not assume that the team is valid
72 any more - it could be deallocated by the master thread at any time. */
73 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
74 flag.release();
75 } else {
76 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
77 register int nproc = this_thr->th.th_team_nproc;
78 register int i;
79 // Don't have to worry about sleep bit here or atomic since team setting
80 register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
81
82 // Collect all the worker team member threads.
83 for (i=1; i<nproc; ++i) {
84#if KMP_CACHE_MANAGE
85 // Prefetch next thread's arrived count
86 if (i+1 < nproc)
87 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
88#endif /* KMP_CACHE_MANAGE */
89 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
90 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
91 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
92 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
93
94 // Wait for worker thread to arrive
95 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
96 flag.wait(this_thr, FALSE
97 USE_ITT_BUILD_ARG(itt_sync_obj) );
98#if USE_ITT_BUILD && USE_ITT_NOTIFY
99 // Barrier imbalance - write min of the thread time and the other thread time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000100 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000101 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
102 other_threads[i]->th.th_bar_min_time);
103 }
104#endif
105 if (reduce) {
106 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
107 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
108 (*reduce)(this_thr->th.th_local.reduce_data,
109 other_threads[i]->th.th_local.reduce_data);
110 }
111 }
112 // Don't have to worry about sleep bit here or atomic since team setting
113 team_bar->b_arrived = new_state;
114 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
115 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
116 }
117 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
118 gtid, team->t.t_id, tid, bt));
119}
120
121static void
122__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
123 int propagate_icvs
124 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
125{
126 KMP_TIME_BLOCK(KMP_linear_release);
127 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
128 register kmp_team_t *team;
129
130 if (KMP_MASTER_TID(tid)) {
131 register unsigned int i;
132 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
133 register kmp_info_t **other_threads;
134
135 team = __kmp_threads[gtid]->th.th_team;
136 KMP_DEBUG_ASSERT(team != NULL);
137 other_threads = team->t.t_threads;
138
139 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
140 gtid, team->t.t_id, tid, bt));
141
142 if (nproc > 1) {
143#if KMP_BARRIER_ICV_PUSH
144 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
145 if (propagate_icvs) {
146 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
147 for (i=1; i<nproc; ++i) {
148 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
149 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
150 &team->t.t_implicit_task_taskdata[0].td_icvs);
151 }
152 ngo_sync();
153 }
154 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
155#endif // KMP_BARRIER_ICV_PUSH
156
157 // Now, release all of the worker threads
158 for (i=1; i<nproc; ++i) {
159#if KMP_CACHE_MANAGE
160 // Prefetch next thread's go flag
161 if (i+1 < nproc)
162 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
163#endif /* KMP_CACHE_MANAGE */
164 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
165 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
166 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
167 &other_threads[i]->th.th_bar[bt].bb.b_go,
168 other_threads[i]->th.th_bar[bt].bb.b_go,
169 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
170 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
171 flag.release();
172 }
173 }
174 } else { // Wait for the MASTER thread to release us
175 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
176 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
177 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
178 flag.wait(this_thr, TRUE
179 USE_ITT_BUILD_ARG(itt_sync_obj) );
180#if USE_ITT_BUILD && USE_ITT_NOTIFY
181 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
182 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
183 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
184 // Cancel wait on previous parallel region...
185 __kmp_itt_task_starting(itt_sync_obj);
186
187 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
188 return;
189
190 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
191 if (itt_sync_obj != NULL)
192 // Call prepare as early as possible for "new" barrier
193 __kmp_itt_task_finished(itt_sync_obj);
194 } else
195#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
196 // Early exit for reaping threads releasing forkjoin barrier
197 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
198 return;
199 // The worker thread may now assume that the team is valid.
200#ifdef KMP_DEBUG
201 tid = __kmp_tid_from_gtid(gtid);
202 team = __kmp_threads[gtid]->th.th_team;
203#endif
204 KMP_DEBUG_ASSERT(team != NULL);
205 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
206 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
207 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
208 KMP_MB(); // Flush all pending memory write invalidates.
209 }
210 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
211 gtid, team->t.t_id, tid, bt));
212}
213
214// Tree barrier
215static void
216__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
217 void (*reduce)(void *, void *)
218 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
219{
220 KMP_TIME_BLOCK(KMP_tree_gather);
221 register kmp_team_t *team = this_thr->th.th_team;
222 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
223 register kmp_info_t **other_threads = team->t.t_threads;
224 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
225 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
226 register kmp_uint32 branch_factor = 1 << branch_bits;
227 register kmp_uint32 child;
228 register kmp_uint32 child_tid;
229 register kmp_uint new_state;
230
231 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
232 gtid, team->t.t_id, tid, bt));
233 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
234
235#if USE_ITT_BUILD && USE_ITT_NOTIFY
236 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000237 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000238 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
239 }
240#endif
241 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
242 child_tid = (tid << branch_bits) + 1;
243 if (child_tid < nproc) {
244 // Parent threads wait for all their children to arrive
245 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
246 child = 1;
247 do {
248 register kmp_info_t *child_thr = other_threads[child_tid];
249 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
250#if KMP_CACHE_MANAGE
251 // Prefetch next thread's arrived count
252 if (child+1 <= branch_factor && child_tid+1 < nproc)
253 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
254#endif /* KMP_CACHE_MANAGE */
255 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
256 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
257 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
258 &child_bar->b_arrived, new_state));
259 // Wait for child to arrive
260 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
261 flag.wait(this_thr, FALSE
262 USE_ITT_BUILD_ARG(itt_sync_obj) );
263#if USE_ITT_BUILD && USE_ITT_NOTIFY
264 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000265 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000266 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
267 child_thr->th.th_bar_min_time);
268 }
269#endif
270 if (reduce) {
271 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
272 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
273 team->t.t_id, child_tid));
274 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
275 }
276 child++;
277 child_tid++;
278 }
279 while (child <= branch_factor && child_tid < nproc);
280 }
281
282 if (!KMP_MASTER_TID(tid)) { // Worker threads
283 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
284
285 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
286 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
287 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
288 &thr_bar->b_arrived, thr_bar->b_arrived,
289 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
290
291 // Mark arrival to parent thread
292 /* After performing this write, a worker thread may not assume that the team is valid
293 any more - it could be deallocated by the master thread at any time. */
294 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
295 flag.release();
296 } else {
297 // Need to update the team arrived pointer if we are the master thread
298 if (nproc > 1) // New value was already computed above
299 team->t.t_bar[bt].b_arrived = new_state;
300 else
301 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
302 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
303 gtid, team->t.t_id, tid, team->t.t_id,
304 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
305 }
306 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
307 gtid, team->t.t_id, tid, bt));
308}
309
310static void
311__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
312 int propagate_icvs
313 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
314{
315 KMP_TIME_BLOCK(KMP_tree_release);
316 register kmp_team_t *team;
317 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
318 register kmp_uint32 nproc;
319 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
320 register kmp_uint32 branch_factor = 1 << branch_bits;
321 register kmp_uint32 child;
322 register kmp_uint32 child_tid;
323
324 // Perform a tree release for all of the threads that have been gathered
325 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
326 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
327 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
328 // Wait for parent thread to release us
329 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
330 flag.wait(this_thr, TRUE
331 USE_ITT_BUILD_ARG(itt_sync_obj) );
332#if USE_ITT_BUILD && USE_ITT_NOTIFY
333 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
334 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
335 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
336 // Cancel wait on previous parallel region...
337 __kmp_itt_task_starting(itt_sync_obj);
338
339 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
340 return;
341
342 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
343 if (itt_sync_obj != NULL)
344 // Call prepare as early as possible for "new" barrier
345 __kmp_itt_task_finished(itt_sync_obj);
346 } else
347#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
348 // Early exit for reaping threads releasing forkjoin barrier
349 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
350 return;
351
352 // The worker thread may now assume that the team is valid.
353 team = __kmp_threads[gtid]->th.th_team;
354 KMP_DEBUG_ASSERT(team != NULL);
355 tid = __kmp_tid_from_gtid(gtid);
356
357 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
358 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
359 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
360 KMP_MB(); // Flush all pending memory write invalidates.
361 } else {
362 team = __kmp_threads[gtid]->th.th_team;
363 KMP_DEBUG_ASSERT(team != NULL);
364 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
365 gtid, team->t.t_id, tid, bt));
366 }
367 nproc = this_thr->th.th_team_nproc;
368 child_tid = (tid << branch_bits) + 1;
369
370 if (child_tid < nproc) {
371 register kmp_info_t **other_threads = team->t.t_threads;
372 child = 1;
373 // Parent threads release all their children
374 do {
375 register kmp_info_t *child_thr = other_threads[child_tid];
376 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
377#if KMP_CACHE_MANAGE
378 // Prefetch next thread's go count
379 if (child+1 <= branch_factor && child_tid+1 < nproc)
380 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
381#endif /* KMP_CACHE_MANAGE */
382
383#if KMP_BARRIER_ICV_PUSH
384 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
385 if (propagate_icvs) {
386 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
387 team, child_tid, FALSE);
388 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
389 &team->t.t_implicit_task_taskdata[0].td_icvs);
390 }
391 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
392#endif // KMP_BARRIER_ICV_PUSH
393 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
394 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
395 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
396 child_tid, &child_bar->b_go, child_bar->b_go,
397 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
398 // Release child from barrier
399 kmp_flag_64 flag(&child_bar->b_go, child_thr);
400 flag.release();
401 child++;
402 child_tid++;
403 }
404 while (child <= branch_factor && child_tid < nproc);
405 }
406 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
407 gtid, team->t.t_id, tid, bt));
408}
409
410
411// Hyper Barrier
412static void
413__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
414 void (*reduce)(void *, void *)
415 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
416{
417 KMP_TIME_BLOCK(KMP_hyper_gather);
418 register kmp_team_t *team = this_thr->th.th_team;
419 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
420 register kmp_info_t **other_threads = team->t.t_threads;
421 register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
422 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
423 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
424 register kmp_uint32 branch_factor = 1 << branch_bits;
425 register kmp_uint32 offset;
426 register kmp_uint32 level;
427
428 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
429 gtid, team->t.t_id, tid, bt));
430
431 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
432
433#if USE_ITT_BUILD && USE_ITT_NOTIFY
434 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000435 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000436 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
437 }
438#endif
439 /* Perform a hypercube-embedded tree gather to wait until all of the threads have
440 arrived, and reduce any required data as we go. */
441 kmp_flag_64 p_flag(&thr_bar->b_arrived);
442 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
443 {
444 register kmp_uint32 child;
445 register kmp_uint32 child_tid;
446
447 if (((tid >> level) & (branch_factor - 1)) != 0) {
448 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
449
450 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
451 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
452 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
453 &thr_bar->b_arrived, thr_bar->b_arrived,
454 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
455 // Mark arrival to parent thread
456 /* After performing this write (in the last iteration of the enclosing for loop),
457 a worker thread may not assume that the team is valid any more - it could be
458 deallocated by the master thread at any time. */
459 p_flag.set_waiter(other_threads[parent_tid]);
460 p_flag.release();
461 break;
462 }
463
464 // Parent threads wait for children to arrive
465 if (new_state == KMP_BARRIER_UNUSED_STATE)
466 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
467 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
468 child++, child_tid+=(1 << level))
469 {
470 register kmp_info_t *child_thr = other_threads[child_tid];
471 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
472#if KMP_CACHE_MANAGE
473 register kmp_uint32 next_child_tid = child_tid + (1 << level);
474 // Prefetch next thread's arrived count
475 if (child+1 < branch_factor && next_child_tid < num_threads)
476 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
477#endif /* KMP_CACHE_MANAGE */
478 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
479 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
480 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
481 &child_bar->b_arrived, new_state));
482 // Wait for child to arrive
483 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
484 c_flag.wait(this_thr, FALSE
485 USE_ITT_BUILD_ARG(itt_sync_obj) );
486#if USE_ITT_BUILD && USE_ITT_NOTIFY
487 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000488 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000489 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
490 child_thr->th.th_bar_min_time);
491 }
492#endif
493 if (reduce) {
494 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
495 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
496 team->t.t_id, child_tid));
497 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
498 }
499 }
500 }
501
502 if (KMP_MASTER_TID(tid)) {
503 // Need to update the team arrived pointer if we are the master thread
504 if (new_state == KMP_BARRIER_UNUSED_STATE)
505 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
506 else
507 team->t.t_bar[bt].b_arrived = new_state;
508 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
509 gtid, team->t.t_id, tid, team->t.t_id,
510 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
511 }
512 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
513 gtid, team->t.t_id, tid, bt));
514}
515
516// The reverse versions seem to beat the forward versions overall
517#define KMP_REVERSE_HYPER_BAR
518static void
519__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
520 int propagate_icvs
521 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
522{
523 KMP_TIME_BLOCK(KMP_hyper_release);
524 register kmp_team_t *team;
525 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
526 register kmp_info_t **other_threads;
527 register kmp_uint32 num_threads;
528 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
529 register kmp_uint32 branch_factor = 1 << branch_bits;
530 register kmp_uint32 child;
531 register kmp_uint32 child_tid;
532 register kmp_uint32 offset;
533 register kmp_uint32 level;
534
535 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
536 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
537 order of the corresponding gather, otherwise threads are released in the same order. */
538 if (KMP_MASTER_TID(tid)) { // master
539 team = __kmp_threads[gtid]->th.th_team;
540 KMP_DEBUG_ASSERT(team != NULL);
541 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
542 gtid, team->t.t_id, tid, bt));
543#if KMP_BARRIER_ICV_PUSH
544 if (propagate_icvs) { // master already has ICVs in final destination; copy
545 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
546 }
547#endif
548 }
549 else { // Handle fork barrier workers who aren't part of a team yet
550 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
551 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
552 // Wait for parent thread to release us
553 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
554 flag.wait(this_thr, TRUE
555 USE_ITT_BUILD_ARG(itt_sync_obj) );
556#if USE_ITT_BUILD && USE_ITT_NOTIFY
557 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
558 // In fork barrier where we could not get the object reliably
559 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
560 // Cancel wait on previous parallel region...
561 __kmp_itt_task_starting(itt_sync_obj);
562
563 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
564 return;
565
566 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
567 if (itt_sync_obj != NULL)
568 // Call prepare as early as possible for "new" barrier
569 __kmp_itt_task_finished(itt_sync_obj);
570 } else
571#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
572 // Early exit for reaping threads releasing forkjoin barrier
573 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
574 return;
575
576 // The worker thread may now assume that the team is valid.
577 team = __kmp_threads[gtid]->th.th_team;
578 KMP_DEBUG_ASSERT(team != NULL);
579 tid = __kmp_tid_from_gtid(gtid);
580
581 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
582 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
583 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
584 KMP_MB(); // Flush all pending memory write invalidates.
585 }
586 num_threads = this_thr->th.th_team_nproc;
587 other_threads = team->t.t_threads;
588
589#ifdef KMP_REVERSE_HYPER_BAR
590 // Count up to correct level for parent
591 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
592 level+=branch_bits, offset<<=branch_bits);
593
594 // Now go down from there
595 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
596 level-=branch_bits, offset>>=branch_bits)
597#else
598 // Go down the tree, level by level
599 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
600#endif // KMP_REVERSE_HYPER_BAR
601 {
602#ifdef KMP_REVERSE_HYPER_BAR
603 /* Now go in reverse order through the children, highest to lowest.
604 Initial setting of child is conservative here. */
605 child = num_threads >> ((level==0)?level:level-1);
606 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
607 child>=1; child--, child_tid-=(1<<level))
608#else
609 if (((tid >> level) & (branch_factor - 1)) != 0)
610 // No need to go lower than this, since this is the level parent would be notified
611 break;
612 // Iterate through children on this level of the tree
613 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
614 child++, child_tid+=(1<<level))
615#endif // KMP_REVERSE_HYPER_BAR
616 {
617 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
618 else {
619 register kmp_info_t *child_thr = other_threads[child_tid];
620 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
621#if KMP_CACHE_MANAGE
622 register kmp_uint32 next_child_tid = child_tid - (1 << level);
623 // Prefetch next thread's go count
624# ifdef KMP_REVERSE_HYPER_BAR
625 if (child-1 >= 1 && next_child_tid < num_threads)
626# else
627 if (child+1 < branch_factor && next_child_tid < num_threads)
628# endif // KMP_REVERSE_HYPER_BAR
629 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
630#endif /* KMP_CACHE_MANAGE */
631
632#if KMP_BARRIER_ICV_PUSH
633 if (propagate_icvs) // push my fixed ICVs to my child
634 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
635#endif // KMP_BARRIER_ICV_PUSH
636
637 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
638 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
639 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
640 child_tid, &child_bar->b_go, child_bar->b_go,
641 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
642 // Release child from barrier
643 kmp_flag_64 flag(&child_bar->b_go, child_thr);
644 flag.release();
645 }
646 }
647 }
648#if KMP_BARRIER_ICV_PUSH
649 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
650 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
651 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
652 }
653#endif
654 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
655 gtid, team->t.t_id, tid, bt));
656}
657
658// Hierarchical Barrier
659
660// Initialize thread barrier data
661/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
662 minimum amount of initialization required based on how the team has changed. Returns true if
663 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
664 team size increases, threads already in the team will respond to on-core wakeup on their parent
665 thread, but threads newly added to the team will only be listening on the their local b_go. */
666static bool
667__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
668 int gtid, int tid, kmp_team_t *team)
669{
670 // Checks to determine if (re-)initialization is needed
671 bool uninitialized = thr_bar->team == NULL;
672 bool team_changed = team != thr_bar->team;
673 bool team_sz_changed = nproc != thr_bar->nproc;
674 bool tid_changed = tid != thr_bar->old_tid;
675 bool retval = false;
676
677 if (uninitialized || team_sz_changed) {
678 __kmp_get_hierarchy(nproc, thr_bar);
679 }
680
681 if (uninitialized || team_sz_changed || tid_changed) {
682 thr_bar->my_level = thr_bar->depth-1; // default for master
683 thr_bar->parent_tid = -1; // default for master
684 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
685 kmp_uint32 d=0;
686 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
687 kmp_uint32 rem;
688 if (d == thr_bar->depth-2) { // reached level right below the master
689 thr_bar->parent_tid = 0;
690 thr_bar->my_level = d;
691 break;
692 }
693 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
694 // thread is not a subtree root at next level, so this is max
695 thr_bar->parent_tid = tid - rem;
696 thr_bar->my_level = d;
697 break;
698 }
699 ++d;
700 }
701 }
702 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
703 thr_bar->old_tid = tid;
704 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
705 }
706 if (uninitialized || team_changed || tid_changed) {
707 thr_bar->team = team;
708 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
709 retval = true;
710 }
711 if (uninitialized || team_sz_changed || tid_changed) {
712 thr_bar->nproc = nproc;
713 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
714 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
715 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
716 thr_bar->leaf_kids = nproc - tid - 1;
717 thr_bar->leaf_state = 0;
718 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
719 }
720 return retval;
721}
722
723static void
724__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
725 int gtid, int tid, void (*reduce) (void *, void *)
726 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
727{
728 KMP_TIME_BLOCK(KMP_hier_gather);
729 register kmp_team_t *team = this_thr->th.th_team;
730 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
731 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
732 register kmp_info_t **other_threads = team->t.t_threads;
733 register kmp_uint64 new_state;
734
Andrey Churbanov42a79212015-01-27 16:50:31 +0000735 int level = team->t.t_level;
736 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
737 if (this_thr->th.th_teams_size.nteams > 1)
738 ++level; // level was not increased in teams construct for team_of_masters
739 if (level == 1) thr_bar->use_oncore_barrier = 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000740 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
741
742 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
743 gtid, team->t.t_id, tid, bt));
744 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
745
Andrey Churbanove6bfb732015-05-06 18:34:15 +0000746#if USE_ITT_BUILD && USE_ITT_NOTIFY
747 // Barrier imbalance - save arrive time to the thread
748 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
749 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
750 }
751#endif
752
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000753 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
754
755 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
756 register kmp_int32 child_tid;
757 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
758 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
759 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
760 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
761 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
762 flag.wait(this_thr, FALSE
763 USE_ITT_BUILD_ARG(itt_sync_obj) );
764 if (reduce) {
765 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
766 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
767 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
768 team->t.t_id, child_tid));
769 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
770 }
771 }
772 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
773 }
774 // Next, wait for higher level children on each child's b_arrived flag
775 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
776 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
777 if (last > nproc) last = nproc;
778 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
779 register kmp_info_t *child_thr = other_threads[child_tid];
780 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
781 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
782 "arrived(%p) == %u\n",
783 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
784 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
785 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
786 flag.wait(this_thr, FALSE
787 USE_ITT_BUILD_ARG(itt_sync_obj) );
788 if (reduce) {
789 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
790 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
791 team->t.t_id, child_tid));
792 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
793 }
794 }
795 }
796 }
797 else { // Blocktime is not infinite
798 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
799 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
800 if (last > nproc) last = nproc;
801 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
802 register kmp_info_t *child_thr = other_threads[child_tid];
803 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
804 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
805 "arrived(%p) == %u\n",
806 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
807 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
808 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
809 flag.wait(this_thr, FALSE
810 USE_ITT_BUILD_ARG(itt_sync_obj) );
811 if (reduce) {
812 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
813 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
814 team->t.t_id, child_tid));
815 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
816 }
817 }
818 }
819 }
820 }
821 // All subordinates are gathered; now release parent if not master thread
822
823 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
824 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
825 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
826 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
827 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
828 /* Mark arrival to parent: After performing this write, a worker thread may not assume that
829 the team is valid any more - it could be deallocated by the master thread at any time. */
830 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
831 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
832 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
833 flag.release();
834 }
835 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
836 thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
837 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
838 flag.set_waiter(other_threads[thr_bar->parent_tid]);
839 flag.release();
840 }
841 } else { // Master thread needs to update the team's b_arrived value
842 team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
843 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
844 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
845 }
846 // Is the team access below unsafe or just technically invalid?
847 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
848 gtid, team->t.t_id, tid, bt));
849}
850
851static void
852__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
853 int propagate_icvs
854 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
855{
856 KMP_TIME_BLOCK(KMP_hier_release);
857 register kmp_team_t *team;
858 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
859 register kmp_uint32 nproc;
860 bool team_change = false; // indicates on-core barrier shouldn't be used
861
862 if (KMP_MASTER_TID(tid)) {
863 team = __kmp_threads[gtid]->th.th_team;
864 KMP_DEBUG_ASSERT(team != NULL);
865 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
866 gtid, team->t.t_id, tid, bt));
867 }
868 else { // Worker threads
869 // Wait for parent thread to release me
870 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
871 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
872 // Use traditional method of waiting on my own b_go flag
873 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
874 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
875 flag.wait(this_thr, TRUE
876 USE_ITT_BUILD_ARG(itt_sync_obj) );
877 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
878 }
879 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
880 // Wait on my "offset" bits on parent's b_go flag
881 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
882 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
883 bt, this_thr
884 USE_ITT_BUILD_ARG(itt_sync_obj) );
885 flag.wait(this_thr, TRUE
886 USE_ITT_BUILD_ARG(itt_sync_obj) );
887 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
888 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
889 }
890 else { // Reset my bits on parent's b_go flag
891 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
892 }
893 }
894 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
895 // Early exit for reaping threads releasing forkjoin barrier
896 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
897 return;
898 // The worker thread may now assume that the team is valid.
899 team = __kmp_threads[gtid]->th.th_team;
900 KMP_DEBUG_ASSERT(team != NULL);
901 tid = __kmp_tid_from_gtid(gtid);
902
903 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
904 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
905 KMP_MB(); // Flush all pending memory write invalidates.
906 }
907
Andrey Churbanov42a79212015-01-27 16:50:31 +0000908 int level = team->t.t_level;
909 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
910 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
911 ++level; // level was not increased in teams construct for team_of_workers
912 if( this_thr->th.th_teams_size.nteams > 1 )
913 ++level; // level was not increased in teams construct for team_of_masters
914 }
915 if (level == 1) thr_bar->use_oncore_barrier = 1;
916 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000917 nproc = this_thr->th.th_team_nproc;
918
919 // If the team size has increased, we still communicate with old leaves via oncore barrier.
920 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
921 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
922 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
923 // But if the entire team changes, we won't use oncore barrier at all
924 if (team_change) old_leaf_kids = 0;
925
926#if KMP_BARRIER_ICV_PUSH
927 if (propagate_icvs) {
928 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
929 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
930 }
931 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
932 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
933 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
934 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
935 &thr_bar->parent_bar->th_fixed_icvs);
936 // non-leaves will get ICVs piggybacked with b_go via NGO store
937 }
938 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
939 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
940 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
941 else // leaves copy parent's fixed ICVs directly to local ICV store
942 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
943 &thr_bar->parent_bar->th_fixed_icvs);
944 }
945 }
946#endif // KMP_BARRIER_ICV_PUSH
947
948 // Now, release my children
949 if (thr_bar->my_level) { // not a leaf
950 register kmp_int32 child_tid;
951 kmp_uint32 last;
952 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
953 if (KMP_MASTER_TID(tid)) { // do a flat release
954 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
955 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
956 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
957 ngo_load(&thr_bar->th_fixed_icvs);
958 // This loops over all the threads skipping only the leaf nodes in the hierarchy
959 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
960 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
961 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
962 " go(%p): %u => %u\n",
963 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
964 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
965 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
966 // Use ngo store (if available) to both store ICVs and release child via child's b_go
967 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
968 }
969 ngo_sync();
970 }
971 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
972 // Now, release leaf children
973 if (thr_bar->leaf_kids) { // if there are any
974 // We test team_change on the off-chance that the level 1 team changed.
975 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
976 if (old_leaf_kids) { // release old leaf kids
977 thr_bar->b_go |= old_leaf_state;
978 }
979 // Release new leaf kids
980 last = tid+thr_bar->skip_per_level[1];
981 if (last > nproc) last = nproc;
982 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
983 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
984 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
985 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
986 " T#%d(%d:%d) go(%p): %u => %u\n",
987 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
988 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
989 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
990 // Release child using child's b_go flag
991 kmp_flag_64 flag(&child_bar->b_go, child_thr);
992 flag.release();
993 }
994 }
995 else { // Release all children at once with leaf_state bits on my own b_go flag
996 thr_bar->b_go |= thr_bar->leaf_state;
997 }
998 }
999 }
1000 else { // Blocktime is not infinite; do a simple hierarchical release
1001 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1002 last = tid+thr_bar->skip_per_level[d+1];
1003 kmp_uint32 skip = thr_bar->skip_per_level[d];
1004 if (last > nproc) last = nproc;
1005 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1006 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1007 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1008 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1009 " go(%p): %u => %u\n",
1010 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1011 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1012 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1013 // Release child using child's b_go flag
1014 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1015 flag.release();
1016 }
1017 }
1018 }
1019#if KMP_BARRIER_ICV_PUSH
1020 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1021 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1022#endif // KMP_BARRIER_ICV_PUSH
1023 }
1024 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1025 gtid, team->t.t_id, tid, bt));
1026}
1027
1028// ---------------------------- End of Barrier Algorithms ----------------------------
1029
1030// Internal function to do a barrier.
1031/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1032 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1033 Returns 0 if master thread, 1 if worker thread. */
1034int
1035__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1036 void *reduce_data, void (*reduce)(void *, void *))
1037{
1038 KMP_TIME_BLOCK(KMP_barrier);
1039 register int tid = __kmp_tid_from_gtid(gtid);
1040 register kmp_info_t *this_thr = __kmp_threads[gtid];
1041 register kmp_team_t *team = this_thr->th.th_team;
1042 register int status = 0;
1043 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001044#if OMPT_SUPPORT
1045 ompt_task_id_t my_task_id;
1046 ompt_parallel_id_t my_parallel_id;
1047#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001048
1049 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1050 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1051
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001052#if OMPT_SUPPORT && OMPT_TRACE
1053 if (ompt_status & ompt_status_track) {
1054 if (ompt_status == ompt_status_track_callback) {
1055 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1056 my_parallel_id = team->t.ompt_team_info.parallel_id;
1057
1058 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1059 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1060 ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1061 my_parallel_id, my_task_id);
1062 }
1063 }
1064 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1065 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1066 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1067 my_parallel_id, my_task_id);
1068 }
1069 } else {
1070 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1071 }
1072 }
1073#endif
1074
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001075 if (! team->t.t_serialized) {
1076#if USE_ITT_BUILD
1077 // This value will be used in itt notify events below.
1078 void *itt_sync_obj = NULL;
1079# if USE_ITT_NOTIFY
1080 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1081 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1082# endif
1083#endif /* USE_ITT_BUILD */
1084 if (__kmp_tasking_mode == tskm_extra_barrier) {
1085 __kmp_tasking_barrier(team, this_thr, gtid);
1086 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1087 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1088 }
1089
1090 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1091 the team struct is not guaranteed to exist. */
1092 // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1093 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1094 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1095 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1096 }
1097
1098#if USE_ITT_BUILD
1099 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1100 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1101#endif /* USE_ITT_BUILD */
1102
1103 if (reduce != NULL) {
1104 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1105 this_thr->th.th_local.reduce_data = reduce_data;
1106 }
1107 switch (__kmp_barrier_gather_pattern[bt]) {
1108 case bp_hyper_bar: {
1109 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1110 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1111 USE_ITT_BUILD_ARG(itt_sync_obj) );
1112 break;
1113 }
1114 case bp_hierarchical_bar: {
1115 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1116 USE_ITT_BUILD_ARG(itt_sync_obj));
1117 break;
1118 }
1119 case bp_tree_bar: {
1120 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1121 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1122 USE_ITT_BUILD_ARG(itt_sync_obj) );
1123 break;
1124 }
1125 default: {
1126 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1127 USE_ITT_BUILD_ARG(itt_sync_obj) );
1128 }
1129 }
1130
1131 KMP_MB();
1132
1133 if (KMP_MASTER_TID(tid)) {
1134 status = 0;
1135 if (__kmp_tasking_mode != tskm_immediate_exec) {
1136 __kmp_task_team_wait(this_thr, team
1137 USE_ITT_BUILD_ARG(itt_sync_obj) );
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001138 __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001139 }
1140
1141
1142#if USE_ITT_BUILD
1143 /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1144 before the final summation into the shared variable is done (final summation can be a
1145 long operation for array reductions). */
1146 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1147 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1148#endif /* USE_ITT_BUILD */
1149#if USE_ITT_BUILD && USE_ITT_NOTIFY
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001150 // Barrier - report frame end (only if active_level == 1)
1151 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1152#if OMP_40_ENABLED
1153 this_thr->th.th_teams_microtask == NULL &&
1154#endif
1155 team->t.t_active_level == 1)
1156 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001157 kmp_uint64 cur_time = __itt_get_timestamp();
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001158 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001159 int nproc = this_thr->th.th_team_nproc;
1160 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001161 switch(__kmp_forkjoin_frames_mode) {
1162 case 1:
1163 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1164 this_thr->th.th_frame_time = cur_time;
1165 break;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001166 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001167 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1168 break;
1169 case 3:
1170 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001171 // Initialize with master's wait time
1172 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001173 for (i=1; i<nproc; ++i) {
1174 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1175 }
1176 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1177 }
1178 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1179 this_thr->th.th_frame_time = cur_time;
1180 break;
1181 }
1182 }
1183#endif /* USE_ITT_BUILD */
1184 } else {
1185 status = 1;
1186#if USE_ITT_BUILD
1187 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1188 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1189#endif /* USE_ITT_BUILD */
1190 }
1191 if (status == 1 || ! is_split) {
1192 switch (__kmp_barrier_release_pattern[bt]) {
1193 case bp_hyper_bar: {
1194 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1195 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1196 USE_ITT_BUILD_ARG(itt_sync_obj) );
1197 break;
1198 }
1199 case bp_hierarchical_bar: {
1200 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1201 USE_ITT_BUILD_ARG(itt_sync_obj) );
1202 break;
1203 }
1204 case bp_tree_bar: {
1205 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1206 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1207 USE_ITT_BUILD_ARG(itt_sync_obj) );
1208 break;
1209 }
1210 default: {
1211 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1212 USE_ITT_BUILD_ARG(itt_sync_obj) );
1213 }
1214 }
1215 if (__kmp_tasking_mode != tskm_immediate_exec) {
1216 __kmp_task_team_sync(this_thr, team);
1217 }
1218 }
1219
1220#if USE_ITT_BUILD
1221 /* GEH: TODO: Move this under if-condition above and also include in
1222 __kmp_end_split_barrier(). This will more accurately represent the actual release time
1223 of the threads for split barriers. */
1224 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1225 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1226#endif /* USE_ITT_BUILD */
1227 } else { // Team is serialized.
1228 status = 0;
1229 if (__kmp_tasking_mode != tskm_immediate_exec) {
1230 // The task team should be NULL for serialized code (tasks will be executed immediately)
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001231 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001232 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1233 }
1234 }
1235 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1236 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001237
1238#if OMPT_SUPPORT
1239 if (ompt_status & ompt_status_track) {
1240#if OMPT_TRACE
1241 if ((ompt_status == ompt_status_track_callback) &&
1242 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1243 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1244 my_parallel_id, my_task_id);
1245 }
1246#endif
1247 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1248 }
1249#endif
1250
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001251 return status;
1252}
1253
1254
1255void
1256__kmp_end_split_barrier(enum barrier_type bt, int gtid)
1257{
1258 KMP_TIME_BLOCK(KMP_end_split_barrier);
1259 int tid = __kmp_tid_from_gtid(gtid);
1260 kmp_info_t *this_thr = __kmp_threads[gtid];
1261 kmp_team_t *team = this_thr->th.th_team;
1262
1263 if (!team->t.t_serialized) {
1264 if (KMP_MASTER_GTID(gtid)) {
1265 switch (__kmp_barrier_release_pattern[bt]) {
1266 case bp_hyper_bar: {
1267 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1268 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1269 USE_ITT_BUILD_ARG(NULL) );
1270 break;
1271 }
1272 case bp_hierarchical_bar: {
1273 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1274 USE_ITT_BUILD_ARG(NULL));
1275 break;
1276 }
1277 case bp_tree_bar: {
1278 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1279 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1280 USE_ITT_BUILD_ARG(NULL) );
1281 break;
1282 }
1283 default: {
1284 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1285 USE_ITT_BUILD_ARG(NULL) );
1286 }
1287 }
1288 if (__kmp_tasking_mode != tskm_immediate_exec) {
1289 __kmp_task_team_sync(this_thr, team);
1290 } // if
1291 }
1292 }
1293}
1294
1295
1296void
1297__kmp_join_barrier(int gtid)
1298{
1299 KMP_TIME_BLOCK(KMP_join_barrier);
1300 register kmp_info_t *this_thr = __kmp_threads[gtid];
1301 register kmp_team_t *team;
1302 register kmp_uint nproc;
1303 kmp_info_t *master_thread;
1304 int tid;
1305#ifdef KMP_DEBUG
1306 int team_id;
1307#endif /* KMP_DEBUG */
1308#if USE_ITT_BUILD
1309 void *itt_sync_obj = NULL;
1310# if USE_ITT_NOTIFY
1311 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1312 // Get object created at fork_barrier
1313 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1314# endif
1315#endif /* USE_ITT_BUILD */
1316 KMP_MB();
1317
1318 // Get current info
1319 team = this_thr->th.th_team;
1320 nproc = this_thr->th.th_team_nproc;
1321 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1322 tid = __kmp_tid_from_gtid(gtid);
1323#ifdef KMP_DEBUG
1324 team_id = team->t.t_id;
1325#endif /* KMP_DEBUG */
1326 master_thread = this_thr->th.th_team_master;
1327#ifdef KMP_DEBUG
1328 if (master_thread != team->t.t_threads[0]) {
1329 __kmp_print_structure();
1330 }
1331#endif /* KMP_DEBUG */
1332 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1333 KMP_MB();
1334
1335 // Verify state
1336 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1337 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1338 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1339 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1340 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1341
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001342#if OMPT_SUPPORT && OMPT_TRACE
1343 if ((ompt_status == ompt_status_track_callback) &&
1344 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1345 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1346 team->t.ompt_team_info.parallel_id,
1347 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1348 }
1349 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1350#endif
1351
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001352 if (__kmp_tasking_mode == tskm_extra_barrier) {
1353 __kmp_tasking_barrier(team, this_thr, gtid);
1354 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1355 }
1356# ifdef KMP_DEBUG
1357 if (__kmp_tasking_mode != tskm_immediate_exec) {
1358 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001359 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001360 this_thr->th.th_task_team));
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001361 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001362 }
1363# endif /* KMP_DEBUG */
1364
1365 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1366 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1367 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1368 since the values are not used by __kmp_wait_template() in that case. */
1369 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1370 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1371 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1372 }
1373
1374#if USE_ITT_BUILD
1375 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1376 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1377#endif /* USE_ITT_BUILD */
1378
1379 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1380 case bp_hyper_bar: {
1381 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1382 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1383 USE_ITT_BUILD_ARG(itt_sync_obj) );
1384 break;
1385 }
1386 case bp_hierarchical_bar: {
1387 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1388 USE_ITT_BUILD_ARG(itt_sync_obj) );
1389 break;
1390 }
1391 case bp_tree_bar: {
1392 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1393 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1394 USE_ITT_BUILD_ARG(itt_sync_obj) );
1395 break;
1396 }
1397 default: {
1398 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1399 USE_ITT_BUILD_ARG(itt_sync_obj) );
1400 }
1401 }
1402
1403 /* From this point on, the team data structure may be deallocated at any time by the
1404 master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1405 data items that need to be referenced before the end of the barrier should be moved to
1406 the kmp_task_team_t structs. */
1407 if (KMP_MASTER_TID(tid)) {
1408 if (__kmp_tasking_mode != tskm_immediate_exec) {
1409 // Master shouldn't call decrease_load(). // TODO: enable master threads.
1410 // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1411 __kmp_task_team_wait(this_thr, team
1412 USE_ITT_BUILD_ARG(itt_sync_obj) );
1413 }
1414#if USE_ITT_BUILD
1415 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1416 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1417#endif /* USE_ITT_BUILD */
1418
1419# if USE_ITT_BUILD && USE_ITT_NOTIFY
1420 // Join barrier - report frame end
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001421 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1422#if OMP_40_ENABLED
1423 this_thr->th.th_teams_microtask == NULL &&
1424#endif
1425 team->t.t_active_level == 1)
1426 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001427 kmp_uint64 cur_time = __itt_get_timestamp();
1428 ident_t * loc = team->t.t_ident;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001429 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001430 int nproc = this_thr->th.th_team_nproc;
1431 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001432 switch(__kmp_forkjoin_frames_mode) {
1433 case 1:
1434 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1435 break;
1436 case 2:
1437 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1438 break;
1439 case 3:
1440 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001441 // Initialize with master's wait time
1442 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001443 for (i=1; i<nproc; ++i) {
1444 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1445 }
1446 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1447 }
1448 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1449 this_thr->th.th_frame_time = cur_time;
1450 break;
1451 }
1452 }
1453# endif /* USE_ITT_BUILD */
1454 }
1455#if USE_ITT_BUILD
1456 else {
1457 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1458 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1459 }
1460#endif /* USE_ITT_BUILD */
1461
1462#if KMP_DEBUG
1463 if (KMP_MASTER_TID(tid)) {
1464 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1465 gtid, team_id, tid, nproc));
1466 }
1467#endif /* KMP_DEBUG */
1468
1469 // TODO now, mark worker threads as done so they may be disbanded
1470 KMP_MB(); // Flush all pending memory write invalidates.
1471 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001472
1473#if OMPT_SUPPORT
1474 if (ompt_status == ompt_status_track) {
1475#if OMPT_TRACE
1476 if ((ompt_status == ompt_status_track_callback) &&
1477 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1478 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1479 team->t.ompt_team_info.parallel_id,
1480 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1481 }
1482#endif
1483
1484 // return to default state
1485 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1486 }
1487#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001488}
1489
1490
1491// TODO release worker threads' fork barriers as we are ready instead of all at once
1492void
1493__kmp_fork_barrier(int gtid, int tid)
1494{
1495 KMP_TIME_BLOCK(KMP_fork_barrier);
1496 kmp_info_t *this_thr = __kmp_threads[gtid];
1497 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1498#if USE_ITT_BUILD
1499 void * itt_sync_obj = NULL;
1500#endif /* USE_ITT_BUILD */
1501
1502 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1503 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1504
1505 // th_team pointer only valid for master thread here
1506 if (KMP_MASTER_TID(tid)) {
1507#if USE_ITT_BUILD && USE_ITT_NOTIFY
1508 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1509 // Create itt barrier object
1510 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1511 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1512 }
1513#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1514
1515#ifdef KMP_DEBUG
1516 register kmp_info_t **other_threads = team->t.t_threads;
1517 register int i;
1518
1519 // Verify state
1520 KMP_MB();
1521
1522 for(i=1; i<team->t.t_nproc; ++i) {
1523 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1524 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1525 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1526 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1527 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1528 & ~(KMP_BARRIER_SLEEP_STATE))
1529 == KMP_INIT_BARRIER_STATE);
1530 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1531 }
1532#endif
1533
1534 if (__kmp_tasking_mode != tskm_immediate_exec) {
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001535 __kmp_task_team_setup(this_thr, team, 1); // 1 indicates setup both task teams
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001536 }
1537
1538 /* The master thread may have changed its blocktime between the join barrier and the
1539 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1540 access it when the team struct is not guaranteed to exist. */
1541 // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1542 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1543 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1544 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1545 }
1546 } // master
1547
1548 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1549 case bp_hyper_bar: {
1550 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1551 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1552 USE_ITT_BUILD_ARG(itt_sync_obj) );
1553 break;
1554 }
1555 case bp_hierarchical_bar: {
1556 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1557 USE_ITT_BUILD_ARG(itt_sync_obj) );
1558 break;
1559 }
1560 case bp_tree_bar: {
1561 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1562 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1563 USE_ITT_BUILD_ARG(itt_sync_obj) );
1564 break;
1565 }
1566 default: {
1567 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1568 USE_ITT_BUILD_ARG(itt_sync_obj) );
1569 }
1570 }
1571
1572 // Early exit for reaping threads releasing forkjoin barrier
1573 if (TCR_4(__kmp_global.g.g_done)) {
1574 if (this_thr->th.th_task_team != NULL) {
1575 if (KMP_MASTER_TID(tid)) {
1576 TCW_PTR(this_thr->th.th_task_team, NULL);
1577 }
1578 else {
1579 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1580 }
1581 }
1582
1583#if USE_ITT_BUILD && USE_ITT_NOTIFY
1584 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1585 if (!KMP_MASTER_TID(tid)) {
1586 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1587 if (itt_sync_obj)
1588 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1589 }
1590 }
1591#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1592 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1593 return;
1594 }
1595
1596 /* We can now assume that a valid team structure has been allocated by the master and
1597 propagated to all worker threads. The current thread, however, may not be part of the
1598 team, so we can't blindly assume that the team pointer is non-null. */
1599 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1600 KMP_DEBUG_ASSERT(team != NULL);
1601 tid = __kmp_tid_from_gtid(gtid);
1602
1603
1604#if KMP_BARRIER_ICV_PULL
1605 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1606 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1607 this data before this function is called. We cannot modify __kmp_fork_call() to look at
1608 the fixed ICVs in the master's thread struct, because it is not always the case that the
1609 threads arrays have been allocated when __kmp_fork_call() is executed. */
1610 KMP_START_EXPLICIT_TIMER(USER_icv_copy);
1611 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1612 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1613 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1614 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1615 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1616 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1617 }
1618 KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
1619#endif // KMP_BARRIER_ICV_PULL
1620
1621 if (__kmp_tasking_mode != tskm_immediate_exec) {
1622 __kmp_task_team_sync(this_thr, team);
1623 }
1624
1625#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1626 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1627 if (proc_bind == proc_bind_intel) {
1628#endif
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001629#if KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001630 // Call dynamic affinity settings
1631 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1632 __kmp_balanced_affinity(tid, team->t.t_nproc);
1633 }
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001634#endif // KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001635#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1636 }
Andrey Churbanov94e569e2015-03-10 09:19:47 +00001637 else if (proc_bind != proc_bind_false) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001638 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1639 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1640 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1641 }
1642 else {
1643 __kmp_affinity_set_place(gtid);
1644 }
1645 }
1646#endif
1647
1648#if USE_ITT_BUILD && USE_ITT_NOTIFY
1649 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1650 if (!KMP_MASTER_TID(tid)) {
1651 // Get correct barrier object
1652 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1653 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1654 } // (prepare called inside barrier_release)
1655 }
1656#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1657 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1658}
1659
1660
1661void
1662__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1663{
1664 KMP_TIME_BLOCK(KMP_setup_icv_copy);
1665 int f;
1666
1667 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1668 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1669
1670 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1671 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1672 this data before this function is called. */
1673#if KMP_BARRIER_ICV_PULL
1674 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1675 all of the worker threads can access them and make their own copies after the barrier. */
1676 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1677 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1678 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1679 0, team->t.t_threads[0], team));
1680#elif KMP_BARRIER_ICV_PUSH
1681 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1682 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1683 0, team->t.t_threads[0], team));
1684#else
1685 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1686 ngo_load(new_icvs);
1687 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1688 for (f=1; f<new_nproc; ++f) { // Skip the master thread
1689 // TODO: GEH - pass in better source location info since usually NULL here
1690 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1691 f, team->t.t_threads[f], team));
1692 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1693 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1694 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1695 f, team->t.t_threads[f], team));
1696 }
1697 ngo_sync();
1698#endif // KMP_BARRIER_ICV_PULL
1699}