blob: 2e91c14e8d8971e47934a4b66969813bde6cc310 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_affinity.cpp -- affinity management
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16#include "kmp.h"
17#include "kmp_i18n.h"
18#include "kmp_io.h"
19#include "kmp_str.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000020#include "kmp_wrapper_getpid.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000021
Alp Toker763b9392014-02-28 09:42:41 +000022#if KMP_AFFINITY_SUPPORTED
Jim Cownie5e8470a2013-09-27 10:38:44 +000023
24//
25// Print the affinity mask to the character array in a pretty format.
26//
27char *
28__kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
29{
30 KMP_ASSERT(buf_len >= 40);
31 char *scan = buf;
32 char *end = buf + buf_len - 1;
33
34 //
35 // Find first element / check for empty set.
36 //
37 size_t i;
38 for (i = 0; i < KMP_CPU_SETSIZE; i++) {
39 if (KMP_CPU_ISSET(i, mask)) {
40 break;
41 }
42 }
43 if (i == KMP_CPU_SETSIZE) {
Andrey Churbanov74bf17b2015-04-02 13:27:08 +000044 KMP_SNPRINTF(scan, buf_len, "{<empty>}");
Jim Cownie5e8470a2013-09-27 10:38:44 +000045 while (*scan != '\0') scan++;
46 KMP_ASSERT(scan <= end);
47 return buf;
48 }
49
Andrey Churbanov74bf17b2015-04-02 13:27:08 +000050 KMP_SNPRINTF(scan, buf_len, "{%ld", (long)i);
Jim Cownie5e8470a2013-09-27 10:38:44 +000051 while (*scan != '\0') scan++;
52 i++;
53 for (; i < KMP_CPU_SETSIZE; i++) {
54 if (! KMP_CPU_ISSET(i, mask)) {
55 continue;
56 }
57
58 //
59 // Check for buffer overflow. A string of the form ",<n>" will have
60 // at most 10 characters, plus we want to leave room to print ",...}"
61 // if the set is too large to print for a total of 15 characters.
62 // We already left room for '\0' in setting end.
63 //
64 if (end - scan < 15) {
65 break;
66 }
Andrey Churbanov74bf17b2015-04-02 13:27:08 +000067 KMP_SNPRINTF(scan, buf_len, ",%-ld", (long)i);
Jim Cownie5e8470a2013-09-27 10:38:44 +000068 while (*scan != '\0') scan++;
69 }
70 if (i < KMP_CPU_SETSIZE) {
Andrey Churbanov74bf17b2015-04-02 13:27:08 +000071 KMP_SNPRINTF(scan, buf_len, ",...");
Jim Cownie5e8470a2013-09-27 10:38:44 +000072 while (*scan != '\0') scan++;
73 }
Andrey Churbanov74bf17b2015-04-02 13:27:08 +000074 KMP_SNPRINTF(scan, buf_len, "}");
Jim Cownie5e8470a2013-09-27 10:38:44 +000075 while (*scan != '\0') scan++;
76 KMP_ASSERT(scan <= end);
77 return buf;
78}
79
80
81void
82__kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
83{
84 KMP_CPU_ZERO(mask);
85
Andrey Churbanov7daf9802015-01-27 16:52:57 +000086# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +000087
88 if (__kmp_num_proc_groups > 1) {
89 int group;
Jim Cownie5e8470a2013-09-27 10:38:44 +000090 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
91 for (group = 0; group < __kmp_num_proc_groups; group++) {
92 int i;
93 int num = __kmp_GetActiveProcessorCount(group);
94 for (i = 0; i < num; i++) {
95 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
96 }
97 }
98 }
99 else
100
Andrey Churbanov7daf9802015-01-27 16:52:57 +0000101# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +0000102
103 {
104 int proc;
105 for (proc = 0; proc < __kmp_xproc; proc++) {
106 KMP_CPU_SET(proc, mask);
107 }
108 }
109}
110
111
112//
113// In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
114// functions.
115//
116// The icc codegen emits sections with extremely long names, of the form
117// ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug
118// introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
119// some sort of memory corruption or table overflow that is triggered by
120// these long strings. I checked the latest version of the linker -
121// GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
122// fixed.
123//
124// Unfortunately, my attempts to reproduce it in a smaller example have
125// failed - I'm not sure what the prospects are of getting it fixed
Jonathan Peyton66338292015-06-01 02:37:28 +0000126// properly - but we need a reproducer smaller than all of libomp.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000127//
128// Work around the problem by avoiding inline constructors in such builds.
129// We do this for all platforms, not just Linux* OS - non-inline functions are
130// more debuggable and provide better coverage into than inline functions.
131// Use inline functions in shipping libs, for performance.
132//
133
134# if !defined(KMP_DEBUG) && !defined(COVER)
135
136class Address {
137public:
138 static const unsigned maxDepth = 32;
139 unsigned labels[maxDepth];
140 unsigned childNums[maxDepth];
141 unsigned depth;
142 unsigned leader;
143 Address(unsigned _depth)
144 : depth(_depth), leader(FALSE) {
145 }
146 Address &operator=(const Address &b) {
147 depth = b.depth;
148 for (unsigned i = 0; i < depth; i++) {
149 labels[i] = b.labels[i];
150 childNums[i] = b.childNums[i];
151 }
152 leader = FALSE;
153 return *this;
154 }
155 bool operator==(const Address &b) const {
156 if (depth != b.depth)
157 return false;
158 for (unsigned i = 0; i < depth; i++)
159 if(labels[i] != b.labels[i])
160 return false;
161 return true;
162 }
163 bool isClose(const Address &b, int level) const {
164 if (depth != b.depth)
165 return false;
166 if ((unsigned)level >= depth)
167 return true;
168 for (unsigned i = 0; i < (depth - level); i++)
169 if(labels[i] != b.labels[i])
170 return false;
171 return true;
172 }
173 bool operator!=(const Address &b) const {
174 return !operator==(b);
175 }
176};
177
178class AddrUnsPair {
179public:
180 Address first;
181 unsigned second;
182 AddrUnsPair(Address _first, unsigned _second)
183 : first(_first), second(_second) {
184 }
185 AddrUnsPair &operator=(const AddrUnsPair &b)
186 {
187 first = b.first;
188 second = b.second;
189 return *this;
190 }
191};
192
193# else
194
195class Address {
196public:
197 static const unsigned maxDepth = 32;
198 unsigned labels[maxDepth];
199 unsigned childNums[maxDepth];
200 unsigned depth;
201 unsigned leader;
202 Address(unsigned _depth);
203 Address &operator=(const Address &b);
204 bool operator==(const Address &b) const;
205 bool isClose(const Address &b, int level) const;
206 bool operator!=(const Address &b) const;
207};
208
209Address::Address(unsigned _depth)
210{
211 depth = _depth;
212 leader = FALSE;
213}
214
215Address &Address::operator=(const Address &b) {
216 depth = b.depth;
217 for (unsigned i = 0; i < depth; i++) {
218 labels[i] = b.labels[i];
219 childNums[i] = b.childNums[i];
220 }
221 leader = FALSE;
222 return *this;
223}
224
225bool Address::operator==(const Address &b) const {
226 if (depth != b.depth)
227 return false;
228 for (unsigned i = 0; i < depth; i++)
229 if(labels[i] != b.labels[i])
230 return false;
231 return true;
232}
233
234bool Address::isClose(const Address &b, int level) const {
235 if (depth != b.depth)
236 return false;
237 if ((unsigned)level >= depth)
238 return true;
239 for (unsigned i = 0; i < (depth - level); i++)
240 if(labels[i] != b.labels[i])
241 return false;
242 return true;
243}
244
245bool Address::operator!=(const Address &b) const {
246 return !operator==(b);
247}
248
249class AddrUnsPair {
250public:
251 Address first;
252 unsigned second;
253 AddrUnsPair(Address _first, unsigned _second);
254 AddrUnsPair &operator=(const AddrUnsPair &b);
255};
256
257AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
258 : first(_first), second(_second)
259{
260}
261
262AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
263{
264 first = b.first;
265 second = b.second;
266 return *this;
267}
268
269# endif /* !defined(KMP_DEBUG) && !defined(COVER) */
270
271
272static int
273__kmp_affinity_cmp_Address_labels(const void *a, const void *b)
274{
275 const Address *aa = (const Address *)&(((AddrUnsPair *)a)
276 ->first);
277 const Address *bb = (const Address *)&(((AddrUnsPair *)b)
278 ->first);
279 unsigned depth = aa->depth;
280 unsigned i;
281 KMP_DEBUG_ASSERT(depth == bb->depth);
282 for (i = 0; i < depth; i++) {
283 if (aa->labels[i] < bb->labels[i]) return -1;
284 if (aa->labels[i] > bb->labels[i]) return 1;
285 }
286 return 0;
287}
288
289
290static int
291__kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
292{
293 const Address *aa = (const Address *)&(((AddrUnsPair *)a)
294 ->first);
295 const Address *bb = (const Address *)&(((AddrUnsPair *)b)
296 ->first);
297 unsigned depth = aa->depth;
298 unsigned i;
299 KMP_DEBUG_ASSERT(depth == bb->depth);
300 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
301 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
302 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
303 int j = depth - i - 1;
304 if (aa->childNums[j] < bb->childNums[j]) return -1;
305 if (aa->childNums[j] > bb->childNums[j]) return 1;
306 }
307 for (; i < depth; i++) {
308 int j = i - __kmp_affinity_compact;
309 if (aa->childNums[j] < bb->childNums[j]) return -1;
310 if (aa->childNums[j] > bb->childNums[j]) return 1;
311 }
312 return 0;
313}
314
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000315/** A structure for holding machine-specific hierarchy info to be computed once at init.
316 This structure represents a mapping of threads to the actual machine hierarchy, or to
317 our best guess at what the hierarchy might be, for the purpose of performing an
318 efficient barrier. In the worst case, when there is no machine hierarchy information,
319 it produces a tree suitable for a barrier, similar to the tree used in the hyper barrier. */
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000320class hierarchy_info {
321public:
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000322 /** Number of levels in the hierarchy. Typical levels are threads/core, cores/package
323 or socket, packages/node, nodes/machine, etc. We don't want to get specific with
324 nomenclature. When the machine is oversubscribed we add levels to duplicate the
325 hierarchy, doubling the thread capacity of the hierarchy each time we add a level. */
326 kmp_uint32 maxLevels;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000327
328 /** This is specifically the depth of the machine configuration hierarchy, in terms of the
329 number of levels along the longest path from root to any leaf. It corresponds to the
330 number of entries in numPerLevel if we exclude all but one trailing 1. */
331 kmp_uint32 depth;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000332 kmp_uint32 base_num_threads;
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +0000333 volatile kmp_int8 uninitialized; // 0=initialized, 1=uninitialized, 2=initialization in progress
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000334 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000335
336 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
337 node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
338 and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000339 kmp_uint32 *numPerLevel;
340 kmp_uint32 *skipPerLevel;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000341
342 void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
343 int hier_depth = adr2os[0].first.depth;
344 int level = 0;
345 for (int i=hier_depth-1; i>=0; --i) {
346 int max = -1;
347 for (int j=0; j<num_addrs; ++j) {
348 int next = adr2os[j].first.childNums[i];
349 if (next > max) max = next;
350 }
351 numPerLevel[level] = max+1;
352 ++level;
353 }
354 }
355
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000356 hierarchy_info() : maxLevels(7), depth(1), uninitialized(1), resizing(0) {}
357
358 // TO FIX: This destructor causes a segfault in the library at shutdown.
359 //~hierarchy_info() { if (!uninitialized && numPerLevel) __kmp_free(numPerLevel); }
360
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000361 void init(AddrUnsPair *adr2os, int num_addrs)
362 {
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +0000363 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, 1, 2);
364 if (bool_result == 0) { // Wait for initialization
365 while (TCR_1(uninitialized) != 0) KMP_CPU_PAUSE();
366 return;
367 }
368 KMP_DEBUG_ASSERT(bool_result==1);
369
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000370 /* Added explicit initialization of the data fields here to prevent usage of dirty value
Andrey Churbanovb41e62b2015-02-10 20:10:21 +0000371 observed when static library is re-initialized multiple times (e.g. when
372 non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */
373 depth = 1;
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000374 resizing = 0;
375 maxLevels = 7;
376 numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
377 skipPerLevel = &(numPerLevel[maxLevels]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000378 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
379 numPerLevel[i] = 1;
380 skipPerLevel[i] = 1;
381 }
382
383 // Sort table by physical ID
384 if (adr2os) {
385 qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
386 deriveLevels(adr2os, num_addrs);
387 }
388 else {
389 numPerLevel[0] = 4;
390 numPerLevel[1] = num_addrs/4;
391 if (num_addrs%4) numPerLevel[1]++;
392 }
393
394 base_num_threads = num_addrs;
395 for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
396 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
397 depth++;
398
399 kmp_uint32 branch = 4;
400 if (numPerLevel[0] == 1) branch = num_addrs/4;
401 if (branch<4) branch=4;
402 for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
403 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
404 if (numPerLevel[d] & 1) numPerLevel[d]++;
405 numPerLevel[d] = numPerLevel[d] >> 1;
406 if (numPerLevel[d+1] == 1) depth++;
407 numPerLevel[d+1] = numPerLevel[d+1] << 1;
408 }
409 if(numPerLevel[0] == 1) {
410 branch = branch >> 1;
411 if (branch<4) branch = 4;
412 }
413 }
414
415 for (kmp_uint32 i=1; i<depth; ++i)
416 skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +0000417 // Fill in hierarchy in the case of oversubscription
418 for (kmp_uint32 i=depth; i<maxLevels; ++i)
419 skipPerLevel[i] = 2*skipPerLevel[i-1];
420
421 uninitialized = 0; // One writer
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000422
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000423 }
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000424
425 void resize(kmp_uint32 nproc)
426 {
427 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
428 if (bool_result == 0) { // Someone else is resizing
429 while (TCR_1(resizing) != 0) KMP_CPU_PAUSE();
430 return;
431 }
432 KMP_DEBUG_ASSERT(bool_result!=0);
433 KMP_DEBUG_ASSERT(nproc > base_num_threads);
434
435 // Calculate new max_levels
436 kmp_uint32 old_sz = skipPerLevel[depth-1];
437 kmp_uint32 incs = 0, old_maxLevels= maxLevels;
438 while (nproc > old_sz) {
439 old_sz *=2;
440 incs++;
441 }
442 maxLevels += incs;
443
444 // Resize arrays
445 kmp_uint32 *old_numPerLevel = numPerLevel;
446 kmp_uint32 *old_skipPerLevel = skipPerLevel;
447 numPerLevel = skipPerLevel = NULL;
448 numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
449 skipPerLevel = &(numPerLevel[maxLevels]);
450
451 // Copy old elements from old arrays
452 for (kmp_uint32 i=0; i<old_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
453 numPerLevel[i] = old_numPerLevel[i];
454 skipPerLevel[i] = old_skipPerLevel[i];
455 }
456
457 // Init new elements in arrays to 1
458 for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
459 numPerLevel[i] = 1;
460 skipPerLevel[i] = 1;
461 }
462
463 // Free old arrays
464 __kmp_free(old_numPerLevel);
465
466 // Fill in oversubscription levels of hierarchy
467 for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i)
468 skipPerLevel[i] = 2*skipPerLevel[i-1];
469
470 base_num_threads = nproc;
471 resizing = 0; // One writer
472
473 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000474};
475
476static hierarchy_info machine_hierarchy;
477
478void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
Andrey Churbanov1362ae72015-04-02 13:18:50 +0000479 kmp_uint32 depth;
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +0000480 // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
481 if (TCR_1(machine_hierarchy.uninitialized))
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000482 machine_hierarchy.init(NULL, nproc);
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000483 // Adjust the hierarchy in case num threads exceeds original
484 if (nproc > machine_hierarchy.base_num_threads)
485 machine_hierarchy.resize(nproc);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000486
Andrey Churbanov1362ae72015-04-02 13:18:50 +0000487 depth = machine_hierarchy.depth;
488 KMP_DEBUG_ASSERT(depth > 0);
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000489 // The loop below adjusts the depth in the case of a resize
490 while (nproc > machine_hierarchy.skipPerLevel[depth-1])
Andrey Churbanov1362ae72015-04-02 13:18:50 +0000491 depth++;
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +0000492
Andrey Churbanov1362ae72015-04-02 13:18:50 +0000493 thr_bar->depth = depth;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000494 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
495 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
496}
Jim Cownie5e8470a2013-09-27 10:38:44 +0000497
498//
499// When sorting by labels, __kmp_affinity_assign_child_nums() must first be
500// called to renumber the labels from [0..n] and place them into the child_num
501// vector of the address object. This is done in case the labels used for
Alp Toker8f2d3f02014-02-24 10:40:15 +0000502// the children at one node of the hierarchy differ from those used for
Jim Cownie5e8470a2013-09-27 10:38:44 +0000503// another node at the same level. Example: suppose the machine has 2 nodes
504// with 2 packages each. The first node contains packages 601 and 602, and
505// second node contains packages 603 and 604. If we try to sort the table
506// for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
507// because we are paying attention to the labels themselves, not the ordinal
508// child numbers. By using the child numbers in the sort, the result is
509// {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
510//
511static void
512__kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
513 int numAddrs)
514{
515 KMP_DEBUG_ASSERT(numAddrs > 0);
516 int depth = address2os->first.depth;
517 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
518 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
519 * sizeof(unsigned));
520 int labCt;
521 for (labCt = 0; labCt < depth; labCt++) {
522 address2os[0].first.childNums[labCt] = counts[labCt] = 0;
523 lastLabel[labCt] = address2os[0].first.labels[labCt];
524 }
525 int i;
526 for (i = 1; i < numAddrs; i++) {
527 for (labCt = 0; labCt < depth; labCt++) {
528 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
529 int labCt2;
530 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
531 counts[labCt2] = 0;
532 lastLabel[labCt2] = address2os[i].first.labels[labCt2];
533 }
534 counts[labCt]++;
535 lastLabel[labCt] = address2os[i].first.labels[labCt];
536 break;
537 }
538 }
539 for (labCt = 0; labCt < depth; labCt++) {
540 address2os[i].first.childNums[labCt] = counts[labCt];
541 }
542 for (; labCt < (int)Address::maxDepth; labCt++) {
543 address2os[i].first.childNums[labCt] = 0;
544 }
545 }
546}
547
548
549//
550// All of the __kmp_affinity_create_*_map() routines should set
551// __kmp_affinity_masks to a vector of affinity mask objects of length
552// __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
553// return the number of levels in the machine topology tree (zero if
554// __kmp_affinity_type == affinity_none).
555//
556// All of the __kmp_affinity_create_*_map() routines should set *fullMask
557// to the affinity mask for the initialization thread. They need to save and
558// restore the mask, and it could be needed later, so saving it is just an
559// optimization to avoid calling kmp_get_system_affinity() again.
560//
561static kmp_affin_mask_t *fullMask = NULL;
562
563kmp_affin_mask_t *
564__kmp_affinity_get_fullMask() { return fullMask; }
565
566
567static int nCoresPerPkg, nPackages;
Andrey Churbanovf696c822015-01-27 16:55:43 +0000568static int __kmp_nThreadsPerCore;
569#ifndef KMP_DFLT_NTH_CORES
570static int __kmp_ncores;
571#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000572
573//
574// __kmp_affinity_uniform_topology() doesn't work when called from
575// places which support arbitrarily many levels in the machine topology
576// map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
577// __kmp_affinity_create_x2apicid_map().
578//
579inline static bool
580__kmp_affinity_uniform_topology()
581{
582 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
583}
584
585
586//
587// Print out the detailed machine topology map, i.e. the physical locations
588// of each OS proc.
589//
590static void
591__kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
592 int pkgLevel, int coreLevel, int threadLevel)
593{
594 int proc;
595
596 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
597 for (proc = 0; proc < len; proc++) {
598 int level;
599 kmp_str_buf_t buf;
600 __kmp_str_buf_init(&buf);
601 for (level = 0; level < depth; level++) {
602 if (level == threadLevel) {
603 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
604 }
605 else if (level == coreLevel) {
606 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
607 }
608 else if (level == pkgLevel) {
609 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
610 }
611 else if (level > pkgLevel) {
612 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
613 level - pkgLevel - 1);
614 }
615 else {
616 __kmp_str_buf_print(&buf, "L%d ", level);
617 }
618 __kmp_str_buf_print(&buf, "%d ",
619 address2os[proc].first.labels[level]);
620 }
621 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
622 buf.str);
623 __kmp_str_buf_free(&buf);
624 }
625}
626
627
628//
629// If we don't know how to retrieve the machine's processor topology, or
630// encounter an error in doing so, this routine is called to form a "flat"
631// mapping of os thread id's <-> processor id's.
632//
633static int
634__kmp_affinity_create_flat_map(AddrUnsPair **address2os,
635 kmp_i18n_id_t *const msg_id)
636{
637 *address2os = NULL;
638 *msg_id = kmp_i18n_null;
639
640 //
641 // Even if __kmp_affinity_type == affinity_none, this routine might still
Andrey Churbanovf696c822015-01-27 16:55:43 +0000642 // called to set __kmp_ncores, as well as
Jim Cownie5e8470a2013-09-27 10:38:44 +0000643 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
644 //
645 if (! KMP_AFFINITY_CAPABLE()) {
646 KMP_ASSERT(__kmp_affinity_type == affinity_none);
647 __kmp_ncores = nPackages = __kmp_xproc;
648 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000649 if (__kmp_affinity_verbose) {
650 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
651 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
652 KMP_INFORM(Uniform, "KMP_AFFINITY");
653 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
654 __kmp_nThreadsPerCore, __kmp_ncores);
655 }
656 return 0;
657 }
658
659 //
660 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +0000661 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +0000662 // nCoresPerPkg, & nPackages. Make sure all these vars are set
663 // correctly, and return now if affinity is not enabled.
664 //
665 __kmp_ncores = nPackages = __kmp_avail_proc;
666 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000667 if (__kmp_affinity_verbose) {
668 char buf[KMP_AFFIN_MASK_PRINT_LEN];
669 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
670
671 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
672 if (__kmp_affinity_respect_mask) {
673 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
674 } else {
675 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
676 }
677 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
678 KMP_INFORM(Uniform, "KMP_AFFINITY");
679 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
680 __kmp_nThreadsPerCore, __kmp_ncores);
681 }
682 if (__kmp_affinity_type == affinity_none) {
683 return 0;
684 }
685
686 //
687 // Contruct the data structure to be returned.
688 //
689 *address2os = (AddrUnsPair*)
690 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
691 int avail_ct = 0;
692 unsigned int i;
693 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
694 //
695 // Skip this proc if it is not included in the machine model.
696 //
697 if (! KMP_CPU_ISSET(i, fullMask)) {
698 continue;
699 }
700
701 Address addr(1);
702 addr.labels[0] = i;
703 (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
704 }
705 if (__kmp_affinity_verbose) {
706 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
707 }
708
709 if (__kmp_affinity_gran_levels < 0) {
710 //
711 // Only the package level is modeled in the machine topology map,
712 // so the #levels of granularity is either 0 or 1.
713 //
714 if (__kmp_affinity_gran > affinity_gran_package) {
715 __kmp_affinity_gran_levels = 1;
716 }
717 else {
718 __kmp_affinity_gran_levels = 0;
719 }
720 }
721 return 1;
722}
723
724
Andrey Churbanov7daf9802015-01-27 16:52:57 +0000725# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +0000726
727//
728// If multiple Windows* OS processor groups exist, we can create a 2-level
729// topology map with the groups at level 0 and the individual procs at
730// level 1.
731//
732// This facilitates letting the threads float among all procs in a group,
733// if granularity=group (the default when there are multiple groups).
734//
735static int
736__kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
737 kmp_i18n_id_t *const msg_id)
738{
739 *address2os = NULL;
740 *msg_id = kmp_i18n_null;
741
742 //
743 // If we don't have multiple processor groups, return now.
744 // The flat mapping will be used.
745 //
746 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
747 // FIXME set *msg_id
748 return -1;
749 }
750
751 //
752 // Contruct the data structure to be returned.
753 //
754 *address2os = (AddrUnsPair*)
755 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
756 int avail_ct = 0;
757 int i;
758 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
759 //
760 // Skip this proc if it is not included in the machine model.
761 //
762 if (! KMP_CPU_ISSET(i, fullMask)) {
763 continue;
764 }
765
766 Address addr(2);
767 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
768 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
769 (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
770
771 if (__kmp_affinity_verbose) {
772 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
773 addr.labels[1]);
774 }
775 }
776
777 if (__kmp_affinity_gran_levels < 0) {
778 if (__kmp_affinity_gran == affinity_gran_group) {
779 __kmp_affinity_gran_levels = 1;
780 }
781 else if ((__kmp_affinity_gran == affinity_gran_fine)
782 || (__kmp_affinity_gran == affinity_gran_thread)) {
783 __kmp_affinity_gran_levels = 0;
784 }
785 else {
786 const char *gran_str = NULL;
787 if (__kmp_affinity_gran == affinity_gran_core) {
788 gran_str = "core";
789 }
790 else if (__kmp_affinity_gran == affinity_gran_package) {
791 gran_str = "package";
792 }
793 else if (__kmp_affinity_gran == affinity_gran_node) {
794 gran_str = "node";
795 }
796 else {
797 KMP_ASSERT(0);
798 }
799
800 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
801 __kmp_affinity_gran_levels = 0;
802 }
803 }
804 return 2;
805}
806
Andrey Churbanov7daf9802015-01-27 16:52:57 +0000807# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +0000808
809
810# if KMP_ARCH_X86 || KMP_ARCH_X86_64
811
812static int
813__kmp_cpuid_mask_width(int count) {
814 int r = 0;
815
816 while((1<<r) < count)
817 ++r;
818 return r;
819}
820
821
822class apicThreadInfo {
823public:
824 unsigned osId; // param to __kmp_affinity_bind_thread
825 unsigned apicId; // from cpuid after binding
826 unsigned maxCoresPerPkg; // ""
827 unsigned maxThreadsPerPkg; // ""
828 unsigned pkgId; // inferred from above values
829 unsigned coreId; // ""
830 unsigned threadId; // ""
831};
832
833
834static int
835__kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
836{
837 const apicThreadInfo *aa = (const apicThreadInfo *)a;
838 const apicThreadInfo *bb = (const apicThreadInfo *)b;
839 if (aa->osId < bb->osId) return -1;
840 if (aa->osId > bb->osId) return 1;
841 return 0;
842}
843
844
845static int
846__kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
847{
848 const apicThreadInfo *aa = (const apicThreadInfo *)a;
849 const apicThreadInfo *bb = (const apicThreadInfo *)b;
850 if (aa->pkgId < bb->pkgId) return -1;
851 if (aa->pkgId > bb->pkgId) return 1;
852 if (aa->coreId < bb->coreId) return -1;
853 if (aa->coreId > bb->coreId) return 1;
854 if (aa->threadId < bb->threadId) return -1;
855 if (aa->threadId > bb->threadId) return 1;
856 return 0;
857}
858
859
860//
861// On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
862// an algorithm which cycles through the available os threads, setting
863// the current thread's affinity mask to that thread, and then retrieves
864// the Apic Id for each thread context using the cpuid instruction.
865//
866static int
867__kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
868 kmp_i18n_id_t *const msg_id)
869{
Andrey Churbanov1c331292015-01-27 17:03:42 +0000870 kmp_cpuid buf;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000871 int rc;
872 *address2os = NULL;
873 *msg_id = kmp_i18n_null;
874
Andrey Churbanov1c331292015-01-27 17:03:42 +0000875 //
876 // Check if cpuid leaf 4 is supported.
877 //
Jim Cownie5e8470a2013-09-27 10:38:44 +0000878 __kmp_x86_cpuid(0, 0, &buf);
879 if (buf.eax < 4) {
880 *msg_id = kmp_i18n_str_NoLeaf4Support;
881 return -1;
882 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000883
884 //
Jim Cownie5e8470a2013-09-27 10:38:44 +0000885 // The algorithm used starts by setting the affinity to each available
Andrey Churbanov1c331292015-01-27 17:03:42 +0000886 // thread and retrieving info from the cpuid instruction, so if we are
887 // not capable of calling __kmp_get_system_affinity() and
888 // _kmp_get_system_affinity(), then we need to do something else - use
889 // the defaults that we calculated from issuing cpuid without binding
890 // to each proc.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000891 //
892 if (! KMP_AFFINITY_CAPABLE()) {
893 //
894 // Hack to try and infer the machine topology using only the data
895 // available from cpuid on the current thread, and __kmp_xproc.
896 //
897 KMP_ASSERT(__kmp_affinity_type == affinity_none);
898
899 //
900 // Get an upper bound on the number of threads per package using
901 // cpuid(1).
902 //
903 // On some OS/chps combinations where HT is supported by the chip
904 // but is disabled, this value will be 2 on a single core chip.
905 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
906 //
Jim Cownie5e8470a2013-09-27 10:38:44 +0000907 __kmp_x86_cpuid(1, 0, &buf);
908 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
909 if (maxThreadsPerPkg == 0) {
910 maxThreadsPerPkg = 1;
911 }
912
913 //
914 // The num cores per pkg comes from cpuid(4).
915 // 1 must be added to the encoded value.
916 //
917 // The author of cpu_count.cpp treated this only an upper bound
918 // on the number of cores, but I haven't seen any cases where it
919 // was greater than the actual number of cores, so we will treat
920 // it as exact in this block of code.
921 //
922 // First, we need to check if cpuid(4) is supported on this chip.
923 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
924 // has the value n or greater.
925 //
926 __kmp_x86_cpuid(0, 0, &buf);
927 if (buf.eax >= 4) {
928 __kmp_x86_cpuid(4, 0, &buf);
929 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
930 }
931 else {
932 nCoresPerPkg = 1;
933 }
934
935 //
936 // There is no way to reliably tell if HT is enabled without issuing
937 // the cpuid instruction from every thread, can correlating the cpuid
938 // info, so if the machine is not affinity capable, we assume that HT
939 // is off. We have seen quite a few machines where maxThreadsPerPkg
940 // is 2, yet the machine does not support HT.
941 //
942 // - Older OSes are usually found on machines with older chips, which
943 // do not support HT.
944 //
945 // - The performance penalty for mistakenly identifying a machine as
946 // HT when it isn't (which results in blocktime being incorrecly set
947 // to 0) is greater than the penalty when for mistakenly identifying
948 // a machine as being 1 thread/core when it is really HT enabled
949 // (which results in blocktime being incorrectly set to a positive
950 // value).
951 //
952 __kmp_ncores = __kmp_xproc;
953 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
954 __kmp_nThreadsPerCore = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000955 if (__kmp_affinity_verbose) {
956 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
957 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
958 if (__kmp_affinity_uniform_topology()) {
959 KMP_INFORM(Uniform, "KMP_AFFINITY");
960 } else {
961 KMP_INFORM(NonUniform, "KMP_AFFINITY");
962 }
963 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
964 __kmp_nThreadsPerCore, __kmp_ncores);
965 }
966 return 0;
967 }
968
969 //
970 //
971 // From here on, we can assume that it is safe to call
972 // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
973 // even if __kmp_affinity_type = affinity_none.
974 //
975
976 //
977 // Save the affinity mask for the current thread.
978 //
979 kmp_affin_mask_t *oldMask;
980 KMP_CPU_ALLOC(oldMask);
981 KMP_ASSERT(oldMask != NULL);
982 __kmp_get_system_affinity(oldMask, TRUE);
983
984 //
985 // Run through each of the available contexts, binding the current thread
986 // to it, and obtaining the pertinent information using the cpuid instr.
987 //
988 // The relevant information is:
989 //
990 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
991 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
992 //
993 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The
994 // value of this field determines the width of the core# + thread#
995 // fields in the Apic Id. It is also an upper bound on the number
996 // of threads per package, but it has been verified that situations
997 // happen were it is not exact. In particular, on certain OS/chip
998 // combinations where Intel(R) Hyper-Threading Technology is supported
999 // by the chip but has
1000 // been disabled, the value of this field will be 2 (for a single core
1001 // chip). On other OS/chip combinations supporting
1002 // Intel(R) Hyper-Threading Technology, the value of
1003 // this field will be 1 when Intel(R) Hyper-Threading Technology is
1004 // disabled and 2 when it is enabled.
1005 //
1006 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The
1007 // value of this field (+1) determines the width of the core# field in
1008 // the Apic Id. The comments in "cpucount.cpp" say that this value is
1009 // an upper bound, but the IA-32 architecture manual says that it is
1010 // exactly the number of cores per package, and I haven't seen any
1011 // case where it wasn't.
1012 //
1013 // From this information, deduce the package Id, core Id, and thread Id,
1014 // and set the corresponding fields in the apicThreadInfo struct.
1015 //
1016 unsigned i;
1017 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1018 __kmp_avail_proc * sizeof(apicThreadInfo));
1019 unsigned nApics = 0;
1020 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
1021 //
1022 // Skip this proc if it is not included in the machine model.
1023 //
1024 if (! KMP_CPU_ISSET(i, fullMask)) {
1025 continue;
1026 }
1027 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1028
1029 __kmp_affinity_bind_thread(i);
1030 threadInfo[nApics].osId = i;
1031
1032 //
1033 // The apic id and max threads per pkg come from cpuid(1).
1034 //
Jim Cownie5e8470a2013-09-27 10:38:44 +00001035 __kmp_x86_cpuid(1, 0, &buf);
1036 if (! (buf.edx >> 9) & 1) {
1037 __kmp_set_system_affinity(oldMask, TRUE);
1038 __kmp_free(threadInfo);
1039 KMP_CPU_FREE(oldMask);
1040 *msg_id = kmp_i18n_str_ApicNotPresent;
1041 return -1;
1042 }
1043 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1044 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1045 if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1046 threadInfo[nApics].maxThreadsPerPkg = 1;
1047 }
1048
1049 //
1050 // Max cores per pkg comes from cpuid(4).
1051 // 1 must be added to the encoded value.
1052 //
1053 // First, we need to check if cpuid(4) is supported on this chip.
1054 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
1055 // has the value n or greater.
1056 //
1057 __kmp_x86_cpuid(0, 0, &buf);
1058 if (buf.eax >= 4) {
1059 __kmp_x86_cpuid(4, 0, &buf);
1060 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1061 }
1062 else {
1063 threadInfo[nApics].maxCoresPerPkg = 1;
1064 }
1065
1066 //
1067 // Infer the pkgId / coreId / threadId using only the info
1068 // obtained locally.
1069 //
1070 int widthCT = __kmp_cpuid_mask_width(
1071 threadInfo[nApics].maxThreadsPerPkg);
1072 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1073
1074 int widthC = __kmp_cpuid_mask_width(
1075 threadInfo[nApics].maxCoresPerPkg);
1076 int widthT = widthCT - widthC;
1077 if (widthT < 0) {
1078 //
1079 // I've never seen this one happen, but I suppose it could, if
1080 // the cpuid instruction on a chip was really screwed up.
1081 // Make sure to restore the affinity mask before the tail call.
1082 //
1083 __kmp_set_system_affinity(oldMask, TRUE);
1084 __kmp_free(threadInfo);
1085 KMP_CPU_FREE(oldMask);
1086 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1087 return -1;
1088 }
1089
1090 int maskC = (1 << widthC) - 1;
1091 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1092 &maskC;
1093
1094 int maskT = (1 << widthT) - 1;
1095 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1096
1097 nApics++;
1098 }
1099
1100 //
1101 // We've collected all the info we need.
1102 // Restore the old affinity mask for this thread.
1103 //
1104 __kmp_set_system_affinity(oldMask, TRUE);
1105
1106 //
1107 // If there's only one thread context to bind to, form an Address object
1108 // with depth 1 and return immediately (or, if affinity is off, set
1109 // address2os to NULL and return).
1110 //
1111 // If it is configured to omit the package level when there is only a
1112 // single package, the logic at the end of this routine won't work if
1113 // there is only a single thread - it would try to form an Address
1114 // object with depth 0.
1115 //
1116 KMP_ASSERT(nApics > 0);
1117 if (nApics == 1) {
1118 __kmp_ncores = nPackages = 1;
1119 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001120 if (__kmp_affinity_verbose) {
1121 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1122 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1123
1124 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1125 if (__kmp_affinity_respect_mask) {
1126 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1127 } else {
1128 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1129 }
1130 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1131 KMP_INFORM(Uniform, "KMP_AFFINITY");
1132 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1133 __kmp_nThreadsPerCore, __kmp_ncores);
1134 }
1135
1136 if (__kmp_affinity_type == affinity_none) {
1137 __kmp_free(threadInfo);
1138 KMP_CPU_FREE(oldMask);
1139 return 0;
1140 }
1141
1142 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1143 Address addr(1);
1144 addr.labels[0] = threadInfo[0].pkgId;
1145 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1146
1147 if (__kmp_affinity_gran_levels < 0) {
1148 __kmp_affinity_gran_levels = 0;
1149 }
1150
1151 if (__kmp_affinity_verbose) {
1152 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1153 }
1154
1155 __kmp_free(threadInfo);
1156 KMP_CPU_FREE(oldMask);
1157 return 1;
1158 }
1159
1160 //
1161 // Sort the threadInfo table by physical Id.
1162 //
1163 qsort(threadInfo, nApics, sizeof(*threadInfo),
1164 __kmp_affinity_cmp_apicThreadInfo_phys_id);
1165
1166 //
1167 // The table is now sorted by pkgId / coreId / threadId, but we really
1168 // don't know the radix of any of the fields. pkgId's may be sparsely
1169 // assigned among the chips on a system. Although coreId's are usually
1170 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1171 // [0..threadsPerCore-1], we don't want to make any such assumptions.
1172 //
1173 // For that matter, we don't know what coresPerPkg and threadsPerCore
1174 // (or the total # packages) are at this point - we want to determine
1175 // that now. We only have an upper bound on the first two figures.
1176 //
1177 // We also perform a consistency check at this point: the values returned
1178 // by the cpuid instruction for any thread bound to a given package had
1179 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1180 //
1181 nPackages = 1;
1182 nCoresPerPkg = 1;
1183 __kmp_nThreadsPerCore = 1;
1184 unsigned nCores = 1;
1185
1186 unsigned pkgCt = 1; // to determine radii
1187 unsigned lastPkgId = threadInfo[0].pkgId;
1188 unsigned coreCt = 1;
1189 unsigned lastCoreId = threadInfo[0].coreId;
1190 unsigned threadCt = 1;
1191 unsigned lastThreadId = threadInfo[0].threadId;
1192
1193 // intra-pkg consist checks
1194 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1195 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1196
1197 for (i = 1; i < nApics; i++) {
1198 if (threadInfo[i].pkgId != lastPkgId) {
1199 nCores++;
1200 pkgCt++;
1201 lastPkgId = threadInfo[i].pkgId;
1202 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1203 coreCt = 1;
1204 lastCoreId = threadInfo[i].coreId;
1205 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1206 threadCt = 1;
1207 lastThreadId = threadInfo[i].threadId;
1208
1209 //
1210 // This is a different package, so go on to the next iteration
1211 // without doing any consistency checks. Reset the consistency
1212 // check vars, though.
1213 //
1214 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1215 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1216 continue;
1217 }
1218
1219 if (threadInfo[i].coreId != lastCoreId) {
1220 nCores++;
1221 coreCt++;
1222 lastCoreId = threadInfo[i].coreId;
1223 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1224 threadCt = 1;
1225 lastThreadId = threadInfo[i].threadId;
1226 }
1227 else if (threadInfo[i].threadId != lastThreadId) {
1228 threadCt++;
1229 lastThreadId = threadInfo[i].threadId;
1230 }
1231 else {
1232 __kmp_free(threadInfo);
1233 KMP_CPU_FREE(oldMask);
1234 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1235 return -1;
1236 }
1237
1238 //
1239 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1240 // fields agree between all the threads bounds to a given package.
1241 //
1242 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1243 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1244 __kmp_free(threadInfo);
1245 KMP_CPU_FREE(oldMask);
1246 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1247 return -1;
1248 }
1249 }
1250 nPackages = pkgCt;
1251 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1252 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1253
1254 //
1255 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +00001256 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +00001257 // nCoresPerPkg, & nPackages. Make sure all these vars are set
1258 // correctly, and return now if affinity is not enabled.
1259 //
Jim Cownie5e8470a2013-09-27 10:38:44 +00001260 __kmp_ncores = nCores;
1261 if (__kmp_affinity_verbose) {
1262 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1263 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1264
1265 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1266 if (__kmp_affinity_respect_mask) {
1267 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1268 } else {
1269 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1270 }
1271 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1272 if (__kmp_affinity_uniform_topology()) {
1273 KMP_INFORM(Uniform, "KMP_AFFINITY");
1274 } else {
1275 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1276 }
1277 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1278 __kmp_nThreadsPerCore, __kmp_ncores);
1279
1280 }
1281
1282 if (__kmp_affinity_type == affinity_none) {
1283 __kmp_free(threadInfo);
1284 KMP_CPU_FREE(oldMask);
1285 return 0;
1286 }
1287
1288 //
1289 // Now that we've determined the number of packages, the number of cores
1290 // per package, and the number of threads per core, we can construct the
1291 // data structure that is to be returned.
1292 //
1293 int pkgLevel = 0;
1294 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1295 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1296 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1297
1298 KMP_ASSERT(depth > 0);
1299 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1300
1301 for (i = 0; i < nApics; ++i) {
1302 Address addr(depth);
1303 unsigned os = threadInfo[i].osId;
1304 int d = 0;
1305
1306 if (pkgLevel >= 0) {
1307 addr.labels[d++] = threadInfo[i].pkgId;
1308 }
1309 if (coreLevel >= 0) {
1310 addr.labels[d++] = threadInfo[i].coreId;
1311 }
1312 if (threadLevel >= 0) {
1313 addr.labels[d++] = threadInfo[i].threadId;
1314 }
1315 (*address2os)[i] = AddrUnsPair(addr, os);
1316 }
1317
1318 if (__kmp_affinity_gran_levels < 0) {
1319 //
1320 // Set the granularity level based on what levels are modeled
1321 // in the machine topology map.
1322 //
1323 __kmp_affinity_gran_levels = 0;
1324 if ((threadLevel >= 0)
1325 && (__kmp_affinity_gran > affinity_gran_thread)) {
1326 __kmp_affinity_gran_levels++;
1327 }
1328 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1329 __kmp_affinity_gran_levels++;
1330 }
1331 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1332 __kmp_affinity_gran_levels++;
1333 }
1334 }
1335
1336 if (__kmp_affinity_verbose) {
1337 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1338 coreLevel, threadLevel);
1339 }
1340
1341 __kmp_free(threadInfo);
1342 KMP_CPU_FREE(oldMask);
1343 return depth;
1344}
1345
1346
1347//
1348// Intel(R) microarchitecture code name Nehalem, Dunnington and later
1349// architectures support a newer interface for specifying the x2APIC Ids,
1350// based on cpuid leaf 11.
1351//
1352static int
1353__kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1354 kmp_i18n_id_t *const msg_id)
1355{
1356 kmp_cpuid buf;
1357
1358 *address2os = NULL;
1359 *msg_id = kmp_i18n_null;
1360
1361 //
1362 // Check to see if cpuid leaf 11 is supported.
1363 //
1364 __kmp_x86_cpuid(0, 0, &buf);
1365 if (buf.eax < 11) {
1366 *msg_id = kmp_i18n_str_NoLeaf11Support;
1367 return -1;
1368 }
1369 __kmp_x86_cpuid(11, 0, &buf);
1370 if (buf.ebx == 0) {
1371 *msg_id = kmp_i18n_str_NoLeaf11Support;
1372 return -1;
1373 }
1374
1375 //
1376 // Find the number of levels in the machine topology. While we're at it,
1377 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will
1378 // try to get more accurate values later by explicitly counting them,
1379 // but get reasonable defaults now, in case we return early.
1380 //
1381 int level;
1382 int threadLevel = -1;
1383 int coreLevel = -1;
1384 int pkgLevel = -1;
1385 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1386
1387 for (level = 0;; level++) {
1388 if (level > 31) {
1389 //
1390 // FIXME: Hack for DPD200163180
1391 //
1392 // If level is big then something went wrong -> exiting
1393 //
1394 // There could actually be 32 valid levels in the machine topology,
1395 // but so far, the only machine we have seen which does not exit
1396 // this loop before iteration 32 has fubar x2APIC settings.
1397 //
1398 // For now, just reject this case based upon loop trip count.
1399 //
1400 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1401 return -1;
1402 }
1403 __kmp_x86_cpuid(11, level, &buf);
1404 if (buf.ebx == 0) {
1405 if (pkgLevel < 0) {
1406 //
1407 // Will infer nPackages from __kmp_xproc
1408 //
1409 pkgLevel = level;
1410 level++;
1411 }
1412 break;
1413 }
1414 int kind = (buf.ecx >> 8) & 0xff;
1415 if (kind == 1) {
1416 //
1417 // SMT level
1418 //
1419 threadLevel = level;
1420 coreLevel = -1;
1421 pkgLevel = -1;
1422 __kmp_nThreadsPerCore = buf.ebx & 0xff;
1423 if (__kmp_nThreadsPerCore == 0) {
1424 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1425 return -1;
1426 }
1427 }
1428 else if (kind == 2) {
1429 //
1430 // core level
1431 //
1432 coreLevel = level;
1433 pkgLevel = -1;
1434 nCoresPerPkg = buf.ebx & 0xff;
1435 if (nCoresPerPkg == 0) {
1436 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1437 return -1;
1438 }
1439 }
1440 else {
1441 if (level <= 0) {
1442 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1443 return -1;
1444 }
1445 if (pkgLevel >= 0) {
1446 continue;
1447 }
1448 pkgLevel = level;
1449 nPackages = buf.ebx & 0xff;
1450 if (nPackages == 0) {
1451 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1452 return -1;
1453 }
1454 }
1455 }
1456 int depth = level;
1457
1458 //
1459 // In the above loop, "level" was counted from the finest level (usually
1460 // thread) to the coarsest. The caller expects that we will place the
1461 // labels in (*address2os)[].first.labels[] in the inverse order, so
1462 // we need to invert the vars saying which level means what.
1463 //
1464 if (threadLevel >= 0) {
1465 threadLevel = depth - threadLevel - 1;
1466 }
1467 if (coreLevel >= 0) {
1468 coreLevel = depth - coreLevel - 1;
1469 }
1470 KMP_DEBUG_ASSERT(pkgLevel >= 0);
1471 pkgLevel = depth - pkgLevel - 1;
1472
1473 //
1474 // The algorithm used starts by setting the affinity to each available
Andrey Churbanov1c331292015-01-27 17:03:42 +00001475 // thread and retrieving info from the cpuid instruction, so if we are
1476 // not capable of calling __kmp_get_system_affinity() and
1477 // _kmp_get_system_affinity(), then we need to do something else - use
1478 // the defaults that we calculated from issuing cpuid without binding
1479 // to each proc.
Jim Cownie5e8470a2013-09-27 10:38:44 +00001480 //
1481 if (! KMP_AFFINITY_CAPABLE())
1482 {
1483 //
1484 // Hack to try and infer the machine topology using only the data
1485 // available from cpuid on the current thread, and __kmp_xproc.
1486 //
1487 KMP_ASSERT(__kmp_affinity_type == affinity_none);
1488
1489 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1490 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001491 if (__kmp_affinity_verbose) {
1492 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1493 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1494 if (__kmp_affinity_uniform_topology()) {
1495 KMP_INFORM(Uniform, "KMP_AFFINITY");
1496 } else {
1497 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1498 }
1499 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1500 __kmp_nThreadsPerCore, __kmp_ncores);
1501 }
1502 return 0;
1503 }
1504
1505 //
1506 //
1507 // From here on, we can assume that it is safe to call
1508 // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1509 // even if __kmp_affinity_type = affinity_none.
1510 //
1511
1512 //
1513 // Save the affinity mask for the current thread.
1514 //
1515 kmp_affin_mask_t *oldMask;
1516 KMP_CPU_ALLOC(oldMask);
1517 __kmp_get_system_affinity(oldMask, TRUE);
1518
1519 //
1520 // Allocate the data structure to be returned.
1521 //
1522 AddrUnsPair *retval = (AddrUnsPair *)
1523 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1524
1525 //
1526 // Run through each of the available contexts, binding the current thread
1527 // to it, and obtaining the pertinent information using the cpuid instr.
1528 //
1529 unsigned int proc;
1530 int nApics = 0;
1531 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1532 //
1533 // Skip this proc if it is not included in the machine model.
1534 //
1535 if (! KMP_CPU_ISSET(proc, fullMask)) {
1536 continue;
1537 }
1538 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1539
1540 __kmp_affinity_bind_thread(proc);
1541
1542 //
1543 // Extrach the labels for each level in the machine topology map
1544 // from the Apic ID.
1545 //
1546 Address addr(depth);
1547 int prev_shift = 0;
1548
1549 for (level = 0; level < depth; level++) {
1550 __kmp_x86_cpuid(11, level, &buf);
1551 unsigned apicId = buf.edx;
1552 if (buf.ebx == 0) {
1553 if (level != depth - 1) {
1554 KMP_CPU_FREE(oldMask);
1555 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1556 return -1;
1557 }
1558 addr.labels[depth - level - 1] = apicId >> prev_shift;
1559 level++;
1560 break;
1561 }
1562 int shift = buf.eax & 0x1f;
1563 int mask = (1 << shift) - 1;
1564 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1565 prev_shift = shift;
1566 }
1567 if (level != depth) {
1568 KMP_CPU_FREE(oldMask);
1569 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1570 return -1;
1571 }
1572
1573 retval[nApics] = AddrUnsPair(addr, proc);
1574 nApics++;
1575 }
1576
1577 //
1578 // We've collected all the info we need.
1579 // Restore the old affinity mask for this thread.
1580 //
1581 __kmp_set_system_affinity(oldMask, TRUE);
1582
1583 //
1584 // If there's only one thread context to bind to, return now.
1585 //
1586 KMP_ASSERT(nApics > 0);
1587 if (nApics == 1) {
1588 __kmp_ncores = nPackages = 1;
1589 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001590 if (__kmp_affinity_verbose) {
1591 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1592 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1593
1594 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1595 if (__kmp_affinity_respect_mask) {
1596 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1597 } else {
1598 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1599 }
1600 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1601 KMP_INFORM(Uniform, "KMP_AFFINITY");
1602 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1603 __kmp_nThreadsPerCore, __kmp_ncores);
1604 }
1605
1606 if (__kmp_affinity_type == affinity_none) {
1607 __kmp_free(retval);
1608 KMP_CPU_FREE(oldMask);
1609 return 0;
1610 }
1611
1612 //
1613 // Form an Address object which only includes the package level.
1614 //
1615 Address addr(1);
1616 addr.labels[0] = retval[0].first.labels[pkgLevel];
1617 retval[0].first = addr;
1618
1619 if (__kmp_affinity_gran_levels < 0) {
1620 __kmp_affinity_gran_levels = 0;
1621 }
1622
1623 if (__kmp_affinity_verbose) {
1624 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1625 }
1626
1627 *address2os = retval;
1628 KMP_CPU_FREE(oldMask);
1629 return 1;
1630 }
1631
1632 //
1633 // Sort the table by physical Id.
1634 //
1635 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1636
1637 //
1638 // Find the radix at each of the levels.
1639 //
1640 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1641 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1642 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1643 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1644 for (level = 0; level < depth; level++) {
1645 totals[level] = 1;
1646 maxCt[level] = 1;
1647 counts[level] = 1;
1648 last[level] = retval[0].first.labels[level];
1649 }
1650
1651 //
1652 // From here on, the iteration variable "level" runs from the finest
1653 // level to the coarsest, i.e. we iterate forward through
1654 // (*address2os)[].first.labels[] - in the previous loops, we iterated
1655 // backwards.
1656 //
1657 for (proc = 1; (int)proc < nApics; proc++) {
1658 int level;
1659 for (level = 0; level < depth; level++) {
1660 if (retval[proc].first.labels[level] != last[level]) {
1661 int j;
1662 for (j = level + 1; j < depth; j++) {
1663 totals[j]++;
1664 counts[j] = 1;
1665 // The line below causes printing incorrect topology information
1666 // in case the max value for some level (maxCt[level]) is encountered earlier than
1667 // some less value while going through the array.
1668 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1669 // whereas it must be 4.
1670 // TODO!!! Check if it can be commented safely
1671 //maxCt[j] = 1;
1672 last[j] = retval[proc].first.labels[j];
1673 }
1674 totals[level]++;
1675 counts[level]++;
1676 if (counts[level] > maxCt[level]) {
1677 maxCt[level] = counts[level];
1678 }
1679 last[level] = retval[proc].first.labels[level];
1680 break;
1681 }
1682 else if (level == depth - 1) {
1683 __kmp_free(last);
1684 __kmp_free(maxCt);
1685 __kmp_free(counts);
1686 __kmp_free(totals);
1687 __kmp_free(retval);
1688 KMP_CPU_FREE(oldMask);
1689 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1690 return -1;
1691 }
1692 }
1693 }
1694
1695 //
1696 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +00001697 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +00001698 // nCoresPerPkg, & nPackages. Make sure all these vars are set
1699 // correctly, and return if affinity is not enabled.
1700 //
1701 if (threadLevel >= 0) {
1702 __kmp_nThreadsPerCore = maxCt[threadLevel];
1703 }
1704 else {
1705 __kmp_nThreadsPerCore = 1;
1706 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00001707 nPackages = totals[pkgLevel];
1708
1709 if (coreLevel >= 0) {
1710 __kmp_ncores = totals[coreLevel];
1711 nCoresPerPkg = maxCt[coreLevel];
1712 }
1713 else {
1714 __kmp_ncores = nPackages;
1715 nCoresPerPkg = 1;
1716 }
1717
1718 //
1719 // Check to see if the machine topology is uniform
1720 //
1721 unsigned prod = maxCt[0];
1722 for (level = 1; level < depth; level++) {
1723 prod *= maxCt[level];
1724 }
1725 bool uniform = (prod == totals[level - 1]);
1726
1727 //
1728 // Print the machine topology summary.
1729 //
1730 if (__kmp_affinity_verbose) {
1731 char mask[KMP_AFFIN_MASK_PRINT_LEN];
1732 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1733
1734 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1735 if (__kmp_affinity_respect_mask) {
1736 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1737 } else {
1738 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1739 }
1740 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1741 if (uniform) {
1742 KMP_INFORM(Uniform, "KMP_AFFINITY");
1743 } else {
1744 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1745 }
1746
1747 kmp_str_buf_t buf;
1748 __kmp_str_buf_init(&buf);
1749
1750 __kmp_str_buf_print(&buf, "%d", totals[0]);
1751 for (level = 1; level <= pkgLevel; level++) {
1752 __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1753 }
1754 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1755 __kmp_nThreadsPerCore, __kmp_ncores);
1756
1757 __kmp_str_buf_free(&buf);
1758 }
1759
1760 if (__kmp_affinity_type == affinity_none) {
1761 __kmp_free(last);
1762 __kmp_free(maxCt);
1763 __kmp_free(counts);
1764 __kmp_free(totals);
1765 __kmp_free(retval);
1766 KMP_CPU_FREE(oldMask);
1767 return 0;
1768 }
1769
1770 //
1771 // Find any levels with radiix 1, and remove them from the map
1772 // (except for the package level).
1773 //
1774 int new_depth = 0;
1775 for (level = 0; level < depth; level++) {
1776 if ((maxCt[level] == 1) && (level != pkgLevel)) {
1777 continue;
1778 }
1779 new_depth++;
1780 }
1781
1782 //
1783 // If we are removing any levels, allocate a new vector to return,
1784 // and copy the relevant information to it.
1785 //
1786 if (new_depth != depth) {
1787 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1788 sizeof(AddrUnsPair) * nApics);
1789 for (proc = 0; (int)proc < nApics; proc++) {
1790 Address addr(new_depth);
1791 new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1792 }
1793 int new_level = 0;
Jonathan Peyton62f38402015-08-25 18:44:41 +00001794 int newPkgLevel = -1;
1795 int newCoreLevel = -1;
1796 int newThreadLevel = -1;
1797 int i;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001798 for (level = 0; level < depth; level++) {
Jonathan Peyton62f38402015-08-25 18:44:41 +00001799 if ((maxCt[level] == 1)
1800 && (level != pkgLevel)) {
1801 //
1802 // Remove this level. Never remove the package level
1803 //
1804 continue;
1805 }
1806 if (level == pkgLevel) {
1807 newPkgLevel = level;
1808 }
1809 if (level == coreLevel) {
1810 newCoreLevel = level;
1811 }
1812 if (level == threadLevel) {
1813 newThreadLevel = level;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001814 }
1815 for (proc = 0; (int)proc < nApics; proc++) {
1816 new_retval[proc].first.labels[new_level]
1817 = retval[proc].first.labels[level];
1818 }
1819 new_level++;
1820 }
1821
1822 __kmp_free(retval);
1823 retval = new_retval;
1824 depth = new_depth;
Jonathan Peyton62f38402015-08-25 18:44:41 +00001825 pkgLevel = newPkgLevel;
1826 coreLevel = newCoreLevel;
1827 threadLevel = newThreadLevel;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001828 }
1829
1830 if (__kmp_affinity_gran_levels < 0) {
1831 //
1832 // Set the granularity level based on what levels are modeled
1833 // in the machine topology map.
1834 //
1835 __kmp_affinity_gran_levels = 0;
1836 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1837 __kmp_affinity_gran_levels++;
1838 }
1839 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1840 __kmp_affinity_gran_levels++;
1841 }
1842 if (__kmp_affinity_gran > affinity_gran_package) {
1843 __kmp_affinity_gran_levels++;
1844 }
1845 }
1846
1847 if (__kmp_affinity_verbose) {
1848 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1849 coreLevel, threadLevel);
1850 }
1851
1852 __kmp_free(last);
1853 __kmp_free(maxCt);
1854 __kmp_free(counts);
1855 __kmp_free(totals);
1856 KMP_CPU_FREE(oldMask);
1857 *address2os = retval;
1858 return depth;
1859}
1860
1861
1862# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1863
1864
1865#define osIdIndex 0
1866#define threadIdIndex 1
1867#define coreIdIndex 2
1868#define pkgIdIndex 3
1869#define nodeIdIndex 4
1870
1871typedef unsigned *ProcCpuInfo;
1872static unsigned maxIndex = pkgIdIndex;
1873
1874
1875static int
1876__kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1877{
1878 const unsigned *aa = (const unsigned *)a;
1879 const unsigned *bb = (const unsigned *)b;
1880 if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1881 if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1882 return 0;
1883};
1884
1885
1886static int
1887__kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1888{
1889 unsigned i;
1890 const unsigned *aa = *((const unsigned **)a);
1891 const unsigned *bb = *((const unsigned **)b);
1892 for (i = maxIndex; ; i--) {
1893 if (aa[i] < bb[i]) return -1;
1894 if (aa[i] > bb[i]) return 1;
1895 if (i == osIdIndex) break;
1896 }
1897 return 0;
1898}
1899
1900
1901//
1902// Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1903// affinity map.
1904//
1905static int
1906__kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1907 kmp_i18n_id_t *const msg_id, FILE *f)
1908{
1909 *address2os = NULL;
1910 *msg_id = kmp_i18n_null;
1911
1912 //
1913 // Scan of the file, and count the number of "processor" (osId) fields,
Alp Toker8f2d3f02014-02-24 10:40:15 +00001914 // and find the highest value of <n> for a node_<n> field.
Jim Cownie5e8470a2013-09-27 10:38:44 +00001915 //
1916 char buf[256];
1917 unsigned num_records = 0;
1918 while (! feof(f)) {
1919 buf[sizeof(buf) - 1] = 1;
1920 if (! fgets(buf, sizeof(buf), f)) {
1921 //
1922 // Read errors presumably because of EOF
1923 //
1924 break;
1925 }
1926
1927 char s1[] = "processor";
1928 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1929 num_records++;
1930 continue;
1931 }
1932
1933 //
1934 // FIXME - this will match "node_<n> <garbage>"
1935 //
1936 unsigned level;
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00001937 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00001938 if (nodeIdIndex + level >= maxIndex) {
1939 maxIndex = nodeIdIndex + level;
1940 }
1941 continue;
1942 }
1943 }
1944
1945 //
1946 // Check for empty file / no valid processor records, or too many.
1947 // The number of records can't exceed the number of valid bits in the
1948 // affinity mask.
1949 //
1950 if (num_records == 0) {
1951 *line = 0;
1952 *msg_id = kmp_i18n_str_NoProcRecords;
1953 return -1;
1954 }
1955 if (num_records > (unsigned)__kmp_xproc) {
1956 *line = 0;
1957 *msg_id = kmp_i18n_str_TooManyProcRecords;
1958 return -1;
1959 }
1960
1961 //
1962 // Set the file pointer back to the begginning, so that we can scan the
1963 // file again, this time performing a full parse of the data.
1964 // Allocate a vector of ProcCpuInfo object, where we will place the data.
1965 // Adding an extra element at the end allows us to remove a lot of extra
1966 // checks for termination conditions.
1967 //
1968 if (fseek(f, 0, SEEK_SET) != 0) {
1969 *line = 0;
1970 *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1971 return -1;
1972 }
1973
1974 //
1975 // Allocate the array of records to store the proc info in. The dummy
1976 // element at the end makes the logic in filling them out easier to code.
1977 //
1978 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1979 * sizeof(unsigned *));
1980 unsigned i;
1981 for (i = 0; i <= num_records; i++) {
1982 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1983 * sizeof(unsigned));
1984 }
1985
1986#define CLEANUP_THREAD_INFO \
1987 for (i = 0; i <= num_records; i++) { \
1988 __kmp_free(threadInfo[i]); \
1989 } \
1990 __kmp_free(threadInfo);
1991
1992 //
1993 // A value of UINT_MAX means that we didn't find the field
1994 //
1995 unsigned __index;
1996
1997#define INIT_PROC_INFO(p) \
1998 for (__index = 0; __index <= maxIndex; __index++) { \
1999 (p)[__index] = UINT_MAX; \
2000 }
2001
2002 for (i = 0; i <= num_records; i++) {
2003 INIT_PROC_INFO(threadInfo[i]);
2004 }
2005
2006 unsigned num_avail = 0;
2007 *line = 0;
2008 while (! feof(f)) {
2009 //
2010 // Create an inner scoping level, so that all the goto targets at the
2011 // end of the loop appear in an outer scoping level. This avoids
2012 // warnings about jumping past an initialization to a target in the
2013 // same block.
2014 //
2015 {
2016 buf[sizeof(buf) - 1] = 1;
2017 bool long_line = false;
2018 if (! fgets(buf, sizeof(buf), f)) {
2019 //
2020 // Read errors presumably because of EOF
2021 //
2022 // If there is valid data in threadInfo[num_avail], then fake
2023 // a blank line in ensure that the last address gets parsed.
2024 //
2025 bool valid = false;
2026 for (i = 0; i <= maxIndex; i++) {
2027 if (threadInfo[num_avail][i] != UINT_MAX) {
2028 valid = true;
2029 }
2030 }
2031 if (! valid) {
2032 break;
2033 }
2034 buf[0] = 0;
2035 } else if (!buf[sizeof(buf) - 1]) {
2036 //
2037 // The line is longer than the buffer. Set a flag and don't
2038 // emit an error if we were going to ignore the line, anyway.
2039 //
2040 long_line = true;
2041
2042#define CHECK_LINE \
2043 if (long_line) { \
2044 CLEANUP_THREAD_INFO; \
2045 *msg_id = kmp_i18n_str_LongLineCpuinfo; \
2046 return -1; \
2047 }
2048 }
2049 (*line)++;
2050
2051 char s1[] = "processor";
2052 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2053 CHECK_LINE;
2054 char *p = strchr(buf + sizeof(s1) - 1, ':');
2055 unsigned val;
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002056 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002057 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
2058 threadInfo[num_avail][osIdIndex] = val;
Jim Cownie181b4bb2013-12-23 17:28:57 +00002059#if KMP_OS_LINUX && USE_SYSFS_INFO
2060 char path[256];
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002061 KMP_SNPRINTF(path, sizeof(path),
Jim Cownie181b4bb2013-12-23 17:28:57 +00002062 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2063 threadInfo[num_avail][osIdIndex]);
2064 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2065
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002066 KMP_SNPRINTF(path, sizeof(path),
Jim Cownie181b4bb2013-12-23 17:28:57 +00002067 "/sys/devices/system/cpu/cpu%u/topology/core_id",
2068 threadInfo[num_avail][osIdIndex]);
2069 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002070 continue;
Jim Cownie181b4bb2013-12-23 17:28:57 +00002071#else
Jim Cownie5e8470a2013-09-27 10:38:44 +00002072 }
2073 char s2[] = "physical id";
2074 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2075 CHECK_LINE;
2076 char *p = strchr(buf + sizeof(s2) - 1, ':');
2077 unsigned val;
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002078 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002079 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2080 threadInfo[num_avail][pkgIdIndex] = val;
2081 continue;
2082 }
2083 char s3[] = "core id";
2084 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2085 CHECK_LINE;
2086 char *p = strchr(buf + sizeof(s3) - 1, ':');
2087 unsigned val;
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002088 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002089 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2090 threadInfo[num_avail][coreIdIndex] = val;
2091 continue;
Jim Cownie181b4bb2013-12-23 17:28:57 +00002092#endif // KMP_OS_LINUX && USE_SYSFS_INFO
Jim Cownie5e8470a2013-09-27 10:38:44 +00002093 }
2094 char s4[] = "thread id";
2095 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2096 CHECK_LINE;
2097 char *p = strchr(buf + sizeof(s4) - 1, ':');
2098 unsigned val;
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002099 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002100 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2101 threadInfo[num_avail][threadIdIndex] = val;
2102 continue;
2103 }
2104 unsigned level;
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002105 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002106 CHECK_LINE;
2107 char *p = strchr(buf + sizeof(s4) - 1, ':');
2108 unsigned val;
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002109 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002110 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2111 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2112 threadInfo[num_avail][nodeIdIndex + level] = val;
2113 continue;
2114 }
2115
2116 //
2117 // We didn't recognize the leading token on the line.
2118 // There are lots of leading tokens that we don't recognize -
2119 // if the line isn't empty, go on to the next line.
2120 //
2121 if ((*buf != 0) && (*buf != '\n')) {
2122 //
2123 // If the line is longer than the buffer, read characters
2124 // until we find a newline.
2125 //
2126 if (long_line) {
2127 int ch;
2128 while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2129 }
2130 continue;
2131 }
2132
2133 //
2134 // A newline has signalled the end of the processor record.
2135 // Check that there aren't too many procs specified.
2136 //
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002137 if ((int)num_avail == __kmp_xproc) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002138 CLEANUP_THREAD_INFO;
2139 *msg_id = kmp_i18n_str_TooManyEntries;
2140 return -1;
2141 }
2142
2143 //
2144 // Check for missing fields. The osId field must be there, and we
2145 // currently require that the physical id field is specified, also.
2146 //
2147 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2148 CLEANUP_THREAD_INFO;
2149 *msg_id = kmp_i18n_str_MissingProcField;
2150 return -1;
2151 }
2152 if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2153 CLEANUP_THREAD_INFO;
2154 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2155 return -1;
2156 }
2157
2158 //
2159 // Skip this proc if it is not included in the machine model.
2160 //
2161 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2162 INIT_PROC_INFO(threadInfo[num_avail]);
2163 continue;
2164 }
2165
2166 //
2167 // We have a successful parse of this proc's info.
2168 // Increment the counter, and prepare for the next proc.
2169 //
2170 num_avail++;
2171 KMP_ASSERT(num_avail <= num_records);
2172 INIT_PROC_INFO(threadInfo[num_avail]);
2173 }
2174 continue;
2175
2176 no_val:
2177 CLEANUP_THREAD_INFO;
2178 *msg_id = kmp_i18n_str_MissingValCpuinfo;
2179 return -1;
2180
2181 dup_field:
2182 CLEANUP_THREAD_INFO;
2183 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2184 return -1;
2185 }
2186 *line = 0;
2187
2188# if KMP_MIC && REDUCE_TEAM_SIZE
2189 unsigned teamSize = 0;
2190# endif // KMP_MIC && REDUCE_TEAM_SIZE
2191
2192 // check for num_records == __kmp_xproc ???
2193
2194 //
2195 // If there's only one thread context to bind to, form an Address object
2196 // with depth 1 and return immediately (or, if affinity is off, set
2197 // address2os to NULL and return).
2198 //
2199 // If it is configured to omit the package level when there is only a
2200 // single package, the logic at the end of this routine won't work if
2201 // there is only a single thread - it would try to form an Address
2202 // object with depth 0.
2203 //
2204 KMP_ASSERT(num_avail > 0);
2205 KMP_ASSERT(num_avail <= num_records);
2206 if (num_avail == 1) {
2207 __kmp_ncores = 1;
2208 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002209 if (__kmp_affinity_verbose) {
2210 if (! KMP_AFFINITY_CAPABLE()) {
2211 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2212 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2213 KMP_INFORM(Uniform, "KMP_AFFINITY");
2214 }
2215 else {
2216 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2217 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2218 fullMask);
2219 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2220 if (__kmp_affinity_respect_mask) {
2221 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2222 } else {
2223 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2224 }
2225 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2226 KMP_INFORM(Uniform, "KMP_AFFINITY");
2227 }
2228 int index;
2229 kmp_str_buf_t buf;
2230 __kmp_str_buf_init(&buf);
2231 __kmp_str_buf_print(&buf, "1");
2232 for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2233 __kmp_str_buf_print(&buf, " x 1");
2234 }
2235 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2236 __kmp_str_buf_free(&buf);
2237 }
2238
2239 if (__kmp_affinity_type == affinity_none) {
2240 CLEANUP_THREAD_INFO;
2241 return 0;
2242 }
2243
2244 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2245 Address addr(1);
2246 addr.labels[0] = threadInfo[0][pkgIdIndex];
2247 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2248
2249 if (__kmp_affinity_gran_levels < 0) {
2250 __kmp_affinity_gran_levels = 0;
2251 }
2252
2253 if (__kmp_affinity_verbose) {
2254 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2255 }
2256
2257 CLEANUP_THREAD_INFO;
2258 return 1;
2259 }
2260
2261 //
2262 // Sort the threadInfo table by physical Id.
2263 //
2264 qsort(threadInfo, num_avail, sizeof(*threadInfo),
2265 __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2266
2267 //
2268 // The table is now sorted by pkgId / coreId / threadId, but we really
2269 // don't know the radix of any of the fields. pkgId's may be sparsely
2270 // assigned among the chips on a system. Although coreId's are usually
2271 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2272 // [0..threadsPerCore-1], we don't want to make any such assumptions.
2273 //
2274 // For that matter, we don't know what coresPerPkg and threadsPerCore
2275 // (or the total # packages) are at this point - we want to determine
2276 // that now. We only have an upper bound on the first two figures.
2277 //
2278 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2279 * sizeof(unsigned));
2280 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2281 * sizeof(unsigned));
2282 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2283 * sizeof(unsigned));
2284 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2285 * sizeof(unsigned));
2286
2287 bool assign_thread_ids = false;
2288 unsigned threadIdCt;
2289 unsigned index;
2290
2291 restart_radix_check:
2292 threadIdCt = 0;
2293
2294 //
2295 // Initialize the counter arrays with data from threadInfo[0].
2296 //
2297 if (assign_thread_ids) {
2298 if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2299 threadInfo[0][threadIdIndex] = threadIdCt++;
2300 }
2301 else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2302 threadIdCt = threadInfo[0][threadIdIndex] + 1;
2303 }
2304 }
2305 for (index = 0; index <= maxIndex; index++) {
2306 counts[index] = 1;
2307 maxCt[index] = 1;
2308 totals[index] = 1;
2309 lastId[index] = threadInfo[0][index];;
2310 }
2311
2312 //
2313 // Run through the rest of the OS procs.
2314 //
2315 for (i = 1; i < num_avail; i++) {
2316 //
2317 // Find the most significant index whose id differs
2318 // from the id for the previous OS proc.
2319 //
2320 for (index = maxIndex; index >= threadIdIndex; index--) {
2321 if (assign_thread_ids && (index == threadIdIndex)) {
2322 //
2323 // Auto-assign the thread id field if it wasn't specified.
2324 //
2325 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2326 threadInfo[i][threadIdIndex] = threadIdCt++;
2327 }
2328
2329 //
2330 // Aparrently the thread id field was specified for some
2331 // entries and not others. Start the thread id counter
2332 // off at the next higher thread id.
2333 //
2334 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2335 threadIdCt = threadInfo[i][threadIdIndex] + 1;
2336 }
2337 }
2338 if (threadInfo[i][index] != lastId[index]) {
2339 //
2340 // Run through all indices which are less significant,
2341 // and reset the counts to 1.
2342 //
2343 // At all levels up to and including index, we need to
2344 // increment the totals and record the last id.
2345 //
2346 unsigned index2;
2347 for (index2 = threadIdIndex; index2 < index; index2++) {
2348 totals[index2]++;
2349 if (counts[index2] > maxCt[index2]) {
2350 maxCt[index2] = counts[index2];
2351 }
2352 counts[index2] = 1;
2353 lastId[index2] = threadInfo[i][index2];
2354 }
2355 counts[index]++;
2356 totals[index]++;
2357 lastId[index] = threadInfo[i][index];
2358
2359 if (assign_thread_ids && (index > threadIdIndex)) {
2360
2361# if KMP_MIC && REDUCE_TEAM_SIZE
2362 //
2363 // The default team size is the total #threads in the machine
2364 // minus 1 thread for every core that has 3 or more threads.
2365 //
2366 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2367# endif // KMP_MIC && REDUCE_TEAM_SIZE
2368
2369 //
2370 // Restart the thread counter, as we are on a new core.
2371 //
2372 threadIdCt = 0;
2373
2374 //
2375 // Auto-assign the thread id field if it wasn't specified.
2376 //
2377 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2378 threadInfo[i][threadIdIndex] = threadIdCt++;
2379 }
2380
2381 //
2382 // Aparrently the thread id field was specified for some
2383 // entries and not others. Start the thread id counter
2384 // off at the next higher thread id.
2385 //
2386 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2387 threadIdCt = threadInfo[i][threadIdIndex] + 1;
2388 }
2389 }
2390 break;
2391 }
2392 }
2393 if (index < threadIdIndex) {
2394 //
2395 // If thread ids were specified, it is an error if they are not
2396 // unique. Also, check that we waven't already restarted the
2397 // loop (to be safe - shouldn't need to).
2398 //
2399 if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2400 || assign_thread_ids) {
2401 __kmp_free(lastId);
2402 __kmp_free(totals);
2403 __kmp_free(maxCt);
2404 __kmp_free(counts);
2405 CLEANUP_THREAD_INFO;
2406 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2407 return -1;
2408 }
2409
2410 //
2411 // If the thread ids were not specified and we see entries
2412 // entries that are duplicates, start the loop over and
2413 // assign the thread ids manually.
2414 //
2415 assign_thread_ids = true;
2416 goto restart_radix_check;
2417 }
2418 }
2419
2420# if KMP_MIC && REDUCE_TEAM_SIZE
2421 //
2422 // The default team size is the total #threads in the machine
2423 // minus 1 thread for every core that has 3 or more threads.
2424 //
2425 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2426# endif // KMP_MIC && REDUCE_TEAM_SIZE
2427
2428 for (index = threadIdIndex; index <= maxIndex; index++) {
2429 if (counts[index] > maxCt[index]) {
2430 maxCt[index] = counts[index];
2431 }
2432 }
2433
2434 __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2435 nCoresPerPkg = maxCt[coreIdIndex];
2436 nPackages = totals[pkgIdIndex];
2437
2438 //
2439 // Check to see if the machine topology is uniform
2440 //
2441 unsigned prod = totals[maxIndex];
2442 for (index = threadIdIndex; index < maxIndex; index++) {
2443 prod *= maxCt[index];
2444 }
2445 bool uniform = (prod == totals[threadIdIndex]);
2446
2447 //
2448 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +00002449 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +00002450 // nCoresPerPkg, & nPackages. Make sure all these vars are set
2451 // correctly, and return now if affinity is not enabled.
2452 //
Jim Cownie5e8470a2013-09-27 10:38:44 +00002453 __kmp_ncores = totals[coreIdIndex];
2454
2455 if (__kmp_affinity_verbose) {
2456 if (! KMP_AFFINITY_CAPABLE()) {
2457 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2458 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2459 if (uniform) {
2460 KMP_INFORM(Uniform, "KMP_AFFINITY");
2461 } else {
2462 KMP_INFORM(NonUniform, "KMP_AFFINITY");
2463 }
2464 }
2465 else {
2466 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2467 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2468 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2469 if (__kmp_affinity_respect_mask) {
2470 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2471 } else {
2472 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2473 }
2474 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2475 if (uniform) {
2476 KMP_INFORM(Uniform, "KMP_AFFINITY");
2477 } else {
2478 KMP_INFORM(NonUniform, "KMP_AFFINITY");
2479 }
2480 }
2481 kmp_str_buf_t buf;
2482 __kmp_str_buf_init(&buf);
2483
2484 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2485 for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2486 __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2487 }
2488 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2489 maxCt[threadIdIndex], __kmp_ncores);
2490
2491 __kmp_str_buf_free(&buf);
2492 }
2493
2494# if KMP_MIC && REDUCE_TEAM_SIZE
2495 //
2496 // Set the default team size.
2497 //
2498 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2499 __kmp_dflt_team_nth = teamSize;
2500 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2501 __kmp_dflt_team_nth));
2502 }
2503# endif // KMP_MIC && REDUCE_TEAM_SIZE
2504
2505 if (__kmp_affinity_type == affinity_none) {
2506 __kmp_free(lastId);
2507 __kmp_free(totals);
2508 __kmp_free(maxCt);
2509 __kmp_free(counts);
2510 CLEANUP_THREAD_INFO;
2511 return 0;
2512 }
2513
2514 //
2515 // Count the number of levels which have more nodes at that level than
2516 // at the parent's level (with there being an implicit root node of
2517 // the top level). This is equivalent to saying that there is at least
2518 // one node at this level which has a sibling. These levels are in the
2519 // map, and the package level is always in the map.
2520 //
2521 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2522 int level = 0;
2523 for (index = threadIdIndex; index < maxIndex; index++) {
2524 KMP_ASSERT(totals[index] >= totals[index + 1]);
2525 inMap[index] = (totals[index] > totals[index + 1]);
2526 }
2527 inMap[maxIndex] = (totals[maxIndex] > 1);
2528 inMap[pkgIdIndex] = true;
2529
2530 int depth = 0;
2531 for (index = threadIdIndex; index <= maxIndex; index++) {
2532 if (inMap[index]) {
2533 depth++;
2534 }
2535 }
2536 KMP_ASSERT(depth > 0);
2537
2538 //
2539 // Construct the data structure that is to be returned.
2540 //
2541 *address2os = (AddrUnsPair*)
2542 __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2543 int pkgLevel = -1;
2544 int coreLevel = -1;
2545 int threadLevel = -1;
2546
2547 for (i = 0; i < num_avail; ++i) {
2548 Address addr(depth);
2549 unsigned os = threadInfo[i][osIdIndex];
2550 int src_index;
2551 int dst_index = 0;
2552
2553 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2554 if (! inMap[src_index]) {
2555 continue;
2556 }
2557 addr.labels[dst_index] = threadInfo[i][src_index];
2558 if (src_index == pkgIdIndex) {
2559 pkgLevel = dst_index;
2560 }
2561 else if (src_index == coreIdIndex) {
2562 coreLevel = dst_index;
2563 }
2564 else if (src_index == threadIdIndex) {
2565 threadLevel = dst_index;
2566 }
2567 dst_index++;
2568 }
2569 (*address2os)[i] = AddrUnsPair(addr, os);
2570 }
2571
2572 if (__kmp_affinity_gran_levels < 0) {
2573 //
2574 // Set the granularity level based on what levels are modeled
2575 // in the machine topology map.
2576 //
2577 unsigned src_index;
2578 __kmp_affinity_gran_levels = 0;
2579 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2580 if (! inMap[src_index]) {
2581 continue;
2582 }
2583 switch (src_index) {
2584 case threadIdIndex:
2585 if (__kmp_affinity_gran > affinity_gran_thread) {
2586 __kmp_affinity_gran_levels++;
2587 }
2588
2589 break;
2590 case coreIdIndex:
2591 if (__kmp_affinity_gran > affinity_gran_core) {
2592 __kmp_affinity_gran_levels++;
2593 }
2594 break;
2595
2596 case pkgIdIndex:
2597 if (__kmp_affinity_gran > affinity_gran_package) {
2598 __kmp_affinity_gran_levels++;
2599 }
2600 break;
2601 }
2602 }
2603 }
2604
2605 if (__kmp_affinity_verbose) {
2606 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2607 coreLevel, threadLevel);
2608 }
2609
2610 __kmp_free(inMap);
2611 __kmp_free(lastId);
2612 __kmp_free(totals);
2613 __kmp_free(maxCt);
2614 __kmp_free(counts);
2615 CLEANUP_THREAD_INFO;
2616 return depth;
2617}
2618
2619
2620//
2621// Create and return a table of affinity masks, indexed by OS thread ID.
2622// This routine handles OR'ing together all the affinity masks of threads
2623// that are sufficiently close, if granularity > fine.
2624//
2625static kmp_affin_mask_t *
2626__kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2627 AddrUnsPair *address2os, unsigned numAddrs)
2628{
2629 //
2630 // First form a table of affinity masks in order of OS thread id.
2631 //
2632 unsigned depth;
2633 unsigned maxOsId;
2634 unsigned i;
2635
2636 KMP_ASSERT(numAddrs > 0);
2637 depth = address2os[0].first.depth;
2638
2639 maxOsId = 0;
2640 for (i = 0; i < numAddrs; i++) {
2641 unsigned osId = address2os[i].second;
2642 if (osId > maxOsId) {
2643 maxOsId = osId;
2644 }
2645 }
2646 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2647 (maxOsId + 1) * __kmp_affin_mask_size);
2648
2649 //
2650 // Sort the address2os table according to physical order. Doing so
2651 // will put all threads on the same core/package/node in consecutive
2652 // locations.
2653 //
2654 qsort(address2os, numAddrs, sizeof(*address2os),
2655 __kmp_affinity_cmp_Address_labels);
2656
2657 KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2658 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2659 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2660 }
2661 if (__kmp_affinity_gran_levels >= (int)depth) {
2662 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2663 && (__kmp_affinity_type != affinity_none))) {
2664 KMP_WARNING(AffThreadsMayMigrate);
2665 }
2666 }
2667
2668 //
2669 // Run through the table, forming the masks for all threads on each
2670 // core. Threads on the same core will have identical "Address"
2671 // objects, not considering the last level, which must be the thread
2672 // id. All threads on a core will appear consecutively.
2673 //
2674 unsigned unique = 0;
2675 unsigned j = 0; // index of 1st thread on core
2676 unsigned leader = 0;
2677 Address *leaderAddr = &(address2os[0].first);
2678 kmp_affin_mask_t *sum
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002679 = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002680 KMP_CPU_ZERO(sum);
2681 KMP_CPU_SET(address2os[0].second, sum);
2682 for (i = 1; i < numAddrs; i++) {
2683 //
Alp Toker8f2d3f02014-02-24 10:40:15 +00002684 // If this thread is sufficiently close to the leader (within the
Jim Cownie5e8470a2013-09-27 10:38:44 +00002685 // granularity setting), then set the bit for this os thread in the
2686 // affinity mask for this group, and go on to the next thread.
2687 //
2688 if (leaderAddr->isClose(address2os[i].first,
2689 __kmp_affinity_gran_levels)) {
2690 KMP_CPU_SET(address2os[i].second, sum);
2691 continue;
2692 }
2693
2694 //
2695 // For every thread in this group, copy the mask to the thread's
2696 // entry in the osId2Mask table. Mark the first address as a
2697 // leader.
2698 //
2699 for (; j < i; j++) {
2700 unsigned osId = address2os[j].second;
2701 KMP_DEBUG_ASSERT(osId <= maxOsId);
2702 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2703 KMP_CPU_COPY(mask, sum);
2704 address2os[j].first.leader = (j == leader);
2705 }
2706 unique++;
2707
2708 //
2709 // Start a new mask.
2710 //
2711 leader = i;
2712 leaderAddr = &(address2os[i].first);
2713 KMP_CPU_ZERO(sum);
2714 KMP_CPU_SET(address2os[i].second, sum);
2715 }
2716
2717 //
2718 // For every thread in last group, copy the mask to the thread's
2719 // entry in the osId2Mask table.
2720 //
2721 for (; j < i; j++) {
2722 unsigned osId = address2os[j].second;
2723 KMP_DEBUG_ASSERT(osId <= maxOsId);
2724 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2725 KMP_CPU_COPY(mask, sum);
2726 address2os[j].first.leader = (j == leader);
2727 }
2728 unique++;
2729
2730 *maxIndex = maxOsId;
2731 *numUnique = unique;
2732 return osId2Mask;
2733}
2734
2735
2736//
2737// Stuff for the affinity proclist parsers. It's easier to declare these vars
2738// as file-static than to try and pass them through the calling sequence of
2739// the recursive-descent OMP_PLACES parser.
2740//
2741static kmp_affin_mask_t *newMasks;
2742static int numNewMasks;
2743static int nextNewMask;
2744
2745#define ADD_MASK(_mask) \
2746 { \
2747 if (nextNewMask >= numNewMasks) { \
2748 numNewMasks *= 2; \
2749 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2750 numNewMasks * __kmp_affin_mask_size); \
2751 } \
2752 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2753 nextNewMask++; \
2754 }
2755
2756#define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2757 { \
2758 if (((_osId) > _maxOsId) || \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002759 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
Jim Cownie5e8470a2013-09-27 10:38:44 +00002760 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \
2761 && (__kmp_affinity_type != affinity_none))) { \
2762 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2763 } \
2764 } \
2765 else { \
2766 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2767 } \
2768 }
2769
2770
2771//
2772// Re-parse the proclist (for the explicit affinity type), and form the list
2773// of affinity newMasks indexed by gtid.
2774//
2775static void
2776__kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2777 unsigned int *out_numMasks, const char *proclist,
2778 kmp_affin_mask_t *osId2Mask, int maxOsId)
2779{
2780 const char *scan = proclist;
2781 const char *next = proclist;
2782
2783 //
2784 // We use malloc() for the temporary mask vector,
2785 // so that we can use realloc() to extend it.
2786 //
2787 numNewMasks = 2;
2788 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2789 * __kmp_affin_mask_size);
2790 nextNewMask = 0;
2791 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2792 __kmp_affin_mask_size);
2793 int setSize = 0;
2794
2795 for (;;) {
2796 int start, end, stride;
2797
2798 SKIP_WS(scan);
2799 next = scan;
2800 if (*next == '\0') {
2801 break;
2802 }
2803
2804 if (*next == '{') {
2805 int num;
2806 setSize = 0;
2807 next++; // skip '{'
2808 SKIP_WS(next);
2809 scan = next;
2810
2811 //
2812 // Read the first integer in the set.
2813 //
2814 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2815 "bad proclist");
2816 SKIP_DIGITS(next);
2817 num = __kmp_str_to_int(scan, *next);
2818 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2819
2820 //
2821 // Copy the mask for that osId to the sum (union) mask.
2822 //
2823 if ((num > maxOsId) ||
2824 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2825 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2826 && (__kmp_affinity_type != affinity_none))) {
2827 KMP_WARNING(AffIgnoreInvalidProcID, num);
2828 }
2829 KMP_CPU_ZERO(sumMask);
2830 }
2831 else {
2832 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2833 setSize = 1;
2834 }
2835
2836 for (;;) {
2837 //
2838 // Check for end of set.
2839 //
2840 SKIP_WS(next);
2841 if (*next == '}') {
2842 next++; // skip '}'
2843 break;
2844 }
2845
2846 //
2847 // Skip optional comma.
2848 //
2849 if (*next == ',') {
2850 next++;
2851 }
2852 SKIP_WS(next);
2853
2854 //
2855 // Read the next integer in the set.
2856 //
2857 scan = next;
2858 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2859 "bad explicit proc list");
2860
2861 SKIP_DIGITS(next);
2862 num = __kmp_str_to_int(scan, *next);
2863 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2864
2865 //
2866 // Add the mask for that osId to the sum mask.
2867 //
2868 if ((num > maxOsId) ||
2869 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2870 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2871 && (__kmp_affinity_type != affinity_none))) {
2872 KMP_WARNING(AffIgnoreInvalidProcID, num);
2873 }
2874 }
2875 else {
2876 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2877 setSize++;
2878 }
2879 }
2880 if (setSize > 0) {
2881 ADD_MASK(sumMask);
2882 }
2883
2884 SKIP_WS(next);
2885 if (*next == ',') {
2886 next++;
2887 }
2888 scan = next;
2889 continue;
2890 }
2891
2892 //
2893 // Read the first integer.
2894 //
2895 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2896 SKIP_DIGITS(next);
2897 start = __kmp_str_to_int(scan, *next);
2898 KMP_ASSERT2(start >= 0, "bad explicit proc list");
2899 SKIP_WS(next);
2900
2901 //
2902 // If this isn't a range, then add a mask to the list and go on.
2903 //
2904 if (*next != '-') {
2905 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2906
2907 //
2908 // Skip optional comma.
2909 //
2910 if (*next == ',') {
2911 next++;
2912 }
2913 scan = next;
2914 continue;
2915 }
2916
2917 //
2918 // This is a range. Skip over the '-' and read in the 2nd int.
2919 //
2920 next++; // skip '-'
2921 SKIP_WS(next);
2922 scan = next;
2923 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2924 SKIP_DIGITS(next);
2925 end = __kmp_str_to_int(scan, *next);
2926 KMP_ASSERT2(end >= 0, "bad explicit proc list");
2927
2928 //
2929 // Check for a stride parameter
2930 //
2931 stride = 1;
2932 SKIP_WS(next);
2933 if (*next == ':') {
2934 //
2935 // A stride is specified. Skip over the ':" and read the 3rd int.
2936 //
2937 int sign = +1;
2938 next++; // skip ':'
2939 SKIP_WS(next);
2940 scan = next;
2941 if (*next == '-') {
2942 sign = -1;
2943 next++;
2944 SKIP_WS(next);
2945 scan = next;
2946 }
2947 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2948 "bad explicit proc list");
2949 SKIP_DIGITS(next);
2950 stride = __kmp_str_to_int(scan, *next);
2951 KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2952 stride *= sign;
2953 }
2954
2955 //
2956 // Do some range checks.
2957 //
2958 KMP_ASSERT2(stride != 0, "bad explicit proc list");
2959 if (stride > 0) {
2960 KMP_ASSERT2(start <= end, "bad explicit proc list");
2961 }
2962 else {
2963 KMP_ASSERT2(start >= end, "bad explicit proc list");
2964 }
2965 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2966
2967 //
2968 // Add the mask for each OS proc # to the list.
2969 //
2970 if (stride > 0) {
2971 do {
2972 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2973 start += stride;
2974 } while (start <= end);
2975 }
2976 else {
2977 do {
2978 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2979 start += stride;
2980 } while (start >= end);
2981 }
2982
2983 //
2984 // Skip optional comma.
2985 //
2986 SKIP_WS(next);
2987 if (*next == ',') {
2988 next++;
2989 }
2990 scan = next;
2991 }
2992
2993 *out_numMasks = nextNewMask;
2994 if (nextNewMask == 0) {
2995 *out_masks = NULL;
2996 KMP_INTERNAL_FREE(newMasks);
2997 return;
2998 }
2999 *out_masks
3000 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00003001 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003002 __kmp_free(sumMask);
3003 KMP_INTERNAL_FREE(newMasks);
3004}
3005
3006
3007# if OMP_40_ENABLED
3008
3009/*-----------------------------------------------------------------------------
3010
3011Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3012places. Again, Here is the grammar:
3013
3014place_list := place
3015place_list := place , place_list
3016place := num
3017place := place : num
3018place := place : num : signed
3019place := { subplacelist }
3020place := ! place // (lowest priority)
3021subplace_list := subplace
3022subplace_list := subplace , subplace_list
3023subplace := num
3024subplace := num : num
3025subplace := num : num : signed
3026signed := num
3027signed := + signed
3028signed := - signed
3029
3030-----------------------------------------------------------------------------*/
3031
3032static void
3033__kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
3034 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3035{
3036 const char *next;
3037
3038 for (;;) {
3039 int start, count, stride, i;
3040
3041 //
3042 // Read in the starting proc id
3043 //
3044 SKIP_WS(*scan);
3045 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3046 "bad explicit places list");
3047 next = *scan;
3048 SKIP_DIGITS(next);
3049 start = __kmp_str_to_int(*scan, *next);
3050 KMP_ASSERT(start >= 0);
3051 *scan = next;
3052
3053 //
3054 // valid follow sets are ',' ':' and '}'
3055 //
3056 SKIP_WS(*scan);
3057 if (**scan == '}' || **scan == ',') {
3058 if ((start > maxOsId) ||
3059 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3060 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3061 && (__kmp_affinity_type != affinity_none))) {
3062 KMP_WARNING(AffIgnoreInvalidProcID, start);
3063 }
3064 }
3065 else {
3066 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3067 (*setSize)++;
3068 }
3069 if (**scan == '}') {
3070 break;
3071 }
3072 (*scan)++; // skip ','
3073 continue;
3074 }
3075 KMP_ASSERT2(**scan == ':', "bad explicit places list");
3076 (*scan)++; // skip ':'
3077
3078 //
3079 // Read count parameter
3080 //
3081 SKIP_WS(*scan);
3082 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3083 "bad explicit places list");
3084 next = *scan;
3085 SKIP_DIGITS(next);
3086 count = __kmp_str_to_int(*scan, *next);
3087 KMP_ASSERT(count >= 0);
3088 *scan = next;
3089
3090 //
3091 // valid follow sets are ',' ':' and '}'
3092 //
3093 SKIP_WS(*scan);
3094 if (**scan == '}' || **scan == ',') {
3095 for (i = 0; i < count; i++) {
3096 if ((start > maxOsId) ||
3097 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3098 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3099 && (__kmp_affinity_type != affinity_none))) {
3100 KMP_WARNING(AffIgnoreInvalidProcID, start);
3101 }
3102 break; // don't proliferate warnings for large count
3103 }
3104 else {
3105 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3106 start++;
3107 (*setSize)++;
3108 }
3109 }
3110 if (**scan == '}') {
3111 break;
3112 }
3113 (*scan)++; // skip ','
3114 continue;
3115 }
3116 KMP_ASSERT2(**scan == ':', "bad explicit places list");
3117 (*scan)++; // skip ':'
3118
3119 //
3120 // Read stride parameter
3121 //
3122 int sign = +1;
3123 for (;;) {
3124 SKIP_WS(*scan);
3125 if (**scan == '+') {
3126 (*scan)++; // skip '+'
3127 continue;
3128 }
3129 if (**scan == '-') {
3130 sign *= -1;
3131 (*scan)++; // skip '-'
3132 continue;
3133 }
3134 break;
3135 }
3136 SKIP_WS(*scan);
3137 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3138 "bad explicit places list");
3139 next = *scan;
3140 SKIP_DIGITS(next);
3141 stride = __kmp_str_to_int(*scan, *next);
3142 KMP_ASSERT(stride >= 0);
3143 *scan = next;
3144 stride *= sign;
3145
3146 //
3147 // valid follow sets are ',' and '}'
3148 //
3149 SKIP_WS(*scan);
3150 if (**scan == '}' || **scan == ',') {
3151 for (i = 0; i < count; i++) {
3152 if ((start > maxOsId) ||
3153 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3154 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3155 && (__kmp_affinity_type != affinity_none))) {
3156 KMP_WARNING(AffIgnoreInvalidProcID, start);
3157 }
3158 break; // don't proliferate warnings for large count
3159 }
3160 else {
3161 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3162 start += stride;
3163 (*setSize)++;
3164 }
3165 }
3166 if (**scan == '}') {
3167 break;
3168 }
3169 (*scan)++; // skip ','
3170 continue;
3171 }
3172
3173 KMP_ASSERT2(0, "bad explicit places list");
3174 }
3175}
3176
3177
3178static void
3179__kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3180 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3181{
3182 const char *next;
3183
3184 //
3185 // valid follow sets are '{' '!' and num
3186 //
3187 SKIP_WS(*scan);
3188 if (**scan == '{') {
3189 (*scan)++; // skip '{'
3190 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3191 setSize);
3192 KMP_ASSERT2(**scan == '}', "bad explicit places list");
3193 (*scan)++; // skip '}'
3194 }
3195 else if (**scan == '!') {
3196 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3197 KMP_CPU_COMPLEMENT(tempMask);
3198 (*scan)++; // skip '!'
3199 }
3200 else if ((**scan >= '0') && (**scan <= '9')) {
3201 next = *scan;
3202 SKIP_DIGITS(next);
3203 int num = __kmp_str_to_int(*scan, *next);
3204 KMP_ASSERT(num >= 0);
3205 if ((num > maxOsId) ||
3206 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3207 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3208 && (__kmp_affinity_type != affinity_none))) {
3209 KMP_WARNING(AffIgnoreInvalidProcID, num);
3210 }
3211 }
3212 else {
3213 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3214 (*setSize)++;
3215 }
3216 *scan = next; // skip num
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003217 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003218 else {
3219 KMP_ASSERT2(0, "bad explicit places list");
3220 }
3221}
3222
3223
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003224//static void
3225void
Jim Cownie5e8470a2013-09-27 10:38:44 +00003226__kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3227 unsigned int *out_numMasks, const char *placelist,
3228 kmp_affin_mask_t *osId2Mask, int maxOsId)
3229{
3230 const char *scan = placelist;
3231 const char *next = placelist;
3232
3233 numNewMasks = 2;
3234 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3235 * __kmp_affin_mask_size);
3236 nextNewMask = 0;
3237
3238 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3239 __kmp_affin_mask_size);
3240 KMP_CPU_ZERO(tempMask);
3241 int setSize = 0;
3242
3243 for (;;) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003244 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3245
3246 //
3247 // valid follow sets are ',' ':' and EOL
3248 //
3249 SKIP_WS(scan);
3250 if (*scan == '\0' || *scan == ',') {
3251 if (setSize > 0) {
3252 ADD_MASK(tempMask);
3253 }
3254 KMP_CPU_ZERO(tempMask);
3255 setSize = 0;
3256 if (*scan == '\0') {
3257 break;
3258 }
3259 scan++; // skip ','
3260 continue;
3261 }
3262
3263 KMP_ASSERT2(*scan == ':', "bad explicit places list");
3264 scan++; // skip ':'
3265
3266 //
3267 // Read count parameter
3268 //
3269 SKIP_WS(scan);
3270 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3271 "bad explicit places list");
3272 next = scan;
3273 SKIP_DIGITS(next);
Jim Cownie181b4bb2013-12-23 17:28:57 +00003274 int count = __kmp_str_to_int(scan, *next);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003275 KMP_ASSERT(count >= 0);
3276 scan = next;
3277
3278 //
3279 // valid follow sets are ',' ':' and EOL
3280 //
3281 SKIP_WS(scan);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003282 int stride;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003283 if (*scan == '\0' || *scan == ',') {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003284 stride = +1;
3285 }
3286 else {
3287 KMP_ASSERT2(*scan == ':', "bad explicit places list");
3288 scan++; // skip ':'
Jim Cownie5e8470a2013-09-27 10:38:44 +00003289
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003290 //
3291 // Read stride parameter
3292 //
3293 int sign = +1;
3294 for (;;) {
3295 SKIP_WS(scan);
3296 if (*scan == '+') {
3297 scan++; // skip '+'
3298 continue;
3299 }
3300 if (*scan == '-') {
3301 sign *= -1;
3302 scan++; // skip '-'
3303 continue;
3304 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003305 break;
3306 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003307 SKIP_WS(scan);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003308 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3309 "bad explicit places list");
3310 next = scan;
3311 SKIP_DIGITS(next);
3312 stride = __kmp_str_to_int(scan, *next);
3313 KMP_DEBUG_ASSERT(stride >= 0);
3314 scan = next;
3315 stride *= sign;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003316 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003317
3318 if (stride > 0) {
3319 int i;
3320 for (i = 0; i < count; i++) {
3321 int j;
3322 if (setSize == 0) {
3323 break;
3324 }
3325 ADD_MASK(tempMask);
3326 setSize = 0;
3327 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003328 if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3329 KMP_CPU_CLR(j, tempMask);
3330 }
3331 else if ((j > maxOsId) ||
3332 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
Andrey Churbanov16a14322015-03-10 09:34:38 +00003333 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3334 && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003335 KMP_WARNING(AffIgnoreInvalidProcID, j);
3336 }
3337 KMP_CPU_CLR(j, tempMask);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003338 }
3339 else {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003340 KMP_CPU_SET(j, tempMask);
3341 setSize++;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003342 }
3343 }
3344 for (; j >= 0; j--) {
3345 KMP_CPU_CLR(j, tempMask);
3346 }
3347 }
3348 }
3349 else {
3350 int i;
3351 for (i = 0; i < count; i++) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003352 int j;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003353 if (setSize == 0) {
3354 break;
3355 }
3356 ADD_MASK(tempMask);
3357 setSize = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003358 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003359 j++) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003360 if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3361 KMP_CPU_CLR(j, tempMask);
3362 }
3363 else if ((j > maxOsId) ||
3364 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
Andrey Churbanov16a14322015-03-10 09:34:38 +00003365 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3366 && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003367 KMP_WARNING(AffIgnoreInvalidProcID, j);
3368 }
3369 KMP_CPU_CLR(j, tempMask);
3370 }
3371 else {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003372 KMP_CPU_SET(j, tempMask);
3373 setSize++;
3374 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003375 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003376 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003377 KMP_CPU_CLR(j, tempMask);
3378 }
3379 }
3380 }
3381 KMP_CPU_ZERO(tempMask);
3382 setSize = 0;
3383
3384 //
3385 // valid follow sets are ',' and EOL
3386 //
3387 SKIP_WS(scan);
3388 if (*scan == '\0') {
3389 break;
3390 }
3391 if (*scan == ',') {
3392 scan++; // skip ','
3393 continue;
3394 }
3395
3396 KMP_ASSERT2(0, "bad explicit places list");
3397 }
3398
3399 *out_numMasks = nextNewMask;
3400 if (nextNewMask == 0) {
3401 *out_masks = NULL;
3402 KMP_INTERNAL_FREE(newMasks);
3403 return;
3404 }
3405 *out_masks
3406 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00003407 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003408 __kmp_free(tempMask);
3409 KMP_INTERNAL_FREE(newMasks);
3410}
3411
3412# endif /* OMP_40_ENABLED */
3413
3414#undef ADD_MASK
3415#undef ADD_MASK_OSID
3416
Jim Cownie5e8470a2013-09-27 10:38:44 +00003417static void
3418__kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3419{
3420 if ( __kmp_place_num_cores == 0 ) {
3421 if ( __kmp_place_num_threads_per_core == 0 ) {
3422 return; // no cores limiting actions requested, exit
3423 }
3424 __kmp_place_num_cores = nCoresPerPkg; // use all available cores
3425 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003426 if ( !__kmp_affinity_uniform_topology() ) {
3427 KMP_WARNING( AffThrPlaceNonUniform );
3428 return; // don't support non-uniform topology
3429 }
3430 if ( depth != 3 ) {
3431 KMP_WARNING( AffThrPlaceNonThreeLevel );
3432 return; // don't support not-3-level topology
Jim Cownie5e8470a2013-09-27 10:38:44 +00003433 }
3434 if ( __kmp_place_num_threads_per_core == 0 ) {
3435 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3436 }
Andrey Churbanov12875572015-03-10 09:00:36 +00003437 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003438 KMP_WARNING( AffThrPlaceManyCores );
3439 return;
3440 }
3441
3442 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3443 nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3444 int i, j, k, n_old = 0, n_new = 0;
3445 for ( i = 0; i < nPackages; ++i ) {
3446 for ( j = 0; j < nCoresPerPkg; ++j ) {
Andrey Churbanov12875572015-03-10 09:00:36 +00003447 if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003448 n_old += __kmp_nThreadsPerCore; // skip not-requested core
3449 } else {
3450 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
Andrey Churbanov12875572015-03-10 09:00:36 +00003451 if ( k < __kmp_place_num_threads_per_core ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003452 newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location
3453 n_new++;
3454 }
3455 n_old++;
3456 }
3457 }
3458 }
3459 }
3460 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg
3461 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3462 __kmp_avail_proc = n_new; // correct avail_proc
3463 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores
3464
3465 __kmp_free( *pAddr );
3466 *pAddr = newAddr; // replace old topology with new one
3467}
3468
Jim Cownie5e8470a2013-09-27 10:38:44 +00003469
3470static AddrUnsPair *address2os = NULL;
3471static int * procarr = NULL;
3472static int __kmp_aff_depth = 0;
3473
3474static void
3475__kmp_aux_affinity_initialize(void)
3476{
3477 if (__kmp_affinity_masks != NULL) {
3478 KMP_ASSERT(fullMask != NULL);
3479 return;
3480 }
3481
3482 //
3483 // Create the "full" mask - this defines all of the processors that we
3484 // consider to be in the machine model. If respect is set, then it is
3485 // the initialization thread's affinity mask. Otherwise, it is all
3486 // processors that we know about on the machine.
3487 //
3488 if (fullMask == NULL) {
3489 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3490 }
3491 if (KMP_AFFINITY_CAPABLE()) {
3492 if (__kmp_affinity_respect_mask) {
3493 __kmp_get_system_affinity(fullMask, TRUE);
3494
3495 //
3496 // Count the number of available processors.
3497 //
3498 unsigned i;
3499 __kmp_avail_proc = 0;
3500 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3501 if (! KMP_CPU_ISSET(i, fullMask)) {
3502 continue;
3503 }
3504 __kmp_avail_proc++;
3505 }
3506 if (__kmp_avail_proc > __kmp_xproc) {
3507 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3508 && (__kmp_affinity_type != affinity_none))) {
3509 KMP_WARNING(ErrorInitializeAffinity);
3510 }
3511 __kmp_affinity_type = affinity_none;
Andrey Churbanov1f037e42015-03-10 09:15:26 +00003512 KMP_AFFINITY_DISABLE();
Jim Cownie5e8470a2013-09-27 10:38:44 +00003513 return;
3514 }
3515 }
3516 else {
3517 __kmp_affinity_entire_machine_mask(fullMask);
3518 __kmp_avail_proc = __kmp_xproc;
3519 }
3520 }
3521
3522 int depth = -1;
3523 kmp_i18n_id_t msg_id = kmp_i18n_null;
3524
3525 //
Alp Toker8f2d3f02014-02-24 10:40:15 +00003526 // For backward compatibility, setting KMP_CPUINFO_FILE =>
Jim Cownie5e8470a2013-09-27 10:38:44 +00003527 // KMP_TOPOLOGY_METHOD=cpuinfo
3528 //
3529 if ((__kmp_cpuinfo_file != NULL) &&
3530 (__kmp_affinity_top_method == affinity_top_method_all)) {
3531 __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3532 }
3533
3534 if (__kmp_affinity_top_method == affinity_top_method_all) {
3535 //
3536 // In the default code path, errors are not fatal - we just try using
3537 // another method. We only emit a warning message if affinity is on,
3538 // or the verbose flag is set, an the nowarnings flag was not set.
3539 //
3540 const char *file_name = NULL;
3541 int line = 0;
3542
3543# if KMP_ARCH_X86 || KMP_ARCH_X86_64
3544
3545 if (__kmp_affinity_verbose) {
3546 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3547 }
3548
3549 file_name = NULL;
3550 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3551 if (depth == 0) {
3552 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3553 KMP_ASSERT(address2os == NULL);
3554 return;
3555 }
3556
3557 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003558 if (__kmp_affinity_verbose) {
3559 if (msg_id != kmp_i18n_null) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003560 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3561 KMP_I18N_STR(DecodingLegacyAPIC));
3562 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003563 else {
3564 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3565 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003566 }
3567
3568 file_name = NULL;
3569 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3570 if (depth == 0) {
3571 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3572 KMP_ASSERT(address2os == NULL);
3573 return;
3574 }
3575 }
3576
3577# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3578
3579# if KMP_OS_LINUX
3580
3581 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003582 if (__kmp_affinity_verbose) {
3583 if (msg_id != kmp_i18n_null) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003584 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3585 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003586 else {
3587 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3588 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003589 }
3590
3591 FILE *f = fopen("/proc/cpuinfo", "r");
3592 if (f == NULL) {
3593 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3594 }
3595 else {
3596 file_name = "/proc/cpuinfo";
3597 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3598 fclose(f);
3599 if (depth == 0) {
3600 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3601 KMP_ASSERT(address2os == NULL);
3602 return;
3603 }
3604 }
3605 }
3606
3607# endif /* KMP_OS_LINUX */
3608
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003609# if KMP_GROUP_AFFINITY
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003610
3611 if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3612 if (__kmp_affinity_verbose) {
3613 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3614 }
3615
3616 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3617 KMP_ASSERT(depth != 0);
3618 }
3619
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003620# endif /* KMP_GROUP_AFFINITY */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003621
Jim Cownie5e8470a2013-09-27 10:38:44 +00003622 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003623 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003624 if (file_name == NULL) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003625 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003626 }
3627 else if (line == 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003628 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003629 }
3630 else {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003631 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003632 }
3633 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003634 // FIXME - print msg if msg_id = kmp_i18n_null ???
Jim Cownie5e8470a2013-09-27 10:38:44 +00003635
3636 file_name = "";
3637 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3638 if (depth == 0) {
3639 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3640 KMP_ASSERT(address2os == NULL);
3641 return;
3642 }
3643 KMP_ASSERT(depth > 0);
3644 KMP_ASSERT(address2os != NULL);
3645 }
3646 }
3647
3648 //
3649 // If the user has specified that a paricular topology discovery method
3650 // is to be used, then we abort if that method fails. The exception is
3651 // group affinity, which might have been implicitly set.
3652 //
3653
3654# if KMP_ARCH_X86 || KMP_ARCH_X86_64
3655
3656 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3657 if (__kmp_affinity_verbose) {
3658 KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3659 KMP_I18N_STR(Decodingx2APIC));
3660 }
3661
3662 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3663 if (depth == 0) {
3664 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3665 KMP_ASSERT(address2os == NULL);
3666 return;
3667 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003668 if (depth < 0) {
3669 KMP_ASSERT(msg_id != kmp_i18n_null);
3670 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3671 }
3672 }
3673 else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3674 if (__kmp_affinity_verbose) {
3675 KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3676 KMP_I18N_STR(DecodingLegacyAPIC));
3677 }
3678
3679 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3680 if (depth == 0) {
3681 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3682 KMP_ASSERT(address2os == NULL);
3683 return;
3684 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003685 if (depth < 0) {
3686 KMP_ASSERT(msg_id != kmp_i18n_null);
3687 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3688 }
3689 }
3690
3691# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3692
3693 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3694 const char *filename;
3695 if (__kmp_cpuinfo_file != NULL) {
3696 filename = __kmp_cpuinfo_file;
3697 }
3698 else {
3699 filename = "/proc/cpuinfo";
3700 }
3701
3702 if (__kmp_affinity_verbose) {
3703 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3704 }
3705
3706 FILE *f = fopen(filename, "r");
3707 if (f == NULL) {
3708 int code = errno;
3709 if (__kmp_cpuinfo_file != NULL) {
3710 __kmp_msg(
3711 kmp_ms_fatal,
3712 KMP_MSG(CantOpenFileForReading, filename),
3713 KMP_ERR(code),
3714 KMP_HNT(NameComesFrom_CPUINFO_FILE),
3715 __kmp_msg_null
3716 );
3717 }
3718 else {
3719 __kmp_msg(
3720 kmp_ms_fatal,
3721 KMP_MSG(CantOpenFileForReading, filename),
3722 KMP_ERR(code),
3723 __kmp_msg_null
3724 );
3725 }
3726 }
3727 int line = 0;
3728 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3729 fclose(f);
3730 if (depth < 0) {
3731 KMP_ASSERT(msg_id != kmp_i18n_null);
3732 if (line > 0) {
3733 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3734 }
3735 else {
3736 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3737 }
3738 }
3739 if (__kmp_affinity_type == affinity_none) {
3740 KMP_ASSERT(depth == 0);
3741 KMP_ASSERT(address2os == NULL);
3742 return;
3743 }
3744 }
3745
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003746# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00003747
3748 else if (__kmp_affinity_top_method == affinity_top_method_group) {
3749 if (__kmp_affinity_verbose) {
3750 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3751 }
3752
3753 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3754 KMP_ASSERT(depth != 0);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003755 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003756 KMP_ASSERT(msg_id != kmp_i18n_null);
3757 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003758 }
3759 }
3760
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003761# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +00003762
3763 else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3764 if (__kmp_affinity_verbose) {
3765 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3766 }
3767
3768 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3769 if (depth == 0) {
3770 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3771 KMP_ASSERT(address2os == NULL);
3772 return;
3773 }
3774 // should not fail
3775 KMP_ASSERT(depth > 0);
3776 KMP_ASSERT(address2os != NULL);
3777 }
3778
3779 if (address2os == NULL) {
3780 if (KMP_AFFINITY_CAPABLE()
3781 && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3782 && (__kmp_affinity_type != affinity_none)))) {
3783 KMP_WARNING(ErrorInitializeAffinity);
3784 }
3785 __kmp_affinity_type = affinity_none;
Andrey Churbanov1f037e42015-03-10 09:15:26 +00003786 KMP_AFFINITY_DISABLE();
Jim Cownie5e8470a2013-09-27 10:38:44 +00003787 return;
3788 }
3789
Jim Cownie5e8470a2013-09-27 10:38:44 +00003790 __kmp_apply_thread_places(&address2os, depth);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003791
3792 //
3793 // Create the table of masks, indexed by thread Id.
3794 //
3795 unsigned maxIndex;
3796 unsigned numUnique;
3797 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3798 address2os, __kmp_avail_proc);
3799 if (__kmp_affinity_gran_levels == 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003800 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003801 }
3802
3803 //
3804 // Set the childNums vector in all Address objects. This must be done
3805 // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3806 // which takes into account the setting of __kmp_affinity_compact.
3807 //
3808 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3809
3810 switch (__kmp_affinity_type) {
3811
3812 case affinity_explicit:
3813 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3814# if OMP_40_ENABLED
3815 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3816# endif
3817 {
3818 __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3819 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3820 maxIndex);
3821 }
3822# if OMP_40_ENABLED
3823 else {
3824 __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3825 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3826 maxIndex);
3827 }
3828# endif
3829 if (__kmp_affinity_num_masks == 0) {
3830 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3831 && (__kmp_affinity_type != affinity_none))) {
3832 KMP_WARNING(AffNoValidProcID);
3833 }
3834 __kmp_affinity_type = affinity_none;
3835 return;
3836 }
3837 break;
3838
3839 //
3840 // The other affinity types rely on sorting the Addresses according
3841 // to some permutation of the machine topology tree. Set
3842 // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3843 // then jump to a common code fragment to do the sort and create
3844 // the array of affinity masks.
3845 //
3846
3847 case affinity_logical:
3848 __kmp_affinity_compact = 0;
3849 if (__kmp_affinity_offset) {
3850 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3851 % __kmp_avail_proc;
3852 }
3853 goto sortAddresses;
3854
3855 case affinity_physical:
3856 if (__kmp_nThreadsPerCore > 1) {
3857 __kmp_affinity_compact = 1;
3858 if (__kmp_affinity_compact >= depth) {
3859 __kmp_affinity_compact = 0;
3860 }
3861 } else {
3862 __kmp_affinity_compact = 0;
3863 }
3864 if (__kmp_affinity_offset) {
3865 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3866 % __kmp_avail_proc;
3867 }
3868 goto sortAddresses;
3869
3870 case affinity_scatter:
3871 if (__kmp_affinity_compact >= depth) {
3872 __kmp_affinity_compact = 0;
3873 }
3874 else {
3875 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3876 }
3877 goto sortAddresses;
3878
3879 case affinity_compact:
3880 if (__kmp_affinity_compact >= depth) {
3881 __kmp_affinity_compact = depth - 1;
3882 }
3883 goto sortAddresses;
3884
Jim Cownie5e8470a2013-09-27 10:38:44 +00003885 case affinity_balanced:
Jonathan Peytoncaf09fe2015-05-27 23:27:33 +00003886 // Balanced works only for the case of a single package
Jim Cownie5e8470a2013-09-27 10:38:44 +00003887 if( nPackages > 1 ) {
3888 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3889 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3890 }
3891 __kmp_affinity_type = affinity_none;
3892 return;
3893 } else if( __kmp_affinity_uniform_topology() ) {
3894 break;
3895 } else { // Non-uniform topology
3896
3897 // Save the depth for further usage
3898 __kmp_aff_depth = depth;
3899
3900 // Number of hyper threads per core in HT machine
3901 int nth_per_core = __kmp_nThreadsPerCore;
3902
3903 int core_level;
3904 if( nth_per_core > 1 ) {
3905 core_level = depth - 2;
3906 } else {
3907 core_level = depth - 1;
3908 }
3909 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3910 int nproc = nth_per_core * ncores;
3911
3912 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3913 for( int i = 0; i < nproc; i++ ) {
3914 procarr[ i ] = -1;
3915 }
3916
3917 for( int i = 0; i < __kmp_avail_proc; i++ ) {
3918 int proc = address2os[ i ].second;
3919 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3920 // If there is only one thread per core then depth == 2: level 0 - package,
3921 // level 1 - core.
3922 int level = depth - 1;
3923
3924 // __kmp_nth_per_core == 1
3925 int thread = 0;
3926 int core = address2os[ i ].first.labels[ level ];
3927 // If the thread level exists, that is we have more than one thread context per core
3928 if( nth_per_core > 1 ) {
3929 thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3930 core = address2os[ i ].first.labels[ level - 1 ];
3931 }
3932 procarr[ core * nth_per_core + thread ] = proc;
3933 }
3934
3935 break;
3936 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003937
3938 sortAddresses:
3939 //
3940 // Allocate the gtid->affinity mask table.
3941 //
3942 if (__kmp_affinity_dups) {
3943 __kmp_affinity_num_masks = __kmp_avail_proc;
3944 }
3945 else {
3946 __kmp_affinity_num_masks = numUnique;
3947 }
3948
3949# if OMP_40_ENABLED
3950 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3951 && ( __kmp_affinity_num_places > 0 )
3952 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3953 __kmp_affinity_num_masks = __kmp_affinity_num_places;
3954 }
3955# endif
3956
3957 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3958 __kmp_affinity_num_masks * __kmp_affin_mask_size);
3959
3960 //
3961 // Sort the address2os table according to the current setting of
3962 // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3963 //
3964 qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3965 __kmp_affinity_cmp_Address_child_num);
3966 {
3967 int i;
3968 unsigned j;
3969 for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3970 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3971 continue;
3972 }
3973 unsigned osId = address2os[i].second;
3974 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3975 kmp_affin_mask_t *dest
3976 = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3977 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3978 KMP_CPU_COPY(dest, src);
3979 if (++j >= __kmp_affinity_num_masks) {
3980 break;
3981 }
3982 }
3983 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3984 }
3985 break;
3986
3987 default:
3988 KMP_ASSERT2(0, "Unexpected affinity setting");
3989 }
3990
3991 __kmp_free(osId2Mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003992 machine_hierarchy.init(address2os, __kmp_avail_proc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003993}
3994
3995
3996void
3997__kmp_affinity_initialize(void)
3998{
3999 //
4000 // Much of the code above was written assumming that if a machine was not
4001 // affinity capable, then __kmp_affinity_type == affinity_none. We now
4002 // explicitly represent this as __kmp_affinity_type == affinity_disabled.
4003 //
4004 // There are too many checks for __kmp_affinity_type == affinity_none
4005 // in this code. Instead of trying to change them all, check if
4006 // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4007 // affinity_none, call the real initialization routine, then restore
4008 // __kmp_affinity_type to affinity_disabled.
4009 //
4010 int disabled = (__kmp_affinity_type == affinity_disabled);
4011 if (! KMP_AFFINITY_CAPABLE()) {
4012 KMP_ASSERT(disabled);
4013 }
4014 if (disabled) {
4015 __kmp_affinity_type = affinity_none;
4016 }
4017 __kmp_aux_affinity_initialize();
4018 if (disabled) {
4019 __kmp_affinity_type = affinity_disabled;
4020 }
4021}
4022
4023
4024void
4025__kmp_affinity_uninitialize(void)
4026{
4027 if (__kmp_affinity_masks != NULL) {
4028 __kmp_free(__kmp_affinity_masks);
4029 __kmp_affinity_masks = NULL;
4030 }
4031 if (fullMask != NULL) {
4032 KMP_CPU_FREE(fullMask);
4033 fullMask = NULL;
4034 }
4035 __kmp_affinity_num_masks = 0;
4036# if OMP_40_ENABLED
4037 __kmp_affinity_num_places = 0;
4038# endif
4039 if (__kmp_affinity_proclist != NULL) {
4040 __kmp_free(__kmp_affinity_proclist);
4041 __kmp_affinity_proclist = NULL;
4042 }
4043 if( address2os != NULL ) {
4044 __kmp_free( address2os );
4045 address2os = NULL;
4046 }
4047 if( procarr != NULL ) {
4048 __kmp_free( procarr );
4049 procarr = NULL;
4050 }
4051}
4052
4053
4054void
4055__kmp_affinity_set_init_mask(int gtid, int isa_root)
4056{
4057 if (! KMP_AFFINITY_CAPABLE()) {
4058 return;
4059 }
4060
4061 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4062 if (th->th.th_affin_mask == NULL) {
4063 KMP_CPU_ALLOC(th->th.th_affin_mask);
4064 }
4065 else {
4066 KMP_CPU_ZERO(th->th.th_affin_mask);
4067 }
4068
4069 //
4070 // Copy the thread mask to the kmp_info_t strucuture.
4071 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
4072 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4073 // is set, then the full mask is the same as the mask of the initialization
4074 // thread.
4075 //
4076 kmp_affin_mask_t *mask;
4077 int i;
4078
4079# if OMP_40_ENABLED
4080 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4081# endif
4082 {
Andrey Churbanovf28f6132015-01-13 14:54:00 +00004083 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
Jim Cownie5e8470a2013-09-27 10:38:44 +00004084 ) {
Andrey Churbanov7daf9802015-01-27 16:52:57 +00004085# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00004086 if (__kmp_num_proc_groups > 1) {
4087 return;
4088 }
4089# endif
4090 KMP_ASSERT(fullMask != NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004091 i = KMP_PLACE_ALL;
Jim Cownie5e8470a2013-09-27 10:38:44 +00004092 mask = fullMask;
4093 }
4094 else {
4095 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4096 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4097 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4098 }
4099 }
4100# if OMP_40_ENABLED
4101 else {
4102 if ((! isa_root)
4103 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
Andrey Churbanov7daf9802015-01-27 16:52:57 +00004104# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00004105 if (__kmp_num_proc_groups > 1) {
4106 return;
4107 }
4108# endif
4109 KMP_ASSERT(fullMask != NULL);
4110 i = KMP_PLACE_ALL;
4111 mask = fullMask;
4112 }
4113 else {
4114 //
4115 // int i = some hash function or just a counter that doesn't
4116 // always start at 0. Use gtid for now.
4117 //
4118 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4119 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4120 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4121 }
4122 }
4123# endif
4124
4125# if OMP_40_ENABLED
4126 th->th.th_current_place = i;
4127 if (isa_root) {
4128 th->th.th_new_place = i;
4129 th->th.th_first_place = 0;
4130 th->th.th_last_place = __kmp_affinity_num_masks - 1;
4131 }
4132
4133 if (i == KMP_PLACE_ALL) {
4134 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4135 gtid));
4136 }
4137 else {
4138 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4139 gtid, i));
4140 }
4141# else
4142 if (i == -1) {
4143 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4144 gtid));
4145 }
4146 else {
4147 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4148 gtid, i));
4149 }
4150# endif /* OMP_40_ENABLED */
4151
4152 KMP_CPU_COPY(th->th.th_affin_mask, mask);
4153
4154 if (__kmp_affinity_verbose) {
4155 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4156 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4157 th->th.th_affin_mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004158 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4159 buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004160 }
4161
4162# if KMP_OS_WINDOWS
4163 //
4164 // On Windows* OS, the process affinity mask might have changed.
4165 // If the user didn't request affinity and this call fails,
4166 // just continue silently. See CQ171393.
4167 //
4168 if ( __kmp_affinity_type == affinity_none ) {
4169 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4170 }
4171 else
4172# endif
4173 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4174}
4175
4176
4177# if OMP_40_ENABLED
4178
4179void
4180__kmp_affinity_set_place(int gtid)
4181{
4182 int retval;
4183
4184 if (! KMP_AFFINITY_CAPABLE()) {
4185 return;
4186 }
4187
4188 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4189
4190 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4191 gtid, th->th.th_new_place, th->th.th_current_place));
4192
4193 //
Alp Toker8f2d3f02014-02-24 10:40:15 +00004194 // Check that the new place is within this thread's partition.
Jim Cownie5e8470a2013-09-27 10:38:44 +00004195 //
4196 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004197 KMP_ASSERT(th->th.th_new_place >= 0);
4198 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004199 if (th->th.th_first_place <= th->th.th_last_place) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004200 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
Jim Cownie5e8470a2013-09-27 10:38:44 +00004201 && (th->th.th_new_place <= th->th.th_last_place));
4202 }
4203 else {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004204 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
Jim Cownie5e8470a2013-09-27 10:38:44 +00004205 || (th->th.th_new_place >= th->th.th_last_place));
4206 }
4207
4208 //
4209 // Copy the thread mask to the kmp_info_t strucuture,
4210 // and set this thread's affinity.
4211 //
4212 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4213 th->th.th_new_place);
4214 KMP_CPU_COPY(th->th.th_affin_mask, mask);
4215 th->th.th_current_place = th->th.th_new_place;
4216
4217 if (__kmp_affinity_verbose) {
4218 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4219 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4220 th->th.th_affin_mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004221 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4222 gtid, buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004223 }
4224 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4225}
4226
4227# endif /* OMP_40_ENABLED */
4228
4229
4230int
4231__kmp_aux_set_affinity(void **mask)
4232{
4233 int gtid;
4234 kmp_info_t *th;
4235 int retval;
4236
4237 if (! KMP_AFFINITY_CAPABLE()) {
4238 return -1;
4239 }
4240
4241 gtid = __kmp_entry_gtid();
4242 KA_TRACE(1000, ;{
4243 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4244 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4245 (kmp_affin_mask_t *)(*mask));
4246 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4247 gtid, buf);
4248 });
4249
4250 if (__kmp_env_consistency_check) {
4251 if ((mask == NULL) || (*mask == NULL)) {
4252 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4253 }
4254 else {
4255 unsigned proc;
4256 int num_procs = 0;
4257
4258 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4259 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4260 continue;
4261 }
4262 num_procs++;
4263 if (! KMP_CPU_ISSET(proc, fullMask)) {
4264 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4265 break;
4266 }
4267 }
4268 if (num_procs == 0) {
4269 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4270 }
4271
Andrey Churbanov7daf9802015-01-27 16:52:57 +00004272# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00004273 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4274 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4275 }
Andrey Churbanov7daf9802015-01-27 16:52:57 +00004276# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +00004277
4278 }
4279 }
4280
4281 th = __kmp_threads[gtid];
4282 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4283 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4284 if (retval == 0) {
4285 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4286 }
4287
4288# if OMP_40_ENABLED
4289 th->th.th_current_place = KMP_PLACE_UNDEFINED;
4290 th->th.th_new_place = KMP_PLACE_UNDEFINED;
4291 th->th.th_first_place = 0;
4292 th->th.th_last_place = __kmp_affinity_num_masks - 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004293
4294 //
4295 // Turn off 4.0 affinity for the current tread at this parallel level.
4296 //
4297 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
Jim Cownie5e8470a2013-09-27 10:38:44 +00004298# endif
4299
4300 return retval;
4301}
4302
4303
4304int
4305__kmp_aux_get_affinity(void **mask)
4306{
4307 int gtid;
4308 int retval;
4309 kmp_info_t *th;
4310
4311 if (! KMP_AFFINITY_CAPABLE()) {
4312 return -1;
4313 }
4314
4315 gtid = __kmp_entry_gtid();
4316 th = __kmp_threads[gtid];
4317 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4318
4319 KA_TRACE(1000, ;{
4320 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4321 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4322 th->th.th_affin_mask);
4323 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4324 });
4325
4326 if (__kmp_env_consistency_check) {
4327 if ((mask == NULL) || (*mask == NULL)) {
4328 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4329 }
4330 }
4331
4332# if !KMP_OS_WINDOWS
4333
4334 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4335 KA_TRACE(1000, ;{
4336 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4337 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4338 (kmp_affin_mask_t *)(*mask));
4339 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4340 });
4341 return retval;
4342
4343# else
4344
4345 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4346 return 0;
4347
4348# endif /* KMP_OS_WINDOWS */
4349
4350}
4351
Jim Cownie5e8470a2013-09-27 10:38:44 +00004352int
4353__kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4354{
4355 int retval;
4356
4357 if (! KMP_AFFINITY_CAPABLE()) {
4358 return -1;
4359 }
4360
4361 KA_TRACE(1000, ;{
4362 int gtid = __kmp_entry_gtid();
4363 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4364 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4365 (kmp_affin_mask_t *)(*mask));
4366 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4367 proc, gtid, buf);
4368 });
4369
4370 if (__kmp_env_consistency_check) {
4371 if ((mask == NULL) || (*mask == NULL)) {
4372 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4373 }
4374 }
4375
4376 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4377 return -1;
4378 }
4379 if (! KMP_CPU_ISSET(proc, fullMask)) {
4380 return -2;
4381 }
4382
4383 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4384 return 0;
4385}
4386
4387
4388int
4389__kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4390{
4391 int retval;
4392
4393 if (! KMP_AFFINITY_CAPABLE()) {
4394 return -1;
4395 }
4396
4397 KA_TRACE(1000, ;{
4398 int gtid = __kmp_entry_gtid();
4399 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4400 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4401 (kmp_affin_mask_t *)(*mask));
4402 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4403 proc, gtid, buf);
4404 });
4405
4406 if (__kmp_env_consistency_check) {
4407 if ((mask == NULL) || (*mask == NULL)) {
4408 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4409 }
4410 }
4411
4412 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4413 return -1;
4414 }
4415 if (! KMP_CPU_ISSET(proc, fullMask)) {
4416 return -2;
4417 }
4418
4419 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4420 return 0;
4421}
4422
4423
4424int
4425__kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4426{
4427 int retval;
4428
4429 if (! KMP_AFFINITY_CAPABLE()) {
4430 return -1;
4431 }
4432
4433 KA_TRACE(1000, ;{
4434 int gtid = __kmp_entry_gtid();
4435 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4436 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4437 (kmp_affin_mask_t *)(*mask));
4438 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4439 proc, gtid, buf);
4440 });
4441
4442 if (__kmp_env_consistency_check) {
4443 if ((mask == NULL) || (*mask == NULL)) {
Andrey Churbanov4b2f17a2015-01-29 15:49:22 +00004444 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
Jim Cownie5e8470a2013-09-27 10:38:44 +00004445 }
4446 }
4447
4448 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4449 return 0;
4450 }
4451 if (! KMP_CPU_ISSET(proc, fullMask)) {
4452 return 0;
4453 }
4454
4455 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4456}
4457
Jim Cownie5e8470a2013-09-27 10:38:44 +00004458
4459// Dynamic affinity settings - Affinity balanced
4460void __kmp_balanced_affinity( int tid, int nthreads )
4461{
4462 if( __kmp_affinity_uniform_topology() ) {
4463 int coreID;
4464 int threadID;
4465 // Number of hyper threads per core in HT machine
4466 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4467 // Number of cores
4468 int ncores = __kmp_ncores;
4469 // How many threads will be bound to each core
4470 int chunk = nthreads / ncores;
4471 // How many cores will have an additional thread bound to it - "big cores"
4472 int big_cores = nthreads % ncores;
4473 // Number of threads on the big cores
4474 int big_nth = ( chunk + 1 ) * big_cores;
4475 if( tid < big_nth ) {
4476 coreID = tid / (chunk + 1 );
4477 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4478 } else { //tid >= big_nth
4479 coreID = ( tid - big_cores ) / chunk;
4480 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4481 }
4482
4483 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4484 "Illegal set affinity operation when not capable");
4485
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00004486 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004487 KMP_CPU_ZERO(mask);
4488
4489 // Granularity == thread
4490 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4491 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4492 KMP_CPU_SET( osID, mask);
4493 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4494 for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4495 int osID;
4496 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4497 KMP_CPU_SET( osID, mask);
4498 }
4499 }
4500 if (__kmp_affinity_verbose) {
4501 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4502 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004503 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4504 tid, buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004505 }
4506 __kmp_set_system_affinity( mask, TRUE );
4507 } else { // Non-uniform topology
4508
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00004509 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004510 KMP_CPU_ZERO(mask);
4511
4512 // Number of hyper threads per core in HT machine
4513 int nth_per_core = __kmp_nThreadsPerCore;
4514 int core_level;
4515 if( nth_per_core > 1 ) {
4516 core_level = __kmp_aff_depth - 2;
4517 } else {
4518 core_level = __kmp_aff_depth - 1;
4519 }
4520
4521 // Number of cores - maximum value; it does not count trail cores with 0 processors
4522 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4523
4524 // For performance gain consider the special case nthreads == __kmp_avail_proc
4525 if( nthreads == __kmp_avail_proc ) {
4526 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4527 int osID = address2os[ tid ].second;
4528 KMP_CPU_SET( osID, mask);
4529 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4530 int coreID = address2os[ tid ].first.labels[ core_level ];
4531 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4532 // since the address2os is sortied we can break when cnt==nth_per_core
4533 int cnt = 0;
4534 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4535 int osID = address2os[ i ].second;
4536 int core = address2os[ i ].first.labels[ core_level ];
4537 if( core == coreID ) {
4538 KMP_CPU_SET( osID, mask);
4539 cnt++;
4540 if( cnt == nth_per_core ) {
4541 break;
4542 }
4543 }
4544 }
4545 }
4546 } else if( nthreads <= __kmp_ncores ) {
4547
4548 int core = 0;
4549 for( int i = 0; i < ncores; i++ ) {
4550 // Check if this core from procarr[] is in the mask
4551 int in_mask = 0;
4552 for( int j = 0; j < nth_per_core; j++ ) {
4553 if( procarr[ i * nth_per_core + j ] != - 1 ) {
4554 in_mask = 1;
4555 break;
4556 }
4557 }
4558 if( in_mask ) {
4559 if( tid == core ) {
4560 for( int j = 0; j < nth_per_core; j++ ) {
4561 int osID = procarr[ i * nth_per_core + j ];
4562 if( osID != -1 ) {
4563 KMP_CPU_SET( osID, mask );
4564 // For granularity=thread it is enough to set the first available osID for this core
4565 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4566 break;
4567 }
4568 }
4569 }
4570 break;
4571 } else {
4572 core++;
4573 }
4574 }
4575 }
4576
4577 } else { // nthreads > __kmp_ncores
4578
4579 // Array to save the number of processors at each core
Jonathan Peyton7be075332015-06-22 15:53:50 +00004580 int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004581 // Array to save the number of cores with "x" available processors;
Jonathan Peyton7be075332015-06-22 15:53:50 +00004582 int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
Jim Cownie5e8470a2013-09-27 10:38:44 +00004583 // Array to save the number of cores with # procs from x to nth_per_core
Jonathan Peyton7be075332015-06-22 15:53:50 +00004584 int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
Jim Cownie5e8470a2013-09-27 10:38:44 +00004585
4586 for( int i = 0; i <= nth_per_core; i++ ) {
4587 ncores_with_x_procs[ i ] = 0;
4588 ncores_with_x_to_max_procs[ i ] = 0;
4589 }
4590
4591 for( int i = 0; i < ncores; i++ ) {
4592 int cnt = 0;
4593 for( int j = 0; j < nth_per_core; j++ ) {
4594 if( procarr[ i * nth_per_core + j ] != -1 ) {
4595 cnt++;
4596 }
4597 }
4598 nproc_at_core[ i ] = cnt;
4599 ncores_with_x_procs[ cnt ]++;
4600 }
4601
4602 for( int i = 0; i <= nth_per_core; i++ ) {
4603 for( int j = i; j <= nth_per_core; j++ ) {
4604 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4605 }
4606 }
4607
4608 // Max number of processors
4609 int nproc = nth_per_core * ncores;
4610 // An array to keep number of threads per each context
4611 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4612 for( int i = 0; i < nproc; i++ ) {
4613 newarr[ i ] = 0;
4614 }
4615
4616 int nth = nthreads;
4617 int flag = 0;
4618 while( nth > 0 ) {
4619 for( int j = 1; j <= nth_per_core; j++ ) {
4620 int cnt = ncores_with_x_to_max_procs[ j ];
4621 for( int i = 0; i < ncores; i++ ) {
4622 // Skip the core with 0 processors
4623 if( nproc_at_core[ i ] == 0 ) {
4624 continue;
4625 }
4626 for( int k = 0; k < nth_per_core; k++ ) {
4627 if( procarr[ i * nth_per_core + k ] != -1 ) {
4628 if( newarr[ i * nth_per_core + k ] == 0 ) {
4629 newarr[ i * nth_per_core + k ] = 1;
4630 cnt--;
4631 nth--;
4632 break;
4633 } else {
4634 if( flag != 0 ) {
4635 newarr[ i * nth_per_core + k ] ++;
4636 cnt--;
4637 nth--;
4638 break;
4639 }
4640 }
4641 }
4642 }
4643 if( cnt == 0 || nth == 0 ) {
4644 break;
4645 }
4646 }
4647 if( nth == 0 ) {
4648 break;
4649 }
4650 }
4651 flag = 1;
4652 }
4653 int sum = 0;
4654 for( int i = 0; i < nproc; i++ ) {
4655 sum += newarr[ i ];
4656 if( sum > tid ) {
4657 // Granularity == thread
4658 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4659 int osID = procarr[ i ];
4660 KMP_CPU_SET( osID, mask);
4661 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4662 int coreID = i / nth_per_core;
4663 for( int ii = 0; ii < nth_per_core; ii++ ) {
4664 int osID = procarr[ coreID * nth_per_core + ii ];
4665 if( osID != -1 ) {
4666 KMP_CPU_SET( osID, mask);
4667 }
4668 }
4669 }
4670 break;
4671 }
4672 }
4673 __kmp_free( newarr );
4674 }
4675
4676 if (__kmp_affinity_verbose) {
4677 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4678 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004679 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4680 tid, buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004681 }
4682 __kmp_set_system_affinity( mask, TRUE );
4683 }
4684}
4685
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004686#else
4687 // affinity not supported
4688
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +00004689static const kmp_uint32 noaff_maxLevels=7;
4690kmp_uint32 noaff_skipPerLevel[noaff_maxLevels];
4691kmp_uint32 noaff_depth;
4692kmp_uint8 noaff_leaf_kids;
4693kmp_int8 noaff_uninitialized=1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004694
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +00004695void noaff_init(int nprocs)
4696{
4697 kmp_int8 result = KMP_COMPARE_AND_STORE_ACQ8(&noaff_uninitialized, 1, 2);
4698 if (result == 0) return; // Already initialized
4699 else if (result == 2) { // Someone else is initializing
4700 while (TCR_1(noaff_uninitialized) != 0) KMP_CPU_PAUSE();
4701 return;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004702 }
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +00004703 KMP_DEBUG_ASSERT(result==1);
4704
4705 kmp_uint32 numPerLevel[noaff_maxLevels];
4706 noaff_depth = 1;
4707 for (kmp_uint32 i=0; i<noaff_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
4708 numPerLevel[i] = 1;
4709 noaff_skipPerLevel[i] = 1;
4710 }
4711
4712 numPerLevel[0] = 4;
4713 numPerLevel[1] = nprocs/4;
4714 if (nprocs%4) numPerLevel[1]++;
4715
4716 for (int i=noaff_maxLevels-1; i>=0; --i) // count non-empty levels to get depth
4717 if (numPerLevel[i] != 1 || noaff_depth > 1) // only count one top-level '1'
4718 noaff_depth++;
4719
4720 kmp_uint32 branch = 4;
4721 if (numPerLevel[0] == 1) branch = nprocs/4;
4722 if (branch<4) branch=4;
4723 for (kmp_uint32 d=0; d<noaff_depth-1; ++d) { // optimize hierarchy width
4724 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
4725 if (numPerLevel[d] & 1) numPerLevel[d]++;
4726 numPerLevel[d] = numPerLevel[d] >> 1;
4727 if (numPerLevel[d+1] == 1) noaff_depth++;
4728 numPerLevel[d+1] = numPerLevel[d+1] << 1;
4729 }
4730 if(numPerLevel[0] == 1) {
4731 branch = branch >> 1;
4732 if (branch<4) branch = 4;
4733 }
4734 }
4735
4736 for (kmp_uint32 i=1; i<noaff_depth; ++i)
4737 noaff_skipPerLevel[i] = numPerLevel[i-1] * noaff_skipPerLevel[i-1];
4738 // Fill in hierarchy in the case of oversubscription
4739 for (kmp_uint32 i=noaff_depth; i<noaff_maxLevels; ++i)
4740 noaff_skipPerLevel[i] = 2*noaff_skipPerLevel[i-1];
4741 noaff_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
4742 noaff_uninitialized = 0; // One writer
4743
4744}
4745
4746void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
4747 if (noaff_uninitialized)
4748 noaff_init(nproc);
4749
4750 thr_bar->depth = noaff_depth;
4751 thr_bar->base_leaf_kids = noaff_leaf_kids;
4752 thr_bar->skip_per_level = noaff_skipPerLevel;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004753}
4754
Alp Toker763b9392014-02-28 09:42:41 +00004755#endif // KMP_AFFINITY_SUPPORTED