blob: 5fcee142c045fa33e7693ec793297157d4514f75 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_affinity.cpp -- affinity management
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16#include "kmp.h"
17#include "kmp_i18n.h"
18#include "kmp_io.h"
19#include "kmp_str.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000020#include "kmp_wrapper_getpid.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000021
Alp Toker763b9392014-02-28 09:42:41 +000022#if KMP_AFFINITY_SUPPORTED
Jim Cownie5e8470a2013-09-27 10:38:44 +000023
24//
25// Print the affinity mask to the character array in a pretty format.
26//
27char *
28__kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
29{
30 KMP_ASSERT(buf_len >= 40);
31 char *scan = buf;
32 char *end = buf + buf_len - 1;
33
34 //
35 // Find first element / check for empty set.
36 //
37 size_t i;
38 for (i = 0; i < KMP_CPU_SETSIZE; i++) {
39 if (KMP_CPU_ISSET(i, mask)) {
40 break;
41 }
42 }
43 if (i == KMP_CPU_SETSIZE) {
Andrey Churbanov74bf17b2015-04-02 13:27:08 +000044 KMP_SNPRINTF(scan, buf_len, "{<empty>}");
Jim Cownie5e8470a2013-09-27 10:38:44 +000045 while (*scan != '\0') scan++;
46 KMP_ASSERT(scan <= end);
47 return buf;
48 }
49
Andrey Churbanov74bf17b2015-04-02 13:27:08 +000050 KMP_SNPRINTF(scan, buf_len, "{%ld", (long)i);
Jim Cownie5e8470a2013-09-27 10:38:44 +000051 while (*scan != '\0') scan++;
52 i++;
53 for (; i < KMP_CPU_SETSIZE; i++) {
54 if (! KMP_CPU_ISSET(i, mask)) {
55 continue;
56 }
57
58 //
59 // Check for buffer overflow. A string of the form ",<n>" will have
60 // at most 10 characters, plus we want to leave room to print ",...}"
61 // if the set is too large to print for a total of 15 characters.
62 // We already left room for '\0' in setting end.
63 //
64 if (end - scan < 15) {
65 break;
66 }
Andrey Churbanov74bf17b2015-04-02 13:27:08 +000067 KMP_SNPRINTF(scan, buf_len, ",%-ld", (long)i);
Jim Cownie5e8470a2013-09-27 10:38:44 +000068 while (*scan != '\0') scan++;
69 }
70 if (i < KMP_CPU_SETSIZE) {
Andrey Churbanov74bf17b2015-04-02 13:27:08 +000071 KMP_SNPRINTF(scan, buf_len, ",...");
Jim Cownie5e8470a2013-09-27 10:38:44 +000072 while (*scan != '\0') scan++;
73 }
Andrey Churbanov74bf17b2015-04-02 13:27:08 +000074 KMP_SNPRINTF(scan, buf_len, "}");
Jim Cownie5e8470a2013-09-27 10:38:44 +000075 while (*scan != '\0') scan++;
76 KMP_ASSERT(scan <= end);
77 return buf;
78}
79
80
81void
82__kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
83{
84 KMP_CPU_ZERO(mask);
85
Andrey Churbanov7daf9802015-01-27 16:52:57 +000086# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +000087
88 if (__kmp_num_proc_groups > 1) {
89 int group;
Jim Cownie5e8470a2013-09-27 10:38:44 +000090 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
91 for (group = 0; group < __kmp_num_proc_groups; group++) {
92 int i;
93 int num = __kmp_GetActiveProcessorCount(group);
94 for (i = 0; i < num; i++) {
95 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
96 }
97 }
98 }
99 else
100
Andrey Churbanov7daf9802015-01-27 16:52:57 +0000101# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +0000102
103 {
104 int proc;
105 for (proc = 0; proc < __kmp_xproc; proc++) {
106 KMP_CPU_SET(proc, mask);
107 }
108 }
109}
110
111
112//
113// In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
114// functions.
115//
116// The icc codegen emits sections with extremely long names, of the form
117// ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug
118// introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
119// some sort of memory corruption or table overflow that is triggered by
120// these long strings. I checked the latest version of the linker -
121// GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
122// fixed.
123//
124// Unfortunately, my attempts to reproduce it in a smaller example have
125// failed - I'm not sure what the prospects are of getting it fixed
Jonathan Peyton66338292015-06-01 02:37:28 +0000126// properly - but we need a reproducer smaller than all of libomp.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000127//
128// Work around the problem by avoiding inline constructors in such builds.
129// We do this for all platforms, not just Linux* OS - non-inline functions are
130// more debuggable and provide better coverage into than inline functions.
131// Use inline functions in shipping libs, for performance.
132//
133
134# if !defined(KMP_DEBUG) && !defined(COVER)
135
136class Address {
137public:
138 static const unsigned maxDepth = 32;
139 unsigned labels[maxDepth];
140 unsigned childNums[maxDepth];
141 unsigned depth;
142 unsigned leader;
143 Address(unsigned _depth)
144 : depth(_depth), leader(FALSE) {
145 }
146 Address &operator=(const Address &b) {
147 depth = b.depth;
148 for (unsigned i = 0; i < depth; i++) {
149 labels[i] = b.labels[i];
150 childNums[i] = b.childNums[i];
151 }
152 leader = FALSE;
153 return *this;
154 }
155 bool operator==(const Address &b) const {
156 if (depth != b.depth)
157 return false;
158 for (unsigned i = 0; i < depth; i++)
159 if(labels[i] != b.labels[i])
160 return false;
161 return true;
162 }
163 bool isClose(const Address &b, int level) const {
164 if (depth != b.depth)
165 return false;
166 if ((unsigned)level >= depth)
167 return true;
168 for (unsigned i = 0; i < (depth - level); i++)
169 if(labels[i] != b.labels[i])
170 return false;
171 return true;
172 }
173 bool operator!=(const Address &b) const {
174 return !operator==(b);
175 }
176};
177
178class AddrUnsPair {
179public:
180 Address first;
181 unsigned second;
182 AddrUnsPair(Address _first, unsigned _second)
183 : first(_first), second(_second) {
184 }
185 AddrUnsPair &operator=(const AddrUnsPair &b)
186 {
187 first = b.first;
188 second = b.second;
189 return *this;
190 }
191};
192
193# else
194
195class Address {
196public:
197 static const unsigned maxDepth = 32;
198 unsigned labels[maxDepth];
199 unsigned childNums[maxDepth];
200 unsigned depth;
201 unsigned leader;
202 Address(unsigned _depth);
203 Address &operator=(const Address &b);
204 bool operator==(const Address &b) const;
205 bool isClose(const Address &b, int level) const;
206 bool operator!=(const Address &b) const;
207};
208
209Address::Address(unsigned _depth)
210{
211 depth = _depth;
212 leader = FALSE;
213}
214
215Address &Address::operator=(const Address &b) {
216 depth = b.depth;
217 for (unsigned i = 0; i < depth; i++) {
218 labels[i] = b.labels[i];
219 childNums[i] = b.childNums[i];
220 }
221 leader = FALSE;
222 return *this;
223}
224
225bool Address::operator==(const Address &b) const {
226 if (depth != b.depth)
227 return false;
228 for (unsigned i = 0; i < depth; i++)
229 if(labels[i] != b.labels[i])
230 return false;
231 return true;
232}
233
234bool Address::isClose(const Address &b, int level) const {
235 if (depth != b.depth)
236 return false;
237 if ((unsigned)level >= depth)
238 return true;
239 for (unsigned i = 0; i < (depth - level); i++)
240 if(labels[i] != b.labels[i])
241 return false;
242 return true;
243}
244
245bool Address::operator!=(const Address &b) const {
246 return !operator==(b);
247}
248
249class AddrUnsPair {
250public:
251 Address first;
252 unsigned second;
253 AddrUnsPair(Address _first, unsigned _second);
254 AddrUnsPair &operator=(const AddrUnsPair &b);
255};
256
257AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
258 : first(_first), second(_second)
259{
260}
261
262AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
263{
264 first = b.first;
265 second = b.second;
266 return *this;
267}
268
269# endif /* !defined(KMP_DEBUG) && !defined(COVER) */
270
271
272static int
273__kmp_affinity_cmp_Address_labels(const void *a, const void *b)
274{
275 const Address *aa = (const Address *)&(((AddrUnsPair *)a)
276 ->first);
277 const Address *bb = (const Address *)&(((AddrUnsPair *)b)
278 ->first);
279 unsigned depth = aa->depth;
280 unsigned i;
281 KMP_DEBUG_ASSERT(depth == bb->depth);
282 for (i = 0; i < depth; i++) {
283 if (aa->labels[i] < bb->labels[i]) return -1;
284 if (aa->labels[i] > bb->labels[i]) return 1;
285 }
286 return 0;
287}
288
289
290static int
291__kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
292{
293 const Address *aa = (const Address *)&(((AddrUnsPair *)a)
294 ->first);
295 const Address *bb = (const Address *)&(((AddrUnsPair *)b)
296 ->first);
297 unsigned depth = aa->depth;
298 unsigned i;
299 KMP_DEBUG_ASSERT(depth == bb->depth);
300 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
301 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
302 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
303 int j = depth - i - 1;
304 if (aa->childNums[j] < bb->childNums[j]) return -1;
305 if (aa->childNums[j] > bb->childNums[j]) return 1;
306 }
307 for (; i < depth; i++) {
308 int j = i - __kmp_affinity_compact;
309 if (aa->childNums[j] < bb->childNums[j]) return -1;
310 if (aa->childNums[j] > bb->childNums[j]) return 1;
311 }
312 return 0;
313}
314
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000315/** A structure for holding machine-specific hierarchy info to be computed once at init.
316 This structure represents a mapping of threads to the actual machine hierarchy, or to
317 our best guess at what the hierarchy might be, for the purpose of performing an
318 efficient barrier. In the worst case, when there is no machine hierarchy information,
319 it produces a tree suitable for a barrier, similar to the tree used in the hyper barrier. */
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000320class hierarchy_info {
321public:
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000322 /** Number of levels in the hierarchy. Typical levels are threads/core, cores/package
323 or socket, packages/node, nodes/machine, etc. We don't want to get specific with
324 nomenclature. When the machine is oversubscribed we add levels to duplicate the
325 hierarchy, doubling the thread capacity of the hierarchy each time we add a level. */
326 kmp_uint32 maxLevels;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000327
328 /** This is specifically the depth of the machine configuration hierarchy, in terms of the
329 number of levels along the longest path from root to any leaf. It corresponds to the
330 number of entries in numPerLevel if we exclude all but one trailing 1. */
331 kmp_uint32 depth;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000332 kmp_uint32 base_num_threads;
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +0000333 volatile kmp_int8 uninitialized; // 0=initialized, 1=uninitialized, 2=initialization in progress
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000334 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000335
336 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
337 node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
338 and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000339 kmp_uint32 *numPerLevel;
340 kmp_uint32 *skipPerLevel;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000341
342 void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
343 int hier_depth = adr2os[0].first.depth;
344 int level = 0;
345 for (int i=hier_depth-1; i>=0; --i) {
346 int max = -1;
347 for (int j=0; j<num_addrs; ++j) {
348 int next = adr2os[j].first.childNums[i];
349 if (next > max) max = next;
350 }
351 numPerLevel[level] = max+1;
352 ++level;
353 }
354 }
355
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000356 hierarchy_info() : maxLevels(7), depth(1), uninitialized(1), resizing(0) {}
357
358 // TO FIX: This destructor causes a segfault in the library at shutdown.
359 //~hierarchy_info() { if (!uninitialized && numPerLevel) __kmp_free(numPerLevel); }
360
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000361 void init(AddrUnsPair *adr2os, int num_addrs)
362 {
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +0000363 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, 1, 2);
364 if (bool_result == 0) { // Wait for initialization
365 while (TCR_1(uninitialized) != 0) KMP_CPU_PAUSE();
366 return;
367 }
368 KMP_DEBUG_ASSERT(bool_result==1);
369
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000370 /* Added explicit initialization of the data fields here to prevent usage of dirty value
Andrey Churbanovb41e62b2015-02-10 20:10:21 +0000371 observed when static library is re-initialized multiple times (e.g. when
372 non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */
373 depth = 1;
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000374 resizing = 0;
375 maxLevels = 7;
376 numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
377 skipPerLevel = &(numPerLevel[maxLevels]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000378 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
379 numPerLevel[i] = 1;
380 skipPerLevel[i] = 1;
381 }
382
383 // Sort table by physical ID
384 if (adr2os) {
385 qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
386 deriveLevels(adr2os, num_addrs);
387 }
388 else {
389 numPerLevel[0] = 4;
390 numPerLevel[1] = num_addrs/4;
391 if (num_addrs%4) numPerLevel[1]++;
392 }
393
394 base_num_threads = num_addrs;
395 for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
396 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
397 depth++;
398
399 kmp_uint32 branch = 4;
400 if (numPerLevel[0] == 1) branch = num_addrs/4;
401 if (branch<4) branch=4;
402 for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
403 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
404 if (numPerLevel[d] & 1) numPerLevel[d]++;
405 numPerLevel[d] = numPerLevel[d] >> 1;
406 if (numPerLevel[d+1] == 1) depth++;
407 numPerLevel[d+1] = numPerLevel[d+1] << 1;
408 }
409 if(numPerLevel[0] == 1) {
410 branch = branch >> 1;
411 if (branch<4) branch = 4;
412 }
413 }
414
415 for (kmp_uint32 i=1; i<depth; ++i)
416 skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +0000417 // Fill in hierarchy in the case of oversubscription
418 for (kmp_uint32 i=depth; i<maxLevels; ++i)
419 skipPerLevel[i] = 2*skipPerLevel[i-1];
420
421 uninitialized = 0; // One writer
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000422
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000423 }
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000424
425 void resize(kmp_uint32 nproc)
426 {
427 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
428 if (bool_result == 0) { // Someone else is resizing
429 while (TCR_1(resizing) != 0) KMP_CPU_PAUSE();
430 return;
431 }
432 KMP_DEBUG_ASSERT(bool_result!=0);
433 KMP_DEBUG_ASSERT(nproc > base_num_threads);
434
435 // Calculate new max_levels
436 kmp_uint32 old_sz = skipPerLevel[depth-1];
437 kmp_uint32 incs = 0, old_maxLevels= maxLevels;
438 while (nproc > old_sz) {
439 old_sz *=2;
440 incs++;
441 }
442 maxLevels += incs;
443
444 // Resize arrays
445 kmp_uint32 *old_numPerLevel = numPerLevel;
446 kmp_uint32 *old_skipPerLevel = skipPerLevel;
447 numPerLevel = skipPerLevel = NULL;
448 numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
449 skipPerLevel = &(numPerLevel[maxLevels]);
450
451 // Copy old elements from old arrays
452 for (kmp_uint32 i=0; i<old_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
453 numPerLevel[i] = old_numPerLevel[i];
454 skipPerLevel[i] = old_skipPerLevel[i];
455 }
456
457 // Init new elements in arrays to 1
458 for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
459 numPerLevel[i] = 1;
460 skipPerLevel[i] = 1;
461 }
462
463 // Free old arrays
464 __kmp_free(old_numPerLevel);
465
466 // Fill in oversubscription levels of hierarchy
467 for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i)
468 skipPerLevel[i] = 2*skipPerLevel[i-1];
469
470 base_num_threads = nproc;
471 resizing = 0; // One writer
472
473 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000474};
475
476static hierarchy_info machine_hierarchy;
477
478void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
Andrey Churbanov1362ae72015-04-02 13:18:50 +0000479 kmp_uint32 depth;
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +0000480 // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
481 if (TCR_1(machine_hierarchy.uninitialized))
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000482 machine_hierarchy.init(NULL, nproc);
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000483 // Adjust the hierarchy in case num threads exceeds original
484 if (nproc > machine_hierarchy.base_num_threads)
485 machine_hierarchy.resize(nproc);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000486
Andrey Churbanov1362ae72015-04-02 13:18:50 +0000487 depth = machine_hierarchy.depth;
488 KMP_DEBUG_ASSERT(depth > 0);
Jonathan Peyton7f09a982015-06-22 15:59:18 +0000489 // The loop below adjusts the depth in the case of a resize
490 while (nproc > machine_hierarchy.skipPerLevel[depth-1])
Andrey Churbanov1362ae72015-04-02 13:18:50 +0000491 depth++;
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +0000492
Andrey Churbanov1362ae72015-04-02 13:18:50 +0000493 thr_bar->depth = depth;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000494 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
495 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
496}
Jim Cownie5e8470a2013-09-27 10:38:44 +0000497
498//
499// When sorting by labels, __kmp_affinity_assign_child_nums() must first be
500// called to renumber the labels from [0..n] and place them into the child_num
501// vector of the address object. This is done in case the labels used for
Alp Toker8f2d3f02014-02-24 10:40:15 +0000502// the children at one node of the hierarchy differ from those used for
Jim Cownie5e8470a2013-09-27 10:38:44 +0000503// another node at the same level. Example: suppose the machine has 2 nodes
504// with 2 packages each. The first node contains packages 601 and 602, and
505// second node contains packages 603 and 604. If we try to sort the table
506// for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
507// because we are paying attention to the labels themselves, not the ordinal
508// child numbers. By using the child numbers in the sort, the result is
509// {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
510//
511static void
512__kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
513 int numAddrs)
514{
515 KMP_DEBUG_ASSERT(numAddrs > 0);
516 int depth = address2os->first.depth;
517 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
518 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
519 * sizeof(unsigned));
520 int labCt;
521 for (labCt = 0; labCt < depth; labCt++) {
522 address2os[0].first.childNums[labCt] = counts[labCt] = 0;
523 lastLabel[labCt] = address2os[0].first.labels[labCt];
524 }
525 int i;
526 for (i = 1; i < numAddrs; i++) {
527 for (labCt = 0; labCt < depth; labCt++) {
528 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
529 int labCt2;
530 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
531 counts[labCt2] = 0;
532 lastLabel[labCt2] = address2os[i].first.labels[labCt2];
533 }
534 counts[labCt]++;
535 lastLabel[labCt] = address2os[i].first.labels[labCt];
536 break;
537 }
538 }
539 for (labCt = 0; labCt < depth; labCt++) {
540 address2os[i].first.childNums[labCt] = counts[labCt];
541 }
542 for (; labCt < (int)Address::maxDepth; labCt++) {
543 address2os[i].first.childNums[labCt] = 0;
544 }
545 }
546}
547
548
549//
550// All of the __kmp_affinity_create_*_map() routines should set
551// __kmp_affinity_masks to a vector of affinity mask objects of length
552// __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
553// return the number of levels in the machine topology tree (zero if
554// __kmp_affinity_type == affinity_none).
555//
556// All of the __kmp_affinity_create_*_map() routines should set *fullMask
557// to the affinity mask for the initialization thread. They need to save and
558// restore the mask, and it could be needed later, so saving it is just an
559// optimization to avoid calling kmp_get_system_affinity() again.
560//
561static kmp_affin_mask_t *fullMask = NULL;
562
563kmp_affin_mask_t *
564__kmp_affinity_get_fullMask() { return fullMask; }
565
566
567static int nCoresPerPkg, nPackages;
Andrey Churbanovf696c822015-01-27 16:55:43 +0000568static int __kmp_nThreadsPerCore;
569#ifndef KMP_DFLT_NTH_CORES
570static int __kmp_ncores;
571#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000572
573//
574// __kmp_affinity_uniform_topology() doesn't work when called from
575// places which support arbitrarily many levels in the machine topology
576// map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
577// __kmp_affinity_create_x2apicid_map().
578//
579inline static bool
580__kmp_affinity_uniform_topology()
581{
582 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
583}
584
585
586//
587// Print out the detailed machine topology map, i.e. the physical locations
588// of each OS proc.
589//
590static void
591__kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
592 int pkgLevel, int coreLevel, int threadLevel)
593{
594 int proc;
595
596 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
597 for (proc = 0; proc < len; proc++) {
598 int level;
599 kmp_str_buf_t buf;
600 __kmp_str_buf_init(&buf);
601 for (level = 0; level < depth; level++) {
602 if (level == threadLevel) {
603 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
604 }
605 else if (level == coreLevel) {
606 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
607 }
608 else if (level == pkgLevel) {
609 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
610 }
611 else if (level > pkgLevel) {
612 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
613 level - pkgLevel - 1);
614 }
615 else {
616 __kmp_str_buf_print(&buf, "L%d ", level);
617 }
618 __kmp_str_buf_print(&buf, "%d ",
619 address2os[proc].first.labels[level]);
620 }
621 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
622 buf.str);
623 __kmp_str_buf_free(&buf);
624 }
625}
626
627
628//
629// If we don't know how to retrieve the machine's processor topology, or
630// encounter an error in doing so, this routine is called to form a "flat"
631// mapping of os thread id's <-> processor id's.
632//
633static int
634__kmp_affinity_create_flat_map(AddrUnsPair **address2os,
635 kmp_i18n_id_t *const msg_id)
636{
637 *address2os = NULL;
638 *msg_id = kmp_i18n_null;
639
640 //
641 // Even if __kmp_affinity_type == affinity_none, this routine might still
Andrey Churbanovf696c822015-01-27 16:55:43 +0000642 // called to set __kmp_ncores, as well as
Jim Cownie5e8470a2013-09-27 10:38:44 +0000643 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
644 //
645 if (! KMP_AFFINITY_CAPABLE()) {
646 KMP_ASSERT(__kmp_affinity_type == affinity_none);
647 __kmp_ncores = nPackages = __kmp_xproc;
648 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000649 if (__kmp_affinity_verbose) {
650 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
651 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
652 KMP_INFORM(Uniform, "KMP_AFFINITY");
653 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
654 __kmp_nThreadsPerCore, __kmp_ncores);
655 }
656 return 0;
657 }
658
659 //
660 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +0000661 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +0000662 // nCoresPerPkg, & nPackages. Make sure all these vars are set
663 // correctly, and return now if affinity is not enabled.
664 //
665 __kmp_ncores = nPackages = __kmp_avail_proc;
666 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000667 if (__kmp_affinity_verbose) {
668 char buf[KMP_AFFIN_MASK_PRINT_LEN];
669 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
670
671 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
672 if (__kmp_affinity_respect_mask) {
673 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
674 } else {
675 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
676 }
677 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
678 KMP_INFORM(Uniform, "KMP_AFFINITY");
679 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
680 __kmp_nThreadsPerCore, __kmp_ncores);
681 }
682 if (__kmp_affinity_type == affinity_none) {
683 return 0;
684 }
685
686 //
687 // Contruct the data structure to be returned.
688 //
689 *address2os = (AddrUnsPair*)
690 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
691 int avail_ct = 0;
692 unsigned int i;
693 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
694 //
695 // Skip this proc if it is not included in the machine model.
696 //
697 if (! KMP_CPU_ISSET(i, fullMask)) {
698 continue;
699 }
700
701 Address addr(1);
702 addr.labels[0] = i;
703 (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
704 }
705 if (__kmp_affinity_verbose) {
706 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
707 }
708
709 if (__kmp_affinity_gran_levels < 0) {
710 //
711 // Only the package level is modeled in the machine topology map,
712 // so the #levels of granularity is either 0 or 1.
713 //
714 if (__kmp_affinity_gran > affinity_gran_package) {
715 __kmp_affinity_gran_levels = 1;
716 }
717 else {
718 __kmp_affinity_gran_levels = 0;
719 }
720 }
721 return 1;
722}
723
724
Andrey Churbanov7daf9802015-01-27 16:52:57 +0000725# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +0000726
727//
728// If multiple Windows* OS processor groups exist, we can create a 2-level
729// topology map with the groups at level 0 and the individual procs at
730// level 1.
731//
732// This facilitates letting the threads float among all procs in a group,
733// if granularity=group (the default when there are multiple groups).
734//
735static int
736__kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
737 kmp_i18n_id_t *const msg_id)
738{
739 *address2os = NULL;
740 *msg_id = kmp_i18n_null;
741
742 //
743 // If we don't have multiple processor groups, return now.
744 // The flat mapping will be used.
745 //
746 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
747 // FIXME set *msg_id
748 return -1;
749 }
750
751 //
752 // Contruct the data structure to be returned.
753 //
754 *address2os = (AddrUnsPair*)
755 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
756 int avail_ct = 0;
757 int i;
758 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
759 //
760 // Skip this proc if it is not included in the machine model.
761 //
762 if (! KMP_CPU_ISSET(i, fullMask)) {
763 continue;
764 }
765
766 Address addr(2);
767 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
768 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
769 (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
770
771 if (__kmp_affinity_verbose) {
772 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
773 addr.labels[1]);
774 }
775 }
776
777 if (__kmp_affinity_gran_levels < 0) {
778 if (__kmp_affinity_gran == affinity_gran_group) {
779 __kmp_affinity_gran_levels = 1;
780 }
781 else if ((__kmp_affinity_gran == affinity_gran_fine)
782 || (__kmp_affinity_gran == affinity_gran_thread)) {
783 __kmp_affinity_gran_levels = 0;
784 }
785 else {
786 const char *gran_str = NULL;
787 if (__kmp_affinity_gran == affinity_gran_core) {
788 gran_str = "core";
789 }
790 else if (__kmp_affinity_gran == affinity_gran_package) {
791 gran_str = "package";
792 }
793 else if (__kmp_affinity_gran == affinity_gran_node) {
794 gran_str = "node";
795 }
796 else {
797 KMP_ASSERT(0);
798 }
799
800 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
801 __kmp_affinity_gran_levels = 0;
802 }
803 }
804 return 2;
805}
806
Andrey Churbanov7daf9802015-01-27 16:52:57 +0000807# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +0000808
809
810# if KMP_ARCH_X86 || KMP_ARCH_X86_64
811
812static int
813__kmp_cpuid_mask_width(int count) {
814 int r = 0;
815
816 while((1<<r) < count)
817 ++r;
818 return r;
819}
820
821
822class apicThreadInfo {
823public:
824 unsigned osId; // param to __kmp_affinity_bind_thread
825 unsigned apicId; // from cpuid after binding
826 unsigned maxCoresPerPkg; // ""
827 unsigned maxThreadsPerPkg; // ""
828 unsigned pkgId; // inferred from above values
829 unsigned coreId; // ""
830 unsigned threadId; // ""
831};
832
833
834static int
835__kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
836{
837 const apicThreadInfo *aa = (const apicThreadInfo *)a;
838 const apicThreadInfo *bb = (const apicThreadInfo *)b;
839 if (aa->osId < bb->osId) return -1;
840 if (aa->osId > bb->osId) return 1;
841 return 0;
842}
843
844
845static int
846__kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
847{
848 const apicThreadInfo *aa = (const apicThreadInfo *)a;
849 const apicThreadInfo *bb = (const apicThreadInfo *)b;
850 if (aa->pkgId < bb->pkgId) return -1;
851 if (aa->pkgId > bb->pkgId) return 1;
852 if (aa->coreId < bb->coreId) return -1;
853 if (aa->coreId > bb->coreId) return 1;
854 if (aa->threadId < bb->threadId) return -1;
855 if (aa->threadId > bb->threadId) return 1;
856 return 0;
857}
858
859
860//
861// On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
862// an algorithm which cycles through the available os threads, setting
863// the current thread's affinity mask to that thread, and then retrieves
864// the Apic Id for each thread context using the cpuid instruction.
865//
866static int
867__kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
868 kmp_i18n_id_t *const msg_id)
869{
Andrey Churbanov1c331292015-01-27 17:03:42 +0000870 kmp_cpuid buf;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000871 int rc;
872 *address2os = NULL;
873 *msg_id = kmp_i18n_null;
874
Andrey Churbanov1c331292015-01-27 17:03:42 +0000875 //
876 // Check if cpuid leaf 4 is supported.
877 //
Jim Cownie5e8470a2013-09-27 10:38:44 +0000878 __kmp_x86_cpuid(0, 0, &buf);
879 if (buf.eax < 4) {
880 *msg_id = kmp_i18n_str_NoLeaf4Support;
881 return -1;
882 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000883
884 //
Jim Cownie5e8470a2013-09-27 10:38:44 +0000885 // The algorithm used starts by setting the affinity to each available
Andrey Churbanov1c331292015-01-27 17:03:42 +0000886 // thread and retrieving info from the cpuid instruction, so if we are
887 // not capable of calling __kmp_get_system_affinity() and
888 // _kmp_get_system_affinity(), then we need to do something else - use
889 // the defaults that we calculated from issuing cpuid without binding
890 // to each proc.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000891 //
892 if (! KMP_AFFINITY_CAPABLE()) {
893 //
894 // Hack to try and infer the machine topology using only the data
895 // available from cpuid on the current thread, and __kmp_xproc.
896 //
897 KMP_ASSERT(__kmp_affinity_type == affinity_none);
898
899 //
900 // Get an upper bound on the number of threads per package using
901 // cpuid(1).
902 //
903 // On some OS/chps combinations where HT is supported by the chip
904 // but is disabled, this value will be 2 on a single core chip.
905 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
906 //
Jim Cownie5e8470a2013-09-27 10:38:44 +0000907 __kmp_x86_cpuid(1, 0, &buf);
908 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
909 if (maxThreadsPerPkg == 0) {
910 maxThreadsPerPkg = 1;
911 }
912
913 //
914 // The num cores per pkg comes from cpuid(4).
915 // 1 must be added to the encoded value.
916 //
917 // The author of cpu_count.cpp treated this only an upper bound
918 // on the number of cores, but I haven't seen any cases where it
919 // was greater than the actual number of cores, so we will treat
920 // it as exact in this block of code.
921 //
922 // First, we need to check if cpuid(4) is supported on this chip.
923 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
924 // has the value n or greater.
925 //
926 __kmp_x86_cpuid(0, 0, &buf);
927 if (buf.eax >= 4) {
928 __kmp_x86_cpuid(4, 0, &buf);
929 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
930 }
931 else {
932 nCoresPerPkg = 1;
933 }
934
935 //
936 // There is no way to reliably tell if HT is enabled without issuing
937 // the cpuid instruction from every thread, can correlating the cpuid
938 // info, so if the machine is not affinity capable, we assume that HT
939 // is off. We have seen quite a few machines where maxThreadsPerPkg
940 // is 2, yet the machine does not support HT.
941 //
942 // - Older OSes are usually found on machines with older chips, which
943 // do not support HT.
944 //
945 // - The performance penalty for mistakenly identifying a machine as
946 // HT when it isn't (which results in blocktime being incorrecly set
947 // to 0) is greater than the penalty when for mistakenly identifying
948 // a machine as being 1 thread/core when it is really HT enabled
949 // (which results in blocktime being incorrectly set to a positive
950 // value).
951 //
952 __kmp_ncores = __kmp_xproc;
953 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
954 __kmp_nThreadsPerCore = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000955 if (__kmp_affinity_verbose) {
956 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
957 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
958 if (__kmp_affinity_uniform_topology()) {
959 KMP_INFORM(Uniform, "KMP_AFFINITY");
960 } else {
961 KMP_INFORM(NonUniform, "KMP_AFFINITY");
962 }
963 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
964 __kmp_nThreadsPerCore, __kmp_ncores);
965 }
966 return 0;
967 }
968
969 //
970 //
971 // From here on, we can assume that it is safe to call
972 // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
973 // even if __kmp_affinity_type = affinity_none.
974 //
975
976 //
977 // Save the affinity mask for the current thread.
978 //
979 kmp_affin_mask_t *oldMask;
980 KMP_CPU_ALLOC(oldMask);
981 KMP_ASSERT(oldMask != NULL);
982 __kmp_get_system_affinity(oldMask, TRUE);
983
984 //
985 // Run through each of the available contexts, binding the current thread
986 // to it, and obtaining the pertinent information using the cpuid instr.
987 //
988 // The relevant information is:
989 //
990 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
991 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
992 //
993 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The
994 // value of this field determines the width of the core# + thread#
995 // fields in the Apic Id. It is also an upper bound on the number
996 // of threads per package, but it has been verified that situations
997 // happen were it is not exact. In particular, on certain OS/chip
998 // combinations where Intel(R) Hyper-Threading Technology is supported
999 // by the chip but has
1000 // been disabled, the value of this field will be 2 (for a single core
1001 // chip). On other OS/chip combinations supporting
1002 // Intel(R) Hyper-Threading Technology, the value of
1003 // this field will be 1 when Intel(R) Hyper-Threading Technology is
1004 // disabled and 2 when it is enabled.
1005 //
1006 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The
1007 // value of this field (+1) determines the width of the core# field in
1008 // the Apic Id. The comments in "cpucount.cpp" say that this value is
1009 // an upper bound, but the IA-32 architecture manual says that it is
1010 // exactly the number of cores per package, and I haven't seen any
1011 // case where it wasn't.
1012 //
1013 // From this information, deduce the package Id, core Id, and thread Id,
1014 // and set the corresponding fields in the apicThreadInfo struct.
1015 //
1016 unsigned i;
1017 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1018 __kmp_avail_proc * sizeof(apicThreadInfo));
1019 unsigned nApics = 0;
1020 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
1021 //
1022 // Skip this proc if it is not included in the machine model.
1023 //
1024 if (! KMP_CPU_ISSET(i, fullMask)) {
1025 continue;
1026 }
1027 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1028
1029 __kmp_affinity_bind_thread(i);
1030 threadInfo[nApics].osId = i;
1031
1032 //
1033 // The apic id and max threads per pkg come from cpuid(1).
1034 //
Jim Cownie5e8470a2013-09-27 10:38:44 +00001035 __kmp_x86_cpuid(1, 0, &buf);
1036 if (! (buf.edx >> 9) & 1) {
1037 __kmp_set_system_affinity(oldMask, TRUE);
1038 __kmp_free(threadInfo);
1039 KMP_CPU_FREE(oldMask);
1040 *msg_id = kmp_i18n_str_ApicNotPresent;
1041 return -1;
1042 }
1043 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1044 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1045 if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1046 threadInfo[nApics].maxThreadsPerPkg = 1;
1047 }
1048
1049 //
1050 // Max cores per pkg comes from cpuid(4).
1051 // 1 must be added to the encoded value.
1052 //
1053 // First, we need to check if cpuid(4) is supported on this chip.
1054 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
1055 // has the value n or greater.
1056 //
1057 __kmp_x86_cpuid(0, 0, &buf);
1058 if (buf.eax >= 4) {
1059 __kmp_x86_cpuid(4, 0, &buf);
1060 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1061 }
1062 else {
1063 threadInfo[nApics].maxCoresPerPkg = 1;
1064 }
1065
1066 //
1067 // Infer the pkgId / coreId / threadId using only the info
1068 // obtained locally.
1069 //
1070 int widthCT = __kmp_cpuid_mask_width(
1071 threadInfo[nApics].maxThreadsPerPkg);
1072 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1073
1074 int widthC = __kmp_cpuid_mask_width(
1075 threadInfo[nApics].maxCoresPerPkg);
1076 int widthT = widthCT - widthC;
1077 if (widthT < 0) {
1078 //
1079 // I've never seen this one happen, but I suppose it could, if
1080 // the cpuid instruction on a chip was really screwed up.
1081 // Make sure to restore the affinity mask before the tail call.
1082 //
1083 __kmp_set_system_affinity(oldMask, TRUE);
1084 __kmp_free(threadInfo);
1085 KMP_CPU_FREE(oldMask);
1086 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1087 return -1;
1088 }
1089
1090 int maskC = (1 << widthC) - 1;
1091 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1092 &maskC;
1093
1094 int maskT = (1 << widthT) - 1;
1095 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1096
1097 nApics++;
1098 }
1099
1100 //
1101 // We've collected all the info we need.
1102 // Restore the old affinity mask for this thread.
1103 //
1104 __kmp_set_system_affinity(oldMask, TRUE);
1105
1106 //
1107 // If there's only one thread context to bind to, form an Address object
1108 // with depth 1 and return immediately (or, if affinity is off, set
1109 // address2os to NULL and return).
1110 //
1111 // If it is configured to omit the package level when there is only a
1112 // single package, the logic at the end of this routine won't work if
1113 // there is only a single thread - it would try to form an Address
1114 // object with depth 0.
1115 //
1116 KMP_ASSERT(nApics > 0);
1117 if (nApics == 1) {
1118 __kmp_ncores = nPackages = 1;
1119 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001120 if (__kmp_affinity_verbose) {
1121 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1122 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1123
1124 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1125 if (__kmp_affinity_respect_mask) {
1126 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1127 } else {
1128 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1129 }
1130 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1131 KMP_INFORM(Uniform, "KMP_AFFINITY");
1132 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1133 __kmp_nThreadsPerCore, __kmp_ncores);
1134 }
1135
1136 if (__kmp_affinity_type == affinity_none) {
1137 __kmp_free(threadInfo);
1138 KMP_CPU_FREE(oldMask);
1139 return 0;
1140 }
1141
1142 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1143 Address addr(1);
1144 addr.labels[0] = threadInfo[0].pkgId;
1145 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1146
1147 if (__kmp_affinity_gran_levels < 0) {
1148 __kmp_affinity_gran_levels = 0;
1149 }
1150
1151 if (__kmp_affinity_verbose) {
1152 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1153 }
1154
1155 __kmp_free(threadInfo);
1156 KMP_CPU_FREE(oldMask);
1157 return 1;
1158 }
1159
1160 //
1161 // Sort the threadInfo table by physical Id.
1162 //
1163 qsort(threadInfo, nApics, sizeof(*threadInfo),
1164 __kmp_affinity_cmp_apicThreadInfo_phys_id);
1165
1166 //
1167 // The table is now sorted by pkgId / coreId / threadId, but we really
1168 // don't know the radix of any of the fields. pkgId's may be sparsely
1169 // assigned among the chips on a system. Although coreId's are usually
1170 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1171 // [0..threadsPerCore-1], we don't want to make any such assumptions.
1172 //
1173 // For that matter, we don't know what coresPerPkg and threadsPerCore
1174 // (or the total # packages) are at this point - we want to determine
1175 // that now. We only have an upper bound on the first two figures.
1176 //
1177 // We also perform a consistency check at this point: the values returned
1178 // by the cpuid instruction for any thread bound to a given package had
1179 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1180 //
1181 nPackages = 1;
1182 nCoresPerPkg = 1;
1183 __kmp_nThreadsPerCore = 1;
1184 unsigned nCores = 1;
1185
1186 unsigned pkgCt = 1; // to determine radii
1187 unsigned lastPkgId = threadInfo[0].pkgId;
1188 unsigned coreCt = 1;
1189 unsigned lastCoreId = threadInfo[0].coreId;
1190 unsigned threadCt = 1;
1191 unsigned lastThreadId = threadInfo[0].threadId;
1192
1193 // intra-pkg consist checks
1194 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1195 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1196
1197 for (i = 1; i < nApics; i++) {
1198 if (threadInfo[i].pkgId != lastPkgId) {
1199 nCores++;
1200 pkgCt++;
1201 lastPkgId = threadInfo[i].pkgId;
1202 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1203 coreCt = 1;
1204 lastCoreId = threadInfo[i].coreId;
1205 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1206 threadCt = 1;
1207 lastThreadId = threadInfo[i].threadId;
1208
1209 //
1210 // This is a different package, so go on to the next iteration
1211 // without doing any consistency checks. Reset the consistency
1212 // check vars, though.
1213 //
1214 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1215 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1216 continue;
1217 }
1218
1219 if (threadInfo[i].coreId != lastCoreId) {
1220 nCores++;
1221 coreCt++;
1222 lastCoreId = threadInfo[i].coreId;
1223 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1224 threadCt = 1;
1225 lastThreadId = threadInfo[i].threadId;
1226 }
1227 else if (threadInfo[i].threadId != lastThreadId) {
1228 threadCt++;
1229 lastThreadId = threadInfo[i].threadId;
1230 }
1231 else {
1232 __kmp_free(threadInfo);
1233 KMP_CPU_FREE(oldMask);
1234 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1235 return -1;
1236 }
1237
1238 //
1239 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1240 // fields agree between all the threads bounds to a given package.
1241 //
1242 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1243 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1244 __kmp_free(threadInfo);
1245 KMP_CPU_FREE(oldMask);
1246 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1247 return -1;
1248 }
1249 }
1250 nPackages = pkgCt;
1251 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1252 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1253
1254 //
1255 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +00001256 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +00001257 // nCoresPerPkg, & nPackages. Make sure all these vars are set
1258 // correctly, and return now if affinity is not enabled.
1259 //
Jim Cownie5e8470a2013-09-27 10:38:44 +00001260 __kmp_ncores = nCores;
1261 if (__kmp_affinity_verbose) {
1262 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1263 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1264
1265 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1266 if (__kmp_affinity_respect_mask) {
1267 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1268 } else {
1269 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1270 }
1271 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1272 if (__kmp_affinity_uniform_topology()) {
1273 KMP_INFORM(Uniform, "KMP_AFFINITY");
1274 } else {
1275 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1276 }
1277 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1278 __kmp_nThreadsPerCore, __kmp_ncores);
1279
1280 }
1281
1282 if (__kmp_affinity_type == affinity_none) {
1283 __kmp_free(threadInfo);
1284 KMP_CPU_FREE(oldMask);
1285 return 0;
1286 }
1287
1288 //
1289 // Now that we've determined the number of packages, the number of cores
1290 // per package, and the number of threads per core, we can construct the
1291 // data structure that is to be returned.
1292 //
1293 int pkgLevel = 0;
1294 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1295 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1296 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1297
1298 KMP_ASSERT(depth > 0);
1299 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1300
1301 for (i = 0; i < nApics; ++i) {
1302 Address addr(depth);
1303 unsigned os = threadInfo[i].osId;
1304 int d = 0;
1305
1306 if (pkgLevel >= 0) {
1307 addr.labels[d++] = threadInfo[i].pkgId;
1308 }
1309 if (coreLevel >= 0) {
1310 addr.labels[d++] = threadInfo[i].coreId;
1311 }
1312 if (threadLevel >= 0) {
1313 addr.labels[d++] = threadInfo[i].threadId;
1314 }
1315 (*address2os)[i] = AddrUnsPair(addr, os);
1316 }
1317
1318 if (__kmp_affinity_gran_levels < 0) {
1319 //
1320 // Set the granularity level based on what levels are modeled
1321 // in the machine topology map.
1322 //
1323 __kmp_affinity_gran_levels = 0;
1324 if ((threadLevel >= 0)
1325 && (__kmp_affinity_gran > affinity_gran_thread)) {
1326 __kmp_affinity_gran_levels++;
1327 }
1328 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1329 __kmp_affinity_gran_levels++;
1330 }
1331 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1332 __kmp_affinity_gran_levels++;
1333 }
1334 }
1335
1336 if (__kmp_affinity_verbose) {
1337 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1338 coreLevel, threadLevel);
1339 }
1340
1341 __kmp_free(threadInfo);
1342 KMP_CPU_FREE(oldMask);
1343 return depth;
1344}
1345
1346
1347//
1348// Intel(R) microarchitecture code name Nehalem, Dunnington and later
1349// architectures support a newer interface for specifying the x2APIC Ids,
1350// based on cpuid leaf 11.
1351//
1352static int
1353__kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1354 kmp_i18n_id_t *const msg_id)
1355{
1356 kmp_cpuid buf;
1357
1358 *address2os = NULL;
1359 *msg_id = kmp_i18n_null;
1360
1361 //
1362 // Check to see if cpuid leaf 11 is supported.
1363 //
1364 __kmp_x86_cpuid(0, 0, &buf);
1365 if (buf.eax < 11) {
1366 *msg_id = kmp_i18n_str_NoLeaf11Support;
1367 return -1;
1368 }
1369 __kmp_x86_cpuid(11, 0, &buf);
1370 if (buf.ebx == 0) {
1371 *msg_id = kmp_i18n_str_NoLeaf11Support;
1372 return -1;
1373 }
1374
1375 //
1376 // Find the number of levels in the machine topology. While we're at it,
1377 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will
1378 // try to get more accurate values later by explicitly counting them,
1379 // but get reasonable defaults now, in case we return early.
1380 //
1381 int level;
1382 int threadLevel = -1;
1383 int coreLevel = -1;
1384 int pkgLevel = -1;
1385 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1386
1387 for (level = 0;; level++) {
1388 if (level > 31) {
1389 //
1390 // FIXME: Hack for DPD200163180
1391 //
1392 // If level is big then something went wrong -> exiting
1393 //
1394 // There could actually be 32 valid levels in the machine topology,
1395 // but so far, the only machine we have seen which does not exit
1396 // this loop before iteration 32 has fubar x2APIC settings.
1397 //
1398 // For now, just reject this case based upon loop trip count.
1399 //
1400 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1401 return -1;
1402 }
1403 __kmp_x86_cpuid(11, level, &buf);
1404 if (buf.ebx == 0) {
1405 if (pkgLevel < 0) {
1406 //
1407 // Will infer nPackages from __kmp_xproc
1408 //
1409 pkgLevel = level;
1410 level++;
1411 }
1412 break;
1413 }
1414 int kind = (buf.ecx >> 8) & 0xff;
1415 if (kind == 1) {
1416 //
1417 // SMT level
1418 //
1419 threadLevel = level;
1420 coreLevel = -1;
1421 pkgLevel = -1;
1422 __kmp_nThreadsPerCore = buf.ebx & 0xff;
1423 if (__kmp_nThreadsPerCore == 0) {
1424 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1425 return -1;
1426 }
1427 }
1428 else if (kind == 2) {
1429 //
1430 // core level
1431 //
1432 coreLevel = level;
1433 pkgLevel = -1;
1434 nCoresPerPkg = buf.ebx & 0xff;
1435 if (nCoresPerPkg == 0) {
1436 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1437 return -1;
1438 }
1439 }
1440 else {
1441 if (level <= 0) {
1442 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1443 return -1;
1444 }
1445 if (pkgLevel >= 0) {
1446 continue;
1447 }
1448 pkgLevel = level;
1449 nPackages = buf.ebx & 0xff;
1450 if (nPackages == 0) {
1451 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1452 return -1;
1453 }
1454 }
1455 }
1456 int depth = level;
1457
1458 //
1459 // In the above loop, "level" was counted from the finest level (usually
1460 // thread) to the coarsest. The caller expects that we will place the
1461 // labels in (*address2os)[].first.labels[] in the inverse order, so
1462 // we need to invert the vars saying which level means what.
1463 //
1464 if (threadLevel >= 0) {
1465 threadLevel = depth - threadLevel - 1;
1466 }
1467 if (coreLevel >= 0) {
1468 coreLevel = depth - coreLevel - 1;
1469 }
1470 KMP_DEBUG_ASSERT(pkgLevel >= 0);
1471 pkgLevel = depth - pkgLevel - 1;
1472
1473 //
1474 // The algorithm used starts by setting the affinity to each available
Andrey Churbanov1c331292015-01-27 17:03:42 +00001475 // thread and retrieving info from the cpuid instruction, so if we are
1476 // not capable of calling __kmp_get_system_affinity() and
1477 // _kmp_get_system_affinity(), then we need to do something else - use
1478 // the defaults that we calculated from issuing cpuid without binding
1479 // to each proc.
Jim Cownie5e8470a2013-09-27 10:38:44 +00001480 //
1481 if (! KMP_AFFINITY_CAPABLE())
1482 {
1483 //
1484 // Hack to try and infer the machine topology using only the data
1485 // available from cpuid on the current thread, and __kmp_xproc.
1486 //
1487 KMP_ASSERT(__kmp_affinity_type == affinity_none);
1488
1489 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1490 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001491 if (__kmp_affinity_verbose) {
1492 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1493 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1494 if (__kmp_affinity_uniform_topology()) {
1495 KMP_INFORM(Uniform, "KMP_AFFINITY");
1496 } else {
1497 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1498 }
1499 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1500 __kmp_nThreadsPerCore, __kmp_ncores);
1501 }
1502 return 0;
1503 }
1504
1505 //
1506 //
1507 // From here on, we can assume that it is safe to call
1508 // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1509 // even if __kmp_affinity_type = affinity_none.
1510 //
1511
1512 //
1513 // Save the affinity mask for the current thread.
1514 //
1515 kmp_affin_mask_t *oldMask;
1516 KMP_CPU_ALLOC(oldMask);
1517 __kmp_get_system_affinity(oldMask, TRUE);
1518
1519 //
1520 // Allocate the data structure to be returned.
1521 //
1522 AddrUnsPair *retval = (AddrUnsPair *)
1523 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1524
1525 //
1526 // Run through each of the available contexts, binding the current thread
1527 // to it, and obtaining the pertinent information using the cpuid instr.
1528 //
1529 unsigned int proc;
1530 int nApics = 0;
1531 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1532 //
1533 // Skip this proc if it is not included in the machine model.
1534 //
1535 if (! KMP_CPU_ISSET(proc, fullMask)) {
1536 continue;
1537 }
1538 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1539
1540 __kmp_affinity_bind_thread(proc);
1541
1542 //
1543 // Extrach the labels for each level in the machine topology map
1544 // from the Apic ID.
1545 //
1546 Address addr(depth);
1547 int prev_shift = 0;
1548
1549 for (level = 0; level < depth; level++) {
1550 __kmp_x86_cpuid(11, level, &buf);
1551 unsigned apicId = buf.edx;
1552 if (buf.ebx == 0) {
1553 if (level != depth - 1) {
1554 KMP_CPU_FREE(oldMask);
1555 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1556 return -1;
1557 }
1558 addr.labels[depth - level - 1] = apicId >> prev_shift;
1559 level++;
1560 break;
1561 }
1562 int shift = buf.eax & 0x1f;
1563 int mask = (1 << shift) - 1;
1564 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1565 prev_shift = shift;
1566 }
1567 if (level != depth) {
1568 KMP_CPU_FREE(oldMask);
1569 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1570 return -1;
1571 }
1572
1573 retval[nApics] = AddrUnsPair(addr, proc);
1574 nApics++;
1575 }
1576
1577 //
1578 // We've collected all the info we need.
1579 // Restore the old affinity mask for this thread.
1580 //
1581 __kmp_set_system_affinity(oldMask, TRUE);
1582
1583 //
1584 // If there's only one thread context to bind to, return now.
1585 //
1586 KMP_ASSERT(nApics > 0);
1587 if (nApics == 1) {
1588 __kmp_ncores = nPackages = 1;
1589 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001590 if (__kmp_affinity_verbose) {
1591 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1592 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1593
1594 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1595 if (__kmp_affinity_respect_mask) {
1596 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1597 } else {
1598 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1599 }
1600 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1601 KMP_INFORM(Uniform, "KMP_AFFINITY");
1602 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1603 __kmp_nThreadsPerCore, __kmp_ncores);
1604 }
1605
1606 if (__kmp_affinity_type == affinity_none) {
1607 __kmp_free(retval);
1608 KMP_CPU_FREE(oldMask);
1609 return 0;
1610 }
1611
1612 //
1613 // Form an Address object which only includes the package level.
1614 //
1615 Address addr(1);
1616 addr.labels[0] = retval[0].first.labels[pkgLevel];
1617 retval[0].first = addr;
1618
1619 if (__kmp_affinity_gran_levels < 0) {
1620 __kmp_affinity_gran_levels = 0;
1621 }
1622
1623 if (__kmp_affinity_verbose) {
1624 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1625 }
1626
1627 *address2os = retval;
1628 KMP_CPU_FREE(oldMask);
1629 return 1;
1630 }
1631
1632 //
1633 // Sort the table by physical Id.
1634 //
1635 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1636
1637 //
1638 // Find the radix at each of the levels.
1639 //
1640 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1641 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1642 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1643 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1644 for (level = 0; level < depth; level++) {
1645 totals[level] = 1;
1646 maxCt[level] = 1;
1647 counts[level] = 1;
1648 last[level] = retval[0].first.labels[level];
1649 }
1650
1651 //
1652 // From here on, the iteration variable "level" runs from the finest
1653 // level to the coarsest, i.e. we iterate forward through
1654 // (*address2os)[].first.labels[] - in the previous loops, we iterated
1655 // backwards.
1656 //
1657 for (proc = 1; (int)proc < nApics; proc++) {
1658 int level;
1659 for (level = 0; level < depth; level++) {
1660 if (retval[proc].first.labels[level] != last[level]) {
1661 int j;
1662 for (j = level + 1; j < depth; j++) {
1663 totals[j]++;
1664 counts[j] = 1;
1665 // The line below causes printing incorrect topology information
1666 // in case the max value for some level (maxCt[level]) is encountered earlier than
1667 // some less value while going through the array.
1668 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1669 // whereas it must be 4.
1670 // TODO!!! Check if it can be commented safely
1671 //maxCt[j] = 1;
1672 last[j] = retval[proc].first.labels[j];
1673 }
1674 totals[level]++;
1675 counts[level]++;
1676 if (counts[level] > maxCt[level]) {
1677 maxCt[level] = counts[level];
1678 }
1679 last[level] = retval[proc].first.labels[level];
1680 break;
1681 }
1682 else if (level == depth - 1) {
1683 __kmp_free(last);
1684 __kmp_free(maxCt);
1685 __kmp_free(counts);
1686 __kmp_free(totals);
1687 __kmp_free(retval);
1688 KMP_CPU_FREE(oldMask);
1689 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1690 return -1;
1691 }
1692 }
1693 }
1694
1695 //
1696 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +00001697 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +00001698 // nCoresPerPkg, & nPackages. Make sure all these vars are set
1699 // correctly, and return if affinity is not enabled.
1700 //
1701 if (threadLevel >= 0) {
1702 __kmp_nThreadsPerCore = maxCt[threadLevel];
1703 }
1704 else {
1705 __kmp_nThreadsPerCore = 1;
1706 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00001707 nPackages = totals[pkgLevel];
1708
1709 if (coreLevel >= 0) {
1710 __kmp_ncores = totals[coreLevel];
1711 nCoresPerPkg = maxCt[coreLevel];
1712 }
1713 else {
1714 __kmp_ncores = nPackages;
1715 nCoresPerPkg = 1;
1716 }
1717
1718 //
1719 // Check to see if the machine topology is uniform
1720 //
1721 unsigned prod = maxCt[0];
1722 for (level = 1; level < depth; level++) {
1723 prod *= maxCt[level];
1724 }
1725 bool uniform = (prod == totals[level - 1]);
1726
1727 //
1728 // Print the machine topology summary.
1729 //
1730 if (__kmp_affinity_verbose) {
1731 char mask[KMP_AFFIN_MASK_PRINT_LEN];
1732 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1733
1734 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1735 if (__kmp_affinity_respect_mask) {
1736 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1737 } else {
1738 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1739 }
1740 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1741 if (uniform) {
1742 KMP_INFORM(Uniform, "KMP_AFFINITY");
1743 } else {
1744 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1745 }
1746
1747 kmp_str_buf_t buf;
1748 __kmp_str_buf_init(&buf);
1749
1750 __kmp_str_buf_print(&buf, "%d", totals[0]);
1751 for (level = 1; level <= pkgLevel; level++) {
1752 __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1753 }
1754 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1755 __kmp_nThreadsPerCore, __kmp_ncores);
1756
1757 __kmp_str_buf_free(&buf);
1758 }
1759
1760 if (__kmp_affinity_type == affinity_none) {
1761 __kmp_free(last);
1762 __kmp_free(maxCt);
1763 __kmp_free(counts);
1764 __kmp_free(totals);
1765 __kmp_free(retval);
1766 KMP_CPU_FREE(oldMask);
1767 return 0;
1768 }
1769
1770 //
1771 // Find any levels with radiix 1, and remove them from the map
1772 // (except for the package level).
1773 //
1774 int new_depth = 0;
1775 for (level = 0; level < depth; level++) {
1776 if ((maxCt[level] == 1) && (level != pkgLevel)) {
1777 continue;
1778 }
1779 new_depth++;
1780 }
1781
1782 //
1783 // If we are removing any levels, allocate a new vector to return,
1784 // and copy the relevant information to it.
1785 //
1786 if (new_depth != depth) {
1787 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1788 sizeof(AddrUnsPair) * nApics);
1789 for (proc = 0; (int)proc < nApics; proc++) {
1790 Address addr(new_depth);
1791 new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1792 }
1793 int new_level = 0;
1794 for (level = 0; level < depth; level++) {
1795 if ((maxCt[level] == 1) && (level != pkgLevel)) {
1796 if (level == threadLevel) {
1797 threadLevel = -1;
1798 }
1799 else if ((threadLevel >= 0) && (level < threadLevel)) {
1800 threadLevel--;
1801 }
1802 if (level == coreLevel) {
1803 coreLevel = -1;
1804 }
1805 else if ((coreLevel >= 0) && (level < coreLevel)) {
1806 coreLevel--;
1807 }
1808 if (level < pkgLevel) {
1809 pkgLevel--;
1810 }
1811 continue;
1812 }
1813 for (proc = 0; (int)proc < nApics; proc++) {
1814 new_retval[proc].first.labels[new_level]
1815 = retval[proc].first.labels[level];
1816 }
1817 new_level++;
1818 }
1819
1820 __kmp_free(retval);
1821 retval = new_retval;
1822 depth = new_depth;
1823 }
1824
1825 if (__kmp_affinity_gran_levels < 0) {
1826 //
1827 // Set the granularity level based on what levels are modeled
1828 // in the machine topology map.
1829 //
1830 __kmp_affinity_gran_levels = 0;
1831 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1832 __kmp_affinity_gran_levels++;
1833 }
1834 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1835 __kmp_affinity_gran_levels++;
1836 }
1837 if (__kmp_affinity_gran > affinity_gran_package) {
1838 __kmp_affinity_gran_levels++;
1839 }
1840 }
1841
1842 if (__kmp_affinity_verbose) {
1843 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1844 coreLevel, threadLevel);
1845 }
1846
1847 __kmp_free(last);
1848 __kmp_free(maxCt);
1849 __kmp_free(counts);
1850 __kmp_free(totals);
1851 KMP_CPU_FREE(oldMask);
1852 *address2os = retval;
1853 return depth;
1854}
1855
1856
1857# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1858
1859
1860#define osIdIndex 0
1861#define threadIdIndex 1
1862#define coreIdIndex 2
1863#define pkgIdIndex 3
1864#define nodeIdIndex 4
1865
1866typedef unsigned *ProcCpuInfo;
1867static unsigned maxIndex = pkgIdIndex;
1868
1869
1870static int
1871__kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1872{
1873 const unsigned *aa = (const unsigned *)a;
1874 const unsigned *bb = (const unsigned *)b;
1875 if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1876 if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1877 return 0;
1878};
1879
1880
1881static int
1882__kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1883{
1884 unsigned i;
1885 const unsigned *aa = *((const unsigned **)a);
1886 const unsigned *bb = *((const unsigned **)b);
1887 for (i = maxIndex; ; i--) {
1888 if (aa[i] < bb[i]) return -1;
1889 if (aa[i] > bb[i]) return 1;
1890 if (i == osIdIndex) break;
1891 }
1892 return 0;
1893}
1894
1895
1896//
1897// Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1898// affinity map.
1899//
1900static int
1901__kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1902 kmp_i18n_id_t *const msg_id, FILE *f)
1903{
1904 *address2os = NULL;
1905 *msg_id = kmp_i18n_null;
1906
1907 //
1908 // Scan of the file, and count the number of "processor" (osId) fields,
Alp Toker8f2d3f02014-02-24 10:40:15 +00001909 // and find the highest value of <n> for a node_<n> field.
Jim Cownie5e8470a2013-09-27 10:38:44 +00001910 //
1911 char buf[256];
1912 unsigned num_records = 0;
1913 while (! feof(f)) {
1914 buf[sizeof(buf) - 1] = 1;
1915 if (! fgets(buf, sizeof(buf), f)) {
1916 //
1917 // Read errors presumably because of EOF
1918 //
1919 break;
1920 }
1921
1922 char s1[] = "processor";
1923 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1924 num_records++;
1925 continue;
1926 }
1927
1928 //
1929 // FIXME - this will match "node_<n> <garbage>"
1930 //
1931 unsigned level;
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00001932 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00001933 if (nodeIdIndex + level >= maxIndex) {
1934 maxIndex = nodeIdIndex + level;
1935 }
1936 continue;
1937 }
1938 }
1939
1940 //
1941 // Check for empty file / no valid processor records, or too many.
1942 // The number of records can't exceed the number of valid bits in the
1943 // affinity mask.
1944 //
1945 if (num_records == 0) {
1946 *line = 0;
1947 *msg_id = kmp_i18n_str_NoProcRecords;
1948 return -1;
1949 }
1950 if (num_records > (unsigned)__kmp_xproc) {
1951 *line = 0;
1952 *msg_id = kmp_i18n_str_TooManyProcRecords;
1953 return -1;
1954 }
1955
1956 //
1957 // Set the file pointer back to the begginning, so that we can scan the
1958 // file again, this time performing a full parse of the data.
1959 // Allocate a vector of ProcCpuInfo object, where we will place the data.
1960 // Adding an extra element at the end allows us to remove a lot of extra
1961 // checks for termination conditions.
1962 //
1963 if (fseek(f, 0, SEEK_SET) != 0) {
1964 *line = 0;
1965 *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1966 return -1;
1967 }
1968
1969 //
1970 // Allocate the array of records to store the proc info in. The dummy
1971 // element at the end makes the logic in filling them out easier to code.
1972 //
1973 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1974 * sizeof(unsigned *));
1975 unsigned i;
1976 for (i = 0; i <= num_records; i++) {
1977 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1978 * sizeof(unsigned));
1979 }
1980
1981#define CLEANUP_THREAD_INFO \
1982 for (i = 0; i <= num_records; i++) { \
1983 __kmp_free(threadInfo[i]); \
1984 } \
1985 __kmp_free(threadInfo);
1986
1987 //
1988 // A value of UINT_MAX means that we didn't find the field
1989 //
1990 unsigned __index;
1991
1992#define INIT_PROC_INFO(p) \
1993 for (__index = 0; __index <= maxIndex; __index++) { \
1994 (p)[__index] = UINT_MAX; \
1995 }
1996
1997 for (i = 0; i <= num_records; i++) {
1998 INIT_PROC_INFO(threadInfo[i]);
1999 }
2000
2001 unsigned num_avail = 0;
2002 *line = 0;
2003 while (! feof(f)) {
2004 //
2005 // Create an inner scoping level, so that all the goto targets at the
2006 // end of the loop appear in an outer scoping level. This avoids
2007 // warnings about jumping past an initialization to a target in the
2008 // same block.
2009 //
2010 {
2011 buf[sizeof(buf) - 1] = 1;
2012 bool long_line = false;
2013 if (! fgets(buf, sizeof(buf), f)) {
2014 //
2015 // Read errors presumably because of EOF
2016 //
2017 // If there is valid data in threadInfo[num_avail], then fake
2018 // a blank line in ensure that the last address gets parsed.
2019 //
2020 bool valid = false;
2021 for (i = 0; i <= maxIndex; i++) {
2022 if (threadInfo[num_avail][i] != UINT_MAX) {
2023 valid = true;
2024 }
2025 }
2026 if (! valid) {
2027 break;
2028 }
2029 buf[0] = 0;
2030 } else if (!buf[sizeof(buf) - 1]) {
2031 //
2032 // The line is longer than the buffer. Set a flag and don't
2033 // emit an error if we were going to ignore the line, anyway.
2034 //
2035 long_line = true;
2036
2037#define CHECK_LINE \
2038 if (long_line) { \
2039 CLEANUP_THREAD_INFO; \
2040 *msg_id = kmp_i18n_str_LongLineCpuinfo; \
2041 return -1; \
2042 }
2043 }
2044 (*line)++;
2045
2046 char s1[] = "processor";
2047 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2048 CHECK_LINE;
2049 char *p = strchr(buf + sizeof(s1) - 1, ':');
2050 unsigned val;
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002051 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002052 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
2053 threadInfo[num_avail][osIdIndex] = val;
Jim Cownie181b4bb2013-12-23 17:28:57 +00002054#if KMP_OS_LINUX && USE_SYSFS_INFO
2055 char path[256];
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002056 KMP_SNPRINTF(path, sizeof(path),
Jim Cownie181b4bb2013-12-23 17:28:57 +00002057 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2058 threadInfo[num_avail][osIdIndex]);
2059 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2060
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002061 KMP_SNPRINTF(path, sizeof(path),
Jim Cownie181b4bb2013-12-23 17:28:57 +00002062 "/sys/devices/system/cpu/cpu%u/topology/core_id",
2063 threadInfo[num_avail][osIdIndex]);
2064 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002065 continue;
Jim Cownie181b4bb2013-12-23 17:28:57 +00002066#else
Jim Cownie5e8470a2013-09-27 10:38:44 +00002067 }
2068 char s2[] = "physical id";
2069 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2070 CHECK_LINE;
2071 char *p = strchr(buf + sizeof(s2) - 1, ':');
2072 unsigned val;
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002073 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002074 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2075 threadInfo[num_avail][pkgIdIndex] = val;
2076 continue;
2077 }
2078 char s3[] = "core id";
2079 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2080 CHECK_LINE;
2081 char *p = strchr(buf + sizeof(s3) - 1, ':');
2082 unsigned val;
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002083 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002084 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2085 threadInfo[num_avail][coreIdIndex] = val;
2086 continue;
Jim Cownie181b4bb2013-12-23 17:28:57 +00002087#endif // KMP_OS_LINUX && USE_SYSFS_INFO
Jim Cownie5e8470a2013-09-27 10:38:44 +00002088 }
2089 char s4[] = "thread id";
2090 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2091 CHECK_LINE;
2092 char *p = strchr(buf + sizeof(s4) - 1, ':');
2093 unsigned val;
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002094 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002095 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2096 threadInfo[num_avail][threadIdIndex] = val;
2097 continue;
2098 }
2099 unsigned level;
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002100 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002101 CHECK_LINE;
2102 char *p = strchr(buf + sizeof(s4) - 1, ':');
2103 unsigned val;
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002104 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002105 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2106 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2107 threadInfo[num_avail][nodeIdIndex + level] = val;
2108 continue;
2109 }
2110
2111 //
2112 // We didn't recognize the leading token on the line.
2113 // There are lots of leading tokens that we don't recognize -
2114 // if the line isn't empty, go on to the next line.
2115 //
2116 if ((*buf != 0) && (*buf != '\n')) {
2117 //
2118 // If the line is longer than the buffer, read characters
2119 // until we find a newline.
2120 //
2121 if (long_line) {
2122 int ch;
2123 while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2124 }
2125 continue;
2126 }
2127
2128 //
2129 // A newline has signalled the end of the processor record.
2130 // Check that there aren't too many procs specified.
2131 //
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002132 if ((int)num_avail == __kmp_xproc) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002133 CLEANUP_THREAD_INFO;
2134 *msg_id = kmp_i18n_str_TooManyEntries;
2135 return -1;
2136 }
2137
2138 //
2139 // Check for missing fields. The osId field must be there, and we
2140 // currently require that the physical id field is specified, also.
2141 //
2142 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2143 CLEANUP_THREAD_INFO;
2144 *msg_id = kmp_i18n_str_MissingProcField;
2145 return -1;
2146 }
2147 if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2148 CLEANUP_THREAD_INFO;
2149 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2150 return -1;
2151 }
2152
2153 //
2154 // Skip this proc if it is not included in the machine model.
2155 //
2156 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2157 INIT_PROC_INFO(threadInfo[num_avail]);
2158 continue;
2159 }
2160
2161 //
2162 // We have a successful parse of this proc's info.
2163 // Increment the counter, and prepare for the next proc.
2164 //
2165 num_avail++;
2166 KMP_ASSERT(num_avail <= num_records);
2167 INIT_PROC_INFO(threadInfo[num_avail]);
2168 }
2169 continue;
2170
2171 no_val:
2172 CLEANUP_THREAD_INFO;
2173 *msg_id = kmp_i18n_str_MissingValCpuinfo;
2174 return -1;
2175
2176 dup_field:
2177 CLEANUP_THREAD_INFO;
2178 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2179 return -1;
2180 }
2181 *line = 0;
2182
2183# if KMP_MIC && REDUCE_TEAM_SIZE
2184 unsigned teamSize = 0;
2185# endif // KMP_MIC && REDUCE_TEAM_SIZE
2186
2187 // check for num_records == __kmp_xproc ???
2188
2189 //
2190 // If there's only one thread context to bind to, form an Address object
2191 // with depth 1 and return immediately (or, if affinity is off, set
2192 // address2os to NULL and return).
2193 //
2194 // If it is configured to omit the package level when there is only a
2195 // single package, the logic at the end of this routine won't work if
2196 // there is only a single thread - it would try to form an Address
2197 // object with depth 0.
2198 //
2199 KMP_ASSERT(num_avail > 0);
2200 KMP_ASSERT(num_avail <= num_records);
2201 if (num_avail == 1) {
2202 __kmp_ncores = 1;
2203 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002204 if (__kmp_affinity_verbose) {
2205 if (! KMP_AFFINITY_CAPABLE()) {
2206 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2207 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2208 KMP_INFORM(Uniform, "KMP_AFFINITY");
2209 }
2210 else {
2211 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2212 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2213 fullMask);
2214 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2215 if (__kmp_affinity_respect_mask) {
2216 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2217 } else {
2218 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2219 }
2220 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2221 KMP_INFORM(Uniform, "KMP_AFFINITY");
2222 }
2223 int index;
2224 kmp_str_buf_t buf;
2225 __kmp_str_buf_init(&buf);
2226 __kmp_str_buf_print(&buf, "1");
2227 for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2228 __kmp_str_buf_print(&buf, " x 1");
2229 }
2230 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2231 __kmp_str_buf_free(&buf);
2232 }
2233
2234 if (__kmp_affinity_type == affinity_none) {
2235 CLEANUP_THREAD_INFO;
2236 return 0;
2237 }
2238
2239 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2240 Address addr(1);
2241 addr.labels[0] = threadInfo[0][pkgIdIndex];
2242 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2243
2244 if (__kmp_affinity_gran_levels < 0) {
2245 __kmp_affinity_gran_levels = 0;
2246 }
2247
2248 if (__kmp_affinity_verbose) {
2249 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2250 }
2251
2252 CLEANUP_THREAD_INFO;
2253 return 1;
2254 }
2255
2256 //
2257 // Sort the threadInfo table by physical Id.
2258 //
2259 qsort(threadInfo, num_avail, sizeof(*threadInfo),
2260 __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2261
2262 //
2263 // The table is now sorted by pkgId / coreId / threadId, but we really
2264 // don't know the radix of any of the fields. pkgId's may be sparsely
2265 // assigned among the chips on a system. Although coreId's are usually
2266 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2267 // [0..threadsPerCore-1], we don't want to make any such assumptions.
2268 //
2269 // For that matter, we don't know what coresPerPkg and threadsPerCore
2270 // (or the total # packages) are at this point - we want to determine
2271 // that now. We only have an upper bound on the first two figures.
2272 //
2273 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2274 * sizeof(unsigned));
2275 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2276 * sizeof(unsigned));
2277 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2278 * sizeof(unsigned));
2279 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2280 * sizeof(unsigned));
2281
2282 bool assign_thread_ids = false;
2283 unsigned threadIdCt;
2284 unsigned index;
2285
2286 restart_radix_check:
2287 threadIdCt = 0;
2288
2289 //
2290 // Initialize the counter arrays with data from threadInfo[0].
2291 //
2292 if (assign_thread_ids) {
2293 if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2294 threadInfo[0][threadIdIndex] = threadIdCt++;
2295 }
2296 else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2297 threadIdCt = threadInfo[0][threadIdIndex] + 1;
2298 }
2299 }
2300 for (index = 0; index <= maxIndex; index++) {
2301 counts[index] = 1;
2302 maxCt[index] = 1;
2303 totals[index] = 1;
2304 lastId[index] = threadInfo[0][index];;
2305 }
2306
2307 //
2308 // Run through the rest of the OS procs.
2309 //
2310 for (i = 1; i < num_avail; i++) {
2311 //
2312 // Find the most significant index whose id differs
2313 // from the id for the previous OS proc.
2314 //
2315 for (index = maxIndex; index >= threadIdIndex; index--) {
2316 if (assign_thread_ids && (index == threadIdIndex)) {
2317 //
2318 // Auto-assign the thread id field if it wasn't specified.
2319 //
2320 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2321 threadInfo[i][threadIdIndex] = threadIdCt++;
2322 }
2323
2324 //
2325 // Aparrently the thread id field was specified for some
2326 // entries and not others. Start the thread id counter
2327 // off at the next higher thread id.
2328 //
2329 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2330 threadIdCt = threadInfo[i][threadIdIndex] + 1;
2331 }
2332 }
2333 if (threadInfo[i][index] != lastId[index]) {
2334 //
2335 // Run through all indices which are less significant,
2336 // and reset the counts to 1.
2337 //
2338 // At all levels up to and including index, we need to
2339 // increment the totals and record the last id.
2340 //
2341 unsigned index2;
2342 for (index2 = threadIdIndex; index2 < index; index2++) {
2343 totals[index2]++;
2344 if (counts[index2] > maxCt[index2]) {
2345 maxCt[index2] = counts[index2];
2346 }
2347 counts[index2] = 1;
2348 lastId[index2] = threadInfo[i][index2];
2349 }
2350 counts[index]++;
2351 totals[index]++;
2352 lastId[index] = threadInfo[i][index];
2353
2354 if (assign_thread_ids && (index > threadIdIndex)) {
2355
2356# if KMP_MIC && REDUCE_TEAM_SIZE
2357 //
2358 // The default team size is the total #threads in the machine
2359 // minus 1 thread for every core that has 3 or more threads.
2360 //
2361 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2362# endif // KMP_MIC && REDUCE_TEAM_SIZE
2363
2364 //
2365 // Restart the thread counter, as we are on a new core.
2366 //
2367 threadIdCt = 0;
2368
2369 //
2370 // Auto-assign the thread id field if it wasn't specified.
2371 //
2372 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2373 threadInfo[i][threadIdIndex] = threadIdCt++;
2374 }
2375
2376 //
2377 // Aparrently the thread id field was specified for some
2378 // entries and not others. Start the thread id counter
2379 // off at the next higher thread id.
2380 //
2381 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2382 threadIdCt = threadInfo[i][threadIdIndex] + 1;
2383 }
2384 }
2385 break;
2386 }
2387 }
2388 if (index < threadIdIndex) {
2389 //
2390 // If thread ids were specified, it is an error if they are not
2391 // unique. Also, check that we waven't already restarted the
2392 // loop (to be safe - shouldn't need to).
2393 //
2394 if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2395 || assign_thread_ids) {
2396 __kmp_free(lastId);
2397 __kmp_free(totals);
2398 __kmp_free(maxCt);
2399 __kmp_free(counts);
2400 CLEANUP_THREAD_INFO;
2401 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2402 return -1;
2403 }
2404
2405 //
2406 // If the thread ids were not specified and we see entries
2407 // entries that are duplicates, start the loop over and
2408 // assign the thread ids manually.
2409 //
2410 assign_thread_ids = true;
2411 goto restart_radix_check;
2412 }
2413 }
2414
2415# if KMP_MIC && REDUCE_TEAM_SIZE
2416 //
2417 // The default team size is the total #threads in the machine
2418 // minus 1 thread for every core that has 3 or more threads.
2419 //
2420 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2421# endif // KMP_MIC && REDUCE_TEAM_SIZE
2422
2423 for (index = threadIdIndex; index <= maxIndex; index++) {
2424 if (counts[index] > maxCt[index]) {
2425 maxCt[index] = counts[index];
2426 }
2427 }
2428
2429 __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2430 nCoresPerPkg = maxCt[coreIdIndex];
2431 nPackages = totals[pkgIdIndex];
2432
2433 //
2434 // Check to see if the machine topology is uniform
2435 //
2436 unsigned prod = totals[maxIndex];
2437 for (index = threadIdIndex; index < maxIndex; index++) {
2438 prod *= maxCt[index];
2439 }
2440 bool uniform = (prod == totals[threadIdIndex]);
2441
2442 //
2443 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +00002444 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +00002445 // nCoresPerPkg, & nPackages. Make sure all these vars are set
2446 // correctly, and return now if affinity is not enabled.
2447 //
Jim Cownie5e8470a2013-09-27 10:38:44 +00002448 __kmp_ncores = totals[coreIdIndex];
2449
2450 if (__kmp_affinity_verbose) {
2451 if (! KMP_AFFINITY_CAPABLE()) {
2452 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2453 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2454 if (uniform) {
2455 KMP_INFORM(Uniform, "KMP_AFFINITY");
2456 } else {
2457 KMP_INFORM(NonUniform, "KMP_AFFINITY");
2458 }
2459 }
2460 else {
2461 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2462 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2463 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2464 if (__kmp_affinity_respect_mask) {
2465 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2466 } else {
2467 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2468 }
2469 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2470 if (uniform) {
2471 KMP_INFORM(Uniform, "KMP_AFFINITY");
2472 } else {
2473 KMP_INFORM(NonUniform, "KMP_AFFINITY");
2474 }
2475 }
2476 kmp_str_buf_t buf;
2477 __kmp_str_buf_init(&buf);
2478
2479 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2480 for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2481 __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2482 }
2483 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2484 maxCt[threadIdIndex], __kmp_ncores);
2485
2486 __kmp_str_buf_free(&buf);
2487 }
2488
2489# if KMP_MIC && REDUCE_TEAM_SIZE
2490 //
2491 // Set the default team size.
2492 //
2493 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2494 __kmp_dflt_team_nth = teamSize;
2495 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2496 __kmp_dflt_team_nth));
2497 }
2498# endif // KMP_MIC && REDUCE_TEAM_SIZE
2499
2500 if (__kmp_affinity_type == affinity_none) {
2501 __kmp_free(lastId);
2502 __kmp_free(totals);
2503 __kmp_free(maxCt);
2504 __kmp_free(counts);
2505 CLEANUP_THREAD_INFO;
2506 return 0;
2507 }
2508
2509 //
2510 // Count the number of levels which have more nodes at that level than
2511 // at the parent's level (with there being an implicit root node of
2512 // the top level). This is equivalent to saying that there is at least
2513 // one node at this level which has a sibling. These levels are in the
2514 // map, and the package level is always in the map.
2515 //
2516 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2517 int level = 0;
2518 for (index = threadIdIndex; index < maxIndex; index++) {
2519 KMP_ASSERT(totals[index] >= totals[index + 1]);
2520 inMap[index] = (totals[index] > totals[index + 1]);
2521 }
2522 inMap[maxIndex] = (totals[maxIndex] > 1);
2523 inMap[pkgIdIndex] = true;
2524
2525 int depth = 0;
2526 for (index = threadIdIndex; index <= maxIndex; index++) {
2527 if (inMap[index]) {
2528 depth++;
2529 }
2530 }
2531 KMP_ASSERT(depth > 0);
2532
2533 //
2534 // Construct the data structure that is to be returned.
2535 //
2536 *address2os = (AddrUnsPair*)
2537 __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2538 int pkgLevel = -1;
2539 int coreLevel = -1;
2540 int threadLevel = -1;
2541
2542 for (i = 0; i < num_avail; ++i) {
2543 Address addr(depth);
2544 unsigned os = threadInfo[i][osIdIndex];
2545 int src_index;
2546 int dst_index = 0;
2547
2548 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2549 if (! inMap[src_index]) {
2550 continue;
2551 }
2552 addr.labels[dst_index] = threadInfo[i][src_index];
2553 if (src_index == pkgIdIndex) {
2554 pkgLevel = dst_index;
2555 }
2556 else if (src_index == coreIdIndex) {
2557 coreLevel = dst_index;
2558 }
2559 else if (src_index == threadIdIndex) {
2560 threadLevel = dst_index;
2561 }
2562 dst_index++;
2563 }
2564 (*address2os)[i] = AddrUnsPair(addr, os);
2565 }
2566
2567 if (__kmp_affinity_gran_levels < 0) {
2568 //
2569 // Set the granularity level based on what levels are modeled
2570 // in the machine topology map.
2571 //
2572 unsigned src_index;
2573 __kmp_affinity_gran_levels = 0;
2574 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2575 if (! inMap[src_index]) {
2576 continue;
2577 }
2578 switch (src_index) {
2579 case threadIdIndex:
2580 if (__kmp_affinity_gran > affinity_gran_thread) {
2581 __kmp_affinity_gran_levels++;
2582 }
2583
2584 break;
2585 case coreIdIndex:
2586 if (__kmp_affinity_gran > affinity_gran_core) {
2587 __kmp_affinity_gran_levels++;
2588 }
2589 break;
2590
2591 case pkgIdIndex:
2592 if (__kmp_affinity_gran > affinity_gran_package) {
2593 __kmp_affinity_gran_levels++;
2594 }
2595 break;
2596 }
2597 }
2598 }
2599
2600 if (__kmp_affinity_verbose) {
2601 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2602 coreLevel, threadLevel);
2603 }
2604
2605 __kmp_free(inMap);
2606 __kmp_free(lastId);
2607 __kmp_free(totals);
2608 __kmp_free(maxCt);
2609 __kmp_free(counts);
2610 CLEANUP_THREAD_INFO;
2611 return depth;
2612}
2613
2614
2615//
2616// Create and return a table of affinity masks, indexed by OS thread ID.
2617// This routine handles OR'ing together all the affinity masks of threads
2618// that are sufficiently close, if granularity > fine.
2619//
2620static kmp_affin_mask_t *
2621__kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2622 AddrUnsPair *address2os, unsigned numAddrs)
2623{
2624 //
2625 // First form a table of affinity masks in order of OS thread id.
2626 //
2627 unsigned depth;
2628 unsigned maxOsId;
2629 unsigned i;
2630
2631 KMP_ASSERT(numAddrs > 0);
2632 depth = address2os[0].first.depth;
2633
2634 maxOsId = 0;
2635 for (i = 0; i < numAddrs; i++) {
2636 unsigned osId = address2os[i].second;
2637 if (osId > maxOsId) {
2638 maxOsId = osId;
2639 }
2640 }
2641 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2642 (maxOsId + 1) * __kmp_affin_mask_size);
2643
2644 //
2645 // Sort the address2os table according to physical order. Doing so
2646 // will put all threads on the same core/package/node in consecutive
2647 // locations.
2648 //
2649 qsort(address2os, numAddrs, sizeof(*address2os),
2650 __kmp_affinity_cmp_Address_labels);
2651
2652 KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2653 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2654 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2655 }
2656 if (__kmp_affinity_gran_levels >= (int)depth) {
2657 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2658 && (__kmp_affinity_type != affinity_none))) {
2659 KMP_WARNING(AffThreadsMayMigrate);
2660 }
2661 }
2662
2663 //
2664 // Run through the table, forming the masks for all threads on each
2665 // core. Threads on the same core will have identical "Address"
2666 // objects, not considering the last level, which must be the thread
2667 // id. All threads on a core will appear consecutively.
2668 //
2669 unsigned unique = 0;
2670 unsigned j = 0; // index of 1st thread on core
2671 unsigned leader = 0;
2672 Address *leaderAddr = &(address2os[0].first);
2673 kmp_affin_mask_t *sum
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002674 = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002675 KMP_CPU_ZERO(sum);
2676 KMP_CPU_SET(address2os[0].second, sum);
2677 for (i = 1; i < numAddrs; i++) {
2678 //
Alp Toker8f2d3f02014-02-24 10:40:15 +00002679 // If this thread is sufficiently close to the leader (within the
Jim Cownie5e8470a2013-09-27 10:38:44 +00002680 // granularity setting), then set the bit for this os thread in the
2681 // affinity mask for this group, and go on to the next thread.
2682 //
2683 if (leaderAddr->isClose(address2os[i].first,
2684 __kmp_affinity_gran_levels)) {
2685 KMP_CPU_SET(address2os[i].second, sum);
2686 continue;
2687 }
2688
2689 //
2690 // For every thread in this group, copy the mask to the thread's
2691 // entry in the osId2Mask table. Mark the first address as a
2692 // leader.
2693 //
2694 for (; j < i; j++) {
2695 unsigned osId = address2os[j].second;
2696 KMP_DEBUG_ASSERT(osId <= maxOsId);
2697 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2698 KMP_CPU_COPY(mask, sum);
2699 address2os[j].first.leader = (j == leader);
2700 }
2701 unique++;
2702
2703 //
2704 // Start a new mask.
2705 //
2706 leader = i;
2707 leaderAddr = &(address2os[i].first);
2708 KMP_CPU_ZERO(sum);
2709 KMP_CPU_SET(address2os[i].second, sum);
2710 }
2711
2712 //
2713 // For every thread in last group, copy the mask to the thread's
2714 // entry in the osId2Mask table.
2715 //
2716 for (; j < i; j++) {
2717 unsigned osId = address2os[j].second;
2718 KMP_DEBUG_ASSERT(osId <= maxOsId);
2719 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2720 KMP_CPU_COPY(mask, sum);
2721 address2os[j].first.leader = (j == leader);
2722 }
2723 unique++;
2724
2725 *maxIndex = maxOsId;
2726 *numUnique = unique;
2727 return osId2Mask;
2728}
2729
2730
2731//
2732// Stuff for the affinity proclist parsers. It's easier to declare these vars
2733// as file-static than to try and pass them through the calling sequence of
2734// the recursive-descent OMP_PLACES parser.
2735//
2736static kmp_affin_mask_t *newMasks;
2737static int numNewMasks;
2738static int nextNewMask;
2739
2740#define ADD_MASK(_mask) \
2741 { \
2742 if (nextNewMask >= numNewMasks) { \
2743 numNewMasks *= 2; \
2744 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2745 numNewMasks * __kmp_affin_mask_size); \
2746 } \
2747 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2748 nextNewMask++; \
2749 }
2750
2751#define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2752 { \
2753 if (((_osId) > _maxOsId) || \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002754 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
Jim Cownie5e8470a2013-09-27 10:38:44 +00002755 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \
2756 && (__kmp_affinity_type != affinity_none))) { \
2757 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2758 } \
2759 } \
2760 else { \
2761 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2762 } \
2763 }
2764
2765
2766//
2767// Re-parse the proclist (for the explicit affinity type), and form the list
2768// of affinity newMasks indexed by gtid.
2769//
2770static void
2771__kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2772 unsigned int *out_numMasks, const char *proclist,
2773 kmp_affin_mask_t *osId2Mask, int maxOsId)
2774{
2775 const char *scan = proclist;
2776 const char *next = proclist;
2777
2778 //
2779 // We use malloc() for the temporary mask vector,
2780 // so that we can use realloc() to extend it.
2781 //
2782 numNewMasks = 2;
2783 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2784 * __kmp_affin_mask_size);
2785 nextNewMask = 0;
2786 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2787 __kmp_affin_mask_size);
2788 int setSize = 0;
2789
2790 for (;;) {
2791 int start, end, stride;
2792
2793 SKIP_WS(scan);
2794 next = scan;
2795 if (*next == '\0') {
2796 break;
2797 }
2798
2799 if (*next == '{') {
2800 int num;
2801 setSize = 0;
2802 next++; // skip '{'
2803 SKIP_WS(next);
2804 scan = next;
2805
2806 //
2807 // Read the first integer in the set.
2808 //
2809 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2810 "bad proclist");
2811 SKIP_DIGITS(next);
2812 num = __kmp_str_to_int(scan, *next);
2813 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2814
2815 //
2816 // Copy the mask for that osId to the sum (union) mask.
2817 //
2818 if ((num > maxOsId) ||
2819 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2820 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2821 && (__kmp_affinity_type != affinity_none))) {
2822 KMP_WARNING(AffIgnoreInvalidProcID, num);
2823 }
2824 KMP_CPU_ZERO(sumMask);
2825 }
2826 else {
2827 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2828 setSize = 1;
2829 }
2830
2831 for (;;) {
2832 //
2833 // Check for end of set.
2834 //
2835 SKIP_WS(next);
2836 if (*next == '}') {
2837 next++; // skip '}'
2838 break;
2839 }
2840
2841 //
2842 // Skip optional comma.
2843 //
2844 if (*next == ',') {
2845 next++;
2846 }
2847 SKIP_WS(next);
2848
2849 //
2850 // Read the next integer in the set.
2851 //
2852 scan = next;
2853 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2854 "bad explicit proc list");
2855
2856 SKIP_DIGITS(next);
2857 num = __kmp_str_to_int(scan, *next);
2858 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2859
2860 //
2861 // Add the mask for that osId to the sum mask.
2862 //
2863 if ((num > maxOsId) ||
2864 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2865 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2866 && (__kmp_affinity_type != affinity_none))) {
2867 KMP_WARNING(AffIgnoreInvalidProcID, num);
2868 }
2869 }
2870 else {
2871 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2872 setSize++;
2873 }
2874 }
2875 if (setSize > 0) {
2876 ADD_MASK(sumMask);
2877 }
2878
2879 SKIP_WS(next);
2880 if (*next == ',') {
2881 next++;
2882 }
2883 scan = next;
2884 continue;
2885 }
2886
2887 //
2888 // Read the first integer.
2889 //
2890 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2891 SKIP_DIGITS(next);
2892 start = __kmp_str_to_int(scan, *next);
2893 KMP_ASSERT2(start >= 0, "bad explicit proc list");
2894 SKIP_WS(next);
2895
2896 //
2897 // If this isn't a range, then add a mask to the list and go on.
2898 //
2899 if (*next != '-') {
2900 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2901
2902 //
2903 // Skip optional comma.
2904 //
2905 if (*next == ',') {
2906 next++;
2907 }
2908 scan = next;
2909 continue;
2910 }
2911
2912 //
2913 // This is a range. Skip over the '-' and read in the 2nd int.
2914 //
2915 next++; // skip '-'
2916 SKIP_WS(next);
2917 scan = next;
2918 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2919 SKIP_DIGITS(next);
2920 end = __kmp_str_to_int(scan, *next);
2921 KMP_ASSERT2(end >= 0, "bad explicit proc list");
2922
2923 //
2924 // Check for a stride parameter
2925 //
2926 stride = 1;
2927 SKIP_WS(next);
2928 if (*next == ':') {
2929 //
2930 // A stride is specified. Skip over the ':" and read the 3rd int.
2931 //
2932 int sign = +1;
2933 next++; // skip ':'
2934 SKIP_WS(next);
2935 scan = next;
2936 if (*next == '-') {
2937 sign = -1;
2938 next++;
2939 SKIP_WS(next);
2940 scan = next;
2941 }
2942 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2943 "bad explicit proc list");
2944 SKIP_DIGITS(next);
2945 stride = __kmp_str_to_int(scan, *next);
2946 KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2947 stride *= sign;
2948 }
2949
2950 //
2951 // Do some range checks.
2952 //
2953 KMP_ASSERT2(stride != 0, "bad explicit proc list");
2954 if (stride > 0) {
2955 KMP_ASSERT2(start <= end, "bad explicit proc list");
2956 }
2957 else {
2958 KMP_ASSERT2(start >= end, "bad explicit proc list");
2959 }
2960 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2961
2962 //
2963 // Add the mask for each OS proc # to the list.
2964 //
2965 if (stride > 0) {
2966 do {
2967 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2968 start += stride;
2969 } while (start <= end);
2970 }
2971 else {
2972 do {
2973 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2974 start += stride;
2975 } while (start >= end);
2976 }
2977
2978 //
2979 // Skip optional comma.
2980 //
2981 SKIP_WS(next);
2982 if (*next == ',') {
2983 next++;
2984 }
2985 scan = next;
2986 }
2987
2988 *out_numMasks = nextNewMask;
2989 if (nextNewMask == 0) {
2990 *out_masks = NULL;
2991 KMP_INTERNAL_FREE(newMasks);
2992 return;
2993 }
2994 *out_masks
2995 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00002996 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002997 __kmp_free(sumMask);
2998 KMP_INTERNAL_FREE(newMasks);
2999}
3000
3001
3002# if OMP_40_ENABLED
3003
3004/*-----------------------------------------------------------------------------
3005
3006Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3007places. Again, Here is the grammar:
3008
3009place_list := place
3010place_list := place , place_list
3011place := num
3012place := place : num
3013place := place : num : signed
3014place := { subplacelist }
3015place := ! place // (lowest priority)
3016subplace_list := subplace
3017subplace_list := subplace , subplace_list
3018subplace := num
3019subplace := num : num
3020subplace := num : num : signed
3021signed := num
3022signed := + signed
3023signed := - signed
3024
3025-----------------------------------------------------------------------------*/
3026
3027static void
3028__kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
3029 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3030{
3031 const char *next;
3032
3033 for (;;) {
3034 int start, count, stride, i;
3035
3036 //
3037 // Read in the starting proc id
3038 //
3039 SKIP_WS(*scan);
3040 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3041 "bad explicit places list");
3042 next = *scan;
3043 SKIP_DIGITS(next);
3044 start = __kmp_str_to_int(*scan, *next);
3045 KMP_ASSERT(start >= 0);
3046 *scan = next;
3047
3048 //
3049 // valid follow sets are ',' ':' and '}'
3050 //
3051 SKIP_WS(*scan);
3052 if (**scan == '}' || **scan == ',') {
3053 if ((start > maxOsId) ||
3054 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3055 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3056 && (__kmp_affinity_type != affinity_none))) {
3057 KMP_WARNING(AffIgnoreInvalidProcID, start);
3058 }
3059 }
3060 else {
3061 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3062 (*setSize)++;
3063 }
3064 if (**scan == '}') {
3065 break;
3066 }
3067 (*scan)++; // skip ','
3068 continue;
3069 }
3070 KMP_ASSERT2(**scan == ':', "bad explicit places list");
3071 (*scan)++; // skip ':'
3072
3073 //
3074 // Read count parameter
3075 //
3076 SKIP_WS(*scan);
3077 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3078 "bad explicit places list");
3079 next = *scan;
3080 SKIP_DIGITS(next);
3081 count = __kmp_str_to_int(*scan, *next);
3082 KMP_ASSERT(count >= 0);
3083 *scan = next;
3084
3085 //
3086 // valid follow sets are ',' ':' and '}'
3087 //
3088 SKIP_WS(*scan);
3089 if (**scan == '}' || **scan == ',') {
3090 for (i = 0; i < count; i++) {
3091 if ((start > maxOsId) ||
3092 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3093 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3094 && (__kmp_affinity_type != affinity_none))) {
3095 KMP_WARNING(AffIgnoreInvalidProcID, start);
3096 }
3097 break; // don't proliferate warnings for large count
3098 }
3099 else {
3100 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3101 start++;
3102 (*setSize)++;
3103 }
3104 }
3105 if (**scan == '}') {
3106 break;
3107 }
3108 (*scan)++; // skip ','
3109 continue;
3110 }
3111 KMP_ASSERT2(**scan == ':', "bad explicit places list");
3112 (*scan)++; // skip ':'
3113
3114 //
3115 // Read stride parameter
3116 //
3117 int sign = +1;
3118 for (;;) {
3119 SKIP_WS(*scan);
3120 if (**scan == '+') {
3121 (*scan)++; // skip '+'
3122 continue;
3123 }
3124 if (**scan == '-') {
3125 sign *= -1;
3126 (*scan)++; // skip '-'
3127 continue;
3128 }
3129 break;
3130 }
3131 SKIP_WS(*scan);
3132 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3133 "bad explicit places list");
3134 next = *scan;
3135 SKIP_DIGITS(next);
3136 stride = __kmp_str_to_int(*scan, *next);
3137 KMP_ASSERT(stride >= 0);
3138 *scan = next;
3139 stride *= sign;
3140
3141 //
3142 // valid follow sets are ',' and '}'
3143 //
3144 SKIP_WS(*scan);
3145 if (**scan == '}' || **scan == ',') {
3146 for (i = 0; i < count; i++) {
3147 if ((start > maxOsId) ||
3148 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3149 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3150 && (__kmp_affinity_type != affinity_none))) {
3151 KMP_WARNING(AffIgnoreInvalidProcID, start);
3152 }
3153 break; // don't proliferate warnings for large count
3154 }
3155 else {
3156 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3157 start += stride;
3158 (*setSize)++;
3159 }
3160 }
3161 if (**scan == '}') {
3162 break;
3163 }
3164 (*scan)++; // skip ','
3165 continue;
3166 }
3167
3168 KMP_ASSERT2(0, "bad explicit places list");
3169 }
3170}
3171
3172
3173static void
3174__kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3175 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3176{
3177 const char *next;
3178
3179 //
3180 // valid follow sets are '{' '!' and num
3181 //
3182 SKIP_WS(*scan);
3183 if (**scan == '{') {
3184 (*scan)++; // skip '{'
3185 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3186 setSize);
3187 KMP_ASSERT2(**scan == '}', "bad explicit places list");
3188 (*scan)++; // skip '}'
3189 }
3190 else if (**scan == '!') {
3191 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3192 KMP_CPU_COMPLEMENT(tempMask);
3193 (*scan)++; // skip '!'
3194 }
3195 else if ((**scan >= '0') && (**scan <= '9')) {
3196 next = *scan;
3197 SKIP_DIGITS(next);
3198 int num = __kmp_str_to_int(*scan, *next);
3199 KMP_ASSERT(num >= 0);
3200 if ((num > maxOsId) ||
3201 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3202 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3203 && (__kmp_affinity_type != affinity_none))) {
3204 KMP_WARNING(AffIgnoreInvalidProcID, num);
3205 }
3206 }
3207 else {
3208 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3209 (*setSize)++;
3210 }
3211 *scan = next; // skip num
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003212 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003213 else {
3214 KMP_ASSERT2(0, "bad explicit places list");
3215 }
3216}
3217
3218
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003219//static void
3220void
Jim Cownie5e8470a2013-09-27 10:38:44 +00003221__kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3222 unsigned int *out_numMasks, const char *placelist,
3223 kmp_affin_mask_t *osId2Mask, int maxOsId)
3224{
3225 const char *scan = placelist;
3226 const char *next = placelist;
3227
3228 numNewMasks = 2;
3229 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3230 * __kmp_affin_mask_size);
3231 nextNewMask = 0;
3232
3233 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3234 __kmp_affin_mask_size);
3235 KMP_CPU_ZERO(tempMask);
3236 int setSize = 0;
3237
3238 for (;;) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003239 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3240
3241 //
3242 // valid follow sets are ',' ':' and EOL
3243 //
3244 SKIP_WS(scan);
3245 if (*scan == '\0' || *scan == ',') {
3246 if (setSize > 0) {
3247 ADD_MASK(tempMask);
3248 }
3249 KMP_CPU_ZERO(tempMask);
3250 setSize = 0;
3251 if (*scan == '\0') {
3252 break;
3253 }
3254 scan++; // skip ','
3255 continue;
3256 }
3257
3258 KMP_ASSERT2(*scan == ':', "bad explicit places list");
3259 scan++; // skip ':'
3260
3261 //
3262 // Read count parameter
3263 //
3264 SKIP_WS(scan);
3265 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3266 "bad explicit places list");
3267 next = scan;
3268 SKIP_DIGITS(next);
Jim Cownie181b4bb2013-12-23 17:28:57 +00003269 int count = __kmp_str_to_int(scan, *next);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003270 KMP_ASSERT(count >= 0);
3271 scan = next;
3272
3273 //
3274 // valid follow sets are ',' ':' and EOL
3275 //
3276 SKIP_WS(scan);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003277 int stride;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003278 if (*scan == '\0' || *scan == ',') {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003279 stride = +1;
3280 }
3281 else {
3282 KMP_ASSERT2(*scan == ':', "bad explicit places list");
3283 scan++; // skip ':'
Jim Cownie5e8470a2013-09-27 10:38:44 +00003284
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003285 //
3286 // Read stride parameter
3287 //
3288 int sign = +1;
3289 for (;;) {
3290 SKIP_WS(scan);
3291 if (*scan == '+') {
3292 scan++; // skip '+'
3293 continue;
3294 }
3295 if (*scan == '-') {
3296 sign *= -1;
3297 scan++; // skip '-'
3298 continue;
3299 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003300 break;
3301 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003302 SKIP_WS(scan);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003303 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3304 "bad explicit places list");
3305 next = scan;
3306 SKIP_DIGITS(next);
3307 stride = __kmp_str_to_int(scan, *next);
3308 KMP_DEBUG_ASSERT(stride >= 0);
3309 scan = next;
3310 stride *= sign;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003311 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003312
3313 if (stride > 0) {
3314 int i;
3315 for (i = 0; i < count; i++) {
3316 int j;
3317 if (setSize == 0) {
3318 break;
3319 }
3320 ADD_MASK(tempMask);
3321 setSize = 0;
3322 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003323 if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3324 KMP_CPU_CLR(j, tempMask);
3325 }
3326 else if ((j > maxOsId) ||
3327 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
Andrey Churbanov16a14322015-03-10 09:34:38 +00003328 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3329 && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003330 KMP_WARNING(AffIgnoreInvalidProcID, j);
3331 }
3332 KMP_CPU_CLR(j, tempMask);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003333 }
3334 else {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003335 KMP_CPU_SET(j, tempMask);
3336 setSize++;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003337 }
3338 }
3339 for (; j >= 0; j--) {
3340 KMP_CPU_CLR(j, tempMask);
3341 }
3342 }
3343 }
3344 else {
3345 int i;
3346 for (i = 0; i < count; i++) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003347 int j;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003348 if (setSize == 0) {
3349 break;
3350 }
3351 ADD_MASK(tempMask);
3352 setSize = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003353 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003354 j++) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003355 if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3356 KMP_CPU_CLR(j, tempMask);
3357 }
3358 else if ((j > maxOsId) ||
3359 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
Andrey Churbanov16a14322015-03-10 09:34:38 +00003360 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3361 && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003362 KMP_WARNING(AffIgnoreInvalidProcID, j);
3363 }
3364 KMP_CPU_CLR(j, tempMask);
3365 }
3366 else {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003367 KMP_CPU_SET(j, tempMask);
3368 setSize++;
3369 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003370 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003371 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003372 KMP_CPU_CLR(j, tempMask);
3373 }
3374 }
3375 }
3376 KMP_CPU_ZERO(tempMask);
3377 setSize = 0;
3378
3379 //
3380 // valid follow sets are ',' and EOL
3381 //
3382 SKIP_WS(scan);
3383 if (*scan == '\0') {
3384 break;
3385 }
3386 if (*scan == ',') {
3387 scan++; // skip ','
3388 continue;
3389 }
3390
3391 KMP_ASSERT2(0, "bad explicit places list");
3392 }
3393
3394 *out_numMasks = nextNewMask;
3395 if (nextNewMask == 0) {
3396 *out_masks = NULL;
3397 KMP_INTERNAL_FREE(newMasks);
3398 return;
3399 }
3400 *out_masks
3401 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00003402 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003403 __kmp_free(tempMask);
3404 KMP_INTERNAL_FREE(newMasks);
3405}
3406
3407# endif /* OMP_40_ENABLED */
3408
3409#undef ADD_MASK
3410#undef ADD_MASK_OSID
3411
Jim Cownie5e8470a2013-09-27 10:38:44 +00003412static void
3413__kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3414{
3415 if ( __kmp_place_num_cores == 0 ) {
3416 if ( __kmp_place_num_threads_per_core == 0 ) {
3417 return; // no cores limiting actions requested, exit
3418 }
3419 __kmp_place_num_cores = nCoresPerPkg; // use all available cores
3420 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003421 if ( !__kmp_affinity_uniform_topology() ) {
3422 KMP_WARNING( AffThrPlaceNonUniform );
3423 return; // don't support non-uniform topology
3424 }
3425 if ( depth != 3 ) {
3426 KMP_WARNING( AffThrPlaceNonThreeLevel );
3427 return; // don't support not-3-level topology
Jim Cownie5e8470a2013-09-27 10:38:44 +00003428 }
3429 if ( __kmp_place_num_threads_per_core == 0 ) {
3430 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3431 }
Andrey Churbanov12875572015-03-10 09:00:36 +00003432 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003433 KMP_WARNING( AffThrPlaceManyCores );
3434 return;
3435 }
3436
3437 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3438 nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3439 int i, j, k, n_old = 0, n_new = 0;
3440 for ( i = 0; i < nPackages; ++i ) {
3441 for ( j = 0; j < nCoresPerPkg; ++j ) {
Andrey Churbanov12875572015-03-10 09:00:36 +00003442 if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003443 n_old += __kmp_nThreadsPerCore; // skip not-requested core
3444 } else {
3445 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
Andrey Churbanov12875572015-03-10 09:00:36 +00003446 if ( k < __kmp_place_num_threads_per_core ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003447 newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location
3448 n_new++;
3449 }
3450 n_old++;
3451 }
3452 }
3453 }
3454 }
3455 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg
3456 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3457 __kmp_avail_proc = n_new; // correct avail_proc
3458 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores
3459
3460 __kmp_free( *pAddr );
3461 *pAddr = newAddr; // replace old topology with new one
3462}
3463
Jim Cownie5e8470a2013-09-27 10:38:44 +00003464
3465static AddrUnsPair *address2os = NULL;
3466static int * procarr = NULL;
3467static int __kmp_aff_depth = 0;
3468
3469static void
3470__kmp_aux_affinity_initialize(void)
3471{
3472 if (__kmp_affinity_masks != NULL) {
3473 KMP_ASSERT(fullMask != NULL);
3474 return;
3475 }
3476
3477 //
3478 // Create the "full" mask - this defines all of the processors that we
3479 // consider to be in the machine model. If respect is set, then it is
3480 // the initialization thread's affinity mask. Otherwise, it is all
3481 // processors that we know about on the machine.
3482 //
3483 if (fullMask == NULL) {
3484 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3485 }
3486 if (KMP_AFFINITY_CAPABLE()) {
3487 if (__kmp_affinity_respect_mask) {
3488 __kmp_get_system_affinity(fullMask, TRUE);
3489
3490 //
3491 // Count the number of available processors.
3492 //
3493 unsigned i;
3494 __kmp_avail_proc = 0;
3495 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3496 if (! KMP_CPU_ISSET(i, fullMask)) {
3497 continue;
3498 }
3499 __kmp_avail_proc++;
3500 }
3501 if (__kmp_avail_proc > __kmp_xproc) {
3502 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3503 && (__kmp_affinity_type != affinity_none))) {
3504 KMP_WARNING(ErrorInitializeAffinity);
3505 }
3506 __kmp_affinity_type = affinity_none;
Andrey Churbanov1f037e42015-03-10 09:15:26 +00003507 KMP_AFFINITY_DISABLE();
Jim Cownie5e8470a2013-09-27 10:38:44 +00003508 return;
3509 }
3510 }
3511 else {
3512 __kmp_affinity_entire_machine_mask(fullMask);
3513 __kmp_avail_proc = __kmp_xproc;
3514 }
3515 }
3516
3517 int depth = -1;
3518 kmp_i18n_id_t msg_id = kmp_i18n_null;
3519
3520 //
Alp Toker8f2d3f02014-02-24 10:40:15 +00003521 // For backward compatibility, setting KMP_CPUINFO_FILE =>
Jim Cownie5e8470a2013-09-27 10:38:44 +00003522 // KMP_TOPOLOGY_METHOD=cpuinfo
3523 //
3524 if ((__kmp_cpuinfo_file != NULL) &&
3525 (__kmp_affinity_top_method == affinity_top_method_all)) {
3526 __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3527 }
3528
3529 if (__kmp_affinity_top_method == affinity_top_method_all) {
3530 //
3531 // In the default code path, errors are not fatal - we just try using
3532 // another method. We only emit a warning message if affinity is on,
3533 // or the verbose flag is set, an the nowarnings flag was not set.
3534 //
3535 const char *file_name = NULL;
3536 int line = 0;
3537
3538# if KMP_ARCH_X86 || KMP_ARCH_X86_64
3539
3540 if (__kmp_affinity_verbose) {
3541 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3542 }
3543
3544 file_name = NULL;
3545 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3546 if (depth == 0) {
3547 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3548 KMP_ASSERT(address2os == NULL);
3549 return;
3550 }
3551
3552 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003553 if (__kmp_affinity_verbose) {
3554 if (msg_id != kmp_i18n_null) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003555 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3556 KMP_I18N_STR(DecodingLegacyAPIC));
3557 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003558 else {
3559 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3560 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003561 }
3562
3563 file_name = NULL;
3564 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3565 if (depth == 0) {
3566 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3567 KMP_ASSERT(address2os == NULL);
3568 return;
3569 }
3570 }
3571
3572# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3573
3574# if KMP_OS_LINUX
3575
3576 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003577 if (__kmp_affinity_verbose) {
3578 if (msg_id != kmp_i18n_null) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003579 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3580 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003581 else {
3582 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3583 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003584 }
3585
3586 FILE *f = fopen("/proc/cpuinfo", "r");
3587 if (f == NULL) {
3588 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3589 }
3590 else {
3591 file_name = "/proc/cpuinfo";
3592 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3593 fclose(f);
3594 if (depth == 0) {
3595 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3596 KMP_ASSERT(address2os == NULL);
3597 return;
3598 }
3599 }
3600 }
3601
3602# endif /* KMP_OS_LINUX */
3603
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003604# if KMP_GROUP_AFFINITY
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003605
3606 if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3607 if (__kmp_affinity_verbose) {
3608 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3609 }
3610
3611 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3612 KMP_ASSERT(depth != 0);
3613 }
3614
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003615# endif /* KMP_GROUP_AFFINITY */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003616
Jim Cownie5e8470a2013-09-27 10:38:44 +00003617 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003618 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003619 if (file_name == NULL) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003620 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003621 }
3622 else if (line == 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003623 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003624 }
3625 else {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003626 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003627 }
3628 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003629 // FIXME - print msg if msg_id = kmp_i18n_null ???
Jim Cownie5e8470a2013-09-27 10:38:44 +00003630
3631 file_name = "";
3632 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3633 if (depth == 0) {
3634 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3635 KMP_ASSERT(address2os == NULL);
3636 return;
3637 }
3638 KMP_ASSERT(depth > 0);
3639 KMP_ASSERT(address2os != NULL);
3640 }
3641 }
3642
3643 //
3644 // If the user has specified that a paricular topology discovery method
3645 // is to be used, then we abort if that method fails. The exception is
3646 // group affinity, which might have been implicitly set.
3647 //
3648
3649# if KMP_ARCH_X86 || KMP_ARCH_X86_64
3650
3651 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3652 if (__kmp_affinity_verbose) {
3653 KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3654 KMP_I18N_STR(Decodingx2APIC));
3655 }
3656
3657 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3658 if (depth == 0) {
3659 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3660 KMP_ASSERT(address2os == NULL);
3661 return;
3662 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003663 if (depth < 0) {
3664 KMP_ASSERT(msg_id != kmp_i18n_null);
3665 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3666 }
3667 }
3668 else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3669 if (__kmp_affinity_verbose) {
3670 KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3671 KMP_I18N_STR(DecodingLegacyAPIC));
3672 }
3673
3674 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3675 if (depth == 0) {
3676 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3677 KMP_ASSERT(address2os == NULL);
3678 return;
3679 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003680 if (depth < 0) {
3681 KMP_ASSERT(msg_id != kmp_i18n_null);
3682 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3683 }
3684 }
3685
3686# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3687
3688 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3689 const char *filename;
3690 if (__kmp_cpuinfo_file != NULL) {
3691 filename = __kmp_cpuinfo_file;
3692 }
3693 else {
3694 filename = "/proc/cpuinfo";
3695 }
3696
3697 if (__kmp_affinity_verbose) {
3698 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3699 }
3700
3701 FILE *f = fopen(filename, "r");
3702 if (f == NULL) {
3703 int code = errno;
3704 if (__kmp_cpuinfo_file != NULL) {
3705 __kmp_msg(
3706 kmp_ms_fatal,
3707 KMP_MSG(CantOpenFileForReading, filename),
3708 KMP_ERR(code),
3709 KMP_HNT(NameComesFrom_CPUINFO_FILE),
3710 __kmp_msg_null
3711 );
3712 }
3713 else {
3714 __kmp_msg(
3715 kmp_ms_fatal,
3716 KMP_MSG(CantOpenFileForReading, filename),
3717 KMP_ERR(code),
3718 __kmp_msg_null
3719 );
3720 }
3721 }
3722 int line = 0;
3723 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3724 fclose(f);
3725 if (depth < 0) {
3726 KMP_ASSERT(msg_id != kmp_i18n_null);
3727 if (line > 0) {
3728 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3729 }
3730 else {
3731 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3732 }
3733 }
3734 if (__kmp_affinity_type == affinity_none) {
3735 KMP_ASSERT(depth == 0);
3736 KMP_ASSERT(address2os == NULL);
3737 return;
3738 }
3739 }
3740
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003741# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00003742
3743 else if (__kmp_affinity_top_method == affinity_top_method_group) {
3744 if (__kmp_affinity_verbose) {
3745 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3746 }
3747
3748 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3749 KMP_ASSERT(depth != 0);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003750 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003751 KMP_ASSERT(msg_id != kmp_i18n_null);
3752 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003753 }
3754 }
3755
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003756# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +00003757
3758 else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3759 if (__kmp_affinity_verbose) {
3760 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3761 }
3762
3763 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3764 if (depth == 0) {
3765 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3766 KMP_ASSERT(address2os == NULL);
3767 return;
3768 }
3769 // should not fail
3770 KMP_ASSERT(depth > 0);
3771 KMP_ASSERT(address2os != NULL);
3772 }
3773
3774 if (address2os == NULL) {
3775 if (KMP_AFFINITY_CAPABLE()
3776 && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3777 && (__kmp_affinity_type != affinity_none)))) {
3778 KMP_WARNING(ErrorInitializeAffinity);
3779 }
3780 __kmp_affinity_type = affinity_none;
Andrey Churbanov1f037e42015-03-10 09:15:26 +00003781 KMP_AFFINITY_DISABLE();
Jim Cownie5e8470a2013-09-27 10:38:44 +00003782 return;
3783 }
3784
Jim Cownie5e8470a2013-09-27 10:38:44 +00003785 __kmp_apply_thread_places(&address2os, depth);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003786
3787 //
3788 // Create the table of masks, indexed by thread Id.
3789 //
3790 unsigned maxIndex;
3791 unsigned numUnique;
3792 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3793 address2os, __kmp_avail_proc);
3794 if (__kmp_affinity_gran_levels == 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003795 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003796 }
3797
3798 //
3799 // Set the childNums vector in all Address objects. This must be done
3800 // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3801 // which takes into account the setting of __kmp_affinity_compact.
3802 //
3803 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3804
3805 switch (__kmp_affinity_type) {
3806
3807 case affinity_explicit:
3808 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3809# if OMP_40_ENABLED
3810 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3811# endif
3812 {
3813 __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3814 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3815 maxIndex);
3816 }
3817# if OMP_40_ENABLED
3818 else {
3819 __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3820 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3821 maxIndex);
3822 }
3823# endif
3824 if (__kmp_affinity_num_masks == 0) {
3825 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3826 && (__kmp_affinity_type != affinity_none))) {
3827 KMP_WARNING(AffNoValidProcID);
3828 }
3829 __kmp_affinity_type = affinity_none;
3830 return;
3831 }
3832 break;
3833
3834 //
3835 // The other affinity types rely on sorting the Addresses according
3836 // to some permutation of the machine topology tree. Set
3837 // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3838 // then jump to a common code fragment to do the sort and create
3839 // the array of affinity masks.
3840 //
3841
3842 case affinity_logical:
3843 __kmp_affinity_compact = 0;
3844 if (__kmp_affinity_offset) {
3845 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3846 % __kmp_avail_proc;
3847 }
3848 goto sortAddresses;
3849
3850 case affinity_physical:
3851 if (__kmp_nThreadsPerCore > 1) {
3852 __kmp_affinity_compact = 1;
3853 if (__kmp_affinity_compact >= depth) {
3854 __kmp_affinity_compact = 0;
3855 }
3856 } else {
3857 __kmp_affinity_compact = 0;
3858 }
3859 if (__kmp_affinity_offset) {
3860 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3861 % __kmp_avail_proc;
3862 }
3863 goto sortAddresses;
3864
3865 case affinity_scatter:
3866 if (__kmp_affinity_compact >= depth) {
3867 __kmp_affinity_compact = 0;
3868 }
3869 else {
3870 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3871 }
3872 goto sortAddresses;
3873
3874 case affinity_compact:
3875 if (__kmp_affinity_compact >= depth) {
3876 __kmp_affinity_compact = depth - 1;
3877 }
3878 goto sortAddresses;
3879
Jim Cownie5e8470a2013-09-27 10:38:44 +00003880 case affinity_balanced:
Jonathan Peytoncaf09fe2015-05-27 23:27:33 +00003881 // Balanced works only for the case of a single package
Jim Cownie5e8470a2013-09-27 10:38:44 +00003882 if( nPackages > 1 ) {
3883 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3884 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3885 }
3886 __kmp_affinity_type = affinity_none;
3887 return;
3888 } else if( __kmp_affinity_uniform_topology() ) {
3889 break;
3890 } else { // Non-uniform topology
3891
3892 // Save the depth for further usage
3893 __kmp_aff_depth = depth;
3894
3895 // Number of hyper threads per core in HT machine
3896 int nth_per_core = __kmp_nThreadsPerCore;
3897
3898 int core_level;
3899 if( nth_per_core > 1 ) {
3900 core_level = depth - 2;
3901 } else {
3902 core_level = depth - 1;
3903 }
3904 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3905 int nproc = nth_per_core * ncores;
3906
3907 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3908 for( int i = 0; i < nproc; i++ ) {
3909 procarr[ i ] = -1;
3910 }
3911
3912 for( int i = 0; i < __kmp_avail_proc; i++ ) {
3913 int proc = address2os[ i ].second;
3914 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3915 // If there is only one thread per core then depth == 2: level 0 - package,
3916 // level 1 - core.
3917 int level = depth - 1;
3918
3919 // __kmp_nth_per_core == 1
3920 int thread = 0;
3921 int core = address2os[ i ].first.labels[ level ];
3922 // If the thread level exists, that is we have more than one thread context per core
3923 if( nth_per_core > 1 ) {
3924 thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3925 core = address2os[ i ].first.labels[ level - 1 ];
3926 }
3927 procarr[ core * nth_per_core + thread ] = proc;
3928 }
3929
3930 break;
3931 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003932
3933 sortAddresses:
3934 //
3935 // Allocate the gtid->affinity mask table.
3936 //
3937 if (__kmp_affinity_dups) {
3938 __kmp_affinity_num_masks = __kmp_avail_proc;
3939 }
3940 else {
3941 __kmp_affinity_num_masks = numUnique;
3942 }
3943
3944# if OMP_40_ENABLED
3945 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3946 && ( __kmp_affinity_num_places > 0 )
3947 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3948 __kmp_affinity_num_masks = __kmp_affinity_num_places;
3949 }
3950# endif
3951
3952 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3953 __kmp_affinity_num_masks * __kmp_affin_mask_size);
3954
3955 //
3956 // Sort the address2os table according to the current setting of
3957 // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3958 //
3959 qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3960 __kmp_affinity_cmp_Address_child_num);
3961 {
3962 int i;
3963 unsigned j;
3964 for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3965 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3966 continue;
3967 }
3968 unsigned osId = address2os[i].second;
3969 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3970 kmp_affin_mask_t *dest
3971 = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3972 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3973 KMP_CPU_COPY(dest, src);
3974 if (++j >= __kmp_affinity_num_masks) {
3975 break;
3976 }
3977 }
3978 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3979 }
3980 break;
3981
3982 default:
3983 KMP_ASSERT2(0, "Unexpected affinity setting");
3984 }
3985
3986 __kmp_free(osId2Mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003987 machine_hierarchy.init(address2os, __kmp_avail_proc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003988}
3989
3990
3991void
3992__kmp_affinity_initialize(void)
3993{
3994 //
3995 // Much of the code above was written assumming that if a machine was not
3996 // affinity capable, then __kmp_affinity_type == affinity_none. We now
3997 // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3998 //
3999 // There are too many checks for __kmp_affinity_type == affinity_none
4000 // in this code. Instead of trying to change them all, check if
4001 // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4002 // affinity_none, call the real initialization routine, then restore
4003 // __kmp_affinity_type to affinity_disabled.
4004 //
4005 int disabled = (__kmp_affinity_type == affinity_disabled);
4006 if (! KMP_AFFINITY_CAPABLE()) {
4007 KMP_ASSERT(disabled);
4008 }
4009 if (disabled) {
4010 __kmp_affinity_type = affinity_none;
4011 }
4012 __kmp_aux_affinity_initialize();
4013 if (disabled) {
4014 __kmp_affinity_type = affinity_disabled;
4015 }
4016}
4017
4018
4019void
4020__kmp_affinity_uninitialize(void)
4021{
4022 if (__kmp_affinity_masks != NULL) {
4023 __kmp_free(__kmp_affinity_masks);
4024 __kmp_affinity_masks = NULL;
4025 }
4026 if (fullMask != NULL) {
4027 KMP_CPU_FREE(fullMask);
4028 fullMask = NULL;
4029 }
4030 __kmp_affinity_num_masks = 0;
4031# if OMP_40_ENABLED
4032 __kmp_affinity_num_places = 0;
4033# endif
4034 if (__kmp_affinity_proclist != NULL) {
4035 __kmp_free(__kmp_affinity_proclist);
4036 __kmp_affinity_proclist = NULL;
4037 }
4038 if( address2os != NULL ) {
4039 __kmp_free( address2os );
4040 address2os = NULL;
4041 }
4042 if( procarr != NULL ) {
4043 __kmp_free( procarr );
4044 procarr = NULL;
4045 }
4046}
4047
4048
4049void
4050__kmp_affinity_set_init_mask(int gtid, int isa_root)
4051{
4052 if (! KMP_AFFINITY_CAPABLE()) {
4053 return;
4054 }
4055
4056 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4057 if (th->th.th_affin_mask == NULL) {
4058 KMP_CPU_ALLOC(th->th.th_affin_mask);
4059 }
4060 else {
4061 KMP_CPU_ZERO(th->th.th_affin_mask);
4062 }
4063
4064 //
4065 // Copy the thread mask to the kmp_info_t strucuture.
4066 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
4067 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4068 // is set, then the full mask is the same as the mask of the initialization
4069 // thread.
4070 //
4071 kmp_affin_mask_t *mask;
4072 int i;
4073
4074# if OMP_40_ENABLED
4075 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4076# endif
4077 {
Andrey Churbanovf28f6132015-01-13 14:54:00 +00004078 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
Jim Cownie5e8470a2013-09-27 10:38:44 +00004079 ) {
Andrey Churbanov7daf9802015-01-27 16:52:57 +00004080# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00004081 if (__kmp_num_proc_groups > 1) {
4082 return;
4083 }
4084# endif
4085 KMP_ASSERT(fullMask != NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004086 i = KMP_PLACE_ALL;
Jim Cownie5e8470a2013-09-27 10:38:44 +00004087 mask = fullMask;
4088 }
4089 else {
4090 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4091 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4092 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4093 }
4094 }
4095# if OMP_40_ENABLED
4096 else {
4097 if ((! isa_root)
4098 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
Andrey Churbanov7daf9802015-01-27 16:52:57 +00004099# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00004100 if (__kmp_num_proc_groups > 1) {
4101 return;
4102 }
4103# endif
4104 KMP_ASSERT(fullMask != NULL);
4105 i = KMP_PLACE_ALL;
4106 mask = fullMask;
4107 }
4108 else {
4109 //
4110 // int i = some hash function or just a counter that doesn't
4111 // always start at 0. Use gtid for now.
4112 //
4113 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4114 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4115 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4116 }
4117 }
4118# endif
4119
4120# if OMP_40_ENABLED
4121 th->th.th_current_place = i;
4122 if (isa_root) {
4123 th->th.th_new_place = i;
4124 th->th.th_first_place = 0;
4125 th->th.th_last_place = __kmp_affinity_num_masks - 1;
4126 }
4127
4128 if (i == KMP_PLACE_ALL) {
4129 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4130 gtid));
4131 }
4132 else {
4133 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4134 gtid, i));
4135 }
4136# else
4137 if (i == -1) {
4138 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4139 gtid));
4140 }
4141 else {
4142 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4143 gtid, i));
4144 }
4145# endif /* OMP_40_ENABLED */
4146
4147 KMP_CPU_COPY(th->th.th_affin_mask, mask);
4148
4149 if (__kmp_affinity_verbose) {
4150 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4151 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4152 th->th.th_affin_mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004153 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4154 buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004155 }
4156
4157# if KMP_OS_WINDOWS
4158 //
4159 // On Windows* OS, the process affinity mask might have changed.
4160 // If the user didn't request affinity and this call fails,
4161 // just continue silently. See CQ171393.
4162 //
4163 if ( __kmp_affinity_type == affinity_none ) {
4164 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4165 }
4166 else
4167# endif
4168 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4169}
4170
4171
4172# if OMP_40_ENABLED
4173
4174void
4175__kmp_affinity_set_place(int gtid)
4176{
4177 int retval;
4178
4179 if (! KMP_AFFINITY_CAPABLE()) {
4180 return;
4181 }
4182
4183 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4184
4185 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4186 gtid, th->th.th_new_place, th->th.th_current_place));
4187
4188 //
Alp Toker8f2d3f02014-02-24 10:40:15 +00004189 // Check that the new place is within this thread's partition.
Jim Cownie5e8470a2013-09-27 10:38:44 +00004190 //
4191 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004192 KMP_ASSERT(th->th.th_new_place >= 0);
4193 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004194 if (th->th.th_first_place <= th->th.th_last_place) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004195 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
Jim Cownie5e8470a2013-09-27 10:38:44 +00004196 && (th->th.th_new_place <= th->th.th_last_place));
4197 }
4198 else {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004199 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
Jim Cownie5e8470a2013-09-27 10:38:44 +00004200 || (th->th.th_new_place >= th->th.th_last_place));
4201 }
4202
4203 //
4204 // Copy the thread mask to the kmp_info_t strucuture,
4205 // and set this thread's affinity.
4206 //
4207 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4208 th->th.th_new_place);
4209 KMP_CPU_COPY(th->th.th_affin_mask, mask);
4210 th->th.th_current_place = th->th.th_new_place;
4211
4212 if (__kmp_affinity_verbose) {
4213 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4214 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4215 th->th.th_affin_mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004216 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4217 gtid, buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004218 }
4219 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4220}
4221
4222# endif /* OMP_40_ENABLED */
4223
4224
4225int
4226__kmp_aux_set_affinity(void **mask)
4227{
4228 int gtid;
4229 kmp_info_t *th;
4230 int retval;
4231
4232 if (! KMP_AFFINITY_CAPABLE()) {
4233 return -1;
4234 }
4235
4236 gtid = __kmp_entry_gtid();
4237 KA_TRACE(1000, ;{
4238 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4239 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4240 (kmp_affin_mask_t *)(*mask));
4241 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4242 gtid, buf);
4243 });
4244
4245 if (__kmp_env_consistency_check) {
4246 if ((mask == NULL) || (*mask == NULL)) {
4247 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4248 }
4249 else {
4250 unsigned proc;
4251 int num_procs = 0;
4252
4253 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4254 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4255 continue;
4256 }
4257 num_procs++;
4258 if (! KMP_CPU_ISSET(proc, fullMask)) {
4259 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4260 break;
4261 }
4262 }
4263 if (num_procs == 0) {
4264 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4265 }
4266
Andrey Churbanov7daf9802015-01-27 16:52:57 +00004267# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00004268 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4269 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4270 }
Andrey Churbanov7daf9802015-01-27 16:52:57 +00004271# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +00004272
4273 }
4274 }
4275
4276 th = __kmp_threads[gtid];
4277 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4278 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4279 if (retval == 0) {
4280 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4281 }
4282
4283# if OMP_40_ENABLED
4284 th->th.th_current_place = KMP_PLACE_UNDEFINED;
4285 th->th.th_new_place = KMP_PLACE_UNDEFINED;
4286 th->th.th_first_place = 0;
4287 th->th.th_last_place = __kmp_affinity_num_masks - 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004288
4289 //
4290 // Turn off 4.0 affinity for the current tread at this parallel level.
4291 //
4292 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
Jim Cownie5e8470a2013-09-27 10:38:44 +00004293# endif
4294
4295 return retval;
4296}
4297
4298
4299int
4300__kmp_aux_get_affinity(void **mask)
4301{
4302 int gtid;
4303 int retval;
4304 kmp_info_t *th;
4305
4306 if (! KMP_AFFINITY_CAPABLE()) {
4307 return -1;
4308 }
4309
4310 gtid = __kmp_entry_gtid();
4311 th = __kmp_threads[gtid];
4312 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4313
4314 KA_TRACE(1000, ;{
4315 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4316 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4317 th->th.th_affin_mask);
4318 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4319 });
4320
4321 if (__kmp_env_consistency_check) {
4322 if ((mask == NULL) || (*mask == NULL)) {
4323 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4324 }
4325 }
4326
4327# if !KMP_OS_WINDOWS
4328
4329 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4330 KA_TRACE(1000, ;{
4331 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4332 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4333 (kmp_affin_mask_t *)(*mask));
4334 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4335 });
4336 return retval;
4337
4338# else
4339
4340 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4341 return 0;
4342
4343# endif /* KMP_OS_WINDOWS */
4344
4345}
4346
Jim Cownie5e8470a2013-09-27 10:38:44 +00004347int
4348__kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4349{
4350 int retval;
4351
4352 if (! KMP_AFFINITY_CAPABLE()) {
4353 return -1;
4354 }
4355
4356 KA_TRACE(1000, ;{
4357 int gtid = __kmp_entry_gtid();
4358 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4359 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4360 (kmp_affin_mask_t *)(*mask));
4361 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4362 proc, gtid, buf);
4363 });
4364
4365 if (__kmp_env_consistency_check) {
4366 if ((mask == NULL) || (*mask == NULL)) {
4367 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4368 }
4369 }
4370
4371 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4372 return -1;
4373 }
4374 if (! KMP_CPU_ISSET(proc, fullMask)) {
4375 return -2;
4376 }
4377
4378 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4379 return 0;
4380}
4381
4382
4383int
4384__kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4385{
4386 int retval;
4387
4388 if (! KMP_AFFINITY_CAPABLE()) {
4389 return -1;
4390 }
4391
4392 KA_TRACE(1000, ;{
4393 int gtid = __kmp_entry_gtid();
4394 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4395 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4396 (kmp_affin_mask_t *)(*mask));
4397 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4398 proc, gtid, buf);
4399 });
4400
4401 if (__kmp_env_consistency_check) {
4402 if ((mask == NULL) || (*mask == NULL)) {
4403 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4404 }
4405 }
4406
4407 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4408 return -1;
4409 }
4410 if (! KMP_CPU_ISSET(proc, fullMask)) {
4411 return -2;
4412 }
4413
4414 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4415 return 0;
4416}
4417
4418
4419int
4420__kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4421{
4422 int retval;
4423
4424 if (! KMP_AFFINITY_CAPABLE()) {
4425 return -1;
4426 }
4427
4428 KA_TRACE(1000, ;{
4429 int gtid = __kmp_entry_gtid();
4430 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4431 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4432 (kmp_affin_mask_t *)(*mask));
4433 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4434 proc, gtid, buf);
4435 });
4436
4437 if (__kmp_env_consistency_check) {
4438 if ((mask == NULL) || (*mask == NULL)) {
Andrey Churbanov4b2f17a2015-01-29 15:49:22 +00004439 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
Jim Cownie5e8470a2013-09-27 10:38:44 +00004440 }
4441 }
4442
4443 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4444 return 0;
4445 }
4446 if (! KMP_CPU_ISSET(proc, fullMask)) {
4447 return 0;
4448 }
4449
4450 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4451}
4452
Jim Cownie5e8470a2013-09-27 10:38:44 +00004453
4454// Dynamic affinity settings - Affinity balanced
4455void __kmp_balanced_affinity( int tid, int nthreads )
4456{
4457 if( __kmp_affinity_uniform_topology() ) {
4458 int coreID;
4459 int threadID;
4460 // Number of hyper threads per core in HT machine
4461 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4462 // Number of cores
4463 int ncores = __kmp_ncores;
4464 // How many threads will be bound to each core
4465 int chunk = nthreads / ncores;
4466 // How many cores will have an additional thread bound to it - "big cores"
4467 int big_cores = nthreads % ncores;
4468 // Number of threads on the big cores
4469 int big_nth = ( chunk + 1 ) * big_cores;
4470 if( tid < big_nth ) {
4471 coreID = tid / (chunk + 1 );
4472 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4473 } else { //tid >= big_nth
4474 coreID = ( tid - big_cores ) / chunk;
4475 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4476 }
4477
4478 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4479 "Illegal set affinity operation when not capable");
4480
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00004481 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004482 KMP_CPU_ZERO(mask);
4483
4484 // Granularity == thread
4485 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4486 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4487 KMP_CPU_SET( osID, mask);
4488 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4489 for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4490 int osID;
4491 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4492 KMP_CPU_SET( osID, mask);
4493 }
4494 }
4495 if (__kmp_affinity_verbose) {
4496 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4497 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004498 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4499 tid, buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004500 }
4501 __kmp_set_system_affinity( mask, TRUE );
4502 } else { // Non-uniform topology
4503
Andrey Churbanov74bf17b2015-04-02 13:27:08 +00004504 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004505 KMP_CPU_ZERO(mask);
4506
4507 // Number of hyper threads per core in HT machine
4508 int nth_per_core = __kmp_nThreadsPerCore;
4509 int core_level;
4510 if( nth_per_core > 1 ) {
4511 core_level = __kmp_aff_depth - 2;
4512 } else {
4513 core_level = __kmp_aff_depth - 1;
4514 }
4515
4516 // Number of cores - maximum value; it does not count trail cores with 0 processors
4517 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4518
4519 // For performance gain consider the special case nthreads == __kmp_avail_proc
4520 if( nthreads == __kmp_avail_proc ) {
4521 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4522 int osID = address2os[ tid ].second;
4523 KMP_CPU_SET( osID, mask);
4524 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4525 int coreID = address2os[ tid ].first.labels[ core_level ];
4526 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4527 // since the address2os is sortied we can break when cnt==nth_per_core
4528 int cnt = 0;
4529 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4530 int osID = address2os[ i ].second;
4531 int core = address2os[ i ].first.labels[ core_level ];
4532 if( core == coreID ) {
4533 KMP_CPU_SET( osID, mask);
4534 cnt++;
4535 if( cnt == nth_per_core ) {
4536 break;
4537 }
4538 }
4539 }
4540 }
4541 } else if( nthreads <= __kmp_ncores ) {
4542
4543 int core = 0;
4544 for( int i = 0; i < ncores; i++ ) {
4545 // Check if this core from procarr[] is in the mask
4546 int in_mask = 0;
4547 for( int j = 0; j < nth_per_core; j++ ) {
4548 if( procarr[ i * nth_per_core + j ] != - 1 ) {
4549 in_mask = 1;
4550 break;
4551 }
4552 }
4553 if( in_mask ) {
4554 if( tid == core ) {
4555 for( int j = 0; j < nth_per_core; j++ ) {
4556 int osID = procarr[ i * nth_per_core + j ];
4557 if( osID != -1 ) {
4558 KMP_CPU_SET( osID, mask );
4559 // For granularity=thread it is enough to set the first available osID for this core
4560 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4561 break;
4562 }
4563 }
4564 }
4565 break;
4566 } else {
4567 core++;
4568 }
4569 }
4570 }
4571
4572 } else { // nthreads > __kmp_ncores
4573
4574 // Array to save the number of processors at each core
Jonathan Peyton7be075332015-06-22 15:53:50 +00004575 int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004576 // Array to save the number of cores with "x" available processors;
Jonathan Peyton7be075332015-06-22 15:53:50 +00004577 int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
Jim Cownie5e8470a2013-09-27 10:38:44 +00004578 // Array to save the number of cores with # procs from x to nth_per_core
Jonathan Peyton7be075332015-06-22 15:53:50 +00004579 int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
Jim Cownie5e8470a2013-09-27 10:38:44 +00004580
4581 for( int i = 0; i <= nth_per_core; i++ ) {
4582 ncores_with_x_procs[ i ] = 0;
4583 ncores_with_x_to_max_procs[ i ] = 0;
4584 }
4585
4586 for( int i = 0; i < ncores; i++ ) {
4587 int cnt = 0;
4588 for( int j = 0; j < nth_per_core; j++ ) {
4589 if( procarr[ i * nth_per_core + j ] != -1 ) {
4590 cnt++;
4591 }
4592 }
4593 nproc_at_core[ i ] = cnt;
4594 ncores_with_x_procs[ cnt ]++;
4595 }
4596
4597 for( int i = 0; i <= nth_per_core; i++ ) {
4598 for( int j = i; j <= nth_per_core; j++ ) {
4599 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4600 }
4601 }
4602
4603 // Max number of processors
4604 int nproc = nth_per_core * ncores;
4605 // An array to keep number of threads per each context
4606 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4607 for( int i = 0; i < nproc; i++ ) {
4608 newarr[ i ] = 0;
4609 }
4610
4611 int nth = nthreads;
4612 int flag = 0;
4613 while( nth > 0 ) {
4614 for( int j = 1; j <= nth_per_core; j++ ) {
4615 int cnt = ncores_with_x_to_max_procs[ j ];
4616 for( int i = 0; i < ncores; i++ ) {
4617 // Skip the core with 0 processors
4618 if( nproc_at_core[ i ] == 0 ) {
4619 continue;
4620 }
4621 for( int k = 0; k < nth_per_core; k++ ) {
4622 if( procarr[ i * nth_per_core + k ] != -1 ) {
4623 if( newarr[ i * nth_per_core + k ] == 0 ) {
4624 newarr[ i * nth_per_core + k ] = 1;
4625 cnt--;
4626 nth--;
4627 break;
4628 } else {
4629 if( flag != 0 ) {
4630 newarr[ i * nth_per_core + k ] ++;
4631 cnt--;
4632 nth--;
4633 break;
4634 }
4635 }
4636 }
4637 }
4638 if( cnt == 0 || nth == 0 ) {
4639 break;
4640 }
4641 }
4642 if( nth == 0 ) {
4643 break;
4644 }
4645 }
4646 flag = 1;
4647 }
4648 int sum = 0;
4649 for( int i = 0; i < nproc; i++ ) {
4650 sum += newarr[ i ];
4651 if( sum > tid ) {
4652 // Granularity == thread
4653 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4654 int osID = procarr[ i ];
4655 KMP_CPU_SET( osID, mask);
4656 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4657 int coreID = i / nth_per_core;
4658 for( int ii = 0; ii < nth_per_core; ii++ ) {
4659 int osID = procarr[ coreID * nth_per_core + ii ];
4660 if( osID != -1 ) {
4661 KMP_CPU_SET( osID, mask);
4662 }
4663 }
4664 }
4665 break;
4666 }
4667 }
4668 __kmp_free( newarr );
4669 }
4670
4671 if (__kmp_affinity_verbose) {
4672 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4673 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004674 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4675 tid, buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004676 }
4677 __kmp_set_system_affinity( mask, TRUE );
4678 }
4679}
4680
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004681#else
4682 // affinity not supported
4683
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +00004684static const kmp_uint32 noaff_maxLevels=7;
4685kmp_uint32 noaff_skipPerLevel[noaff_maxLevels];
4686kmp_uint32 noaff_depth;
4687kmp_uint8 noaff_leaf_kids;
4688kmp_int8 noaff_uninitialized=1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004689
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +00004690void noaff_init(int nprocs)
4691{
4692 kmp_int8 result = KMP_COMPARE_AND_STORE_ACQ8(&noaff_uninitialized, 1, 2);
4693 if (result == 0) return; // Already initialized
4694 else if (result == 2) { // Someone else is initializing
4695 while (TCR_1(noaff_uninitialized) != 0) KMP_CPU_PAUSE();
4696 return;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004697 }
Andrey Churbanovaa1f2b62015-04-13 18:51:59 +00004698 KMP_DEBUG_ASSERT(result==1);
4699
4700 kmp_uint32 numPerLevel[noaff_maxLevels];
4701 noaff_depth = 1;
4702 for (kmp_uint32 i=0; i<noaff_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
4703 numPerLevel[i] = 1;
4704 noaff_skipPerLevel[i] = 1;
4705 }
4706
4707 numPerLevel[0] = 4;
4708 numPerLevel[1] = nprocs/4;
4709 if (nprocs%4) numPerLevel[1]++;
4710
4711 for (int i=noaff_maxLevels-1; i>=0; --i) // count non-empty levels to get depth
4712 if (numPerLevel[i] != 1 || noaff_depth > 1) // only count one top-level '1'
4713 noaff_depth++;
4714
4715 kmp_uint32 branch = 4;
4716 if (numPerLevel[0] == 1) branch = nprocs/4;
4717 if (branch<4) branch=4;
4718 for (kmp_uint32 d=0; d<noaff_depth-1; ++d) { // optimize hierarchy width
4719 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
4720 if (numPerLevel[d] & 1) numPerLevel[d]++;
4721 numPerLevel[d] = numPerLevel[d] >> 1;
4722 if (numPerLevel[d+1] == 1) noaff_depth++;
4723 numPerLevel[d+1] = numPerLevel[d+1] << 1;
4724 }
4725 if(numPerLevel[0] == 1) {
4726 branch = branch >> 1;
4727 if (branch<4) branch = 4;
4728 }
4729 }
4730
4731 for (kmp_uint32 i=1; i<noaff_depth; ++i)
4732 noaff_skipPerLevel[i] = numPerLevel[i-1] * noaff_skipPerLevel[i-1];
4733 // Fill in hierarchy in the case of oversubscription
4734 for (kmp_uint32 i=noaff_depth; i<noaff_maxLevels; ++i)
4735 noaff_skipPerLevel[i] = 2*noaff_skipPerLevel[i-1];
4736 noaff_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
4737 noaff_uninitialized = 0; // One writer
4738
4739}
4740
4741void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
4742 if (noaff_uninitialized)
4743 noaff_init(nproc);
4744
4745 thr_bar->depth = noaff_depth;
4746 thr_bar->base_leaf_kids = noaff_leaf_kids;
4747 thr_bar->skip_per_level = noaff_skipPerLevel;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004748}
4749
Alp Toker763b9392014-02-28 09:42:41 +00004750#endif // KMP_AFFINITY_SUPPORTED