blob: b16b458d2157832c143e7d978b704f4cbb5afaae [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_affinity.cpp -- affinity management
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16#include "kmp.h"
17#include "kmp_i18n.h"
18#include "kmp_io.h"
19#include "kmp_str.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000020#include "kmp_wrapper_getpid.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000021
Alp Toker763b9392014-02-28 09:42:41 +000022#if KMP_AFFINITY_SUPPORTED
Jim Cownie5e8470a2013-09-27 10:38:44 +000023
24//
25// Print the affinity mask to the character array in a pretty format.
26//
27char *
28__kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
29{
30 KMP_ASSERT(buf_len >= 40);
31 char *scan = buf;
32 char *end = buf + buf_len - 1;
33
34 //
35 // Find first element / check for empty set.
36 //
37 size_t i;
38 for (i = 0; i < KMP_CPU_SETSIZE; i++) {
39 if (KMP_CPU_ISSET(i, mask)) {
40 break;
41 }
42 }
43 if (i == KMP_CPU_SETSIZE) {
44 sprintf(scan, "{<empty>}");
45 while (*scan != '\0') scan++;
46 KMP_ASSERT(scan <= end);
47 return buf;
48 }
49
Jim Cownie4cc4bb42014-10-07 16:25:50 +000050 sprintf(scan, "{%ld", (long)i);
Jim Cownie5e8470a2013-09-27 10:38:44 +000051 while (*scan != '\0') scan++;
52 i++;
53 for (; i < KMP_CPU_SETSIZE; i++) {
54 if (! KMP_CPU_ISSET(i, mask)) {
55 continue;
56 }
57
58 //
59 // Check for buffer overflow. A string of the form ",<n>" will have
60 // at most 10 characters, plus we want to leave room to print ",...}"
61 // if the set is too large to print for a total of 15 characters.
62 // We already left room for '\0' in setting end.
63 //
64 if (end - scan < 15) {
65 break;
66 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +000067 sprintf(scan, ",%-ld", (long)i);
Jim Cownie5e8470a2013-09-27 10:38:44 +000068 while (*scan != '\0') scan++;
69 }
70 if (i < KMP_CPU_SETSIZE) {
71 sprintf(scan, ",...");
72 while (*scan != '\0') scan++;
73 }
74 sprintf(scan, "}");
75 while (*scan != '\0') scan++;
76 KMP_ASSERT(scan <= end);
77 return buf;
78}
79
80
81void
82__kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
83{
84 KMP_CPU_ZERO(mask);
85
Andrey Churbanov7daf9802015-01-27 16:52:57 +000086# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +000087
88 if (__kmp_num_proc_groups > 1) {
89 int group;
Jim Cownie5e8470a2013-09-27 10:38:44 +000090 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
91 for (group = 0; group < __kmp_num_proc_groups; group++) {
92 int i;
93 int num = __kmp_GetActiveProcessorCount(group);
94 for (i = 0; i < num; i++) {
95 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
96 }
97 }
98 }
99 else
100
Andrey Churbanov7daf9802015-01-27 16:52:57 +0000101# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +0000102
103 {
104 int proc;
105 for (proc = 0; proc < __kmp_xproc; proc++) {
106 KMP_CPU_SET(proc, mask);
107 }
108 }
109}
110
111
112//
113// In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
114// functions.
115//
116// The icc codegen emits sections with extremely long names, of the form
117// ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug
118// introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
119// some sort of memory corruption or table overflow that is triggered by
120// these long strings. I checked the latest version of the linker -
121// GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
122// fixed.
123//
124// Unfortunately, my attempts to reproduce it in a smaller example have
125// failed - I'm not sure what the prospects are of getting it fixed
126// properly - but we need a reproducer smaller than all of libiomp.
127//
128// Work around the problem by avoiding inline constructors in such builds.
129// We do this for all platforms, not just Linux* OS - non-inline functions are
130// more debuggable and provide better coverage into than inline functions.
131// Use inline functions in shipping libs, for performance.
132//
133
134# if !defined(KMP_DEBUG) && !defined(COVER)
135
136class Address {
137public:
138 static const unsigned maxDepth = 32;
139 unsigned labels[maxDepth];
140 unsigned childNums[maxDepth];
141 unsigned depth;
142 unsigned leader;
143 Address(unsigned _depth)
144 : depth(_depth), leader(FALSE) {
145 }
146 Address &operator=(const Address &b) {
147 depth = b.depth;
148 for (unsigned i = 0; i < depth; i++) {
149 labels[i] = b.labels[i];
150 childNums[i] = b.childNums[i];
151 }
152 leader = FALSE;
153 return *this;
154 }
155 bool operator==(const Address &b) const {
156 if (depth != b.depth)
157 return false;
158 for (unsigned i = 0; i < depth; i++)
159 if(labels[i] != b.labels[i])
160 return false;
161 return true;
162 }
163 bool isClose(const Address &b, int level) const {
164 if (depth != b.depth)
165 return false;
166 if ((unsigned)level >= depth)
167 return true;
168 for (unsigned i = 0; i < (depth - level); i++)
169 if(labels[i] != b.labels[i])
170 return false;
171 return true;
172 }
173 bool operator!=(const Address &b) const {
174 return !operator==(b);
175 }
176};
177
178class AddrUnsPair {
179public:
180 Address first;
181 unsigned second;
182 AddrUnsPair(Address _first, unsigned _second)
183 : first(_first), second(_second) {
184 }
185 AddrUnsPair &operator=(const AddrUnsPair &b)
186 {
187 first = b.first;
188 second = b.second;
189 return *this;
190 }
191};
192
193# else
194
195class Address {
196public:
197 static const unsigned maxDepth = 32;
198 unsigned labels[maxDepth];
199 unsigned childNums[maxDepth];
200 unsigned depth;
201 unsigned leader;
202 Address(unsigned _depth);
203 Address &operator=(const Address &b);
204 bool operator==(const Address &b) const;
205 bool isClose(const Address &b, int level) const;
206 bool operator!=(const Address &b) const;
207};
208
209Address::Address(unsigned _depth)
210{
211 depth = _depth;
212 leader = FALSE;
213}
214
215Address &Address::operator=(const Address &b) {
216 depth = b.depth;
217 for (unsigned i = 0; i < depth; i++) {
218 labels[i] = b.labels[i];
219 childNums[i] = b.childNums[i];
220 }
221 leader = FALSE;
222 return *this;
223}
224
225bool Address::operator==(const Address &b) const {
226 if (depth != b.depth)
227 return false;
228 for (unsigned i = 0; i < depth; i++)
229 if(labels[i] != b.labels[i])
230 return false;
231 return true;
232}
233
234bool Address::isClose(const Address &b, int level) const {
235 if (depth != b.depth)
236 return false;
237 if ((unsigned)level >= depth)
238 return true;
239 for (unsigned i = 0; i < (depth - level); i++)
240 if(labels[i] != b.labels[i])
241 return false;
242 return true;
243}
244
245bool Address::operator!=(const Address &b) const {
246 return !operator==(b);
247}
248
249class AddrUnsPair {
250public:
251 Address first;
252 unsigned second;
253 AddrUnsPair(Address _first, unsigned _second);
254 AddrUnsPair &operator=(const AddrUnsPair &b);
255};
256
257AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
258 : first(_first), second(_second)
259{
260}
261
262AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
263{
264 first = b.first;
265 second = b.second;
266 return *this;
267}
268
269# endif /* !defined(KMP_DEBUG) && !defined(COVER) */
270
271
272static int
273__kmp_affinity_cmp_Address_labels(const void *a, const void *b)
274{
275 const Address *aa = (const Address *)&(((AddrUnsPair *)a)
276 ->first);
277 const Address *bb = (const Address *)&(((AddrUnsPair *)b)
278 ->first);
279 unsigned depth = aa->depth;
280 unsigned i;
281 KMP_DEBUG_ASSERT(depth == bb->depth);
282 for (i = 0; i < depth; i++) {
283 if (aa->labels[i] < bb->labels[i]) return -1;
284 if (aa->labels[i] > bb->labels[i]) return 1;
285 }
286 return 0;
287}
288
289
290static int
291__kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
292{
293 const Address *aa = (const Address *)&(((AddrUnsPair *)a)
294 ->first);
295 const Address *bb = (const Address *)&(((AddrUnsPair *)b)
296 ->first);
297 unsigned depth = aa->depth;
298 unsigned i;
299 KMP_DEBUG_ASSERT(depth == bb->depth);
300 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
301 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
302 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
303 int j = depth - i - 1;
304 if (aa->childNums[j] < bb->childNums[j]) return -1;
305 if (aa->childNums[j] > bb->childNums[j]) return 1;
306 }
307 for (; i < depth; i++) {
308 int j = i - __kmp_affinity_compact;
309 if (aa->childNums[j] < bb->childNums[j]) return -1;
310 if (aa->childNums[j] > bb->childNums[j]) return 1;
311 }
312 return 0;
313}
314
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000315/** A structure for holding machine-specific hierarchy info to be computed once at init. */
316class hierarchy_info {
317public:
318 /** Typical levels are threads/core, cores/package or socket, packages/node, nodes/machine,
319 etc. We don't want to get specific with nomenclature */
320 static const kmp_uint32 maxLevels=7;
321
322 /** This is specifically the depth of the machine configuration hierarchy, in terms of the
323 number of levels along the longest path from root to any leaf. It corresponds to the
324 number of entries in numPerLevel if we exclude all but one trailing 1. */
325 kmp_uint32 depth;
326 kmp_uint32 base_depth;
327 kmp_uint32 base_num_threads;
328 bool uninitialized;
329
330 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
331 node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
332 and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
333 kmp_uint32 numPerLevel[maxLevels];
334 kmp_uint32 skipPerLevel[maxLevels];
335
336 void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
337 int hier_depth = adr2os[0].first.depth;
338 int level = 0;
339 for (int i=hier_depth-1; i>=0; --i) {
340 int max = -1;
341 for (int j=0; j<num_addrs; ++j) {
342 int next = adr2os[j].first.childNums[i];
343 if (next > max) max = next;
344 }
345 numPerLevel[level] = max+1;
346 ++level;
347 }
348 }
349
350 hierarchy_info() : depth(1), uninitialized(true) {}
351 void init(AddrUnsPair *adr2os, int num_addrs)
352 {
353 uninitialized = false;
354 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
355 numPerLevel[i] = 1;
356 skipPerLevel[i] = 1;
357 }
358
359 // Sort table by physical ID
360 if (adr2os) {
361 qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
362 deriveLevels(adr2os, num_addrs);
363 }
364 else {
365 numPerLevel[0] = 4;
366 numPerLevel[1] = num_addrs/4;
367 if (num_addrs%4) numPerLevel[1]++;
368 }
369
370 base_num_threads = num_addrs;
371 for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
372 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
373 depth++;
374
375 kmp_uint32 branch = 4;
376 if (numPerLevel[0] == 1) branch = num_addrs/4;
377 if (branch<4) branch=4;
378 for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
379 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
380 if (numPerLevel[d] & 1) numPerLevel[d]++;
381 numPerLevel[d] = numPerLevel[d] >> 1;
382 if (numPerLevel[d+1] == 1) depth++;
383 numPerLevel[d+1] = numPerLevel[d+1] << 1;
384 }
385 if(numPerLevel[0] == 1) {
386 branch = branch >> 1;
387 if (branch<4) branch = 4;
388 }
389 }
390
391 for (kmp_uint32 i=1; i<depth; ++i)
392 skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
393
394 base_depth = depth;
395 }
396};
397
398static hierarchy_info machine_hierarchy;
399
400void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
401 if (machine_hierarchy.uninitialized)
402 machine_hierarchy.init(NULL, nproc);
403
404 if (nproc <= machine_hierarchy.base_num_threads)
405 machine_hierarchy.depth = machine_hierarchy.base_depth;
406 KMP_DEBUG_ASSERT(machine_hierarchy.depth > 0);
407 while (nproc > machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1]) {
408 machine_hierarchy.depth++;
409 machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1] = 2*machine_hierarchy.skipPerLevel[machine_hierarchy.depth-2];
410 }
411 thr_bar->depth = machine_hierarchy.depth;
412 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
413 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
414}
Jim Cownie5e8470a2013-09-27 10:38:44 +0000415
416//
417// When sorting by labels, __kmp_affinity_assign_child_nums() must first be
418// called to renumber the labels from [0..n] and place them into the child_num
419// vector of the address object. This is done in case the labels used for
Alp Toker8f2d3f02014-02-24 10:40:15 +0000420// the children at one node of the hierarchy differ from those used for
Jim Cownie5e8470a2013-09-27 10:38:44 +0000421// another node at the same level. Example: suppose the machine has 2 nodes
422// with 2 packages each. The first node contains packages 601 and 602, and
423// second node contains packages 603 and 604. If we try to sort the table
424// for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
425// because we are paying attention to the labels themselves, not the ordinal
426// child numbers. By using the child numbers in the sort, the result is
427// {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
428//
429static void
430__kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
431 int numAddrs)
432{
433 KMP_DEBUG_ASSERT(numAddrs > 0);
434 int depth = address2os->first.depth;
435 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
436 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
437 * sizeof(unsigned));
438 int labCt;
439 for (labCt = 0; labCt < depth; labCt++) {
440 address2os[0].first.childNums[labCt] = counts[labCt] = 0;
441 lastLabel[labCt] = address2os[0].first.labels[labCt];
442 }
443 int i;
444 for (i = 1; i < numAddrs; i++) {
445 for (labCt = 0; labCt < depth; labCt++) {
446 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
447 int labCt2;
448 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
449 counts[labCt2] = 0;
450 lastLabel[labCt2] = address2os[i].first.labels[labCt2];
451 }
452 counts[labCt]++;
453 lastLabel[labCt] = address2os[i].first.labels[labCt];
454 break;
455 }
456 }
457 for (labCt = 0; labCt < depth; labCt++) {
458 address2os[i].first.childNums[labCt] = counts[labCt];
459 }
460 for (; labCt < (int)Address::maxDepth; labCt++) {
461 address2os[i].first.childNums[labCt] = 0;
462 }
463 }
464}
465
466
467//
468// All of the __kmp_affinity_create_*_map() routines should set
469// __kmp_affinity_masks to a vector of affinity mask objects of length
470// __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
471// return the number of levels in the machine topology tree (zero if
472// __kmp_affinity_type == affinity_none).
473//
474// All of the __kmp_affinity_create_*_map() routines should set *fullMask
475// to the affinity mask for the initialization thread. They need to save and
476// restore the mask, and it could be needed later, so saving it is just an
477// optimization to avoid calling kmp_get_system_affinity() again.
478//
479static kmp_affin_mask_t *fullMask = NULL;
480
481kmp_affin_mask_t *
482__kmp_affinity_get_fullMask() { return fullMask; }
483
484
485static int nCoresPerPkg, nPackages;
Andrey Churbanovf696c822015-01-27 16:55:43 +0000486static int __kmp_nThreadsPerCore;
487#ifndef KMP_DFLT_NTH_CORES
488static int __kmp_ncores;
489#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000490
491//
492// __kmp_affinity_uniform_topology() doesn't work when called from
493// places which support arbitrarily many levels in the machine topology
494// map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
495// __kmp_affinity_create_x2apicid_map().
496//
497inline static bool
498__kmp_affinity_uniform_topology()
499{
500 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
501}
502
503
504//
505// Print out the detailed machine topology map, i.e. the physical locations
506// of each OS proc.
507//
508static void
509__kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
510 int pkgLevel, int coreLevel, int threadLevel)
511{
512 int proc;
513
514 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
515 for (proc = 0; proc < len; proc++) {
516 int level;
517 kmp_str_buf_t buf;
518 __kmp_str_buf_init(&buf);
519 for (level = 0; level < depth; level++) {
520 if (level == threadLevel) {
521 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
522 }
523 else if (level == coreLevel) {
524 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
525 }
526 else if (level == pkgLevel) {
527 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
528 }
529 else if (level > pkgLevel) {
530 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
531 level - pkgLevel - 1);
532 }
533 else {
534 __kmp_str_buf_print(&buf, "L%d ", level);
535 }
536 __kmp_str_buf_print(&buf, "%d ",
537 address2os[proc].first.labels[level]);
538 }
539 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
540 buf.str);
541 __kmp_str_buf_free(&buf);
542 }
543}
544
545
546//
547// If we don't know how to retrieve the machine's processor topology, or
548// encounter an error in doing so, this routine is called to form a "flat"
549// mapping of os thread id's <-> processor id's.
550//
551static int
552__kmp_affinity_create_flat_map(AddrUnsPair **address2os,
553 kmp_i18n_id_t *const msg_id)
554{
555 *address2os = NULL;
556 *msg_id = kmp_i18n_null;
557
558 //
559 // Even if __kmp_affinity_type == affinity_none, this routine might still
Andrey Churbanovf696c822015-01-27 16:55:43 +0000560 // called to set __kmp_ncores, as well as
Jim Cownie5e8470a2013-09-27 10:38:44 +0000561 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
562 //
563 if (! KMP_AFFINITY_CAPABLE()) {
564 KMP_ASSERT(__kmp_affinity_type == affinity_none);
565 __kmp_ncores = nPackages = __kmp_xproc;
566 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000567 if (__kmp_affinity_verbose) {
568 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
569 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
570 KMP_INFORM(Uniform, "KMP_AFFINITY");
571 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
572 __kmp_nThreadsPerCore, __kmp_ncores);
573 }
574 return 0;
575 }
576
577 //
578 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +0000579 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +0000580 // nCoresPerPkg, & nPackages. Make sure all these vars are set
581 // correctly, and return now if affinity is not enabled.
582 //
583 __kmp_ncores = nPackages = __kmp_avail_proc;
584 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000585 if (__kmp_affinity_verbose) {
586 char buf[KMP_AFFIN_MASK_PRINT_LEN];
587 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
588
589 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
590 if (__kmp_affinity_respect_mask) {
591 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
592 } else {
593 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
594 }
595 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
596 KMP_INFORM(Uniform, "KMP_AFFINITY");
597 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
598 __kmp_nThreadsPerCore, __kmp_ncores);
599 }
600 if (__kmp_affinity_type == affinity_none) {
601 return 0;
602 }
603
604 //
605 // Contruct the data structure to be returned.
606 //
607 *address2os = (AddrUnsPair*)
608 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
609 int avail_ct = 0;
610 unsigned int i;
611 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
612 //
613 // Skip this proc if it is not included in the machine model.
614 //
615 if (! KMP_CPU_ISSET(i, fullMask)) {
616 continue;
617 }
618
619 Address addr(1);
620 addr.labels[0] = i;
621 (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
622 }
623 if (__kmp_affinity_verbose) {
624 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
625 }
626
627 if (__kmp_affinity_gran_levels < 0) {
628 //
629 // Only the package level is modeled in the machine topology map,
630 // so the #levels of granularity is either 0 or 1.
631 //
632 if (__kmp_affinity_gran > affinity_gran_package) {
633 __kmp_affinity_gran_levels = 1;
634 }
635 else {
636 __kmp_affinity_gran_levels = 0;
637 }
638 }
639 return 1;
640}
641
642
Andrey Churbanov7daf9802015-01-27 16:52:57 +0000643# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +0000644
645//
646// If multiple Windows* OS processor groups exist, we can create a 2-level
647// topology map with the groups at level 0 and the individual procs at
648// level 1.
649//
650// This facilitates letting the threads float among all procs in a group,
651// if granularity=group (the default when there are multiple groups).
652//
653static int
654__kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
655 kmp_i18n_id_t *const msg_id)
656{
657 *address2os = NULL;
658 *msg_id = kmp_i18n_null;
659
660 //
661 // If we don't have multiple processor groups, return now.
662 // The flat mapping will be used.
663 //
664 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
665 // FIXME set *msg_id
666 return -1;
667 }
668
669 //
670 // Contruct the data structure to be returned.
671 //
672 *address2os = (AddrUnsPair*)
673 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
674 int avail_ct = 0;
675 int i;
676 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
677 //
678 // Skip this proc if it is not included in the machine model.
679 //
680 if (! KMP_CPU_ISSET(i, fullMask)) {
681 continue;
682 }
683
684 Address addr(2);
685 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
686 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
687 (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
688
689 if (__kmp_affinity_verbose) {
690 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
691 addr.labels[1]);
692 }
693 }
694
695 if (__kmp_affinity_gran_levels < 0) {
696 if (__kmp_affinity_gran == affinity_gran_group) {
697 __kmp_affinity_gran_levels = 1;
698 }
699 else if ((__kmp_affinity_gran == affinity_gran_fine)
700 || (__kmp_affinity_gran == affinity_gran_thread)) {
701 __kmp_affinity_gran_levels = 0;
702 }
703 else {
704 const char *gran_str = NULL;
705 if (__kmp_affinity_gran == affinity_gran_core) {
706 gran_str = "core";
707 }
708 else if (__kmp_affinity_gran == affinity_gran_package) {
709 gran_str = "package";
710 }
711 else if (__kmp_affinity_gran == affinity_gran_node) {
712 gran_str = "node";
713 }
714 else {
715 KMP_ASSERT(0);
716 }
717
718 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
719 __kmp_affinity_gran_levels = 0;
720 }
721 }
722 return 2;
723}
724
Andrey Churbanov7daf9802015-01-27 16:52:57 +0000725# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +0000726
727
728# if KMP_ARCH_X86 || KMP_ARCH_X86_64
729
730static int
731__kmp_cpuid_mask_width(int count) {
732 int r = 0;
733
734 while((1<<r) < count)
735 ++r;
736 return r;
737}
738
739
740class apicThreadInfo {
741public:
742 unsigned osId; // param to __kmp_affinity_bind_thread
743 unsigned apicId; // from cpuid after binding
744 unsigned maxCoresPerPkg; // ""
745 unsigned maxThreadsPerPkg; // ""
746 unsigned pkgId; // inferred from above values
747 unsigned coreId; // ""
748 unsigned threadId; // ""
749};
750
751
752static int
753__kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
754{
755 const apicThreadInfo *aa = (const apicThreadInfo *)a;
756 const apicThreadInfo *bb = (const apicThreadInfo *)b;
757 if (aa->osId < bb->osId) return -1;
758 if (aa->osId > bb->osId) return 1;
759 return 0;
760}
761
762
763static int
764__kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
765{
766 const apicThreadInfo *aa = (const apicThreadInfo *)a;
767 const apicThreadInfo *bb = (const apicThreadInfo *)b;
768 if (aa->pkgId < bb->pkgId) return -1;
769 if (aa->pkgId > bb->pkgId) return 1;
770 if (aa->coreId < bb->coreId) return -1;
771 if (aa->coreId > bb->coreId) return 1;
772 if (aa->threadId < bb->threadId) return -1;
773 if (aa->threadId > bb->threadId) return 1;
774 return 0;
775}
776
777
778//
779// On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
780// an algorithm which cycles through the available os threads, setting
781// the current thread's affinity mask to that thread, and then retrieves
782// the Apic Id for each thread context using the cpuid instruction.
783//
784static int
785__kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
786 kmp_i18n_id_t *const msg_id)
787{
Andrey Churbanov1c331292015-01-27 17:03:42 +0000788 kmp_cpuid buf;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000789 int rc;
790 *address2os = NULL;
791 *msg_id = kmp_i18n_null;
792
Andrey Churbanov1c331292015-01-27 17:03:42 +0000793 //
794 // Check if cpuid leaf 4 is supported.
795 //
Jim Cownie5e8470a2013-09-27 10:38:44 +0000796 __kmp_x86_cpuid(0, 0, &buf);
797 if (buf.eax < 4) {
798 *msg_id = kmp_i18n_str_NoLeaf4Support;
799 return -1;
800 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000801
802 //
Jim Cownie5e8470a2013-09-27 10:38:44 +0000803 // The algorithm used starts by setting the affinity to each available
Andrey Churbanov1c331292015-01-27 17:03:42 +0000804 // thread and retrieving info from the cpuid instruction, so if we are
805 // not capable of calling __kmp_get_system_affinity() and
806 // _kmp_get_system_affinity(), then we need to do something else - use
807 // the defaults that we calculated from issuing cpuid without binding
808 // to each proc.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000809 //
810 if (! KMP_AFFINITY_CAPABLE()) {
811 //
812 // Hack to try and infer the machine topology using only the data
813 // available from cpuid on the current thread, and __kmp_xproc.
814 //
815 KMP_ASSERT(__kmp_affinity_type == affinity_none);
816
817 //
818 // Get an upper bound on the number of threads per package using
819 // cpuid(1).
820 //
821 // On some OS/chps combinations where HT is supported by the chip
822 // but is disabled, this value will be 2 on a single core chip.
823 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
824 //
Jim Cownie5e8470a2013-09-27 10:38:44 +0000825 __kmp_x86_cpuid(1, 0, &buf);
826 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
827 if (maxThreadsPerPkg == 0) {
828 maxThreadsPerPkg = 1;
829 }
830
831 //
832 // The num cores per pkg comes from cpuid(4).
833 // 1 must be added to the encoded value.
834 //
835 // The author of cpu_count.cpp treated this only an upper bound
836 // on the number of cores, but I haven't seen any cases where it
837 // was greater than the actual number of cores, so we will treat
838 // it as exact in this block of code.
839 //
840 // First, we need to check if cpuid(4) is supported on this chip.
841 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
842 // has the value n or greater.
843 //
844 __kmp_x86_cpuid(0, 0, &buf);
845 if (buf.eax >= 4) {
846 __kmp_x86_cpuid(4, 0, &buf);
847 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
848 }
849 else {
850 nCoresPerPkg = 1;
851 }
852
853 //
854 // There is no way to reliably tell if HT is enabled without issuing
855 // the cpuid instruction from every thread, can correlating the cpuid
856 // info, so if the machine is not affinity capable, we assume that HT
857 // is off. We have seen quite a few machines where maxThreadsPerPkg
858 // is 2, yet the machine does not support HT.
859 //
860 // - Older OSes are usually found on machines with older chips, which
861 // do not support HT.
862 //
863 // - The performance penalty for mistakenly identifying a machine as
864 // HT when it isn't (which results in blocktime being incorrecly set
865 // to 0) is greater than the penalty when for mistakenly identifying
866 // a machine as being 1 thread/core when it is really HT enabled
867 // (which results in blocktime being incorrectly set to a positive
868 // value).
869 //
870 __kmp_ncores = __kmp_xproc;
871 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
872 __kmp_nThreadsPerCore = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000873 if (__kmp_affinity_verbose) {
874 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
875 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
876 if (__kmp_affinity_uniform_topology()) {
877 KMP_INFORM(Uniform, "KMP_AFFINITY");
878 } else {
879 KMP_INFORM(NonUniform, "KMP_AFFINITY");
880 }
881 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
882 __kmp_nThreadsPerCore, __kmp_ncores);
883 }
884 return 0;
885 }
886
887 //
888 //
889 // From here on, we can assume that it is safe to call
890 // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
891 // even if __kmp_affinity_type = affinity_none.
892 //
893
894 //
895 // Save the affinity mask for the current thread.
896 //
897 kmp_affin_mask_t *oldMask;
898 KMP_CPU_ALLOC(oldMask);
899 KMP_ASSERT(oldMask != NULL);
900 __kmp_get_system_affinity(oldMask, TRUE);
901
902 //
903 // Run through each of the available contexts, binding the current thread
904 // to it, and obtaining the pertinent information using the cpuid instr.
905 //
906 // The relevant information is:
907 //
908 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
909 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
910 //
911 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The
912 // value of this field determines the width of the core# + thread#
913 // fields in the Apic Id. It is also an upper bound on the number
914 // of threads per package, but it has been verified that situations
915 // happen were it is not exact. In particular, on certain OS/chip
916 // combinations where Intel(R) Hyper-Threading Technology is supported
917 // by the chip but has
918 // been disabled, the value of this field will be 2 (for a single core
919 // chip). On other OS/chip combinations supporting
920 // Intel(R) Hyper-Threading Technology, the value of
921 // this field will be 1 when Intel(R) Hyper-Threading Technology is
922 // disabled and 2 when it is enabled.
923 //
924 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The
925 // value of this field (+1) determines the width of the core# field in
926 // the Apic Id. The comments in "cpucount.cpp" say that this value is
927 // an upper bound, but the IA-32 architecture manual says that it is
928 // exactly the number of cores per package, and I haven't seen any
929 // case where it wasn't.
930 //
931 // From this information, deduce the package Id, core Id, and thread Id,
932 // and set the corresponding fields in the apicThreadInfo struct.
933 //
934 unsigned i;
935 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
936 __kmp_avail_proc * sizeof(apicThreadInfo));
937 unsigned nApics = 0;
938 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
939 //
940 // Skip this proc if it is not included in the machine model.
941 //
942 if (! KMP_CPU_ISSET(i, fullMask)) {
943 continue;
944 }
945 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
946
947 __kmp_affinity_bind_thread(i);
948 threadInfo[nApics].osId = i;
949
950 //
951 // The apic id and max threads per pkg come from cpuid(1).
952 //
Jim Cownie5e8470a2013-09-27 10:38:44 +0000953 __kmp_x86_cpuid(1, 0, &buf);
954 if (! (buf.edx >> 9) & 1) {
955 __kmp_set_system_affinity(oldMask, TRUE);
956 __kmp_free(threadInfo);
957 KMP_CPU_FREE(oldMask);
958 *msg_id = kmp_i18n_str_ApicNotPresent;
959 return -1;
960 }
961 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
962 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
963 if (threadInfo[nApics].maxThreadsPerPkg == 0) {
964 threadInfo[nApics].maxThreadsPerPkg = 1;
965 }
966
967 //
968 // Max cores per pkg comes from cpuid(4).
969 // 1 must be added to the encoded value.
970 //
971 // First, we need to check if cpuid(4) is supported on this chip.
972 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
973 // has the value n or greater.
974 //
975 __kmp_x86_cpuid(0, 0, &buf);
976 if (buf.eax >= 4) {
977 __kmp_x86_cpuid(4, 0, &buf);
978 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
979 }
980 else {
981 threadInfo[nApics].maxCoresPerPkg = 1;
982 }
983
984 //
985 // Infer the pkgId / coreId / threadId using only the info
986 // obtained locally.
987 //
988 int widthCT = __kmp_cpuid_mask_width(
989 threadInfo[nApics].maxThreadsPerPkg);
990 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
991
992 int widthC = __kmp_cpuid_mask_width(
993 threadInfo[nApics].maxCoresPerPkg);
994 int widthT = widthCT - widthC;
995 if (widthT < 0) {
996 //
997 // I've never seen this one happen, but I suppose it could, if
998 // the cpuid instruction on a chip was really screwed up.
999 // Make sure to restore the affinity mask before the tail call.
1000 //
1001 __kmp_set_system_affinity(oldMask, TRUE);
1002 __kmp_free(threadInfo);
1003 KMP_CPU_FREE(oldMask);
1004 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1005 return -1;
1006 }
1007
1008 int maskC = (1 << widthC) - 1;
1009 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1010 &maskC;
1011
1012 int maskT = (1 << widthT) - 1;
1013 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1014
1015 nApics++;
1016 }
1017
1018 //
1019 // We've collected all the info we need.
1020 // Restore the old affinity mask for this thread.
1021 //
1022 __kmp_set_system_affinity(oldMask, TRUE);
1023
1024 //
1025 // If there's only one thread context to bind to, form an Address object
1026 // with depth 1 and return immediately (or, if affinity is off, set
1027 // address2os to NULL and return).
1028 //
1029 // If it is configured to omit the package level when there is only a
1030 // single package, the logic at the end of this routine won't work if
1031 // there is only a single thread - it would try to form an Address
1032 // object with depth 0.
1033 //
1034 KMP_ASSERT(nApics > 0);
1035 if (nApics == 1) {
1036 __kmp_ncores = nPackages = 1;
1037 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001038 if (__kmp_affinity_verbose) {
1039 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1040 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1041
1042 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1043 if (__kmp_affinity_respect_mask) {
1044 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1045 } else {
1046 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1047 }
1048 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1049 KMP_INFORM(Uniform, "KMP_AFFINITY");
1050 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1051 __kmp_nThreadsPerCore, __kmp_ncores);
1052 }
1053
1054 if (__kmp_affinity_type == affinity_none) {
1055 __kmp_free(threadInfo);
1056 KMP_CPU_FREE(oldMask);
1057 return 0;
1058 }
1059
1060 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1061 Address addr(1);
1062 addr.labels[0] = threadInfo[0].pkgId;
1063 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1064
1065 if (__kmp_affinity_gran_levels < 0) {
1066 __kmp_affinity_gran_levels = 0;
1067 }
1068
1069 if (__kmp_affinity_verbose) {
1070 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1071 }
1072
1073 __kmp_free(threadInfo);
1074 KMP_CPU_FREE(oldMask);
1075 return 1;
1076 }
1077
1078 //
1079 // Sort the threadInfo table by physical Id.
1080 //
1081 qsort(threadInfo, nApics, sizeof(*threadInfo),
1082 __kmp_affinity_cmp_apicThreadInfo_phys_id);
1083
1084 //
1085 // The table is now sorted by pkgId / coreId / threadId, but we really
1086 // don't know the radix of any of the fields. pkgId's may be sparsely
1087 // assigned among the chips on a system. Although coreId's are usually
1088 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1089 // [0..threadsPerCore-1], we don't want to make any such assumptions.
1090 //
1091 // For that matter, we don't know what coresPerPkg and threadsPerCore
1092 // (or the total # packages) are at this point - we want to determine
1093 // that now. We only have an upper bound on the first two figures.
1094 //
1095 // We also perform a consistency check at this point: the values returned
1096 // by the cpuid instruction for any thread bound to a given package had
1097 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1098 //
1099 nPackages = 1;
1100 nCoresPerPkg = 1;
1101 __kmp_nThreadsPerCore = 1;
1102 unsigned nCores = 1;
1103
1104 unsigned pkgCt = 1; // to determine radii
1105 unsigned lastPkgId = threadInfo[0].pkgId;
1106 unsigned coreCt = 1;
1107 unsigned lastCoreId = threadInfo[0].coreId;
1108 unsigned threadCt = 1;
1109 unsigned lastThreadId = threadInfo[0].threadId;
1110
1111 // intra-pkg consist checks
1112 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1113 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1114
1115 for (i = 1; i < nApics; i++) {
1116 if (threadInfo[i].pkgId != lastPkgId) {
1117 nCores++;
1118 pkgCt++;
1119 lastPkgId = threadInfo[i].pkgId;
1120 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1121 coreCt = 1;
1122 lastCoreId = threadInfo[i].coreId;
1123 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1124 threadCt = 1;
1125 lastThreadId = threadInfo[i].threadId;
1126
1127 //
1128 // This is a different package, so go on to the next iteration
1129 // without doing any consistency checks. Reset the consistency
1130 // check vars, though.
1131 //
1132 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1133 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1134 continue;
1135 }
1136
1137 if (threadInfo[i].coreId != lastCoreId) {
1138 nCores++;
1139 coreCt++;
1140 lastCoreId = threadInfo[i].coreId;
1141 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1142 threadCt = 1;
1143 lastThreadId = threadInfo[i].threadId;
1144 }
1145 else if (threadInfo[i].threadId != lastThreadId) {
1146 threadCt++;
1147 lastThreadId = threadInfo[i].threadId;
1148 }
1149 else {
1150 __kmp_free(threadInfo);
1151 KMP_CPU_FREE(oldMask);
1152 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1153 return -1;
1154 }
1155
1156 //
1157 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1158 // fields agree between all the threads bounds to a given package.
1159 //
1160 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1161 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1162 __kmp_free(threadInfo);
1163 KMP_CPU_FREE(oldMask);
1164 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1165 return -1;
1166 }
1167 }
1168 nPackages = pkgCt;
1169 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1170 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1171
1172 //
1173 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +00001174 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +00001175 // nCoresPerPkg, & nPackages. Make sure all these vars are set
1176 // correctly, and return now if affinity is not enabled.
1177 //
Jim Cownie5e8470a2013-09-27 10:38:44 +00001178 __kmp_ncores = nCores;
1179 if (__kmp_affinity_verbose) {
1180 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1181 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1182
1183 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1184 if (__kmp_affinity_respect_mask) {
1185 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1186 } else {
1187 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1188 }
1189 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1190 if (__kmp_affinity_uniform_topology()) {
1191 KMP_INFORM(Uniform, "KMP_AFFINITY");
1192 } else {
1193 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1194 }
1195 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1196 __kmp_nThreadsPerCore, __kmp_ncores);
1197
1198 }
1199
1200 if (__kmp_affinity_type == affinity_none) {
1201 __kmp_free(threadInfo);
1202 KMP_CPU_FREE(oldMask);
1203 return 0;
1204 }
1205
1206 //
1207 // Now that we've determined the number of packages, the number of cores
1208 // per package, and the number of threads per core, we can construct the
1209 // data structure that is to be returned.
1210 //
1211 int pkgLevel = 0;
1212 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1213 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1214 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1215
1216 KMP_ASSERT(depth > 0);
1217 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1218
1219 for (i = 0; i < nApics; ++i) {
1220 Address addr(depth);
1221 unsigned os = threadInfo[i].osId;
1222 int d = 0;
1223
1224 if (pkgLevel >= 0) {
1225 addr.labels[d++] = threadInfo[i].pkgId;
1226 }
1227 if (coreLevel >= 0) {
1228 addr.labels[d++] = threadInfo[i].coreId;
1229 }
1230 if (threadLevel >= 0) {
1231 addr.labels[d++] = threadInfo[i].threadId;
1232 }
1233 (*address2os)[i] = AddrUnsPair(addr, os);
1234 }
1235
1236 if (__kmp_affinity_gran_levels < 0) {
1237 //
1238 // Set the granularity level based on what levels are modeled
1239 // in the machine topology map.
1240 //
1241 __kmp_affinity_gran_levels = 0;
1242 if ((threadLevel >= 0)
1243 && (__kmp_affinity_gran > affinity_gran_thread)) {
1244 __kmp_affinity_gran_levels++;
1245 }
1246 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1247 __kmp_affinity_gran_levels++;
1248 }
1249 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1250 __kmp_affinity_gran_levels++;
1251 }
1252 }
1253
1254 if (__kmp_affinity_verbose) {
1255 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1256 coreLevel, threadLevel);
1257 }
1258
1259 __kmp_free(threadInfo);
1260 KMP_CPU_FREE(oldMask);
1261 return depth;
1262}
1263
1264
1265//
1266// Intel(R) microarchitecture code name Nehalem, Dunnington and later
1267// architectures support a newer interface for specifying the x2APIC Ids,
1268// based on cpuid leaf 11.
1269//
1270static int
1271__kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1272 kmp_i18n_id_t *const msg_id)
1273{
1274 kmp_cpuid buf;
1275
1276 *address2os = NULL;
1277 *msg_id = kmp_i18n_null;
1278
1279 //
1280 // Check to see if cpuid leaf 11 is supported.
1281 //
1282 __kmp_x86_cpuid(0, 0, &buf);
1283 if (buf.eax < 11) {
1284 *msg_id = kmp_i18n_str_NoLeaf11Support;
1285 return -1;
1286 }
1287 __kmp_x86_cpuid(11, 0, &buf);
1288 if (buf.ebx == 0) {
1289 *msg_id = kmp_i18n_str_NoLeaf11Support;
1290 return -1;
1291 }
1292
1293 //
1294 // Find the number of levels in the machine topology. While we're at it,
1295 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will
1296 // try to get more accurate values later by explicitly counting them,
1297 // but get reasonable defaults now, in case we return early.
1298 //
1299 int level;
1300 int threadLevel = -1;
1301 int coreLevel = -1;
1302 int pkgLevel = -1;
1303 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1304
1305 for (level = 0;; level++) {
1306 if (level > 31) {
1307 //
1308 // FIXME: Hack for DPD200163180
1309 //
1310 // If level is big then something went wrong -> exiting
1311 //
1312 // There could actually be 32 valid levels in the machine topology,
1313 // but so far, the only machine we have seen which does not exit
1314 // this loop before iteration 32 has fubar x2APIC settings.
1315 //
1316 // For now, just reject this case based upon loop trip count.
1317 //
1318 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1319 return -1;
1320 }
1321 __kmp_x86_cpuid(11, level, &buf);
1322 if (buf.ebx == 0) {
1323 if (pkgLevel < 0) {
1324 //
1325 // Will infer nPackages from __kmp_xproc
1326 //
1327 pkgLevel = level;
1328 level++;
1329 }
1330 break;
1331 }
1332 int kind = (buf.ecx >> 8) & 0xff;
1333 if (kind == 1) {
1334 //
1335 // SMT level
1336 //
1337 threadLevel = level;
1338 coreLevel = -1;
1339 pkgLevel = -1;
1340 __kmp_nThreadsPerCore = buf.ebx & 0xff;
1341 if (__kmp_nThreadsPerCore == 0) {
1342 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1343 return -1;
1344 }
1345 }
1346 else if (kind == 2) {
1347 //
1348 // core level
1349 //
1350 coreLevel = level;
1351 pkgLevel = -1;
1352 nCoresPerPkg = buf.ebx & 0xff;
1353 if (nCoresPerPkg == 0) {
1354 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1355 return -1;
1356 }
1357 }
1358 else {
1359 if (level <= 0) {
1360 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1361 return -1;
1362 }
1363 if (pkgLevel >= 0) {
1364 continue;
1365 }
1366 pkgLevel = level;
1367 nPackages = buf.ebx & 0xff;
1368 if (nPackages == 0) {
1369 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1370 return -1;
1371 }
1372 }
1373 }
1374 int depth = level;
1375
1376 //
1377 // In the above loop, "level" was counted from the finest level (usually
1378 // thread) to the coarsest. The caller expects that we will place the
1379 // labels in (*address2os)[].first.labels[] in the inverse order, so
1380 // we need to invert the vars saying which level means what.
1381 //
1382 if (threadLevel >= 0) {
1383 threadLevel = depth - threadLevel - 1;
1384 }
1385 if (coreLevel >= 0) {
1386 coreLevel = depth - coreLevel - 1;
1387 }
1388 KMP_DEBUG_ASSERT(pkgLevel >= 0);
1389 pkgLevel = depth - pkgLevel - 1;
1390
1391 //
1392 // The algorithm used starts by setting the affinity to each available
Andrey Churbanov1c331292015-01-27 17:03:42 +00001393 // thread and retrieving info from the cpuid instruction, so if we are
1394 // not capable of calling __kmp_get_system_affinity() and
1395 // _kmp_get_system_affinity(), then we need to do something else - use
1396 // the defaults that we calculated from issuing cpuid without binding
1397 // to each proc.
Jim Cownie5e8470a2013-09-27 10:38:44 +00001398 //
1399 if (! KMP_AFFINITY_CAPABLE())
1400 {
1401 //
1402 // Hack to try and infer the machine topology using only the data
1403 // available from cpuid on the current thread, and __kmp_xproc.
1404 //
1405 KMP_ASSERT(__kmp_affinity_type == affinity_none);
1406
1407 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1408 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001409 if (__kmp_affinity_verbose) {
1410 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1411 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1412 if (__kmp_affinity_uniform_topology()) {
1413 KMP_INFORM(Uniform, "KMP_AFFINITY");
1414 } else {
1415 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1416 }
1417 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1418 __kmp_nThreadsPerCore, __kmp_ncores);
1419 }
1420 return 0;
1421 }
1422
1423 //
1424 //
1425 // From here on, we can assume that it is safe to call
1426 // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1427 // even if __kmp_affinity_type = affinity_none.
1428 //
1429
1430 //
1431 // Save the affinity mask for the current thread.
1432 //
1433 kmp_affin_mask_t *oldMask;
1434 KMP_CPU_ALLOC(oldMask);
1435 __kmp_get_system_affinity(oldMask, TRUE);
1436
1437 //
1438 // Allocate the data structure to be returned.
1439 //
1440 AddrUnsPair *retval = (AddrUnsPair *)
1441 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1442
1443 //
1444 // Run through each of the available contexts, binding the current thread
1445 // to it, and obtaining the pertinent information using the cpuid instr.
1446 //
1447 unsigned int proc;
1448 int nApics = 0;
1449 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1450 //
1451 // Skip this proc if it is not included in the machine model.
1452 //
1453 if (! KMP_CPU_ISSET(proc, fullMask)) {
1454 continue;
1455 }
1456 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1457
1458 __kmp_affinity_bind_thread(proc);
1459
1460 //
1461 // Extrach the labels for each level in the machine topology map
1462 // from the Apic ID.
1463 //
1464 Address addr(depth);
1465 int prev_shift = 0;
1466
1467 for (level = 0; level < depth; level++) {
1468 __kmp_x86_cpuid(11, level, &buf);
1469 unsigned apicId = buf.edx;
1470 if (buf.ebx == 0) {
1471 if (level != depth - 1) {
1472 KMP_CPU_FREE(oldMask);
1473 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1474 return -1;
1475 }
1476 addr.labels[depth - level - 1] = apicId >> prev_shift;
1477 level++;
1478 break;
1479 }
1480 int shift = buf.eax & 0x1f;
1481 int mask = (1 << shift) - 1;
1482 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1483 prev_shift = shift;
1484 }
1485 if (level != depth) {
1486 KMP_CPU_FREE(oldMask);
1487 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1488 return -1;
1489 }
1490
1491 retval[nApics] = AddrUnsPair(addr, proc);
1492 nApics++;
1493 }
1494
1495 //
1496 // We've collected all the info we need.
1497 // Restore the old affinity mask for this thread.
1498 //
1499 __kmp_set_system_affinity(oldMask, TRUE);
1500
1501 //
1502 // If there's only one thread context to bind to, return now.
1503 //
1504 KMP_ASSERT(nApics > 0);
1505 if (nApics == 1) {
1506 __kmp_ncores = nPackages = 1;
1507 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001508 if (__kmp_affinity_verbose) {
1509 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1510 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1511
1512 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1513 if (__kmp_affinity_respect_mask) {
1514 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1515 } else {
1516 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1517 }
1518 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1519 KMP_INFORM(Uniform, "KMP_AFFINITY");
1520 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1521 __kmp_nThreadsPerCore, __kmp_ncores);
1522 }
1523
1524 if (__kmp_affinity_type == affinity_none) {
1525 __kmp_free(retval);
1526 KMP_CPU_FREE(oldMask);
1527 return 0;
1528 }
1529
1530 //
1531 // Form an Address object which only includes the package level.
1532 //
1533 Address addr(1);
1534 addr.labels[0] = retval[0].first.labels[pkgLevel];
1535 retval[0].first = addr;
1536
1537 if (__kmp_affinity_gran_levels < 0) {
1538 __kmp_affinity_gran_levels = 0;
1539 }
1540
1541 if (__kmp_affinity_verbose) {
1542 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1543 }
1544
1545 *address2os = retval;
1546 KMP_CPU_FREE(oldMask);
1547 return 1;
1548 }
1549
1550 //
1551 // Sort the table by physical Id.
1552 //
1553 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1554
1555 //
1556 // Find the radix at each of the levels.
1557 //
1558 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1559 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1560 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1561 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1562 for (level = 0; level < depth; level++) {
1563 totals[level] = 1;
1564 maxCt[level] = 1;
1565 counts[level] = 1;
1566 last[level] = retval[0].first.labels[level];
1567 }
1568
1569 //
1570 // From here on, the iteration variable "level" runs from the finest
1571 // level to the coarsest, i.e. we iterate forward through
1572 // (*address2os)[].first.labels[] - in the previous loops, we iterated
1573 // backwards.
1574 //
1575 for (proc = 1; (int)proc < nApics; proc++) {
1576 int level;
1577 for (level = 0; level < depth; level++) {
1578 if (retval[proc].first.labels[level] != last[level]) {
1579 int j;
1580 for (j = level + 1; j < depth; j++) {
1581 totals[j]++;
1582 counts[j] = 1;
1583 // The line below causes printing incorrect topology information
1584 // in case the max value for some level (maxCt[level]) is encountered earlier than
1585 // some less value while going through the array.
1586 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1587 // whereas it must be 4.
1588 // TODO!!! Check if it can be commented safely
1589 //maxCt[j] = 1;
1590 last[j] = retval[proc].first.labels[j];
1591 }
1592 totals[level]++;
1593 counts[level]++;
1594 if (counts[level] > maxCt[level]) {
1595 maxCt[level] = counts[level];
1596 }
1597 last[level] = retval[proc].first.labels[level];
1598 break;
1599 }
1600 else if (level == depth - 1) {
1601 __kmp_free(last);
1602 __kmp_free(maxCt);
1603 __kmp_free(counts);
1604 __kmp_free(totals);
1605 __kmp_free(retval);
1606 KMP_CPU_FREE(oldMask);
1607 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1608 return -1;
1609 }
1610 }
1611 }
1612
1613 //
1614 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +00001615 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +00001616 // nCoresPerPkg, & nPackages. Make sure all these vars are set
1617 // correctly, and return if affinity is not enabled.
1618 //
1619 if (threadLevel >= 0) {
1620 __kmp_nThreadsPerCore = maxCt[threadLevel];
1621 }
1622 else {
1623 __kmp_nThreadsPerCore = 1;
1624 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00001625 nPackages = totals[pkgLevel];
1626
1627 if (coreLevel >= 0) {
1628 __kmp_ncores = totals[coreLevel];
1629 nCoresPerPkg = maxCt[coreLevel];
1630 }
1631 else {
1632 __kmp_ncores = nPackages;
1633 nCoresPerPkg = 1;
1634 }
1635
1636 //
1637 // Check to see if the machine topology is uniform
1638 //
1639 unsigned prod = maxCt[0];
1640 for (level = 1; level < depth; level++) {
1641 prod *= maxCt[level];
1642 }
1643 bool uniform = (prod == totals[level - 1]);
1644
1645 //
1646 // Print the machine topology summary.
1647 //
1648 if (__kmp_affinity_verbose) {
1649 char mask[KMP_AFFIN_MASK_PRINT_LEN];
1650 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1651
1652 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1653 if (__kmp_affinity_respect_mask) {
1654 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1655 } else {
1656 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1657 }
1658 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1659 if (uniform) {
1660 KMP_INFORM(Uniform, "KMP_AFFINITY");
1661 } else {
1662 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1663 }
1664
1665 kmp_str_buf_t buf;
1666 __kmp_str_buf_init(&buf);
1667
1668 __kmp_str_buf_print(&buf, "%d", totals[0]);
1669 for (level = 1; level <= pkgLevel; level++) {
1670 __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1671 }
1672 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1673 __kmp_nThreadsPerCore, __kmp_ncores);
1674
1675 __kmp_str_buf_free(&buf);
1676 }
1677
1678 if (__kmp_affinity_type == affinity_none) {
1679 __kmp_free(last);
1680 __kmp_free(maxCt);
1681 __kmp_free(counts);
1682 __kmp_free(totals);
1683 __kmp_free(retval);
1684 KMP_CPU_FREE(oldMask);
1685 return 0;
1686 }
1687
1688 //
1689 // Find any levels with radiix 1, and remove them from the map
1690 // (except for the package level).
1691 //
1692 int new_depth = 0;
1693 for (level = 0; level < depth; level++) {
1694 if ((maxCt[level] == 1) && (level != pkgLevel)) {
1695 continue;
1696 }
1697 new_depth++;
1698 }
1699
1700 //
1701 // If we are removing any levels, allocate a new vector to return,
1702 // and copy the relevant information to it.
1703 //
1704 if (new_depth != depth) {
1705 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1706 sizeof(AddrUnsPair) * nApics);
1707 for (proc = 0; (int)proc < nApics; proc++) {
1708 Address addr(new_depth);
1709 new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1710 }
1711 int new_level = 0;
1712 for (level = 0; level < depth; level++) {
1713 if ((maxCt[level] == 1) && (level != pkgLevel)) {
1714 if (level == threadLevel) {
1715 threadLevel = -1;
1716 }
1717 else if ((threadLevel >= 0) && (level < threadLevel)) {
1718 threadLevel--;
1719 }
1720 if (level == coreLevel) {
1721 coreLevel = -1;
1722 }
1723 else if ((coreLevel >= 0) && (level < coreLevel)) {
1724 coreLevel--;
1725 }
1726 if (level < pkgLevel) {
1727 pkgLevel--;
1728 }
1729 continue;
1730 }
1731 for (proc = 0; (int)proc < nApics; proc++) {
1732 new_retval[proc].first.labels[new_level]
1733 = retval[proc].first.labels[level];
1734 }
1735 new_level++;
1736 }
1737
1738 __kmp_free(retval);
1739 retval = new_retval;
1740 depth = new_depth;
1741 }
1742
1743 if (__kmp_affinity_gran_levels < 0) {
1744 //
1745 // Set the granularity level based on what levels are modeled
1746 // in the machine topology map.
1747 //
1748 __kmp_affinity_gran_levels = 0;
1749 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1750 __kmp_affinity_gran_levels++;
1751 }
1752 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1753 __kmp_affinity_gran_levels++;
1754 }
1755 if (__kmp_affinity_gran > affinity_gran_package) {
1756 __kmp_affinity_gran_levels++;
1757 }
1758 }
1759
1760 if (__kmp_affinity_verbose) {
1761 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1762 coreLevel, threadLevel);
1763 }
1764
1765 __kmp_free(last);
1766 __kmp_free(maxCt);
1767 __kmp_free(counts);
1768 __kmp_free(totals);
1769 KMP_CPU_FREE(oldMask);
1770 *address2os = retval;
1771 return depth;
1772}
1773
1774
1775# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1776
1777
1778#define osIdIndex 0
1779#define threadIdIndex 1
1780#define coreIdIndex 2
1781#define pkgIdIndex 3
1782#define nodeIdIndex 4
1783
1784typedef unsigned *ProcCpuInfo;
1785static unsigned maxIndex = pkgIdIndex;
1786
1787
1788static int
1789__kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1790{
1791 const unsigned *aa = (const unsigned *)a;
1792 const unsigned *bb = (const unsigned *)b;
1793 if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1794 if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1795 return 0;
1796};
1797
1798
1799static int
1800__kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1801{
1802 unsigned i;
1803 const unsigned *aa = *((const unsigned **)a);
1804 const unsigned *bb = *((const unsigned **)b);
1805 for (i = maxIndex; ; i--) {
1806 if (aa[i] < bb[i]) return -1;
1807 if (aa[i] > bb[i]) return 1;
1808 if (i == osIdIndex) break;
1809 }
1810 return 0;
1811}
1812
1813
1814//
1815// Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1816// affinity map.
1817//
1818static int
1819__kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1820 kmp_i18n_id_t *const msg_id, FILE *f)
1821{
1822 *address2os = NULL;
1823 *msg_id = kmp_i18n_null;
1824
1825 //
1826 // Scan of the file, and count the number of "processor" (osId) fields,
Alp Toker8f2d3f02014-02-24 10:40:15 +00001827 // and find the highest value of <n> for a node_<n> field.
Jim Cownie5e8470a2013-09-27 10:38:44 +00001828 //
1829 char buf[256];
1830 unsigned num_records = 0;
1831 while (! feof(f)) {
1832 buf[sizeof(buf) - 1] = 1;
1833 if (! fgets(buf, sizeof(buf), f)) {
1834 //
1835 // Read errors presumably because of EOF
1836 //
1837 break;
1838 }
1839
1840 char s1[] = "processor";
1841 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1842 num_records++;
1843 continue;
1844 }
1845
1846 //
1847 // FIXME - this will match "node_<n> <garbage>"
1848 //
1849 unsigned level;
1850 if (sscanf(buf, "node_%d id", &level) == 1) {
1851 if (nodeIdIndex + level >= maxIndex) {
1852 maxIndex = nodeIdIndex + level;
1853 }
1854 continue;
1855 }
1856 }
1857
1858 //
1859 // Check for empty file / no valid processor records, or too many.
1860 // The number of records can't exceed the number of valid bits in the
1861 // affinity mask.
1862 //
1863 if (num_records == 0) {
1864 *line = 0;
1865 *msg_id = kmp_i18n_str_NoProcRecords;
1866 return -1;
1867 }
1868 if (num_records > (unsigned)__kmp_xproc) {
1869 *line = 0;
1870 *msg_id = kmp_i18n_str_TooManyProcRecords;
1871 return -1;
1872 }
1873
1874 //
1875 // Set the file pointer back to the begginning, so that we can scan the
1876 // file again, this time performing a full parse of the data.
1877 // Allocate a vector of ProcCpuInfo object, where we will place the data.
1878 // Adding an extra element at the end allows us to remove a lot of extra
1879 // checks for termination conditions.
1880 //
1881 if (fseek(f, 0, SEEK_SET) != 0) {
1882 *line = 0;
1883 *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1884 return -1;
1885 }
1886
1887 //
1888 // Allocate the array of records to store the proc info in. The dummy
1889 // element at the end makes the logic in filling them out easier to code.
1890 //
1891 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1892 * sizeof(unsigned *));
1893 unsigned i;
1894 for (i = 0; i <= num_records; i++) {
1895 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1896 * sizeof(unsigned));
1897 }
1898
1899#define CLEANUP_THREAD_INFO \
1900 for (i = 0; i <= num_records; i++) { \
1901 __kmp_free(threadInfo[i]); \
1902 } \
1903 __kmp_free(threadInfo);
1904
1905 //
1906 // A value of UINT_MAX means that we didn't find the field
1907 //
1908 unsigned __index;
1909
1910#define INIT_PROC_INFO(p) \
1911 for (__index = 0; __index <= maxIndex; __index++) { \
1912 (p)[__index] = UINT_MAX; \
1913 }
1914
1915 for (i = 0; i <= num_records; i++) {
1916 INIT_PROC_INFO(threadInfo[i]);
1917 }
1918
1919 unsigned num_avail = 0;
1920 *line = 0;
1921 while (! feof(f)) {
1922 //
1923 // Create an inner scoping level, so that all the goto targets at the
1924 // end of the loop appear in an outer scoping level. This avoids
1925 // warnings about jumping past an initialization to a target in the
1926 // same block.
1927 //
1928 {
1929 buf[sizeof(buf) - 1] = 1;
1930 bool long_line = false;
1931 if (! fgets(buf, sizeof(buf), f)) {
1932 //
1933 // Read errors presumably because of EOF
1934 //
1935 // If there is valid data in threadInfo[num_avail], then fake
1936 // a blank line in ensure that the last address gets parsed.
1937 //
1938 bool valid = false;
1939 for (i = 0; i <= maxIndex; i++) {
1940 if (threadInfo[num_avail][i] != UINT_MAX) {
1941 valid = true;
1942 }
1943 }
1944 if (! valid) {
1945 break;
1946 }
1947 buf[0] = 0;
1948 } else if (!buf[sizeof(buf) - 1]) {
1949 //
1950 // The line is longer than the buffer. Set a flag and don't
1951 // emit an error if we were going to ignore the line, anyway.
1952 //
1953 long_line = true;
1954
1955#define CHECK_LINE \
1956 if (long_line) { \
1957 CLEANUP_THREAD_INFO; \
1958 *msg_id = kmp_i18n_str_LongLineCpuinfo; \
1959 return -1; \
1960 }
1961 }
1962 (*line)++;
1963
1964 char s1[] = "processor";
1965 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1966 CHECK_LINE;
1967 char *p = strchr(buf + sizeof(s1) - 1, ':');
1968 unsigned val;
1969 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1970 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
1971 threadInfo[num_avail][osIdIndex] = val;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001972#if KMP_OS_LINUX && USE_SYSFS_INFO
1973 char path[256];
1974 snprintf(path, sizeof(path),
1975 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
1976 threadInfo[num_avail][osIdIndex]);
1977 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
1978
1979 snprintf(path, sizeof(path),
1980 "/sys/devices/system/cpu/cpu%u/topology/core_id",
1981 threadInfo[num_avail][osIdIndex]);
1982 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001983 continue;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001984#else
Jim Cownie5e8470a2013-09-27 10:38:44 +00001985 }
1986 char s2[] = "physical id";
1987 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
1988 CHECK_LINE;
1989 char *p = strchr(buf + sizeof(s2) - 1, ':');
1990 unsigned val;
1991 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1992 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
1993 threadInfo[num_avail][pkgIdIndex] = val;
1994 continue;
1995 }
1996 char s3[] = "core id";
1997 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
1998 CHECK_LINE;
1999 char *p = strchr(buf + sizeof(s3) - 1, ':');
2000 unsigned val;
2001 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2002 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2003 threadInfo[num_avail][coreIdIndex] = val;
2004 continue;
Jim Cownie181b4bb2013-12-23 17:28:57 +00002005#endif // KMP_OS_LINUX && USE_SYSFS_INFO
Jim Cownie5e8470a2013-09-27 10:38:44 +00002006 }
2007 char s4[] = "thread id";
2008 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2009 CHECK_LINE;
2010 char *p = strchr(buf + sizeof(s4) - 1, ':');
2011 unsigned val;
2012 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2013 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2014 threadInfo[num_avail][threadIdIndex] = val;
2015 continue;
2016 }
2017 unsigned level;
2018 if (sscanf(buf, "node_%d id", &level) == 1) {
2019 CHECK_LINE;
2020 char *p = strchr(buf + sizeof(s4) - 1, ':');
2021 unsigned val;
2022 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2023 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2024 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2025 threadInfo[num_avail][nodeIdIndex + level] = val;
2026 continue;
2027 }
2028
2029 //
2030 // We didn't recognize the leading token on the line.
2031 // There are lots of leading tokens that we don't recognize -
2032 // if the line isn't empty, go on to the next line.
2033 //
2034 if ((*buf != 0) && (*buf != '\n')) {
2035 //
2036 // If the line is longer than the buffer, read characters
2037 // until we find a newline.
2038 //
2039 if (long_line) {
2040 int ch;
2041 while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2042 }
2043 continue;
2044 }
2045
2046 //
2047 // A newline has signalled the end of the processor record.
2048 // Check that there aren't too many procs specified.
2049 //
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002050 if ((int)num_avail == __kmp_xproc) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002051 CLEANUP_THREAD_INFO;
2052 *msg_id = kmp_i18n_str_TooManyEntries;
2053 return -1;
2054 }
2055
2056 //
2057 // Check for missing fields. The osId field must be there, and we
2058 // currently require that the physical id field is specified, also.
2059 //
2060 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2061 CLEANUP_THREAD_INFO;
2062 *msg_id = kmp_i18n_str_MissingProcField;
2063 return -1;
2064 }
2065 if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2066 CLEANUP_THREAD_INFO;
2067 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2068 return -1;
2069 }
2070
2071 //
2072 // Skip this proc if it is not included in the machine model.
2073 //
2074 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2075 INIT_PROC_INFO(threadInfo[num_avail]);
2076 continue;
2077 }
2078
2079 //
2080 // We have a successful parse of this proc's info.
2081 // Increment the counter, and prepare for the next proc.
2082 //
2083 num_avail++;
2084 KMP_ASSERT(num_avail <= num_records);
2085 INIT_PROC_INFO(threadInfo[num_avail]);
2086 }
2087 continue;
2088
2089 no_val:
2090 CLEANUP_THREAD_INFO;
2091 *msg_id = kmp_i18n_str_MissingValCpuinfo;
2092 return -1;
2093
2094 dup_field:
2095 CLEANUP_THREAD_INFO;
2096 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2097 return -1;
2098 }
2099 *line = 0;
2100
2101# if KMP_MIC && REDUCE_TEAM_SIZE
2102 unsigned teamSize = 0;
2103# endif // KMP_MIC && REDUCE_TEAM_SIZE
2104
2105 // check for num_records == __kmp_xproc ???
2106
2107 //
2108 // If there's only one thread context to bind to, form an Address object
2109 // with depth 1 and return immediately (or, if affinity is off, set
2110 // address2os to NULL and return).
2111 //
2112 // If it is configured to omit the package level when there is only a
2113 // single package, the logic at the end of this routine won't work if
2114 // there is only a single thread - it would try to form an Address
2115 // object with depth 0.
2116 //
2117 KMP_ASSERT(num_avail > 0);
2118 KMP_ASSERT(num_avail <= num_records);
2119 if (num_avail == 1) {
2120 __kmp_ncores = 1;
2121 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002122 if (__kmp_affinity_verbose) {
2123 if (! KMP_AFFINITY_CAPABLE()) {
2124 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2125 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2126 KMP_INFORM(Uniform, "KMP_AFFINITY");
2127 }
2128 else {
2129 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2130 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2131 fullMask);
2132 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2133 if (__kmp_affinity_respect_mask) {
2134 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2135 } else {
2136 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2137 }
2138 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2139 KMP_INFORM(Uniform, "KMP_AFFINITY");
2140 }
2141 int index;
2142 kmp_str_buf_t buf;
2143 __kmp_str_buf_init(&buf);
2144 __kmp_str_buf_print(&buf, "1");
2145 for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2146 __kmp_str_buf_print(&buf, " x 1");
2147 }
2148 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2149 __kmp_str_buf_free(&buf);
2150 }
2151
2152 if (__kmp_affinity_type == affinity_none) {
2153 CLEANUP_THREAD_INFO;
2154 return 0;
2155 }
2156
2157 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2158 Address addr(1);
2159 addr.labels[0] = threadInfo[0][pkgIdIndex];
2160 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2161
2162 if (__kmp_affinity_gran_levels < 0) {
2163 __kmp_affinity_gran_levels = 0;
2164 }
2165
2166 if (__kmp_affinity_verbose) {
2167 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2168 }
2169
2170 CLEANUP_THREAD_INFO;
2171 return 1;
2172 }
2173
2174 //
2175 // Sort the threadInfo table by physical Id.
2176 //
2177 qsort(threadInfo, num_avail, sizeof(*threadInfo),
2178 __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2179
2180 //
2181 // The table is now sorted by pkgId / coreId / threadId, but we really
2182 // don't know the radix of any of the fields. pkgId's may be sparsely
2183 // assigned among the chips on a system. Although coreId's are usually
2184 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2185 // [0..threadsPerCore-1], we don't want to make any such assumptions.
2186 //
2187 // For that matter, we don't know what coresPerPkg and threadsPerCore
2188 // (or the total # packages) are at this point - we want to determine
2189 // that now. We only have an upper bound on the first two figures.
2190 //
2191 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2192 * sizeof(unsigned));
2193 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2194 * sizeof(unsigned));
2195 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2196 * sizeof(unsigned));
2197 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2198 * sizeof(unsigned));
2199
2200 bool assign_thread_ids = false;
2201 unsigned threadIdCt;
2202 unsigned index;
2203
2204 restart_radix_check:
2205 threadIdCt = 0;
2206
2207 //
2208 // Initialize the counter arrays with data from threadInfo[0].
2209 //
2210 if (assign_thread_ids) {
2211 if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2212 threadInfo[0][threadIdIndex] = threadIdCt++;
2213 }
2214 else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2215 threadIdCt = threadInfo[0][threadIdIndex] + 1;
2216 }
2217 }
2218 for (index = 0; index <= maxIndex; index++) {
2219 counts[index] = 1;
2220 maxCt[index] = 1;
2221 totals[index] = 1;
2222 lastId[index] = threadInfo[0][index];;
2223 }
2224
2225 //
2226 // Run through the rest of the OS procs.
2227 //
2228 for (i = 1; i < num_avail; i++) {
2229 //
2230 // Find the most significant index whose id differs
2231 // from the id for the previous OS proc.
2232 //
2233 for (index = maxIndex; index >= threadIdIndex; index--) {
2234 if (assign_thread_ids && (index == threadIdIndex)) {
2235 //
2236 // Auto-assign the thread id field if it wasn't specified.
2237 //
2238 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2239 threadInfo[i][threadIdIndex] = threadIdCt++;
2240 }
2241
2242 //
2243 // Aparrently the thread id field was specified for some
2244 // entries and not others. Start the thread id counter
2245 // off at the next higher thread id.
2246 //
2247 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2248 threadIdCt = threadInfo[i][threadIdIndex] + 1;
2249 }
2250 }
2251 if (threadInfo[i][index] != lastId[index]) {
2252 //
2253 // Run through all indices which are less significant,
2254 // and reset the counts to 1.
2255 //
2256 // At all levels up to and including index, we need to
2257 // increment the totals and record the last id.
2258 //
2259 unsigned index2;
2260 for (index2 = threadIdIndex; index2 < index; index2++) {
2261 totals[index2]++;
2262 if (counts[index2] > maxCt[index2]) {
2263 maxCt[index2] = counts[index2];
2264 }
2265 counts[index2] = 1;
2266 lastId[index2] = threadInfo[i][index2];
2267 }
2268 counts[index]++;
2269 totals[index]++;
2270 lastId[index] = threadInfo[i][index];
2271
2272 if (assign_thread_ids && (index > threadIdIndex)) {
2273
2274# if KMP_MIC && REDUCE_TEAM_SIZE
2275 //
2276 // The default team size is the total #threads in the machine
2277 // minus 1 thread for every core that has 3 or more threads.
2278 //
2279 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2280# endif // KMP_MIC && REDUCE_TEAM_SIZE
2281
2282 //
2283 // Restart the thread counter, as we are on a new core.
2284 //
2285 threadIdCt = 0;
2286
2287 //
2288 // Auto-assign the thread id field if it wasn't specified.
2289 //
2290 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2291 threadInfo[i][threadIdIndex] = threadIdCt++;
2292 }
2293
2294 //
2295 // Aparrently the thread id field was specified for some
2296 // entries and not others. Start the thread id counter
2297 // off at the next higher thread id.
2298 //
2299 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2300 threadIdCt = threadInfo[i][threadIdIndex] + 1;
2301 }
2302 }
2303 break;
2304 }
2305 }
2306 if (index < threadIdIndex) {
2307 //
2308 // If thread ids were specified, it is an error if they are not
2309 // unique. Also, check that we waven't already restarted the
2310 // loop (to be safe - shouldn't need to).
2311 //
2312 if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2313 || assign_thread_ids) {
2314 __kmp_free(lastId);
2315 __kmp_free(totals);
2316 __kmp_free(maxCt);
2317 __kmp_free(counts);
2318 CLEANUP_THREAD_INFO;
2319 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2320 return -1;
2321 }
2322
2323 //
2324 // If the thread ids were not specified and we see entries
2325 // entries that are duplicates, start the loop over and
2326 // assign the thread ids manually.
2327 //
2328 assign_thread_ids = true;
2329 goto restart_radix_check;
2330 }
2331 }
2332
2333# if KMP_MIC && REDUCE_TEAM_SIZE
2334 //
2335 // The default team size is the total #threads in the machine
2336 // minus 1 thread for every core that has 3 or more threads.
2337 //
2338 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2339# endif // KMP_MIC && REDUCE_TEAM_SIZE
2340
2341 for (index = threadIdIndex; index <= maxIndex; index++) {
2342 if (counts[index] > maxCt[index]) {
2343 maxCt[index] = counts[index];
2344 }
2345 }
2346
2347 __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2348 nCoresPerPkg = maxCt[coreIdIndex];
2349 nPackages = totals[pkgIdIndex];
2350
2351 //
2352 // Check to see if the machine topology is uniform
2353 //
2354 unsigned prod = totals[maxIndex];
2355 for (index = threadIdIndex; index < maxIndex; index++) {
2356 prod *= maxCt[index];
2357 }
2358 bool uniform = (prod == totals[threadIdIndex]);
2359
2360 //
2361 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +00002362 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +00002363 // nCoresPerPkg, & nPackages. Make sure all these vars are set
2364 // correctly, and return now if affinity is not enabled.
2365 //
Jim Cownie5e8470a2013-09-27 10:38:44 +00002366 __kmp_ncores = totals[coreIdIndex];
2367
2368 if (__kmp_affinity_verbose) {
2369 if (! KMP_AFFINITY_CAPABLE()) {
2370 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2371 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2372 if (uniform) {
2373 KMP_INFORM(Uniform, "KMP_AFFINITY");
2374 } else {
2375 KMP_INFORM(NonUniform, "KMP_AFFINITY");
2376 }
2377 }
2378 else {
2379 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2380 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2381 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2382 if (__kmp_affinity_respect_mask) {
2383 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2384 } else {
2385 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2386 }
2387 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2388 if (uniform) {
2389 KMP_INFORM(Uniform, "KMP_AFFINITY");
2390 } else {
2391 KMP_INFORM(NonUniform, "KMP_AFFINITY");
2392 }
2393 }
2394 kmp_str_buf_t buf;
2395 __kmp_str_buf_init(&buf);
2396
2397 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2398 for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2399 __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2400 }
2401 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2402 maxCt[threadIdIndex], __kmp_ncores);
2403
2404 __kmp_str_buf_free(&buf);
2405 }
2406
2407# if KMP_MIC && REDUCE_TEAM_SIZE
2408 //
2409 // Set the default team size.
2410 //
2411 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2412 __kmp_dflt_team_nth = teamSize;
2413 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2414 __kmp_dflt_team_nth));
2415 }
2416# endif // KMP_MIC && REDUCE_TEAM_SIZE
2417
2418 if (__kmp_affinity_type == affinity_none) {
2419 __kmp_free(lastId);
2420 __kmp_free(totals);
2421 __kmp_free(maxCt);
2422 __kmp_free(counts);
2423 CLEANUP_THREAD_INFO;
2424 return 0;
2425 }
2426
2427 //
2428 // Count the number of levels which have more nodes at that level than
2429 // at the parent's level (with there being an implicit root node of
2430 // the top level). This is equivalent to saying that there is at least
2431 // one node at this level which has a sibling. These levels are in the
2432 // map, and the package level is always in the map.
2433 //
2434 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2435 int level = 0;
2436 for (index = threadIdIndex; index < maxIndex; index++) {
2437 KMP_ASSERT(totals[index] >= totals[index + 1]);
2438 inMap[index] = (totals[index] > totals[index + 1]);
2439 }
2440 inMap[maxIndex] = (totals[maxIndex] > 1);
2441 inMap[pkgIdIndex] = true;
2442
2443 int depth = 0;
2444 for (index = threadIdIndex; index <= maxIndex; index++) {
2445 if (inMap[index]) {
2446 depth++;
2447 }
2448 }
2449 KMP_ASSERT(depth > 0);
2450
2451 //
2452 // Construct the data structure that is to be returned.
2453 //
2454 *address2os = (AddrUnsPair*)
2455 __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2456 int pkgLevel = -1;
2457 int coreLevel = -1;
2458 int threadLevel = -1;
2459
2460 for (i = 0; i < num_avail; ++i) {
2461 Address addr(depth);
2462 unsigned os = threadInfo[i][osIdIndex];
2463 int src_index;
2464 int dst_index = 0;
2465
2466 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2467 if (! inMap[src_index]) {
2468 continue;
2469 }
2470 addr.labels[dst_index] = threadInfo[i][src_index];
2471 if (src_index == pkgIdIndex) {
2472 pkgLevel = dst_index;
2473 }
2474 else if (src_index == coreIdIndex) {
2475 coreLevel = dst_index;
2476 }
2477 else if (src_index == threadIdIndex) {
2478 threadLevel = dst_index;
2479 }
2480 dst_index++;
2481 }
2482 (*address2os)[i] = AddrUnsPair(addr, os);
2483 }
2484
2485 if (__kmp_affinity_gran_levels < 0) {
2486 //
2487 // Set the granularity level based on what levels are modeled
2488 // in the machine topology map.
2489 //
2490 unsigned src_index;
2491 __kmp_affinity_gran_levels = 0;
2492 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2493 if (! inMap[src_index]) {
2494 continue;
2495 }
2496 switch (src_index) {
2497 case threadIdIndex:
2498 if (__kmp_affinity_gran > affinity_gran_thread) {
2499 __kmp_affinity_gran_levels++;
2500 }
2501
2502 break;
2503 case coreIdIndex:
2504 if (__kmp_affinity_gran > affinity_gran_core) {
2505 __kmp_affinity_gran_levels++;
2506 }
2507 break;
2508
2509 case pkgIdIndex:
2510 if (__kmp_affinity_gran > affinity_gran_package) {
2511 __kmp_affinity_gran_levels++;
2512 }
2513 break;
2514 }
2515 }
2516 }
2517
2518 if (__kmp_affinity_verbose) {
2519 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2520 coreLevel, threadLevel);
2521 }
2522
2523 __kmp_free(inMap);
2524 __kmp_free(lastId);
2525 __kmp_free(totals);
2526 __kmp_free(maxCt);
2527 __kmp_free(counts);
2528 CLEANUP_THREAD_INFO;
2529 return depth;
2530}
2531
2532
2533//
2534// Create and return a table of affinity masks, indexed by OS thread ID.
2535// This routine handles OR'ing together all the affinity masks of threads
2536// that are sufficiently close, if granularity > fine.
2537//
2538static kmp_affin_mask_t *
2539__kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2540 AddrUnsPair *address2os, unsigned numAddrs)
2541{
2542 //
2543 // First form a table of affinity masks in order of OS thread id.
2544 //
2545 unsigned depth;
2546 unsigned maxOsId;
2547 unsigned i;
2548
2549 KMP_ASSERT(numAddrs > 0);
2550 depth = address2os[0].first.depth;
2551
2552 maxOsId = 0;
2553 for (i = 0; i < numAddrs; i++) {
2554 unsigned osId = address2os[i].second;
2555 if (osId > maxOsId) {
2556 maxOsId = osId;
2557 }
2558 }
2559 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2560 (maxOsId + 1) * __kmp_affin_mask_size);
2561
2562 //
2563 // Sort the address2os table according to physical order. Doing so
2564 // will put all threads on the same core/package/node in consecutive
2565 // locations.
2566 //
2567 qsort(address2os, numAddrs, sizeof(*address2os),
2568 __kmp_affinity_cmp_Address_labels);
2569
2570 KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2571 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2572 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2573 }
2574 if (__kmp_affinity_gran_levels >= (int)depth) {
2575 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2576 && (__kmp_affinity_type != affinity_none))) {
2577 KMP_WARNING(AffThreadsMayMigrate);
2578 }
2579 }
2580
2581 //
2582 // Run through the table, forming the masks for all threads on each
2583 // core. Threads on the same core will have identical "Address"
2584 // objects, not considering the last level, which must be the thread
2585 // id. All threads on a core will appear consecutively.
2586 //
2587 unsigned unique = 0;
2588 unsigned j = 0; // index of 1st thread on core
2589 unsigned leader = 0;
2590 Address *leaderAddr = &(address2os[0].first);
2591 kmp_affin_mask_t *sum
2592 = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
2593 KMP_CPU_ZERO(sum);
2594 KMP_CPU_SET(address2os[0].second, sum);
2595 for (i = 1; i < numAddrs; i++) {
2596 //
Alp Toker8f2d3f02014-02-24 10:40:15 +00002597 // If this thread is sufficiently close to the leader (within the
Jim Cownie5e8470a2013-09-27 10:38:44 +00002598 // granularity setting), then set the bit for this os thread in the
2599 // affinity mask for this group, and go on to the next thread.
2600 //
2601 if (leaderAddr->isClose(address2os[i].first,
2602 __kmp_affinity_gran_levels)) {
2603 KMP_CPU_SET(address2os[i].second, sum);
2604 continue;
2605 }
2606
2607 //
2608 // For every thread in this group, copy the mask to the thread's
2609 // entry in the osId2Mask table. Mark the first address as a
2610 // leader.
2611 //
2612 for (; j < i; j++) {
2613 unsigned osId = address2os[j].second;
2614 KMP_DEBUG_ASSERT(osId <= maxOsId);
2615 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2616 KMP_CPU_COPY(mask, sum);
2617 address2os[j].first.leader = (j == leader);
2618 }
2619 unique++;
2620
2621 //
2622 // Start a new mask.
2623 //
2624 leader = i;
2625 leaderAddr = &(address2os[i].first);
2626 KMP_CPU_ZERO(sum);
2627 KMP_CPU_SET(address2os[i].second, sum);
2628 }
2629
2630 //
2631 // For every thread in last group, copy the mask to the thread's
2632 // entry in the osId2Mask table.
2633 //
2634 for (; j < i; j++) {
2635 unsigned osId = address2os[j].second;
2636 KMP_DEBUG_ASSERT(osId <= maxOsId);
2637 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2638 KMP_CPU_COPY(mask, sum);
2639 address2os[j].first.leader = (j == leader);
2640 }
2641 unique++;
2642
2643 *maxIndex = maxOsId;
2644 *numUnique = unique;
2645 return osId2Mask;
2646}
2647
2648
2649//
2650// Stuff for the affinity proclist parsers. It's easier to declare these vars
2651// as file-static than to try and pass them through the calling sequence of
2652// the recursive-descent OMP_PLACES parser.
2653//
2654static kmp_affin_mask_t *newMasks;
2655static int numNewMasks;
2656static int nextNewMask;
2657
2658#define ADD_MASK(_mask) \
2659 { \
2660 if (nextNewMask >= numNewMasks) { \
2661 numNewMasks *= 2; \
2662 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2663 numNewMasks * __kmp_affin_mask_size); \
2664 } \
2665 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2666 nextNewMask++; \
2667 }
2668
2669#define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2670 { \
2671 if (((_osId) > _maxOsId) || \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002672 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
Jim Cownie5e8470a2013-09-27 10:38:44 +00002673 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \
2674 && (__kmp_affinity_type != affinity_none))) { \
2675 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2676 } \
2677 } \
2678 else { \
2679 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2680 } \
2681 }
2682
2683
2684//
2685// Re-parse the proclist (for the explicit affinity type), and form the list
2686// of affinity newMasks indexed by gtid.
2687//
2688static void
2689__kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2690 unsigned int *out_numMasks, const char *proclist,
2691 kmp_affin_mask_t *osId2Mask, int maxOsId)
2692{
2693 const char *scan = proclist;
2694 const char *next = proclist;
2695
2696 //
2697 // We use malloc() for the temporary mask vector,
2698 // so that we can use realloc() to extend it.
2699 //
2700 numNewMasks = 2;
2701 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2702 * __kmp_affin_mask_size);
2703 nextNewMask = 0;
2704 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2705 __kmp_affin_mask_size);
2706 int setSize = 0;
2707
2708 for (;;) {
2709 int start, end, stride;
2710
2711 SKIP_WS(scan);
2712 next = scan;
2713 if (*next == '\0') {
2714 break;
2715 }
2716
2717 if (*next == '{') {
2718 int num;
2719 setSize = 0;
2720 next++; // skip '{'
2721 SKIP_WS(next);
2722 scan = next;
2723
2724 //
2725 // Read the first integer in the set.
2726 //
2727 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2728 "bad proclist");
2729 SKIP_DIGITS(next);
2730 num = __kmp_str_to_int(scan, *next);
2731 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2732
2733 //
2734 // Copy the mask for that osId to the sum (union) mask.
2735 //
2736 if ((num > maxOsId) ||
2737 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2738 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2739 && (__kmp_affinity_type != affinity_none))) {
2740 KMP_WARNING(AffIgnoreInvalidProcID, num);
2741 }
2742 KMP_CPU_ZERO(sumMask);
2743 }
2744 else {
2745 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2746 setSize = 1;
2747 }
2748
2749 for (;;) {
2750 //
2751 // Check for end of set.
2752 //
2753 SKIP_WS(next);
2754 if (*next == '}') {
2755 next++; // skip '}'
2756 break;
2757 }
2758
2759 //
2760 // Skip optional comma.
2761 //
2762 if (*next == ',') {
2763 next++;
2764 }
2765 SKIP_WS(next);
2766
2767 //
2768 // Read the next integer in the set.
2769 //
2770 scan = next;
2771 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2772 "bad explicit proc list");
2773
2774 SKIP_DIGITS(next);
2775 num = __kmp_str_to_int(scan, *next);
2776 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2777
2778 //
2779 // Add the mask for that osId to the sum mask.
2780 //
2781 if ((num > maxOsId) ||
2782 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2783 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2784 && (__kmp_affinity_type != affinity_none))) {
2785 KMP_WARNING(AffIgnoreInvalidProcID, num);
2786 }
2787 }
2788 else {
2789 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2790 setSize++;
2791 }
2792 }
2793 if (setSize > 0) {
2794 ADD_MASK(sumMask);
2795 }
2796
2797 SKIP_WS(next);
2798 if (*next == ',') {
2799 next++;
2800 }
2801 scan = next;
2802 continue;
2803 }
2804
2805 //
2806 // Read the first integer.
2807 //
2808 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2809 SKIP_DIGITS(next);
2810 start = __kmp_str_to_int(scan, *next);
2811 KMP_ASSERT2(start >= 0, "bad explicit proc list");
2812 SKIP_WS(next);
2813
2814 //
2815 // If this isn't a range, then add a mask to the list and go on.
2816 //
2817 if (*next != '-') {
2818 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2819
2820 //
2821 // Skip optional comma.
2822 //
2823 if (*next == ',') {
2824 next++;
2825 }
2826 scan = next;
2827 continue;
2828 }
2829
2830 //
2831 // This is a range. Skip over the '-' and read in the 2nd int.
2832 //
2833 next++; // skip '-'
2834 SKIP_WS(next);
2835 scan = next;
2836 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2837 SKIP_DIGITS(next);
2838 end = __kmp_str_to_int(scan, *next);
2839 KMP_ASSERT2(end >= 0, "bad explicit proc list");
2840
2841 //
2842 // Check for a stride parameter
2843 //
2844 stride = 1;
2845 SKIP_WS(next);
2846 if (*next == ':') {
2847 //
2848 // A stride is specified. Skip over the ':" and read the 3rd int.
2849 //
2850 int sign = +1;
2851 next++; // skip ':'
2852 SKIP_WS(next);
2853 scan = next;
2854 if (*next == '-') {
2855 sign = -1;
2856 next++;
2857 SKIP_WS(next);
2858 scan = next;
2859 }
2860 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2861 "bad explicit proc list");
2862 SKIP_DIGITS(next);
2863 stride = __kmp_str_to_int(scan, *next);
2864 KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2865 stride *= sign;
2866 }
2867
2868 //
2869 // Do some range checks.
2870 //
2871 KMP_ASSERT2(stride != 0, "bad explicit proc list");
2872 if (stride > 0) {
2873 KMP_ASSERT2(start <= end, "bad explicit proc list");
2874 }
2875 else {
2876 KMP_ASSERT2(start >= end, "bad explicit proc list");
2877 }
2878 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2879
2880 //
2881 // Add the mask for each OS proc # to the list.
2882 //
2883 if (stride > 0) {
2884 do {
2885 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2886 start += stride;
2887 } while (start <= end);
2888 }
2889 else {
2890 do {
2891 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2892 start += stride;
2893 } while (start >= end);
2894 }
2895
2896 //
2897 // Skip optional comma.
2898 //
2899 SKIP_WS(next);
2900 if (*next == ',') {
2901 next++;
2902 }
2903 scan = next;
2904 }
2905
2906 *out_numMasks = nextNewMask;
2907 if (nextNewMask == 0) {
2908 *out_masks = NULL;
2909 KMP_INTERNAL_FREE(newMasks);
2910 return;
2911 }
2912 *out_masks
2913 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2914 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2915 __kmp_free(sumMask);
2916 KMP_INTERNAL_FREE(newMasks);
2917}
2918
2919
2920# if OMP_40_ENABLED
2921
2922/*-----------------------------------------------------------------------------
2923
2924Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2925places. Again, Here is the grammar:
2926
2927place_list := place
2928place_list := place , place_list
2929place := num
2930place := place : num
2931place := place : num : signed
2932place := { subplacelist }
2933place := ! place // (lowest priority)
2934subplace_list := subplace
2935subplace_list := subplace , subplace_list
2936subplace := num
2937subplace := num : num
2938subplace := num : num : signed
2939signed := num
2940signed := + signed
2941signed := - signed
2942
2943-----------------------------------------------------------------------------*/
2944
2945static void
2946__kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2947 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2948{
2949 const char *next;
2950
2951 for (;;) {
2952 int start, count, stride, i;
2953
2954 //
2955 // Read in the starting proc id
2956 //
2957 SKIP_WS(*scan);
2958 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2959 "bad explicit places list");
2960 next = *scan;
2961 SKIP_DIGITS(next);
2962 start = __kmp_str_to_int(*scan, *next);
2963 KMP_ASSERT(start >= 0);
2964 *scan = next;
2965
2966 //
2967 // valid follow sets are ',' ':' and '}'
2968 //
2969 SKIP_WS(*scan);
2970 if (**scan == '}' || **scan == ',') {
2971 if ((start > maxOsId) ||
2972 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2973 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2974 && (__kmp_affinity_type != affinity_none))) {
2975 KMP_WARNING(AffIgnoreInvalidProcID, start);
2976 }
2977 }
2978 else {
2979 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2980 (*setSize)++;
2981 }
2982 if (**scan == '}') {
2983 break;
2984 }
2985 (*scan)++; // skip ','
2986 continue;
2987 }
2988 KMP_ASSERT2(**scan == ':', "bad explicit places list");
2989 (*scan)++; // skip ':'
2990
2991 //
2992 // Read count parameter
2993 //
2994 SKIP_WS(*scan);
2995 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2996 "bad explicit places list");
2997 next = *scan;
2998 SKIP_DIGITS(next);
2999 count = __kmp_str_to_int(*scan, *next);
3000 KMP_ASSERT(count >= 0);
3001 *scan = next;
3002
3003 //
3004 // valid follow sets are ',' ':' and '}'
3005 //
3006 SKIP_WS(*scan);
3007 if (**scan == '}' || **scan == ',') {
3008 for (i = 0; i < count; i++) {
3009 if ((start > maxOsId) ||
3010 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3011 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3012 && (__kmp_affinity_type != affinity_none))) {
3013 KMP_WARNING(AffIgnoreInvalidProcID, start);
3014 }
3015 break; // don't proliferate warnings for large count
3016 }
3017 else {
3018 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3019 start++;
3020 (*setSize)++;
3021 }
3022 }
3023 if (**scan == '}') {
3024 break;
3025 }
3026 (*scan)++; // skip ','
3027 continue;
3028 }
3029 KMP_ASSERT2(**scan == ':', "bad explicit places list");
3030 (*scan)++; // skip ':'
3031
3032 //
3033 // Read stride parameter
3034 //
3035 int sign = +1;
3036 for (;;) {
3037 SKIP_WS(*scan);
3038 if (**scan == '+') {
3039 (*scan)++; // skip '+'
3040 continue;
3041 }
3042 if (**scan == '-') {
3043 sign *= -1;
3044 (*scan)++; // skip '-'
3045 continue;
3046 }
3047 break;
3048 }
3049 SKIP_WS(*scan);
3050 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3051 "bad explicit places list");
3052 next = *scan;
3053 SKIP_DIGITS(next);
3054 stride = __kmp_str_to_int(*scan, *next);
3055 KMP_ASSERT(stride >= 0);
3056 *scan = next;
3057 stride *= sign;
3058
3059 //
3060 // valid follow sets are ',' and '}'
3061 //
3062 SKIP_WS(*scan);
3063 if (**scan == '}' || **scan == ',') {
3064 for (i = 0; i < count; i++) {
3065 if ((start > maxOsId) ||
3066 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3067 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3068 && (__kmp_affinity_type != affinity_none))) {
3069 KMP_WARNING(AffIgnoreInvalidProcID, start);
3070 }
3071 break; // don't proliferate warnings for large count
3072 }
3073 else {
3074 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3075 start += stride;
3076 (*setSize)++;
3077 }
3078 }
3079 if (**scan == '}') {
3080 break;
3081 }
3082 (*scan)++; // skip ','
3083 continue;
3084 }
3085
3086 KMP_ASSERT2(0, "bad explicit places list");
3087 }
3088}
3089
3090
3091static void
3092__kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3093 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3094{
3095 const char *next;
3096
3097 //
3098 // valid follow sets are '{' '!' and num
3099 //
3100 SKIP_WS(*scan);
3101 if (**scan == '{') {
3102 (*scan)++; // skip '{'
3103 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3104 setSize);
3105 KMP_ASSERT2(**scan == '}', "bad explicit places list");
3106 (*scan)++; // skip '}'
3107 }
3108 else if (**scan == '!') {
3109 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3110 KMP_CPU_COMPLEMENT(tempMask);
3111 (*scan)++; // skip '!'
3112 }
3113 else if ((**scan >= '0') && (**scan <= '9')) {
3114 next = *scan;
3115 SKIP_DIGITS(next);
3116 int num = __kmp_str_to_int(*scan, *next);
3117 KMP_ASSERT(num >= 0);
3118 if ((num > maxOsId) ||
3119 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3120 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3121 && (__kmp_affinity_type != affinity_none))) {
3122 KMP_WARNING(AffIgnoreInvalidProcID, num);
3123 }
3124 }
3125 else {
3126 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3127 (*setSize)++;
3128 }
3129 *scan = next; // skip num
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003130 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003131 else {
3132 KMP_ASSERT2(0, "bad explicit places list");
3133 }
3134}
3135
3136
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003137//static void
3138void
Jim Cownie5e8470a2013-09-27 10:38:44 +00003139__kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3140 unsigned int *out_numMasks, const char *placelist,
3141 kmp_affin_mask_t *osId2Mask, int maxOsId)
3142{
3143 const char *scan = placelist;
3144 const char *next = placelist;
3145
3146 numNewMasks = 2;
3147 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3148 * __kmp_affin_mask_size);
3149 nextNewMask = 0;
3150
3151 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3152 __kmp_affin_mask_size);
3153 KMP_CPU_ZERO(tempMask);
3154 int setSize = 0;
3155
3156 for (;;) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003157 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3158
3159 //
3160 // valid follow sets are ',' ':' and EOL
3161 //
3162 SKIP_WS(scan);
3163 if (*scan == '\0' || *scan == ',') {
3164 if (setSize > 0) {
3165 ADD_MASK(tempMask);
3166 }
3167 KMP_CPU_ZERO(tempMask);
3168 setSize = 0;
3169 if (*scan == '\0') {
3170 break;
3171 }
3172 scan++; // skip ','
3173 continue;
3174 }
3175
3176 KMP_ASSERT2(*scan == ':', "bad explicit places list");
3177 scan++; // skip ':'
3178
3179 //
3180 // Read count parameter
3181 //
3182 SKIP_WS(scan);
3183 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3184 "bad explicit places list");
3185 next = scan;
3186 SKIP_DIGITS(next);
Jim Cownie181b4bb2013-12-23 17:28:57 +00003187 int count = __kmp_str_to_int(scan, *next);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003188 KMP_ASSERT(count >= 0);
3189 scan = next;
3190
3191 //
3192 // valid follow sets are ',' ':' and EOL
3193 //
3194 SKIP_WS(scan);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003195 int stride;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003196 if (*scan == '\0' || *scan == ',') {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003197 stride = +1;
3198 }
3199 else {
3200 KMP_ASSERT2(*scan == ':', "bad explicit places list");
3201 scan++; // skip ':'
Jim Cownie5e8470a2013-09-27 10:38:44 +00003202
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003203 //
3204 // Read stride parameter
3205 //
3206 int sign = +1;
3207 for (;;) {
3208 SKIP_WS(scan);
3209 if (*scan == '+') {
3210 scan++; // skip '+'
3211 continue;
3212 }
3213 if (*scan == '-') {
3214 sign *= -1;
3215 scan++; // skip '-'
3216 continue;
3217 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003218 break;
3219 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003220 SKIP_WS(scan);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003221 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3222 "bad explicit places list");
3223 next = scan;
3224 SKIP_DIGITS(next);
3225 stride = __kmp_str_to_int(scan, *next);
3226 KMP_DEBUG_ASSERT(stride >= 0);
3227 scan = next;
3228 stride *= sign;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003229 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003230
3231 if (stride > 0) {
3232 int i;
3233 for (i = 0; i < count; i++) {
3234 int j;
3235 if (setSize == 0) {
3236 break;
3237 }
3238 ADD_MASK(tempMask);
3239 setSize = 0;
3240 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003241 if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3242 KMP_CPU_CLR(j, tempMask);
3243 }
3244 else if ((j > maxOsId) ||
3245 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3246 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3247 && (__kmp_affinity_type != affinity_none))) {
3248 KMP_WARNING(AffIgnoreInvalidProcID, j);
3249 }
3250 KMP_CPU_CLR(j, tempMask);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003251 }
3252 else {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003253 KMP_CPU_SET(j, tempMask);
3254 setSize++;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003255 }
3256 }
3257 for (; j >= 0; j--) {
3258 KMP_CPU_CLR(j, tempMask);
3259 }
3260 }
3261 }
3262 else {
3263 int i;
3264 for (i = 0; i < count; i++) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003265 int j;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003266 if (setSize == 0) {
3267 break;
3268 }
3269 ADD_MASK(tempMask);
3270 setSize = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003271 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003272 j++) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003273 if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3274 KMP_CPU_CLR(j, tempMask);
3275 }
3276 else if ((j > maxOsId) ||
3277 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3278 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3279 && (__kmp_affinity_type != affinity_none))) {
3280 KMP_WARNING(AffIgnoreInvalidProcID, j);
3281 }
3282 KMP_CPU_CLR(j, tempMask);
3283 }
3284 else {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003285 KMP_CPU_SET(j, tempMask);
3286 setSize++;
3287 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003288 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003289 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003290 KMP_CPU_CLR(j, tempMask);
3291 }
3292 }
3293 }
3294 KMP_CPU_ZERO(tempMask);
3295 setSize = 0;
3296
3297 //
3298 // valid follow sets are ',' and EOL
3299 //
3300 SKIP_WS(scan);
3301 if (*scan == '\0') {
3302 break;
3303 }
3304 if (*scan == ',') {
3305 scan++; // skip ','
3306 continue;
3307 }
3308
3309 KMP_ASSERT2(0, "bad explicit places list");
3310 }
3311
3312 *out_numMasks = nextNewMask;
3313 if (nextNewMask == 0) {
3314 *out_masks = NULL;
3315 KMP_INTERNAL_FREE(newMasks);
3316 return;
3317 }
3318 *out_masks
3319 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3320 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3321 __kmp_free(tempMask);
3322 KMP_INTERNAL_FREE(newMasks);
3323}
3324
3325# endif /* OMP_40_ENABLED */
3326
3327#undef ADD_MASK
3328#undef ADD_MASK_OSID
3329
Jim Cownie5e8470a2013-09-27 10:38:44 +00003330static void
3331__kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3332{
3333 if ( __kmp_place_num_cores == 0 ) {
3334 if ( __kmp_place_num_threads_per_core == 0 ) {
3335 return; // no cores limiting actions requested, exit
3336 }
3337 __kmp_place_num_cores = nCoresPerPkg; // use all available cores
3338 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003339 if ( !__kmp_affinity_uniform_topology() ) {
3340 KMP_WARNING( AffThrPlaceNonUniform );
3341 return; // don't support non-uniform topology
3342 }
3343 if ( depth != 3 ) {
3344 KMP_WARNING( AffThrPlaceNonThreeLevel );
3345 return; // don't support not-3-level topology
Jim Cownie5e8470a2013-09-27 10:38:44 +00003346 }
3347 if ( __kmp_place_num_threads_per_core == 0 ) {
3348 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3349 }
Andrey Churbanov5cd50e32015-01-29 17:14:58 +00003350 if ( __kmp_place_core_offset + __kmp_place_num_cores > (unsigned int)nCoresPerPkg ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003351 KMP_WARNING( AffThrPlaceManyCores );
3352 return;
3353 }
3354
3355 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3356 nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3357 int i, j, k, n_old = 0, n_new = 0;
3358 for ( i = 0; i < nPackages; ++i ) {
3359 for ( j = 0; j < nCoresPerPkg; ++j ) {
Andrey Churbanov5cd50e32015-01-29 17:14:58 +00003360 if ( (unsigned int)j < __kmp_place_core_offset || (unsigned int)j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003361 n_old += __kmp_nThreadsPerCore; // skip not-requested core
3362 } else {
3363 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
Andrey Churbanov5cd50e32015-01-29 17:14:58 +00003364 if ( (unsigned int)k < __kmp_place_num_threads_per_core ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003365 newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location
3366 n_new++;
3367 }
3368 n_old++;
3369 }
3370 }
3371 }
3372 }
3373 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg
3374 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3375 __kmp_avail_proc = n_new; // correct avail_proc
3376 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores
3377
3378 __kmp_free( *pAddr );
3379 *pAddr = newAddr; // replace old topology with new one
3380}
3381
Jim Cownie5e8470a2013-09-27 10:38:44 +00003382
3383static AddrUnsPair *address2os = NULL;
3384static int * procarr = NULL;
3385static int __kmp_aff_depth = 0;
3386
3387static void
3388__kmp_aux_affinity_initialize(void)
3389{
3390 if (__kmp_affinity_masks != NULL) {
3391 KMP_ASSERT(fullMask != NULL);
3392 return;
3393 }
3394
3395 //
3396 // Create the "full" mask - this defines all of the processors that we
3397 // consider to be in the machine model. If respect is set, then it is
3398 // the initialization thread's affinity mask. Otherwise, it is all
3399 // processors that we know about on the machine.
3400 //
3401 if (fullMask == NULL) {
3402 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3403 }
3404 if (KMP_AFFINITY_CAPABLE()) {
3405 if (__kmp_affinity_respect_mask) {
3406 __kmp_get_system_affinity(fullMask, TRUE);
3407
3408 //
3409 // Count the number of available processors.
3410 //
3411 unsigned i;
3412 __kmp_avail_proc = 0;
3413 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3414 if (! KMP_CPU_ISSET(i, fullMask)) {
3415 continue;
3416 }
3417 __kmp_avail_proc++;
3418 }
3419 if (__kmp_avail_proc > __kmp_xproc) {
3420 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3421 && (__kmp_affinity_type != affinity_none))) {
3422 KMP_WARNING(ErrorInitializeAffinity);
3423 }
3424 __kmp_affinity_type = affinity_none;
3425 __kmp_affin_mask_size = 0;
3426 return;
3427 }
3428 }
3429 else {
3430 __kmp_affinity_entire_machine_mask(fullMask);
3431 __kmp_avail_proc = __kmp_xproc;
3432 }
3433 }
3434
3435 int depth = -1;
3436 kmp_i18n_id_t msg_id = kmp_i18n_null;
3437
3438 //
Alp Toker8f2d3f02014-02-24 10:40:15 +00003439 // For backward compatibility, setting KMP_CPUINFO_FILE =>
Jim Cownie5e8470a2013-09-27 10:38:44 +00003440 // KMP_TOPOLOGY_METHOD=cpuinfo
3441 //
3442 if ((__kmp_cpuinfo_file != NULL) &&
3443 (__kmp_affinity_top_method == affinity_top_method_all)) {
3444 __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3445 }
3446
3447 if (__kmp_affinity_top_method == affinity_top_method_all) {
3448 //
3449 // In the default code path, errors are not fatal - we just try using
3450 // another method. We only emit a warning message if affinity is on,
3451 // or the verbose flag is set, an the nowarnings flag was not set.
3452 //
3453 const char *file_name = NULL;
3454 int line = 0;
3455
3456# if KMP_ARCH_X86 || KMP_ARCH_X86_64
3457
3458 if (__kmp_affinity_verbose) {
3459 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3460 }
3461
3462 file_name = NULL;
3463 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3464 if (depth == 0) {
3465 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3466 KMP_ASSERT(address2os == NULL);
3467 return;
3468 }
3469
3470 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003471 if (__kmp_affinity_verbose) {
3472 if (msg_id != kmp_i18n_null) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003473 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3474 KMP_I18N_STR(DecodingLegacyAPIC));
3475 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003476 else {
3477 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3478 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003479 }
3480
3481 file_name = NULL;
3482 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3483 if (depth == 0) {
3484 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3485 KMP_ASSERT(address2os == NULL);
3486 return;
3487 }
3488 }
3489
3490# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3491
3492# if KMP_OS_LINUX
3493
3494 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003495 if (__kmp_affinity_verbose) {
3496 if (msg_id != kmp_i18n_null) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003497 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3498 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003499 else {
3500 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3501 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003502 }
3503
3504 FILE *f = fopen("/proc/cpuinfo", "r");
3505 if (f == NULL) {
3506 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3507 }
3508 else {
3509 file_name = "/proc/cpuinfo";
3510 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3511 fclose(f);
3512 if (depth == 0) {
3513 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3514 KMP_ASSERT(address2os == NULL);
3515 return;
3516 }
3517 }
3518 }
3519
3520# endif /* KMP_OS_LINUX */
3521
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003522# if KMP_GROUP_AFFINITY
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003523
3524 if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3525 if (__kmp_affinity_verbose) {
3526 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3527 }
3528
3529 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3530 KMP_ASSERT(depth != 0);
3531 }
3532
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003533# endif /* KMP_GROUP_AFFINITY */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003534
Jim Cownie5e8470a2013-09-27 10:38:44 +00003535 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003536 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003537 if (file_name == NULL) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003538 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003539 }
3540 else if (line == 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003541 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003542 }
3543 else {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003544 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003545 }
3546 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003547 // FIXME - print msg if msg_id = kmp_i18n_null ???
Jim Cownie5e8470a2013-09-27 10:38:44 +00003548
3549 file_name = "";
3550 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3551 if (depth == 0) {
3552 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3553 KMP_ASSERT(address2os == NULL);
3554 return;
3555 }
3556 KMP_ASSERT(depth > 0);
3557 KMP_ASSERT(address2os != NULL);
3558 }
3559 }
3560
3561 //
3562 // If the user has specified that a paricular topology discovery method
3563 // is to be used, then we abort if that method fails. The exception is
3564 // group affinity, which might have been implicitly set.
3565 //
3566
3567# if KMP_ARCH_X86 || KMP_ARCH_X86_64
3568
3569 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3570 if (__kmp_affinity_verbose) {
3571 KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3572 KMP_I18N_STR(Decodingx2APIC));
3573 }
3574
3575 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3576 if (depth == 0) {
3577 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3578 KMP_ASSERT(address2os == NULL);
3579 return;
3580 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003581 if (depth < 0) {
3582 KMP_ASSERT(msg_id != kmp_i18n_null);
3583 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3584 }
3585 }
3586 else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3587 if (__kmp_affinity_verbose) {
3588 KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3589 KMP_I18N_STR(DecodingLegacyAPIC));
3590 }
3591
3592 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3593 if (depth == 0) {
3594 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3595 KMP_ASSERT(address2os == NULL);
3596 return;
3597 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003598 if (depth < 0) {
3599 KMP_ASSERT(msg_id != kmp_i18n_null);
3600 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3601 }
3602 }
3603
3604# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3605
3606 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3607 const char *filename;
3608 if (__kmp_cpuinfo_file != NULL) {
3609 filename = __kmp_cpuinfo_file;
3610 }
3611 else {
3612 filename = "/proc/cpuinfo";
3613 }
3614
3615 if (__kmp_affinity_verbose) {
3616 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3617 }
3618
3619 FILE *f = fopen(filename, "r");
3620 if (f == NULL) {
3621 int code = errno;
3622 if (__kmp_cpuinfo_file != NULL) {
3623 __kmp_msg(
3624 kmp_ms_fatal,
3625 KMP_MSG(CantOpenFileForReading, filename),
3626 KMP_ERR(code),
3627 KMP_HNT(NameComesFrom_CPUINFO_FILE),
3628 __kmp_msg_null
3629 );
3630 }
3631 else {
3632 __kmp_msg(
3633 kmp_ms_fatal,
3634 KMP_MSG(CantOpenFileForReading, filename),
3635 KMP_ERR(code),
3636 __kmp_msg_null
3637 );
3638 }
3639 }
3640 int line = 0;
3641 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3642 fclose(f);
3643 if (depth < 0) {
3644 KMP_ASSERT(msg_id != kmp_i18n_null);
3645 if (line > 0) {
3646 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3647 }
3648 else {
3649 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3650 }
3651 }
3652 if (__kmp_affinity_type == affinity_none) {
3653 KMP_ASSERT(depth == 0);
3654 KMP_ASSERT(address2os == NULL);
3655 return;
3656 }
3657 }
3658
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003659# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00003660
3661 else if (__kmp_affinity_top_method == affinity_top_method_group) {
3662 if (__kmp_affinity_verbose) {
3663 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3664 }
3665
3666 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3667 KMP_ASSERT(depth != 0);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003668 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003669 KMP_ASSERT(msg_id != kmp_i18n_null);
3670 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003671 }
3672 }
3673
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003674# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +00003675
3676 else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3677 if (__kmp_affinity_verbose) {
3678 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3679 }
3680
3681 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3682 if (depth == 0) {
3683 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3684 KMP_ASSERT(address2os == NULL);
3685 return;
3686 }
3687 // should not fail
3688 KMP_ASSERT(depth > 0);
3689 KMP_ASSERT(address2os != NULL);
3690 }
3691
3692 if (address2os == NULL) {
3693 if (KMP_AFFINITY_CAPABLE()
3694 && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3695 && (__kmp_affinity_type != affinity_none)))) {
3696 KMP_WARNING(ErrorInitializeAffinity);
3697 }
3698 __kmp_affinity_type = affinity_none;
3699 __kmp_affin_mask_size = 0;
3700 return;
3701 }
3702
Jim Cownie5e8470a2013-09-27 10:38:44 +00003703 __kmp_apply_thread_places(&address2os, depth);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003704
3705 //
3706 // Create the table of masks, indexed by thread Id.
3707 //
3708 unsigned maxIndex;
3709 unsigned numUnique;
3710 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3711 address2os, __kmp_avail_proc);
3712 if (__kmp_affinity_gran_levels == 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003713 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003714 }
3715
3716 //
3717 // Set the childNums vector in all Address objects. This must be done
3718 // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3719 // which takes into account the setting of __kmp_affinity_compact.
3720 //
3721 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3722
3723 switch (__kmp_affinity_type) {
3724
3725 case affinity_explicit:
3726 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3727# if OMP_40_ENABLED
3728 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3729# endif
3730 {
3731 __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3732 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3733 maxIndex);
3734 }
3735# if OMP_40_ENABLED
3736 else {
3737 __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3738 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3739 maxIndex);
3740 }
3741# endif
3742 if (__kmp_affinity_num_masks == 0) {
3743 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3744 && (__kmp_affinity_type != affinity_none))) {
3745 KMP_WARNING(AffNoValidProcID);
3746 }
3747 __kmp_affinity_type = affinity_none;
3748 return;
3749 }
3750 break;
3751
3752 //
3753 // The other affinity types rely on sorting the Addresses according
3754 // to some permutation of the machine topology tree. Set
3755 // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3756 // then jump to a common code fragment to do the sort and create
3757 // the array of affinity masks.
3758 //
3759
3760 case affinity_logical:
3761 __kmp_affinity_compact = 0;
3762 if (__kmp_affinity_offset) {
3763 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3764 % __kmp_avail_proc;
3765 }
3766 goto sortAddresses;
3767
3768 case affinity_physical:
3769 if (__kmp_nThreadsPerCore > 1) {
3770 __kmp_affinity_compact = 1;
3771 if (__kmp_affinity_compact >= depth) {
3772 __kmp_affinity_compact = 0;
3773 }
3774 } else {
3775 __kmp_affinity_compact = 0;
3776 }
3777 if (__kmp_affinity_offset) {
3778 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3779 % __kmp_avail_proc;
3780 }
3781 goto sortAddresses;
3782
3783 case affinity_scatter:
3784 if (__kmp_affinity_compact >= depth) {
3785 __kmp_affinity_compact = 0;
3786 }
3787 else {
3788 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3789 }
3790 goto sortAddresses;
3791
3792 case affinity_compact:
3793 if (__kmp_affinity_compact >= depth) {
3794 __kmp_affinity_compact = depth - 1;
3795 }
3796 goto sortAddresses;
3797
Jim Cownie5e8470a2013-09-27 10:38:44 +00003798 case affinity_balanced:
Andrey Churbanovf28f6132015-01-13 14:54:00 +00003799 // Balanced works only for the case of a single package
Jim Cownie5e8470a2013-09-27 10:38:44 +00003800 if( nPackages > 1 ) {
3801 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3802 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3803 }
3804 __kmp_affinity_type = affinity_none;
3805 return;
3806 } else if( __kmp_affinity_uniform_topology() ) {
3807 break;
3808 } else { // Non-uniform topology
3809
3810 // Save the depth for further usage
3811 __kmp_aff_depth = depth;
3812
3813 // Number of hyper threads per core in HT machine
3814 int nth_per_core = __kmp_nThreadsPerCore;
3815
3816 int core_level;
3817 if( nth_per_core > 1 ) {
3818 core_level = depth - 2;
3819 } else {
3820 core_level = depth - 1;
3821 }
3822 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3823 int nproc = nth_per_core * ncores;
3824
3825 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3826 for( int i = 0; i < nproc; i++ ) {
3827 procarr[ i ] = -1;
3828 }
3829
3830 for( int i = 0; i < __kmp_avail_proc; i++ ) {
3831 int proc = address2os[ i ].second;
3832 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3833 // If there is only one thread per core then depth == 2: level 0 - package,
3834 // level 1 - core.
3835 int level = depth - 1;
3836
3837 // __kmp_nth_per_core == 1
3838 int thread = 0;
3839 int core = address2os[ i ].first.labels[ level ];
3840 // If the thread level exists, that is we have more than one thread context per core
3841 if( nth_per_core > 1 ) {
3842 thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3843 core = address2os[ i ].first.labels[ level - 1 ];
3844 }
3845 procarr[ core * nth_per_core + thread ] = proc;
3846 }
3847
3848 break;
3849 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003850
3851 sortAddresses:
3852 //
3853 // Allocate the gtid->affinity mask table.
3854 //
3855 if (__kmp_affinity_dups) {
3856 __kmp_affinity_num_masks = __kmp_avail_proc;
3857 }
3858 else {
3859 __kmp_affinity_num_masks = numUnique;
3860 }
3861
3862# if OMP_40_ENABLED
3863 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3864 && ( __kmp_affinity_num_places > 0 )
3865 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3866 __kmp_affinity_num_masks = __kmp_affinity_num_places;
3867 }
3868# endif
3869
3870 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3871 __kmp_affinity_num_masks * __kmp_affin_mask_size);
3872
3873 //
3874 // Sort the address2os table according to the current setting of
3875 // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3876 //
3877 qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3878 __kmp_affinity_cmp_Address_child_num);
3879 {
3880 int i;
3881 unsigned j;
3882 for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3883 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3884 continue;
3885 }
3886 unsigned osId = address2os[i].second;
3887 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3888 kmp_affin_mask_t *dest
3889 = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3890 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3891 KMP_CPU_COPY(dest, src);
3892 if (++j >= __kmp_affinity_num_masks) {
3893 break;
3894 }
3895 }
3896 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3897 }
3898 break;
3899
3900 default:
3901 KMP_ASSERT2(0, "Unexpected affinity setting");
3902 }
3903
3904 __kmp_free(osId2Mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003905 machine_hierarchy.init(address2os, __kmp_avail_proc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003906}
3907
3908
3909void
3910__kmp_affinity_initialize(void)
3911{
3912 //
3913 // Much of the code above was written assumming that if a machine was not
3914 // affinity capable, then __kmp_affinity_type == affinity_none. We now
3915 // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3916 //
3917 // There are too many checks for __kmp_affinity_type == affinity_none
3918 // in this code. Instead of trying to change them all, check if
3919 // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3920 // affinity_none, call the real initialization routine, then restore
3921 // __kmp_affinity_type to affinity_disabled.
3922 //
3923 int disabled = (__kmp_affinity_type == affinity_disabled);
3924 if (! KMP_AFFINITY_CAPABLE()) {
3925 KMP_ASSERT(disabled);
3926 }
3927 if (disabled) {
3928 __kmp_affinity_type = affinity_none;
3929 }
3930 __kmp_aux_affinity_initialize();
3931 if (disabled) {
3932 __kmp_affinity_type = affinity_disabled;
3933 }
3934}
3935
3936
3937void
3938__kmp_affinity_uninitialize(void)
3939{
3940 if (__kmp_affinity_masks != NULL) {
3941 __kmp_free(__kmp_affinity_masks);
3942 __kmp_affinity_masks = NULL;
3943 }
3944 if (fullMask != NULL) {
3945 KMP_CPU_FREE(fullMask);
3946 fullMask = NULL;
3947 }
3948 __kmp_affinity_num_masks = 0;
3949# if OMP_40_ENABLED
3950 __kmp_affinity_num_places = 0;
3951# endif
3952 if (__kmp_affinity_proclist != NULL) {
3953 __kmp_free(__kmp_affinity_proclist);
3954 __kmp_affinity_proclist = NULL;
3955 }
3956 if( address2os != NULL ) {
3957 __kmp_free( address2os );
3958 address2os = NULL;
3959 }
3960 if( procarr != NULL ) {
3961 __kmp_free( procarr );
3962 procarr = NULL;
3963 }
3964}
3965
3966
3967void
3968__kmp_affinity_set_init_mask(int gtid, int isa_root)
3969{
3970 if (! KMP_AFFINITY_CAPABLE()) {
3971 return;
3972 }
3973
3974 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3975 if (th->th.th_affin_mask == NULL) {
3976 KMP_CPU_ALLOC(th->th.th_affin_mask);
3977 }
3978 else {
3979 KMP_CPU_ZERO(th->th.th_affin_mask);
3980 }
3981
3982 //
3983 // Copy the thread mask to the kmp_info_t strucuture.
3984 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
3985 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
3986 // is set, then the full mask is the same as the mask of the initialization
3987 // thread.
3988 //
3989 kmp_affin_mask_t *mask;
3990 int i;
3991
3992# if OMP_40_ENABLED
3993 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3994# endif
3995 {
Andrey Churbanovf28f6132015-01-13 14:54:00 +00003996 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
Jim Cownie5e8470a2013-09-27 10:38:44 +00003997 ) {
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003998# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00003999 if (__kmp_num_proc_groups > 1) {
4000 return;
4001 }
4002# endif
4003 KMP_ASSERT(fullMask != NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004004 i = KMP_PLACE_ALL;
Jim Cownie5e8470a2013-09-27 10:38:44 +00004005 mask = fullMask;
4006 }
4007 else {
4008 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4009 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4010 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4011 }
4012 }
4013# if OMP_40_ENABLED
4014 else {
4015 if ((! isa_root)
4016 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
Andrey Churbanov7daf9802015-01-27 16:52:57 +00004017# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00004018 if (__kmp_num_proc_groups > 1) {
4019 return;
4020 }
4021# endif
4022 KMP_ASSERT(fullMask != NULL);
4023 i = KMP_PLACE_ALL;
4024 mask = fullMask;
4025 }
4026 else {
4027 //
4028 // int i = some hash function or just a counter that doesn't
4029 // always start at 0. Use gtid for now.
4030 //
4031 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4032 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4033 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4034 }
4035 }
4036# endif
4037
4038# if OMP_40_ENABLED
4039 th->th.th_current_place = i;
4040 if (isa_root) {
4041 th->th.th_new_place = i;
4042 th->th.th_first_place = 0;
4043 th->th.th_last_place = __kmp_affinity_num_masks - 1;
4044 }
4045
4046 if (i == KMP_PLACE_ALL) {
4047 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4048 gtid));
4049 }
4050 else {
4051 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4052 gtid, i));
4053 }
4054# else
4055 if (i == -1) {
4056 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4057 gtid));
4058 }
4059 else {
4060 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4061 gtid, i));
4062 }
4063# endif /* OMP_40_ENABLED */
4064
4065 KMP_CPU_COPY(th->th.th_affin_mask, mask);
4066
4067 if (__kmp_affinity_verbose) {
4068 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4069 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4070 th->th.th_affin_mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004071 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4072 buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004073 }
4074
4075# if KMP_OS_WINDOWS
4076 //
4077 // On Windows* OS, the process affinity mask might have changed.
4078 // If the user didn't request affinity and this call fails,
4079 // just continue silently. See CQ171393.
4080 //
4081 if ( __kmp_affinity_type == affinity_none ) {
4082 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4083 }
4084 else
4085# endif
4086 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4087}
4088
4089
4090# if OMP_40_ENABLED
4091
4092void
4093__kmp_affinity_set_place(int gtid)
4094{
4095 int retval;
4096
4097 if (! KMP_AFFINITY_CAPABLE()) {
4098 return;
4099 }
4100
4101 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4102
4103 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4104 gtid, th->th.th_new_place, th->th.th_current_place));
4105
4106 //
Alp Toker8f2d3f02014-02-24 10:40:15 +00004107 // Check that the new place is within this thread's partition.
Jim Cownie5e8470a2013-09-27 10:38:44 +00004108 //
4109 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004110 KMP_ASSERT(th->th.th_new_place >= 0);
4111 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004112 if (th->th.th_first_place <= th->th.th_last_place) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004113 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
Jim Cownie5e8470a2013-09-27 10:38:44 +00004114 && (th->th.th_new_place <= th->th.th_last_place));
4115 }
4116 else {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004117 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
Jim Cownie5e8470a2013-09-27 10:38:44 +00004118 || (th->th.th_new_place >= th->th.th_last_place));
4119 }
4120
4121 //
4122 // Copy the thread mask to the kmp_info_t strucuture,
4123 // and set this thread's affinity.
4124 //
4125 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4126 th->th.th_new_place);
4127 KMP_CPU_COPY(th->th.th_affin_mask, mask);
4128 th->th.th_current_place = th->th.th_new_place;
4129
4130 if (__kmp_affinity_verbose) {
4131 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4132 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4133 th->th.th_affin_mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004134 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4135 gtid, buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004136 }
4137 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4138}
4139
4140# endif /* OMP_40_ENABLED */
4141
4142
4143int
4144__kmp_aux_set_affinity(void **mask)
4145{
4146 int gtid;
4147 kmp_info_t *th;
4148 int retval;
4149
4150 if (! KMP_AFFINITY_CAPABLE()) {
4151 return -1;
4152 }
4153
4154 gtid = __kmp_entry_gtid();
4155 KA_TRACE(1000, ;{
4156 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4157 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4158 (kmp_affin_mask_t *)(*mask));
4159 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4160 gtid, buf);
4161 });
4162
4163 if (__kmp_env_consistency_check) {
4164 if ((mask == NULL) || (*mask == NULL)) {
4165 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4166 }
4167 else {
4168 unsigned proc;
4169 int num_procs = 0;
4170
4171 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4172 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4173 continue;
4174 }
4175 num_procs++;
4176 if (! KMP_CPU_ISSET(proc, fullMask)) {
4177 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4178 break;
4179 }
4180 }
4181 if (num_procs == 0) {
4182 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4183 }
4184
Andrey Churbanov7daf9802015-01-27 16:52:57 +00004185# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00004186 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4187 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4188 }
Andrey Churbanov7daf9802015-01-27 16:52:57 +00004189# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +00004190
4191 }
4192 }
4193
4194 th = __kmp_threads[gtid];
4195 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4196 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4197 if (retval == 0) {
4198 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4199 }
4200
4201# if OMP_40_ENABLED
4202 th->th.th_current_place = KMP_PLACE_UNDEFINED;
4203 th->th.th_new_place = KMP_PLACE_UNDEFINED;
4204 th->th.th_first_place = 0;
4205 th->th.th_last_place = __kmp_affinity_num_masks - 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004206
4207 //
4208 // Turn off 4.0 affinity for the current tread at this parallel level.
4209 //
4210 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
Jim Cownie5e8470a2013-09-27 10:38:44 +00004211# endif
4212
4213 return retval;
4214}
4215
4216
4217int
4218__kmp_aux_get_affinity(void **mask)
4219{
4220 int gtid;
4221 int retval;
4222 kmp_info_t *th;
4223
4224 if (! KMP_AFFINITY_CAPABLE()) {
4225 return -1;
4226 }
4227
4228 gtid = __kmp_entry_gtid();
4229 th = __kmp_threads[gtid];
4230 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4231
4232 KA_TRACE(1000, ;{
4233 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4234 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4235 th->th.th_affin_mask);
4236 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4237 });
4238
4239 if (__kmp_env_consistency_check) {
4240 if ((mask == NULL) || (*mask == NULL)) {
4241 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4242 }
4243 }
4244
4245# if !KMP_OS_WINDOWS
4246
4247 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4248 KA_TRACE(1000, ;{
4249 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4250 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4251 (kmp_affin_mask_t *)(*mask));
4252 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4253 });
4254 return retval;
4255
4256# else
4257
4258 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4259 return 0;
4260
4261# endif /* KMP_OS_WINDOWS */
4262
4263}
4264
Jim Cownie5e8470a2013-09-27 10:38:44 +00004265int
4266__kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4267{
4268 int retval;
4269
4270 if (! KMP_AFFINITY_CAPABLE()) {
4271 return -1;
4272 }
4273
4274 KA_TRACE(1000, ;{
4275 int gtid = __kmp_entry_gtid();
4276 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4277 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4278 (kmp_affin_mask_t *)(*mask));
4279 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4280 proc, gtid, buf);
4281 });
4282
4283 if (__kmp_env_consistency_check) {
4284 if ((mask == NULL) || (*mask == NULL)) {
4285 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4286 }
4287 }
4288
4289 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4290 return -1;
4291 }
4292 if (! KMP_CPU_ISSET(proc, fullMask)) {
4293 return -2;
4294 }
4295
4296 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4297 return 0;
4298}
4299
4300
4301int
4302__kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4303{
4304 int retval;
4305
4306 if (! KMP_AFFINITY_CAPABLE()) {
4307 return -1;
4308 }
4309
4310 KA_TRACE(1000, ;{
4311 int gtid = __kmp_entry_gtid();
4312 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4313 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4314 (kmp_affin_mask_t *)(*mask));
4315 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4316 proc, gtid, buf);
4317 });
4318
4319 if (__kmp_env_consistency_check) {
4320 if ((mask == NULL) || (*mask == NULL)) {
4321 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4322 }
4323 }
4324
4325 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4326 return -1;
4327 }
4328 if (! KMP_CPU_ISSET(proc, fullMask)) {
4329 return -2;
4330 }
4331
4332 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4333 return 0;
4334}
4335
4336
4337int
4338__kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4339{
4340 int retval;
4341
4342 if (! KMP_AFFINITY_CAPABLE()) {
4343 return -1;
4344 }
4345
4346 KA_TRACE(1000, ;{
4347 int gtid = __kmp_entry_gtid();
4348 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4349 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4350 (kmp_affin_mask_t *)(*mask));
4351 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4352 proc, gtid, buf);
4353 });
4354
4355 if (__kmp_env_consistency_check) {
4356 if ((mask == NULL) || (*mask == NULL)) {
Andrey Churbanov4b2f17a2015-01-29 15:49:22 +00004357 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
Jim Cownie5e8470a2013-09-27 10:38:44 +00004358 }
4359 }
4360
4361 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4362 return 0;
4363 }
4364 if (! KMP_CPU_ISSET(proc, fullMask)) {
4365 return 0;
4366 }
4367
4368 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4369}
4370
Jim Cownie5e8470a2013-09-27 10:38:44 +00004371
4372// Dynamic affinity settings - Affinity balanced
4373void __kmp_balanced_affinity( int tid, int nthreads )
4374{
4375 if( __kmp_affinity_uniform_topology() ) {
4376 int coreID;
4377 int threadID;
4378 // Number of hyper threads per core in HT machine
4379 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4380 // Number of cores
4381 int ncores = __kmp_ncores;
4382 // How many threads will be bound to each core
4383 int chunk = nthreads / ncores;
4384 // How many cores will have an additional thread bound to it - "big cores"
4385 int big_cores = nthreads % ncores;
4386 // Number of threads on the big cores
4387 int big_nth = ( chunk + 1 ) * big_cores;
4388 if( tid < big_nth ) {
4389 coreID = tid / (chunk + 1 );
4390 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4391 } else { //tid >= big_nth
4392 coreID = ( tid - big_cores ) / chunk;
4393 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4394 }
4395
4396 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4397 "Illegal set affinity operation when not capable");
4398
4399 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4400 KMP_CPU_ZERO(mask);
4401
4402 // Granularity == thread
4403 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4404 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4405 KMP_CPU_SET( osID, mask);
4406 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4407 for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4408 int osID;
4409 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4410 KMP_CPU_SET( osID, mask);
4411 }
4412 }
4413 if (__kmp_affinity_verbose) {
4414 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4415 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004416 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4417 tid, buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004418 }
4419 __kmp_set_system_affinity( mask, TRUE );
4420 } else { // Non-uniform topology
4421
4422 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4423 KMP_CPU_ZERO(mask);
4424
4425 // Number of hyper threads per core in HT machine
4426 int nth_per_core = __kmp_nThreadsPerCore;
4427 int core_level;
4428 if( nth_per_core > 1 ) {
4429 core_level = __kmp_aff_depth - 2;
4430 } else {
4431 core_level = __kmp_aff_depth - 1;
4432 }
4433
4434 // Number of cores - maximum value; it does not count trail cores with 0 processors
4435 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4436
4437 // For performance gain consider the special case nthreads == __kmp_avail_proc
4438 if( nthreads == __kmp_avail_proc ) {
4439 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4440 int osID = address2os[ tid ].second;
4441 KMP_CPU_SET( osID, mask);
4442 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4443 int coreID = address2os[ tid ].first.labels[ core_level ];
4444 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4445 // since the address2os is sortied we can break when cnt==nth_per_core
4446 int cnt = 0;
4447 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4448 int osID = address2os[ i ].second;
4449 int core = address2os[ i ].first.labels[ core_level ];
4450 if( core == coreID ) {
4451 KMP_CPU_SET( osID, mask);
4452 cnt++;
4453 if( cnt == nth_per_core ) {
4454 break;
4455 }
4456 }
4457 }
4458 }
4459 } else if( nthreads <= __kmp_ncores ) {
4460
4461 int core = 0;
4462 for( int i = 0; i < ncores; i++ ) {
4463 // Check if this core from procarr[] is in the mask
4464 int in_mask = 0;
4465 for( int j = 0; j < nth_per_core; j++ ) {
4466 if( procarr[ i * nth_per_core + j ] != - 1 ) {
4467 in_mask = 1;
4468 break;
4469 }
4470 }
4471 if( in_mask ) {
4472 if( tid == core ) {
4473 for( int j = 0; j < nth_per_core; j++ ) {
4474 int osID = procarr[ i * nth_per_core + j ];
4475 if( osID != -1 ) {
4476 KMP_CPU_SET( osID, mask );
4477 // For granularity=thread it is enough to set the first available osID for this core
4478 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4479 break;
4480 }
4481 }
4482 }
4483 break;
4484 } else {
4485 core++;
4486 }
4487 }
4488 }
4489
4490 } else { // nthreads > __kmp_ncores
4491
4492 // Array to save the number of processors at each core
4493 int nproc_at_core[ ncores ];
4494 // Array to save the number of cores with "x" available processors;
4495 int ncores_with_x_procs[ nth_per_core + 1 ];
4496 // Array to save the number of cores with # procs from x to nth_per_core
4497 int ncores_with_x_to_max_procs[ nth_per_core + 1 ];
4498
4499 for( int i = 0; i <= nth_per_core; i++ ) {
4500 ncores_with_x_procs[ i ] = 0;
4501 ncores_with_x_to_max_procs[ i ] = 0;
4502 }
4503
4504 for( int i = 0; i < ncores; i++ ) {
4505 int cnt = 0;
4506 for( int j = 0; j < nth_per_core; j++ ) {
4507 if( procarr[ i * nth_per_core + j ] != -1 ) {
4508 cnt++;
4509 }
4510 }
4511 nproc_at_core[ i ] = cnt;
4512 ncores_with_x_procs[ cnt ]++;
4513 }
4514
4515 for( int i = 0; i <= nth_per_core; i++ ) {
4516 for( int j = i; j <= nth_per_core; j++ ) {
4517 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4518 }
4519 }
4520
4521 // Max number of processors
4522 int nproc = nth_per_core * ncores;
4523 // An array to keep number of threads per each context
4524 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4525 for( int i = 0; i < nproc; i++ ) {
4526 newarr[ i ] = 0;
4527 }
4528
4529 int nth = nthreads;
4530 int flag = 0;
4531 while( nth > 0 ) {
4532 for( int j = 1; j <= nth_per_core; j++ ) {
4533 int cnt = ncores_with_x_to_max_procs[ j ];
4534 for( int i = 0; i < ncores; i++ ) {
4535 // Skip the core with 0 processors
4536 if( nproc_at_core[ i ] == 0 ) {
4537 continue;
4538 }
4539 for( int k = 0; k < nth_per_core; k++ ) {
4540 if( procarr[ i * nth_per_core + k ] != -1 ) {
4541 if( newarr[ i * nth_per_core + k ] == 0 ) {
4542 newarr[ i * nth_per_core + k ] = 1;
4543 cnt--;
4544 nth--;
4545 break;
4546 } else {
4547 if( flag != 0 ) {
4548 newarr[ i * nth_per_core + k ] ++;
4549 cnt--;
4550 nth--;
4551 break;
4552 }
4553 }
4554 }
4555 }
4556 if( cnt == 0 || nth == 0 ) {
4557 break;
4558 }
4559 }
4560 if( nth == 0 ) {
4561 break;
4562 }
4563 }
4564 flag = 1;
4565 }
4566 int sum = 0;
4567 for( int i = 0; i < nproc; i++ ) {
4568 sum += newarr[ i ];
4569 if( sum > tid ) {
4570 // Granularity == thread
4571 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4572 int osID = procarr[ i ];
4573 KMP_CPU_SET( osID, mask);
4574 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4575 int coreID = i / nth_per_core;
4576 for( int ii = 0; ii < nth_per_core; ii++ ) {
4577 int osID = procarr[ coreID * nth_per_core + ii ];
4578 if( osID != -1 ) {
4579 KMP_CPU_SET( osID, mask);
4580 }
4581 }
4582 }
4583 break;
4584 }
4585 }
4586 __kmp_free( newarr );
4587 }
4588
4589 if (__kmp_affinity_verbose) {
4590 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4591 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004592 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4593 tid, buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004594 }
4595 __kmp_set_system_affinity( mask, TRUE );
4596 }
4597}
4598
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004599#else
4600 // affinity not supported
4601
4602kmp_uint32 mac_skipPerLevel[7];
4603kmp_uint32 mac_depth;
4604kmp_uint8 mac_leaf_kids;
4605void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
4606 static int first = 1;
4607 if (first) {
4608 const kmp_uint32 maxLevels = 7;
4609 kmp_uint32 numPerLevel[maxLevels];
4610
4611 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
4612 numPerLevel[i] = 1;
4613 mac_skipPerLevel[i] = 1;
4614 }
4615
4616 mac_depth = 2;
4617 numPerLevel[0] = nproc;
4618
4619 kmp_uint32 branch = 4;
4620 if (numPerLevel[0] == 1) branch = nproc/4;
4621 if (branch<4) branch=4;
4622 for (kmp_uint32 d=0; d<mac_depth-1; ++d) { // optimize hierarchy width
4623 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
4624 if (numPerLevel[d] & 1) numPerLevel[d]++;
4625 numPerLevel[d] = numPerLevel[d] >> 1;
4626 if (numPerLevel[d+1] == 1) mac_depth++;
4627 numPerLevel[d+1] = numPerLevel[d+1] << 1;
4628 }
4629 if(numPerLevel[0] == 1) {
4630 branch = branch >> 1;
4631 if (branch<4) branch = 4;
4632 }
4633 }
4634
4635 for (kmp_uint32 i=1; i<mac_depth; ++i)
4636 mac_skipPerLevel[i] = numPerLevel[i-1] * mac_skipPerLevel[i-1];
4637 mac_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
4638 first=0;
4639 }
4640 thr_bar->depth = mac_depth;
4641 thr_bar->base_leaf_kids = mac_leaf_kids;
4642 thr_bar->skip_per_level = mac_skipPerLevel;
4643}
4644
Alp Toker763b9392014-02-28 09:42:41 +00004645#endif // KMP_AFFINITY_SUPPORTED