blob: 2b3a2b25d4a70a7e06eee09f1f89550f2235eb3d [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_affinity.cpp -- affinity management
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003 * $Revision: 43473 $
4 * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
Jim Cownie5e8470a2013-09-27 10:38:44 +00005 */
6
7
8//===----------------------------------------------------------------------===//
9//
10// The LLVM Compiler Infrastructure
11//
12// This file is dual licensed under the MIT and the University of Illinois Open
13// Source Licenses. See LICENSE.txt for details.
14//
15//===----------------------------------------------------------------------===//
16
17
18#include "kmp.h"
19#include "kmp_i18n.h"
20#include "kmp_io.h"
21#include "kmp_str.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000022#include "kmp_wrapper_getpid.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000023
Alp Toker763b9392014-02-28 09:42:41 +000024#if KMP_AFFINITY_SUPPORTED
Jim Cownie5e8470a2013-09-27 10:38:44 +000025
26//
27// Print the affinity mask to the character array in a pretty format.
28//
29char *
30__kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
31{
32 KMP_ASSERT(buf_len >= 40);
33 char *scan = buf;
34 char *end = buf + buf_len - 1;
35
36 //
37 // Find first element / check for empty set.
38 //
39 size_t i;
40 for (i = 0; i < KMP_CPU_SETSIZE; i++) {
41 if (KMP_CPU_ISSET(i, mask)) {
42 break;
43 }
44 }
45 if (i == KMP_CPU_SETSIZE) {
46 sprintf(scan, "{<empty>}");
47 while (*scan != '\0') scan++;
48 KMP_ASSERT(scan <= end);
49 return buf;
50 }
51
Jim Cownie4cc4bb42014-10-07 16:25:50 +000052 sprintf(scan, "{%ld", (long)i);
Jim Cownie5e8470a2013-09-27 10:38:44 +000053 while (*scan != '\0') scan++;
54 i++;
55 for (; i < KMP_CPU_SETSIZE; i++) {
56 if (! KMP_CPU_ISSET(i, mask)) {
57 continue;
58 }
59
60 //
61 // Check for buffer overflow. A string of the form ",<n>" will have
62 // at most 10 characters, plus we want to leave room to print ",...}"
63 // if the set is too large to print for a total of 15 characters.
64 // We already left room for '\0' in setting end.
65 //
66 if (end - scan < 15) {
67 break;
68 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +000069 sprintf(scan, ",%-ld", (long)i);
Jim Cownie5e8470a2013-09-27 10:38:44 +000070 while (*scan != '\0') scan++;
71 }
72 if (i < KMP_CPU_SETSIZE) {
73 sprintf(scan, ",...");
74 while (*scan != '\0') scan++;
75 }
76 sprintf(scan, "}");
77 while (*scan != '\0') scan++;
78 KMP_ASSERT(scan <= end);
79 return buf;
80}
81
82
83void
84__kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
85{
86 KMP_CPU_ZERO(mask);
87
Andrey Churbanov7daf9802015-01-27 16:52:57 +000088# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +000089
90 if (__kmp_num_proc_groups > 1) {
91 int group;
Jim Cownie5e8470a2013-09-27 10:38:44 +000092 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
93 for (group = 0; group < __kmp_num_proc_groups; group++) {
94 int i;
95 int num = __kmp_GetActiveProcessorCount(group);
96 for (i = 0; i < num; i++) {
97 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
98 }
99 }
100 }
101 else
102
Andrey Churbanov7daf9802015-01-27 16:52:57 +0000103# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +0000104
105 {
106 int proc;
107 for (proc = 0; proc < __kmp_xproc; proc++) {
108 KMP_CPU_SET(proc, mask);
109 }
110 }
111}
112
113
114//
115// In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
116// functions.
117//
118// The icc codegen emits sections with extremely long names, of the form
119// ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug
120// introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
121// some sort of memory corruption or table overflow that is triggered by
122// these long strings. I checked the latest version of the linker -
123// GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
124// fixed.
125//
126// Unfortunately, my attempts to reproduce it in a smaller example have
127// failed - I'm not sure what the prospects are of getting it fixed
128// properly - but we need a reproducer smaller than all of libiomp.
129//
130// Work around the problem by avoiding inline constructors in such builds.
131// We do this for all platforms, not just Linux* OS - non-inline functions are
132// more debuggable and provide better coverage into than inline functions.
133// Use inline functions in shipping libs, for performance.
134//
135
136# if !defined(KMP_DEBUG) && !defined(COVER)
137
138class Address {
139public:
140 static const unsigned maxDepth = 32;
141 unsigned labels[maxDepth];
142 unsigned childNums[maxDepth];
143 unsigned depth;
144 unsigned leader;
145 Address(unsigned _depth)
146 : depth(_depth), leader(FALSE) {
147 }
148 Address &operator=(const Address &b) {
149 depth = b.depth;
150 for (unsigned i = 0; i < depth; i++) {
151 labels[i] = b.labels[i];
152 childNums[i] = b.childNums[i];
153 }
154 leader = FALSE;
155 return *this;
156 }
157 bool operator==(const Address &b) const {
158 if (depth != b.depth)
159 return false;
160 for (unsigned i = 0; i < depth; i++)
161 if(labels[i] != b.labels[i])
162 return false;
163 return true;
164 }
165 bool isClose(const Address &b, int level) const {
166 if (depth != b.depth)
167 return false;
168 if ((unsigned)level >= depth)
169 return true;
170 for (unsigned i = 0; i < (depth - level); i++)
171 if(labels[i] != b.labels[i])
172 return false;
173 return true;
174 }
175 bool operator!=(const Address &b) const {
176 return !operator==(b);
177 }
178};
179
180class AddrUnsPair {
181public:
182 Address first;
183 unsigned second;
184 AddrUnsPair(Address _first, unsigned _second)
185 : first(_first), second(_second) {
186 }
187 AddrUnsPair &operator=(const AddrUnsPair &b)
188 {
189 first = b.first;
190 second = b.second;
191 return *this;
192 }
193};
194
195# else
196
197class Address {
198public:
199 static const unsigned maxDepth = 32;
200 unsigned labels[maxDepth];
201 unsigned childNums[maxDepth];
202 unsigned depth;
203 unsigned leader;
204 Address(unsigned _depth);
205 Address &operator=(const Address &b);
206 bool operator==(const Address &b) const;
207 bool isClose(const Address &b, int level) const;
208 bool operator!=(const Address &b) const;
209};
210
211Address::Address(unsigned _depth)
212{
213 depth = _depth;
214 leader = FALSE;
215}
216
217Address &Address::operator=(const Address &b) {
218 depth = b.depth;
219 for (unsigned i = 0; i < depth; i++) {
220 labels[i] = b.labels[i];
221 childNums[i] = b.childNums[i];
222 }
223 leader = FALSE;
224 return *this;
225}
226
227bool Address::operator==(const Address &b) const {
228 if (depth != b.depth)
229 return false;
230 for (unsigned i = 0; i < depth; i++)
231 if(labels[i] != b.labels[i])
232 return false;
233 return true;
234}
235
236bool Address::isClose(const Address &b, int level) const {
237 if (depth != b.depth)
238 return false;
239 if ((unsigned)level >= depth)
240 return true;
241 for (unsigned i = 0; i < (depth - level); i++)
242 if(labels[i] != b.labels[i])
243 return false;
244 return true;
245}
246
247bool Address::operator!=(const Address &b) const {
248 return !operator==(b);
249}
250
251class AddrUnsPair {
252public:
253 Address first;
254 unsigned second;
255 AddrUnsPair(Address _first, unsigned _second);
256 AddrUnsPair &operator=(const AddrUnsPair &b);
257};
258
259AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
260 : first(_first), second(_second)
261{
262}
263
264AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
265{
266 first = b.first;
267 second = b.second;
268 return *this;
269}
270
271# endif /* !defined(KMP_DEBUG) && !defined(COVER) */
272
273
274static int
275__kmp_affinity_cmp_Address_labels(const void *a, const void *b)
276{
277 const Address *aa = (const Address *)&(((AddrUnsPair *)a)
278 ->first);
279 const Address *bb = (const Address *)&(((AddrUnsPair *)b)
280 ->first);
281 unsigned depth = aa->depth;
282 unsigned i;
283 KMP_DEBUG_ASSERT(depth == bb->depth);
284 for (i = 0; i < depth; i++) {
285 if (aa->labels[i] < bb->labels[i]) return -1;
286 if (aa->labels[i] > bb->labels[i]) return 1;
287 }
288 return 0;
289}
290
291
292static int
293__kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
294{
295 const Address *aa = (const Address *)&(((AddrUnsPair *)a)
296 ->first);
297 const Address *bb = (const Address *)&(((AddrUnsPair *)b)
298 ->first);
299 unsigned depth = aa->depth;
300 unsigned i;
301 KMP_DEBUG_ASSERT(depth == bb->depth);
302 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
303 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
304 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
305 int j = depth - i - 1;
306 if (aa->childNums[j] < bb->childNums[j]) return -1;
307 if (aa->childNums[j] > bb->childNums[j]) return 1;
308 }
309 for (; i < depth; i++) {
310 int j = i - __kmp_affinity_compact;
311 if (aa->childNums[j] < bb->childNums[j]) return -1;
312 if (aa->childNums[j] > bb->childNums[j]) return 1;
313 }
314 return 0;
315}
316
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000317/** A structure for holding machine-specific hierarchy info to be computed once at init. */
318class hierarchy_info {
319public:
320 /** Typical levels are threads/core, cores/package or socket, packages/node, nodes/machine,
321 etc. We don't want to get specific with nomenclature */
322 static const kmp_uint32 maxLevels=7;
323
324 /** This is specifically the depth of the machine configuration hierarchy, in terms of the
325 number of levels along the longest path from root to any leaf. It corresponds to the
326 number of entries in numPerLevel if we exclude all but one trailing 1. */
327 kmp_uint32 depth;
328 kmp_uint32 base_depth;
329 kmp_uint32 base_num_threads;
330 bool uninitialized;
331
332 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
333 node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
334 and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
335 kmp_uint32 numPerLevel[maxLevels];
336 kmp_uint32 skipPerLevel[maxLevels];
337
338 void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
339 int hier_depth = adr2os[0].first.depth;
340 int level = 0;
341 for (int i=hier_depth-1; i>=0; --i) {
342 int max = -1;
343 for (int j=0; j<num_addrs; ++j) {
344 int next = adr2os[j].first.childNums[i];
345 if (next > max) max = next;
346 }
347 numPerLevel[level] = max+1;
348 ++level;
349 }
350 }
351
352 hierarchy_info() : depth(1), uninitialized(true) {}
353 void init(AddrUnsPair *adr2os, int num_addrs)
354 {
355 uninitialized = false;
356 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
357 numPerLevel[i] = 1;
358 skipPerLevel[i] = 1;
359 }
360
361 // Sort table by physical ID
362 if (adr2os) {
363 qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
364 deriveLevels(adr2os, num_addrs);
365 }
366 else {
367 numPerLevel[0] = 4;
368 numPerLevel[1] = num_addrs/4;
369 if (num_addrs%4) numPerLevel[1]++;
370 }
371
372 base_num_threads = num_addrs;
373 for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
374 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
375 depth++;
376
377 kmp_uint32 branch = 4;
378 if (numPerLevel[0] == 1) branch = num_addrs/4;
379 if (branch<4) branch=4;
380 for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
381 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
382 if (numPerLevel[d] & 1) numPerLevel[d]++;
383 numPerLevel[d] = numPerLevel[d] >> 1;
384 if (numPerLevel[d+1] == 1) depth++;
385 numPerLevel[d+1] = numPerLevel[d+1] << 1;
386 }
387 if(numPerLevel[0] == 1) {
388 branch = branch >> 1;
389 if (branch<4) branch = 4;
390 }
391 }
392
393 for (kmp_uint32 i=1; i<depth; ++i)
394 skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
395
396 base_depth = depth;
397 }
398};
399
400static hierarchy_info machine_hierarchy;
401
402void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
403 if (machine_hierarchy.uninitialized)
404 machine_hierarchy.init(NULL, nproc);
405
406 if (nproc <= machine_hierarchy.base_num_threads)
407 machine_hierarchy.depth = machine_hierarchy.base_depth;
408 KMP_DEBUG_ASSERT(machine_hierarchy.depth > 0);
409 while (nproc > machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1]) {
410 machine_hierarchy.depth++;
411 machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1] = 2*machine_hierarchy.skipPerLevel[machine_hierarchy.depth-2];
412 }
413 thr_bar->depth = machine_hierarchy.depth;
414 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
415 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
416}
Jim Cownie5e8470a2013-09-27 10:38:44 +0000417
418//
419// When sorting by labels, __kmp_affinity_assign_child_nums() must first be
420// called to renumber the labels from [0..n] and place them into the child_num
421// vector of the address object. This is done in case the labels used for
Alp Toker8f2d3f02014-02-24 10:40:15 +0000422// the children at one node of the hierarchy differ from those used for
Jim Cownie5e8470a2013-09-27 10:38:44 +0000423// another node at the same level. Example: suppose the machine has 2 nodes
424// with 2 packages each. The first node contains packages 601 and 602, and
425// second node contains packages 603 and 604. If we try to sort the table
426// for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
427// because we are paying attention to the labels themselves, not the ordinal
428// child numbers. By using the child numbers in the sort, the result is
429// {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
430//
431static void
432__kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
433 int numAddrs)
434{
435 KMP_DEBUG_ASSERT(numAddrs > 0);
436 int depth = address2os->first.depth;
437 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
438 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
439 * sizeof(unsigned));
440 int labCt;
441 for (labCt = 0; labCt < depth; labCt++) {
442 address2os[0].first.childNums[labCt] = counts[labCt] = 0;
443 lastLabel[labCt] = address2os[0].first.labels[labCt];
444 }
445 int i;
446 for (i = 1; i < numAddrs; i++) {
447 for (labCt = 0; labCt < depth; labCt++) {
448 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
449 int labCt2;
450 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
451 counts[labCt2] = 0;
452 lastLabel[labCt2] = address2os[i].first.labels[labCt2];
453 }
454 counts[labCt]++;
455 lastLabel[labCt] = address2os[i].first.labels[labCt];
456 break;
457 }
458 }
459 for (labCt = 0; labCt < depth; labCt++) {
460 address2os[i].first.childNums[labCt] = counts[labCt];
461 }
462 for (; labCt < (int)Address::maxDepth; labCt++) {
463 address2os[i].first.childNums[labCt] = 0;
464 }
465 }
466}
467
468
469//
470// All of the __kmp_affinity_create_*_map() routines should set
471// __kmp_affinity_masks to a vector of affinity mask objects of length
472// __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
473// return the number of levels in the machine topology tree (zero if
474// __kmp_affinity_type == affinity_none).
475//
476// All of the __kmp_affinity_create_*_map() routines should set *fullMask
477// to the affinity mask for the initialization thread. They need to save and
478// restore the mask, and it could be needed later, so saving it is just an
479// optimization to avoid calling kmp_get_system_affinity() again.
480//
481static kmp_affin_mask_t *fullMask = NULL;
482
483kmp_affin_mask_t *
484__kmp_affinity_get_fullMask() { return fullMask; }
485
486
487static int nCoresPerPkg, nPackages;
Andrey Churbanovf696c822015-01-27 16:55:43 +0000488static int __kmp_nThreadsPerCore;
489#ifndef KMP_DFLT_NTH_CORES
490static int __kmp_ncores;
491#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000492
493//
494// __kmp_affinity_uniform_topology() doesn't work when called from
495// places which support arbitrarily many levels in the machine topology
496// map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
497// __kmp_affinity_create_x2apicid_map().
498//
499inline static bool
500__kmp_affinity_uniform_topology()
501{
502 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
503}
504
505
506//
507// Print out the detailed machine topology map, i.e. the physical locations
508// of each OS proc.
509//
510static void
511__kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
512 int pkgLevel, int coreLevel, int threadLevel)
513{
514 int proc;
515
516 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
517 for (proc = 0; proc < len; proc++) {
518 int level;
519 kmp_str_buf_t buf;
520 __kmp_str_buf_init(&buf);
521 for (level = 0; level < depth; level++) {
522 if (level == threadLevel) {
523 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
524 }
525 else if (level == coreLevel) {
526 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
527 }
528 else if (level == pkgLevel) {
529 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
530 }
531 else if (level > pkgLevel) {
532 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
533 level - pkgLevel - 1);
534 }
535 else {
536 __kmp_str_buf_print(&buf, "L%d ", level);
537 }
538 __kmp_str_buf_print(&buf, "%d ",
539 address2os[proc].first.labels[level]);
540 }
541 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
542 buf.str);
543 __kmp_str_buf_free(&buf);
544 }
545}
546
547
548//
549// If we don't know how to retrieve the machine's processor topology, or
550// encounter an error in doing so, this routine is called to form a "flat"
551// mapping of os thread id's <-> processor id's.
552//
553static int
554__kmp_affinity_create_flat_map(AddrUnsPair **address2os,
555 kmp_i18n_id_t *const msg_id)
556{
557 *address2os = NULL;
558 *msg_id = kmp_i18n_null;
559
560 //
561 // Even if __kmp_affinity_type == affinity_none, this routine might still
Andrey Churbanovf696c822015-01-27 16:55:43 +0000562 // called to set __kmp_ncores, as well as
Jim Cownie5e8470a2013-09-27 10:38:44 +0000563 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
564 //
565 if (! KMP_AFFINITY_CAPABLE()) {
566 KMP_ASSERT(__kmp_affinity_type == affinity_none);
567 __kmp_ncores = nPackages = __kmp_xproc;
568 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000569 if (__kmp_affinity_verbose) {
570 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
571 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
572 KMP_INFORM(Uniform, "KMP_AFFINITY");
573 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
574 __kmp_nThreadsPerCore, __kmp_ncores);
575 }
576 return 0;
577 }
578
579 //
580 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +0000581 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +0000582 // nCoresPerPkg, & nPackages. Make sure all these vars are set
583 // correctly, and return now if affinity is not enabled.
584 //
585 __kmp_ncores = nPackages = __kmp_avail_proc;
586 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000587 if (__kmp_affinity_verbose) {
588 char buf[KMP_AFFIN_MASK_PRINT_LEN];
589 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
590
591 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
592 if (__kmp_affinity_respect_mask) {
593 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
594 } else {
595 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
596 }
597 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
598 KMP_INFORM(Uniform, "KMP_AFFINITY");
599 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
600 __kmp_nThreadsPerCore, __kmp_ncores);
601 }
602 if (__kmp_affinity_type == affinity_none) {
603 return 0;
604 }
605
606 //
607 // Contruct the data structure to be returned.
608 //
609 *address2os = (AddrUnsPair*)
610 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
611 int avail_ct = 0;
612 unsigned int i;
613 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
614 //
615 // Skip this proc if it is not included in the machine model.
616 //
617 if (! KMP_CPU_ISSET(i, fullMask)) {
618 continue;
619 }
620
621 Address addr(1);
622 addr.labels[0] = i;
623 (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
624 }
625 if (__kmp_affinity_verbose) {
626 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
627 }
628
629 if (__kmp_affinity_gran_levels < 0) {
630 //
631 // Only the package level is modeled in the machine topology map,
632 // so the #levels of granularity is either 0 or 1.
633 //
634 if (__kmp_affinity_gran > affinity_gran_package) {
635 __kmp_affinity_gran_levels = 1;
636 }
637 else {
638 __kmp_affinity_gran_levels = 0;
639 }
640 }
641 return 1;
642}
643
644
Andrey Churbanov7daf9802015-01-27 16:52:57 +0000645# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +0000646
647//
648// If multiple Windows* OS processor groups exist, we can create a 2-level
649// topology map with the groups at level 0 and the individual procs at
650// level 1.
651//
652// This facilitates letting the threads float among all procs in a group,
653// if granularity=group (the default when there are multiple groups).
654//
655static int
656__kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
657 kmp_i18n_id_t *const msg_id)
658{
659 *address2os = NULL;
660 *msg_id = kmp_i18n_null;
661
662 //
663 // If we don't have multiple processor groups, return now.
664 // The flat mapping will be used.
665 //
666 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
667 // FIXME set *msg_id
668 return -1;
669 }
670
671 //
672 // Contruct the data structure to be returned.
673 //
674 *address2os = (AddrUnsPair*)
675 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
676 int avail_ct = 0;
677 int i;
678 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
679 //
680 // Skip this proc if it is not included in the machine model.
681 //
682 if (! KMP_CPU_ISSET(i, fullMask)) {
683 continue;
684 }
685
686 Address addr(2);
687 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
688 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
689 (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
690
691 if (__kmp_affinity_verbose) {
692 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
693 addr.labels[1]);
694 }
695 }
696
697 if (__kmp_affinity_gran_levels < 0) {
698 if (__kmp_affinity_gran == affinity_gran_group) {
699 __kmp_affinity_gran_levels = 1;
700 }
701 else if ((__kmp_affinity_gran == affinity_gran_fine)
702 || (__kmp_affinity_gran == affinity_gran_thread)) {
703 __kmp_affinity_gran_levels = 0;
704 }
705 else {
706 const char *gran_str = NULL;
707 if (__kmp_affinity_gran == affinity_gran_core) {
708 gran_str = "core";
709 }
710 else if (__kmp_affinity_gran == affinity_gran_package) {
711 gran_str = "package";
712 }
713 else if (__kmp_affinity_gran == affinity_gran_node) {
714 gran_str = "node";
715 }
716 else {
717 KMP_ASSERT(0);
718 }
719
720 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
721 __kmp_affinity_gran_levels = 0;
722 }
723 }
724 return 2;
725}
726
Andrey Churbanov7daf9802015-01-27 16:52:57 +0000727# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +0000728
729
730# if KMP_ARCH_X86 || KMP_ARCH_X86_64
731
732static int
733__kmp_cpuid_mask_width(int count) {
734 int r = 0;
735
736 while((1<<r) < count)
737 ++r;
738 return r;
739}
740
741
742class apicThreadInfo {
743public:
744 unsigned osId; // param to __kmp_affinity_bind_thread
745 unsigned apicId; // from cpuid after binding
746 unsigned maxCoresPerPkg; // ""
747 unsigned maxThreadsPerPkg; // ""
748 unsigned pkgId; // inferred from above values
749 unsigned coreId; // ""
750 unsigned threadId; // ""
751};
752
753
754static int
755__kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
756{
757 const apicThreadInfo *aa = (const apicThreadInfo *)a;
758 const apicThreadInfo *bb = (const apicThreadInfo *)b;
759 if (aa->osId < bb->osId) return -1;
760 if (aa->osId > bb->osId) return 1;
761 return 0;
762}
763
764
765static int
766__kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
767{
768 const apicThreadInfo *aa = (const apicThreadInfo *)a;
769 const apicThreadInfo *bb = (const apicThreadInfo *)b;
770 if (aa->pkgId < bb->pkgId) return -1;
771 if (aa->pkgId > bb->pkgId) return 1;
772 if (aa->coreId < bb->coreId) return -1;
773 if (aa->coreId > bb->coreId) return 1;
774 if (aa->threadId < bb->threadId) return -1;
775 if (aa->threadId > bb->threadId) return 1;
776 return 0;
777}
778
779
780//
781// On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
782// an algorithm which cycles through the available os threads, setting
783// the current thread's affinity mask to that thread, and then retrieves
784// the Apic Id for each thread context using the cpuid instruction.
785//
786static int
787__kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
788 kmp_i18n_id_t *const msg_id)
789{
Andrey Churbanov1c331292015-01-27 17:03:42 +0000790 kmp_cpuid buf;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000791 int rc;
792 *address2os = NULL;
793 *msg_id = kmp_i18n_null;
794
Andrey Churbanov1c331292015-01-27 17:03:42 +0000795 //
796 // Check if cpuid leaf 4 is supported.
797 //
Jim Cownie5e8470a2013-09-27 10:38:44 +0000798 __kmp_x86_cpuid(0, 0, &buf);
799 if (buf.eax < 4) {
800 *msg_id = kmp_i18n_str_NoLeaf4Support;
801 return -1;
802 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000803
804 //
Jim Cownie5e8470a2013-09-27 10:38:44 +0000805 // The algorithm used starts by setting the affinity to each available
Andrey Churbanov1c331292015-01-27 17:03:42 +0000806 // thread and retrieving info from the cpuid instruction, so if we are
807 // not capable of calling __kmp_get_system_affinity() and
808 // _kmp_get_system_affinity(), then we need to do something else - use
809 // the defaults that we calculated from issuing cpuid without binding
810 // to each proc.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000811 //
812 if (! KMP_AFFINITY_CAPABLE()) {
813 //
814 // Hack to try and infer the machine topology using only the data
815 // available from cpuid on the current thread, and __kmp_xproc.
816 //
817 KMP_ASSERT(__kmp_affinity_type == affinity_none);
818
819 //
820 // Get an upper bound on the number of threads per package using
821 // cpuid(1).
822 //
823 // On some OS/chps combinations where HT is supported by the chip
824 // but is disabled, this value will be 2 on a single core chip.
825 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
826 //
Jim Cownie5e8470a2013-09-27 10:38:44 +0000827 __kmp_x86_cpuid(1, 0, &buf);
828 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
829 if (maxThreadsPerPkg == 0) {
830 maxThreadsPerPkg = 1;
831 }
832
833 //
834 // The num cores per pkg comes from cpuid(4).
835 // 1 must be added to the encoded value.
836 //
837 // The author of cpu_count.cpp treated this only an upper bound
838 // on the number of cores, but I haven't seen any cases where it
839 // was greater than the actual number of cores, so we will treat
840 // it as exact in this block of code.
841 //
842 // First, we need to check if cpuid(4) is supported on this chip.
843 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
844 // has the value n or greater.
845 //
846 __kmp_x86_cpuid(0, 0, &buf);
847 if (buf.eax >= 4) {
848 __kmp_x86_cpuid(4, 0, &buf);
849 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
850 }
851 else {
852 nCoresPerPkg = 1;
853 }
854
855 //
856 // There is no way to reliably tell if HT is enabled without issuing
857 // the cpuid instruction from every thread, can correlating the cpuid
858 // info, so if the machine is not affinity capable, we assume that HT
859 // is off. We have seen quite a few machines where maxThreadsPerPkg
860 // is 2, yet the machine does not support HT.
861 //
862 // - Older OSes are usually found on machines with older chips, which
863 // do not support HT.
864 //
865 // - The performance penalty for mistakenly identifying a machine as
866 // HT when it isn't (which results in blocktime being incorrecly set
867 // to 0) is greater than the penalty when for mistakenly identifying
868 // a machine as being 1 thread/core when it is really HT enabled
869 // (which results in blocktime being incorrectly set to a positive
870 // value).
871 //
872 __kmp_ncores = __kmp_xproc;
873 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
874 __kmp_nThreadsPerCore = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000875 if (__kmp_affinity_verbose) {
876 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
877 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
878 if (__kmp_affinity_uniform_topology()) {
879 KMP_INFORM(Uniform, "KMP_AFFINITY");
880 } else {
881 KMP_INFORM(NonUniform, "KMP_AFFINITY");
882 }
883 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
884 __kmp_nThreadsPerCore, __kmp_ncores);
885 }
886 return 0;
887 }
888
889 //
890 //
891 // From here on, we can assume that it is safe to call
892 // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
893 // even if __kmp_affinity_type = affinity_none.
894 //
895
896 //
897 // Save the affinity mask for the current thread.
898 //
899 kmp_affin_mask_t *oldMask;
900 KMP_CPU_ALLOC(oldMask);
901 KMP_ASSERT(oldMask != NULL);
902 __kmp_get_system_affinity(oldMask, TRUE);
903
904 //
905 // Run through each of the available contexts, binding the current thread
906 // to it, and obtaining the pertinent information using the cpuid instr.
907 //
908 // The relevant information is:
909 //
910 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
911 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
912 //
913 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The
914 // value of this field determines the width of the core# + thread#
915 // fields in the Apic Id. It is also an upper bound on the number
916 // of threads per package, but it has been verified that situations
917 // happen were it is not exact. In particular, on certain OS/chip
918 // combinations where Intel(R) Hyper-Threading Technology is supported
919 // by the chip but has
920 // been disabled, the value of this field will be 2 (for a single core
921 // chip). On other OS/chip combinations supporting
922 // Intel(R) Hyper-Threading Technology, the value of
923 // this field will be 1 when Intel(R) Hyper-Threading Technology is
924 // disabled and 2 when it is enabled.
925 //
926 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The
927 // value of this field (+1) determines the width of the core# field in
928 // the Apic Id. The comments in "cpucount.cpp" say that this value is
929 // an upper bound, but the IA-32 architecture manual says that it is
930 // exactly the number of cores per package, and I haven't seen any
931 // case where it wasn't.
932 //
933 // From this information, deduce the package Id, core Id, and thread Id,
934 // and set the corresponding fields in the apicThreadInfo struct.
935 //
936 unsigned i;
937 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
938 __kmp_avail_proc * sizeof(apicThreadInfo));
939 unsigned nApics = 0;
940 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
941 //
942 // Skip this proc if it is not included in the machine model.
943 //
944 if (! KMP_CPU_ISSET(i, fullMask)) {
945 continue;
946 }
947 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
948
949 __kmp_affinity_bind_thread(i);
950 threadInfo[nApics].osId = i;
951
952 //
953 // The apic id and max threads per pkg come from cpuid(1).
954 //
Jim Cownie5e8470a2013-09-27 10:38:44 +0000955 __kmp_x86_cpuid(1, 0, &buf);
956 if (! (buf.edx >> 9) & 1) {
957 __kmp_set_system_affinity(oldMask, TRUE);
958 __kmp_free(threadInfo);
959 KMP_CPU_FREE(oldMask);
960 *msg_id = kmp_i18n_str_ApicNotPresent;
961 return -1;
962 }
963 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
964 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
965 if (threadInfo[nApics].maxThreadsPerPkg == 0) {
966 threadInfo[nApics].maxThreadsPerPkg = 1;
967 }
968
969 //
970 // Max cores per pkg comes from cpuid(4).
971 // 1 must be added to the encoded value.
972 //
973 // First, we need to check if cpuid(4) is supported on this chip.
974 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
975 // has the value n or greater.
976 //
977 __kmp_x86_cpuid(0, 0, &buf);
978 if (buf.eax >= 4) {
979 __kmp_x86_cpuid(4, 0, &buf);
980 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
981 }
982 else {
983 threadInfo[nApics].maxCoresPerPkg = 1;
984 }
985
986 //
987 // Infer the pkgId / coreId / threadId using only the info
988 // obtained locally.
989 //
990 int widthCT = __kmp_cpuid_mask_width(
991 threadInfo[nApics].maxThreadsPerPkg);
992 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
993
994 int widthC = __kmp_cpuid_mask_width(
995 threadInfo[nApics].maxCoresPerPkg);
996 int widthT = widthCT - widthC;
997 if (widthT < 0) {
998 //
999 // I've never seen this one happen, but I suppose it could, if
1000 // the cpuid instruction on a chip was really screwed up.
1001 // Make sure to restore the affinity mask before the tail call.
1002 //
1003 __kmp_set_system_affinity(oldMask, TRUE);
1004 __kmp_free(threadInfo);
1005 KMP_CPU_FREE(oldMask);
1006 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1007 return -1;
1008 }
1009
1010 int maskC = (1 << widthC) - 1;
1011 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1012 &maskC;
1013
1014 int maskT = (1 << widthT) - 1;
1015 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1016
1017 nApics++;
1018 }
1019
1020 //
1021 // We've collected all the info we need.
1022 // Restore the old affinity mask for this thread.
1023 //
1024 __kmp_set_system_affinity(oldMask, TRUE);
1025
1026 //
1027 // If there's only one thread context to bind to, form an Address object
1028 // with depth 1 and return immediately (or, if affinity is off, set
1029 // address2os to NULL and return).
1030 //
1031 // If it is configured to omit the package level when there is only a
1032 // single package, the logic at the end of this routine won't work if
1033 // there is only a single thread - it would try to form an Address
1034 // object with depth 0.
1035 //
1036 KMP_ASSERT(nApics > 0);
1037 if (nApics == 1) {
1038 __kmp_ncores = nPackages = 1;
1039 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001040 if (__kmp_affinity_verbose) {
1041 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1042 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1043
1044 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1045 if (__kmp_affinity_respect_mask) {
1046 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1047 } else {
1048 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1049 }
1050 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1051 KMP_INFORM(Uniform, "KMP_AFFINITY");
1052 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1053 __kmp_nThreadsPerCore, __kmp_ncores);
1054 }
1055
1056 if (__kmp_affinity_type == affinity_none) {
1057 __kmp_free(threadInfo);
1058 KMP_CPU_FREE(oldMask);
1059 return 0;
1060 }
1061
1062 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1063 Address addr(1);
1064 addr.labels[0] = threadInfo[0].pkgId;
1065 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1066
1067 if (__kmp_affinity_gran_levels < 0) {
1068 __kmp_affinity_gran_levels = 0;
1069 }
1070
1071 if (__kmp_affinity_verbose) {
1072 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1073 }
1074
1075 __kmp_free(threadInfo);
1076 KMP_CPU_FREE(oldMask);
1077 return 1;
1078 }
1079
1080 //
1081 // Sort the threadInfo table by physical Id.
1082 //
1083 qsort(threadInfo, nApics, sizeof(*threadInfo),
1084 __kmp_affinity_cmp_apicThreadInfo_phys_id);
1085
1086 //
1087 // The table is now sorted by pkgId / coreId / threadId, but we really
1088 // don't know the radix of any of the fields. pkgId's may be sparsely
1089 // assigned among the chips on a system. Although coreId's are usually
1090 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1091 // [0..threadsPerCore-1], we don't want to make any such assumptions.
1092 //
1093 // For that matter, we don't know what coresPerPkg and threadsPerCore
1094 // (or the total # packages) are at this point - we want to determine
1095 // that now. We only have an upper bound on the first two figures.
1096 //
1097 // We also perform a consistency check at this point: the values returned
1098 // by the cpuid instruction for any thread bound to a given package had
1099 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1100 //
1101 nPackages = 1;
1102 nCoresPerPkg = 1;
1103 __kmp_nThreadsPerCore = 1;
1104 unsigned nCores = 1;
1105
1106 unsigned pkgCt = 1; // to determine radii
1107 unsigned lastPkgId = threadInfo[0].pkgId;
1108 unsigned coreCt = 1;
1109 unsigned lastCoreId = threadInfo[0].coreId;
1110 unsigned threadCt = 1;
1111 unsigned lastThreadId = threadInfo[0].threadId;
1112
1113 // intra-pkg consist checks
1114 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1115 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1116
1117 for (i = 1; i < nApics; i++) {
1118 if (threadInfo[i].pkgId != lastPkgId) {
1119 nCores++;
1120 pkgCt++;
1121 lastPkgId = threadInfo[i].pkgId;
1122 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1123 coreCt = 1;
1124 lastCoreId = threadInfo[i].coreId;
1125 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1126 threadCt = 1;
1127 lastThreadId = threadInfo[i].threadId;
1128
1129 //
1130 // This is a different package, so go on to the next iteration
1131 // without doing any consistency checks. Reset the consistency
1132 // check vars, though.
1133 //
1134 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1135 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1136 continue;
1137 }
1138
1139 if (threadInfo[i].coreId != lastCoreId) {
1140 nCores++;
1141 coreCt++;
1142 lastCoreId = threadInfo[i].coreId;
1143 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1144 threadCt = 1;
1145 lastThreadId = threadInfo[i].threadId;
1146 }
1147 else if (threadInfo[i].threadId != lastThreadId) {
1148 threadCt++;
1149 lastThreadId = threadInfo[i].threadId;
1150 }
1151 else {
1152 __kmp_free(threadInfo);
1153 KMP_CPU_FREE(oldMask);
1154 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1155 return -1;
1156 }
1157
1158 //
1159 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1160 // fields agree between all the threads bounds to a given package.
1161 //
1162 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1163 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1164 __kmp_free(threadInfo);
1165 KMP_CPU_FREE(oldMask);
1166 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1167 return -1;
1168 }
1169 }
1170 nPackages = pkgCt;
1171 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1172 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1173
1174 //
1175 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +00001176 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +00001177 // nCoresPerPkg, & nPackages. Make sure all these vars are set
1178 // correctly, and return now if affinity is not enabled.
1179 //
Jim Cownie5e8470a2013-09-27 10:38:44 +00001180 __kmp_ncores = nCores;
1181 if (__kmp_affinity_verbose) {
1182 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1183 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1184
1185 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1186 if (__kmp_affinity_respect_mask) {
1187 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1188 } else {
1189 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1190 }
1191 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1192 if (__kmp_affinity_uniform_topology()) {
1193 KMP_INFORM(Uniform, "KMP_AFFINITY");
1194 } else {
1195 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1196 }
1197 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1198 __kmp_nThreadsPerCore, __kmp_ncores);
1199
1200 }
1201
1202 if (__kmp_affinity_type == affinity_none) {
1203 __kmp_free(threadInfo);
1204 KMP_CPU_FREE(oldMask);
1205 return 0;
1206 }
1207
1208 //
1209 // Now that we've determined the number of packages, the number of cores
1210 // per package, and the number of threads per core, we can construct the
1211 // data structure that is to be returned.
1212 //
1213 int pkgLevel = 0;
1214 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1215 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1216 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1217
1218 KMP_ASSERT(depth > 0);
1219 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1220
1221 for (i = 0; i < nApics; ++i) {
1222 Address addr(depth);
1223 unsigned os = threadInfo[i].osId;
1224 int d = 0;
1225
1226 if (pkgLevel >= 0) {
1227 addr.labels[d++] = threadInfo[i].pkgId;
1228 }
1229 if (coreLevel >= 0) {
1230 addr.labels[d++] = threadInfo[i].coreId;
1231 }
1232 if (threadLevel >= 0) {
1233 addr.labels[d++] = threadInfo[i].threadId;
1234 }
1235 (*address2os)[i] = AddrUnsPair(addr, os);
1236 }
1237
1238 if (__kmp_affinity_gran_levels < 0) {
1239 //
1240 // Set the granularity level based on what levels are modeled
1241 // in the machine topology map.
1242 //
1243 __kmp_affinity_gran_levels = 0;
1244 if ((threadLevel >= 0)
1245 && (__kmp_affinity_gran > affinity_gran_thread)) {
1246 __kmp_affinity_gran_levels++;
1247 }
1248 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1249 __kmp_affinity_gran_levels++;
1250 }
1251 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1252 __kmp_affinity_gran_levels++;
1253 }
1254 }
1255
1256 if (__kmp_affinity_verbose) {
1257 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1258 coreLevel, threadLevel);
1259 }
1260
1261 __kmp_free(threadInfo);
1262 KMP_CPU_FREE(oldMask);
1263 return depth;
1264}
1265
1266
1267//
1268// Intel(R) microarchitecture code name Nehalem, Dunnington and later
1269// architectures support a newer interface for specifying the x2APIC Ids,
1270// based on cpuid leaf 11.
1271//
1272static int
1273__kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1274 kmp_i18n_id_t *const msg_id)
1275{
1276 kmp_cpuid buf;
1277
1278 *address2os = NULL;
1279 *msg_id = kmp_i18n_null;
1280
1281 //
1282 // Check to see if cpuid leaf 11 is supported.
1283 //
1284 __kmp_x86_cpuid(0, 0, &buf);
1285 if (buf.eax < 11) {
1286 *msg_id = kmp_i18n_str_NoLeaf11Support;
1287 return -1;
1288 }
1289 __kmp_x86_cpuid(11, 0, &buf);
1290 if (buf.ebx == 0) {
1291 *msg_id = kmp_i18n_str_NoLeaf11Support;
1292 return -1;
1293 }
1294
1295 //
1296 // Find the number of levels in the machine topology. While we're at it,
1297 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will
1298 // try to get more accurate values later by explicitly counting them,
1299 // but get reasonable defaults now, in case we return early.
1300 //
1301 int level;
1302 int threadLevel = -1;
1303 int coreLevel = -1;
1304 int pkgLevel = -1;
1305 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1306
1307 for (level = 0;; level++) {
1308 if (level > 31) {
1309 //
1310 // FIXME: Hack for DPD200163180
1311 //
1312 // If level is big then something went wrong -> exiting
1313 //
1314 // There could actually be 32 valid levels in the machine topology,
1315 // but so far, the only machine we have seen which does not exit
1316 // this loop before iteration 32 has fubar x2APIC settings.
1317 //
1318 // For now, just reject this case based upon loop trip count.
1319 //
1320 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1321 return -1;
1322 }
1323 __kmp_x86_cpuid(11, level, &buf);
1324 if (buf.ebx == 0) {
1325 if (pkgLevel < 0) {
1326 //
1327 // Will infer nPackages from __kmp_xproc
1328 //
1329 pkgLevel = level;
1330 level++;
1331 }
1332 break;
1333 }
1334 int kind = (buf.ecx >> 8) & 0xff;
1335 if (kind == 1) {
1336 //
1337 // SMT level
1338 //
1339 threadLevel = level;
1340 coreLevel = -1;
1341 pkgLevel = -1;
1342 __kmp_nThreadsPerCore = buf.ebx & 0xff;
1343 if (__kmp_nThreadsPerCore == 0) {
1344 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1345 return -1;
1346 }
1347 }
1348 else if (kind == 2) {
1349 //
1350 // core level
1351 //
1352 coreLevel = level;
1353 pkgLevel = -1;
1354 nCoresPerPkg = buf.ebx & 0xff;
1355 if (nCoresPerPkg == 0) {
1356 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1357 return -1;
1358 }
1359 }
1360 else {
1361 if (level <= 0) {
1362 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1363 return -1;
1364 }
1365 if (pkgLevel >= 0) {
1366 continue;
1367 }
1368 pkgLevel = level;
1369 nPackages = buf.ebx & 0xff;
1370 if (nPackages == 0) {
1371 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1372 return -1;
1373 }
1374 }
1375 }
1376 int depth = level;
1377
1378 //
1379 // In the above loop, "level" was counted from the finest level (usually
1380 // thread) to the coarsest. The caller expects that we will place the
1381 // labels in (*address2os)[].first.labels[] in the inverse order, so
1382 // we need to invert the vars saying which level means what.
1383 //
1384 if (threadLevel >= 0) {
1385 threadLevel = depth - threadLevel - 1;
1386 }
1387 if (coreLevel >= 0) {
1388 coreLevel = depth - coreLevel - 1;
1389 }
1390 KMP_DEBUG_ASSERT(pkgLevel >= 0);
1391 pkgLevel = depth - pkgLevel - 1;
1392
1393 //
1394 // The algorithm used starts by setting the affinity to each available
Andrey Churbanov1c331292015-01-27 17:03:42 +00001395 // thread and retrieving info from the cpuid instruction, so if we are
1396 // not capable of calling __kmp_get_system_affinity() and
1397 // _kmp_get_system_affinity(), then we need to do something else - use
1398 // the defaults that we calculated from issuing cpuid without binding
1399 // to each proc.
Jim Cownie5e8470a2013-09-27 10:38:44 +00001400 //
1401 if (! KMP_AFFINITY_CAPABLE())
1402 {
1403 //
1404 // Hack to try and infer the machine topology using only the data
1405 // available from cpuid on the current thread, and __kmp_xproc.
1406 //
1407 KMP_ASSERT(__kmp_affinity_type == affinity_none);
1408
1409 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1410 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001411 if (__kmp_affinity_verbose) {
1412 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1413 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1414 if (__kmp_affinity_uniform_topology()) {
1415 KMP_INFORM(Uniform, "KMP_AFFINITY");
1416 } else {
1417 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1418 }
1419 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1420 __kmp_nThreadsPerCore, __kmp_ncores);
1421 }
1422 return 0;
1423 }
1424
1425 //
1426 //
1427 // From here on, we can assume that it is safe to call
1428 // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1429 // even if __kmp_affinity_type = affinity_none.
1430 //
1431
1432 //
1433 // Save the affinity mask for the current thread.
1434 //
1435 kmp_affin_mask_t *oldMask;
1436 KMP_CPU_ALLOC(oldMask);
1437 __kmp_get_system_affinity(oldMask, TRUE);
1438
1439 //
1440 // Allocate the data structure to be returned.
1441 //
1442 AddrUnsPair *retval = (AddrUnsPair *)
1443 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1444
1445 //
1446 // Run through each of the available contexts, binding the current thread
1447 // to it, and obtaining the pertinent information using the cpuid instr.
1448 //
1449 unsigned int proc;
1450 int nApics = 0;
1451 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1452 //
1453 // Skip this proc if it is not included in the machine model.
1454 //
1455 if (! KMP_CPU_ISSET(proc, fullMask)) {
1456 continue;
1457 }
1458 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1459
1460 __kmp_affinity_bind_thread(proc);
1461
1462 //
1463 // Extrach the labels for each level in the machine topology map
1464 // from the Apic ID.
1465 //
1466 Address addr(depth);
1467 int prev_shift = 0;
1468
1469 for (level = 0; level < depth; level++) {
1470 __kmp_x86_cpuid(11, level, &buf);
1471 unsigned apicId = buf.edx;
1472 if (buf.ebx == 0) {
1473 if (level != depth - 1) {
1474 KMP_CPU_FREE(oldMask);
1475 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1476 return -1;
1477 }
1478 addr.labels[depth - level - 1] = apicId >> prev_shift;
1479 level++;
1480 break;
1481 }
1482 int shift = buf.eax & 0x1f;
1483 int mask = (1 << shift) - 1;
1484 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1485 prev_shift = shift;
1486 }
1487 if (level != depth) {
1488 KMP_CPU_FREE(oldMask);
1489 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1490 return -1;
1491 }
1492
1493 retval[nApics] = AddrUnsPair(addr, proc);
1494 nApics++;
1495 }
1496
1497 //
1498 // We've collected all the info we need.
1499 // Restore the old affinity mask for this thread.
1500 //
1501 __kmp_set_system_affinity(oldMask, TRUE);
1502
1503 //
1504 // If there's only one thread context to bind to, return now.
1505 //
1506 KMP_ASSERT(nApics > 0);
1507 if (nApics == 1) {
1508 __kmp_ncores = nPackages = 1;
1509 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001510 if (__kmp_affinity_verbose) {
1511 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1512 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1513
1514 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1515 if (__kmp_affinity_respect_mask) {
1516 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1517 } else {
1518 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1519 }
1520 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1521 KMP_INFORM(Uniform, "KMP_AFFINITY");
1522 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1523 __kmp_nThreadsPerCore, __kmp_ncores);
1524 }
1525
1526 if (__kmp_affinity_type == affinity_none) {
1527 __kmp_free(retval);
1528 KMP_CPU_FREE(oldMask);
1529 return 0;
1530 }
1531
1532 //
1533 // Form an Address object which only includes the package level.
1534 //
1535 Address addr(1);
1536 addr.labels[0] = retval[0].first.labels[pkgLevel];
1537 retval[0].first = addr;
1538
1539 if (__kmp_affinity_gran_levels < 0) {
1540 __kmp_affinity_gran_levels = 0;
1541 }
1542
1543 if (__kmp_affinity_verbose) {
1544 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1545 }
1546
1547 *address2os = retval;
1548 KMP_CPU_FREE(oldMask);
1549 return 1;
1550 }
1551
1552 //
1553 // Sort the table by physical Id.
1554 //
1555 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1556
1557 //
1558 // Find the radix at each of the levels.
1559 //
1560 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1561 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1562 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1563 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1564 for (level = 0; level < depth; level++) {
1565 totals[level] = 1;
1566 maxCt[level] = 1;
1567 counts[level] = 1;
1568 last[level] = retval[0].first.labels[level];
1569 }
1570
1571 //
1572 // From here on, the iteration variable "level" runs from the finest
1573 // level to the coarsest, i.e. we iterate forward through
1574 // (*address2os)[].first.labels[] - in the previous loops, we iterated
1575 // backwards.
1576 //
1577 for (proc = 1; (int)proc < nApics; proc++) {
1578 int level;
1579 for (level = 0; level < depth; level++) {
1580 if (retval[proc].first.labels[level] != last[level]) {
1581 int j;
1582 for (j = level + 1; j < depth; j++) {
1583 totals[j]++;
1584 counts[j] = 1;
1585 // The line below causes printing incorrect topology information
1586 // in case the max value for some level (maxCt[level]) is encountered earlier than
1587 // some less value while going through the array.
1588 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1589 // whereas it must be 4.
1590 // TODO!!! Check if it can be commented safely
1591 //maxCt[j] = 1;
1592 last[j] = retval[proc].first.labels[j];
1593 }
1594 totals[level]++;
1595 counts[level]++;
1596 if (counts[level] > maxCt[level]) {
1597 maxCt[level] = counts[level];
1598 }
1599 last[level] = retval[proc].first.labels[level];
1600 break;
1601 }
1602 else if (level == depth - 1) {
1603 __kmp_free(last);
1604 __kmp_free(maxCt);
1605 __kmp_free(counts);
1606 __kmp_free(totals);
1607 __kmp_free(retval);
1608 KMP_CPU_FREE(oldMask);
1609 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1610 return -1;
1611 }
1612 }
1613 }
1614
1615 //
1616 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +00001617 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +00001618 // nCoresPerPkg, & nPackages. Make sure all these vars are set
1619 // correctly, and return if affinity is not enabled.
1620 //
1621 if (threadLevel >= 0) {
1622 __kmp_nThreadsPerCore = maxCt[threadLevel];
1623 }
1624 else {
1625 __kmp_nThreadsPerCore = 1;
1626 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00001627 nPackages = totals[pkgLevel];
1628
1629 if (coreLevel >= 0) {
1630 __kmp_ncores = totals[coreLevel];
1631 nCoresPerPkg = maxCt[coreLevel];
1632 }
1633 else {
1634 __kmp_ncores = nPackages;
1635 nCoresPerPkg = 1;
1636 }
1637
1638 //
1639 // Check to see if the machine topology is uniform
1640 //
1641 unsigned prod = maxCt[0];
1642 for (level = 1; level < depth; level++) {
1643 prod *= maxCt[level];
1644 }
1645 bool uniform = (prod == totals[level - 1]);
1646
1647 //
1648 // Print the machine topology summary.
1649 //
1650 if (__kmp_affinity_verbose) {
1651 char mask[KMP_AFFIN_MASK_PRINT_LEN];
1652 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1653
1654 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1655 if (__kmp_affinity_respect_mask) {
1656 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1657 } else {
1658 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1659 }
1660 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1661 if (uniform) {
1662 KMP_INFORM(Uniform, "KMP_AFFINITY");
1663 } else {
1664 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1665 }
1666
1667 kmp_str_buf_t buf;
1668 __kmp_str_buf_init(&buf);
1669
1670 __kmp_str_buf_print(&buf, "%d", totals[0]);
1671 for (level = 1; level <= pkgLevel; level++) {
1672 __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1673 }
1674 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1675 __kmp_nThreadsPerCore, __kmp_ncores);
1676
1677 __kmp_str_buf_free(&buf);
1678 }
1679
1680 if (__kmp_affinity_type == affinity_none) {
1681 __kmp_free(last);
1682 __kmp_free(maxCt);
1683 __kmp_free(counts);
1684 __kmp_free(totals);
1685 __kmp_free(retval);
1686 KMP_CPU_FREE(oldMask);
1687 return 0;
1688 }
1689
1690 //
1691 // Find any levels with radiix 1, and remove them from the map
1692 // (except for the package level).
1693 //
1694 int new_depth = 0;
1695 for (level = 0; level < depth; level++) {
1696 if ((maxCt[level] == 1) && (level != pkgLevel)) {
1697 continue;
1698 }
1699 new_depth++;
1700 }
1701
1702 //
1703 // If we are removing any levels, allocate a new vector to return,
1704 // and copy the relevant information to it.
1705 //
1706 if (new_depth != depth) {
1707 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1708 sizeof(AddrUnsPair) * nApics);
1709 for (proc = 0; (int)proc < nApics; proc++) {
1710 Address addr(new_depth);
1711 new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1712 }
1713 int new_level = 0;
1714 for (level = 0; level < depth; level++) {
1715 if ((maxCt[level] == 1) && (level != pkgLevel)) {
1716 if (level == threadLevel) {
1717 threadLevel = -1;
1718 }
1719 else if ((threadLevel >= 0) && (level < threadLevel)) {
1720 threadLevel--;
1721 }
1722 if (level == coreLevel) {
1723 coreLevel = -1;
1724 }
1725 else if ((coreLevel >= 0) && (level < coreLevel)) {
1726 coreLevel--;
1727 }
1728 if (level < pkgLevel) {
1729 pkgLevel--;
1730 }
1731 continue;
1732 }
1733 for (proc = 0; (int)proc < nApics; proc++) {
1734 new_retval[proc].first.labels[new_level]
1735 = retval[proc].first.labels[level];
1736 }
1737 new_level++;
1738 }
1739
1740 __kmp_free(retval);
1741 retval = new_retval;
1742 depth = new_depth;
1743 }
1744
1745 if (__kmp_affinity_gran_levels < 0) {
1746 //
1747 // Set the granularity level based on what levels are modeled
1748 // in the machine topology map.
1749 //
1750 __kmp_affinity_gran_levels = 0;
1751 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1752 __kmp_affinity_gran_levels++;
1753 }
1754 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1755 __kmp_affinity_gran_levels++;
1756 }
1757 if (__kmp_affinity_gran > affinity_gran_package) {
1758 __kmp_affinity_gran_levels++;
1759 }
1760 }
1761
1762 if (__kmp_affinity_verbose) {
1763 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1764 coreLevel, threadLevel);
1765 }
1766
1767 __kmp_free(last);
1768 __kmp_free(maxCt);
1769 __kmp_free(counts);
1770 __kmp_free(totals);
1771 KMP_CPU_FREE(oldMask);
1772 *address2os = retval;
1773 return depth;
1774}
1775
1776
1777# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1778
1779
1780#define osIdIndex 0
1781#define threadIdIndex 1
1782#define coreIdIndex 2
1783#define pkgIdIndex 3
1784#define nodeIdIndex 4
1785
1786typedef unsigned *ProcCpuInfo;
1787static unsigned maxIndex = pkgIdIndex;
1788
1789
1790static int
1791__kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1792{
1793 const unsigned *aa = (const unsigned *)a;
1794 const unsigned *bb = (const unsigned *)b;
1795 if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1796 if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1797 return 0;
1798};
1799
1800
1801static int
1802__kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1803{
1804 unsigned i;
1805 const unsigned *aa = *((const unsigned **)a);
1806 const unsigned *bb = *((const unsigned **)b);
1807 for (i = maxIndex; ; i--) {
1808 if (aa[i] < bb[i]) return -1;
1809 if (aa[i] > bb[i]) return 1;
1810 if (i == osIdIndex) break;
1811 }
1812 return 0;
1813}
1814
1815
1816//
1817// Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1818// affinity map.
1819//
1820static int
1821__kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1822 kmp_i18n_id_t *const msg_id, FILE *f)
1823{
1824 *address2os = NULL;
1825 *msg_id = kmp_i18n_null;
1826
1827 //
1828 // Scan of the file, and count the number of "processor" (osId) fields,
Alp Toker8f2d3f02014-02-24 10:40:15 +00001829 // and find the highest value of <n> for a node_<n> field.
Jim Cownie5e8470a2013-09-27 10:38:44 +00001830 //
1831 char buf[256];
1832 unsigned num_records = 0;
1833 while (! feof(f)) {
1834 buf[sizeof(buf) - 1] = 1;
1835 if (! fgets(buf, sizeof(buf), f)) {
1836 //
1837 // Read errors presumably because of EOF
1838 //
1839 break;
1840 }
1841
1842 char s1[] = "processor";
1843 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1844 num_records++;
1845 continue;
1846 }
1847
1848 //
1849 // FIXME - this will match "node_<n> <garbage>"
1850 //
1851 unsigned level;
1852 if (sscanf(buf, "node_%d id", &level) == 1) {
1853 if (nodeIdIndex + level >= maxIndex) {
1854 maxIndex = nodeIdIndex + level;
1855 }
1856 continue;
1857 }
1858 }
1859
1860 //
1861 // Check for empty file / no valid processor records, or too many.
1862 // The number of records can't exceed the number of valid bits in the
1863 // affinity mask.
1864 //
1865 if (num_records == 0) {
1866 *line = 0;
1867 *msg_id = kmp_i18n_str_NoProcRecords;
1868 return -1;
1869 }
1870 if (num_records > (unsigned)__kmp_xproc) {
1871 *line = 0;
1872 *msg_id = kmp_i18n_str_TooManyProcRecords;
1873 return -1;
1874 }
1875
1876 //
1877 // Set the file pointer back to the begginning, so that we can scan the
1878 // file again, this time performing a full parse of the data.
1879 // Allocate a vector of ProcCpuInfo object, where we will place the data.
1880 // Adding an extra element at the end allows us to remove a lot of extra
1881 // checks for termination conditions.
1882 //
1883 if (fseek(f, 0, SEEK_SET) != 0) {
1884 *line = 0;
1885 *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1886 return -1;
1887 }
1888
1889 //
1890 // Allocate the array of records to store the proc info in. The dummy
1891 // element at the end makes the logic in filling them out easier to code.
1892 //
1893 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1894 * sizeof(unsigned *));
1895 unsigned i;
1896 for (i = 0; i <= num_records; i++) {
1897 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1898 * sizeof(unsigned));
1899 }
1900
1901#define CLEANUP_THREAD_INFO \
1902 for (i = 0; i <= num_records; i++) { \
1903 __kmp_free(threadInfo[i]); \
1904 } \
1905 __kmp_free(threadInfo);
1906
1907 //
1908 // A value of UINT_MAX means that we didn't find the field
1909 //
1910 unsigned __index;
1911
1912#define INIT_PROC_INFO(p) \
1913 for (__index = 0; __index <= maxIndex; __index++) { \
1914 (p)[__index] = UINT_MAX; \
1915 }
1916
1917 for (i = 0; i <= num_records; i++) {
1918 INIT_PROC_INFO(threadInfo[i]);
1919 }
1920
1921 unsigned num_avail = 0;
1922 *line = 0;
1923 while (! feof(f)) {
1924 //
1925 // Create an inner scoping level, so that all the goto targets at the
1926 // end of the loop appear in an outer scoping level. This avoids
1927 // warnings about jumping past an initialization to a target in the
1928 // same block.
1929 //
1930 {
1931 buf[sizeof(buf) - 1] = 1;
1932 bool long_line = false;
1933 if (! fgets(buf, sizeof(buf), f)) {
1934 //
1935 // Read errors presumably because of EOF
1936 //
1937 // If there is valid data in threadInfo[num_avail], then fake
1938 // a blank line in ensure that the last address gets parsed.
1939 //
1940 bool valid = false;
1941 for (i = 0; i <= maxIndex; i++) {
1942 if (threadInfo[num_avail][i] != UINT_MAX) {
1943 valid = true;
1944 }
1945 }
1946 if (! valid) {
1947 break;
1948 }
1949 buf[0] = 0;
1950 } else if (!buf[sizeof(buf) - 1]) {
1951 //
1952 // The line is longer than the buffer. Set a flag and don't
1953 // emit an error if we were going to ignore the line, anyway.
1954 //
1955 long_line = true;
1956
1957#define CHECK_LINE \
1958 if (long_line) { \
1959 CLEANUP_THREAD_INFO; \
1960 *msg_id = kmp_i18n_str_LongLineCpuinfo; \
1961 return -1; \
1962 }
1963 }
1964 (*line)++;
1965
1966 char s1[] = "processor";
1967 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1968 CHECK_LINE;
1969 char *p = strchr(buf + sizeof(s1) - 1, ':');
1970 unsigned val;
1971 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1972 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
1973 threadInfo[num_avail][osIdIndex] = val;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001974#if KMP_OS_LINUX && USE_SYSFS_INFO
1975 char path[256];
1976 snprintf(path, sizeof(path),
1977 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
1978 threadInfo[num_avail][osIdIndex]);
1979 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
1980
1981 snprintf(path, sizeof(path),
1982 "/sys/devices/system/cpu/cpu%u/topology/core_id",
1983 threadInfo[num_avail][osIdIndex]);
1984 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001985 continue;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001986#else
Jim Cownie5e8470a2013-09-27 10:38:44 +00001987 }
1988 char s2[] = "physical id";
1989 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
1990 CHECK_LINE;
1991 char *p = strchr(buf + sizeof(s2) - 1, ':');
1992 unsigned val;
1993 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1994 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
1995 threadInfo[num_avail][pkgIdIndex] = val;
1996 continue;
1997 }
1998 char s3[] = "core id";
1999 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2000 CHECK_LINE;
2001 char *p = strchr(buf + sizeof(s3) - 1, ':');
2002 unsigned val;
2003 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2004 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2005 threadInfo[num_avail][coreIdIndex] = val;
2006 continue;
Jim Cownie181b4bb2013-12-23 17:28:57 +00002007#endif // KMP_OS_LINUX && USE_SYSFS_INFO
Jim Cownie5e8470a2013-09-27 10:38:44 +00002008 }
2009 char s4[] = "thread id";
2010 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2011 CHECK_LINE;
2012 char *p = strchr(buf + sizeof(s4) - 1, ':');
2013 unsigned val;
2014 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2015 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2016 threadInfo[num_avail][threadIdIndex] = val;
2017 continue;
2018 }
2019 unsigned level;
2020 if (sscanf(buf, "node_%d id", &level) == 1) {
2021 CHECK_LINE;
2022 char *p = strchr(buf + sizeof(s4) - 1, ':');
2023 unsigned val;
2024 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2025 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2026 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2027 threadInfo[num_avail][nodeIdIndex + level] = val;
2028 continue;
2029 }
2030
2031 //
2032 // We didn't recognize the leading token on the line.
2033 // There are lots of leading tokens that we don't recognize -
2034 // if the line isn't empty, go on to the next line.
2035 //
2036 if ((*buf != 0) && (*buf != '\n')) {
2037 //
2038 // If the line is longer than the buffer, read characters
2039 // until we find a newline.
2040 //
2041 if (long_line) {
2042 int ch;
2043 while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2044 }
2045 continue;
2046 }
2047
2048 //
2049 // A newline has signalled the end of the processor record.
2050 // Check that there aren't too many procs specified.
2051 //
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002052 if ((int)num_avail == __kmp_xproc) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002053 CLEANUP_THREAD_INFO;
2054 *msg_id = kmp_i18n_str_TooManyEntries;
2055 return -1;
2056 }
2057
2058 //
2059 // Check for missing fields. The osId field must be there, and we
2060 // currently require that the physical id field is specified, also.
2061 //
2062 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2063 CLEANUP_THREAD_INFO;
2064 *msg_id = kmp_i18n_str_MissingProcField;
2065 return -1;
2066 }
2067 if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2068 CLEANUP_THREAD_INFO;
2069 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2070 return -1;
2071 }
2072
2073 //
2074 // Skip this proc if it is not included in the machine model.
2075 //
2076 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2077 INIT_PROC_INFO(threadInfo[num_avail]);
2078 continue;
2079 }
2080
2081 //
2082 // We have a successful parse of this proc's info.
2083 // Increment the counter, and prepare for the next proc.
2084 //
2085 num_avail++;
2086 KMP_ASSERT(num_avail <= num_records);
2087 INIT_PROC_INFO(threadInfo[num_avail]);
2088 }
2089 continue;
2090
2091 no_val:
2092 CLEANUP_THREAD_INFO;
2093 *msg_id = kmp_i18n_str_MissingValCpuinfo;
2094 return -1;
2095
2096 dup_field:
2097 CLEANUP_THREAD_INFO;
2098 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2099 return -1;
2100 }
2101 *line = 0;
2102
2103# if KMP_MIC && REDUCE_TEAM_SIZE
2104 unsigned teamSize = 0;
2105# endif // KMP_MIC && REDUCE_TEAM_SIZE
2106
2107 // check for num_records == __kmp_xproc ???
2108
2109 //
2110 // If there's only one thread context to bind to, form an Address object
2111 // with depth 1 and return immediately (or, if affinity is off, set
2112 // address2os to NULL and return).
2113 //
2114 // If it is configured to omit the package level when there is only a
2115 // single package, the logic at the end of this routine won't work if
2116 // there is only a single thread - it would try to form an Address
2117 // object with depth 0.
2118 //
2119 KMP_ASSERT(num_avail > 0);
2120 KMP_ASSERT(num_avail <= num_records);
2121 if (num_avail == 1) {
2122 __kmp_ncores = 1;
2123 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002124 if (__kmp_affinity_verbose) {
2125 if (! KMP_AFFINITY_CAPABLE()) {
2126 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2127 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2128 KMP_INFORM(Uniform, "KMP_AFFINITY");
2129 }
2130 else {
2131 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2132 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2133 fullMask);
2134 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2135 if (__kmp_affinity_respect_mask) {
2136 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2137 } else {
2138 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2139 }
2140 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2141 KMP_INFORM(Uniform, "KMP_AFFINITY");
2142 }
2143 int index;
2144 kmp_str_buf_t buf;
2145 __kmp_str_buf_init(&buf);
2146 __kmp_str_buf_print(&buf, "1");
2147 for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2148 __kmp_str_buf_print(&buf, " x 1");
2149 }
2150 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2151 __kmp_str_buf_free(&buf);
2152 }
2153
2154 if (__kmp_affinity_type == affinity_none) {
2155 CLEANUP_THREAD_INFO;
2156 return 0;
2157 }
2158
2159 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2160 Address addr(1);
2161 addr.labels[0] = threadInfo[0][pkgIdIndex];
2162 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2163
2164 if (__kmp_affinity_gran_levels < 0) {
2165 __kmp_affinity_gran_levels = 0;
2166 }
2167
2168 if (__kmp_affinity_verbose) {
2169 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2170 }
2171
2172 CLEANUP_THREAD_INFO;
2173 return 1;
2174 }
2175
2176 //
2177 // Sort the threadInfo table by physical Id.
2178 //
2179 qsort(threadInfo, num_avail, sizeof(*threadInfo),
2180 __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2181
2182 //
2183 // The table is now sorted by pkgId / coreId / threadId, but we really
2184 // don't know the radix of any of the fields. pkgId's may be sparsely
2185 // assigned among the chips on a system. Although coreId's are usually
2186 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2187 // [0..threadsPerCore-1], we don't want to make any such assumptions.
2188 //
2189 // For that matter, we don't know what coresPerPkg and threadsPerCore
2190 // (or the total # packages) are at this point - we want to determine
2191 // that now. We only have an upper bound on the first two figures.
2192 //
2193 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2194 * sizeof(unsigned));
2195 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2196 * sizeof(unsigned));
2197 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2198 * sizeof(unsigned));
2199 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2200 * sizeof(unsigned));
2201
2202 bool assign_thread_ids = false;
2203 unsigned threadIdCt;
2204 unsigned index;
2205
2206 restart_radix_check:
2207 threadIdCt = 0;
2208
2209 //
2210 // Initialize the counter arrays with data from threadInfo[0].
2211 //
2212 if (assign_thread_ids) {
2213 if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2214 threadInfo[0][threadIdIndex] = threadIdCt++;
2215 }
2216 else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2217 threadIdCt = threadInfo[0][threadIdIndex] + 1;
2218 }
2219 }
2220 for (index = 0; index <= maxIndex; index++) {
2221 counts[index] = 1;
2222 maxCt[index] = 1;
2223 totals[index] = 1;
2224 lastId[index] = threadInfo[0][index];;
2225 }
2226
2227 //
2228 // Run through the rest of the OS procs.
2229 //
2230 for (i = 1; i < num_avail; i++) {
2231 //
2232 // Find the most significant index whose id differs
2233 // from the id for the previous OS proc.
2234 //
2235 for (index = maxIndex; index >= threadIdIndex; index--) {
2236 if (assign_thread_ids && (index == threadIdIndex)) {
2237 //
2238 // Auto-assign the thread id field if it wasn't specified.
2239 //
2240 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2241 threadInfo[i][threadIdIndex] = threadIdCt++;
2242 }
2243
2244 //
2245 // Aparrently the thread id field was specified for some
2246 // entries and not others. Start the thread id counter
2247 // off at the next higher thread id.
2248 //
2249 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2250 threadIdCt = threadInfo[i][threadIdIndex] + 1;
2251 }
2252 }
2253 if (threadInfo[i][index] != lastId[index]) {
2254 //
2255 // Run through all indices which are less significant,
2256 // and reset the counts to 1.
2257 //
2258 // At all levels up to and including index, we need to
2259 // increment the totals and record the last id.
2260 //
2261 unsigned index2;
2262 for (index2 = threadIdIndex; index2 < index; index2++) {
2263 totals[index2]++;
2264 if (counts[index2] > maxCt[index2]) {
2265 maxCt[index2] = counts[index2];
2266 }
2267 counts[index2] = 1;
2268 lastId[index2] = threadInfo[i][index2];
2269 }
2270 counts[index]++;
2271 totals[index]++;
2272 lastId[index] = threadInfo[i][index];
2273
2274 if (assign_thread_ids && (index > threadIdIndex)) {
2275
2276# if KMP_MIC && REDUCE_TEAM_SIZE
2277 //
2278 // The default team size is the total #threads in the machine
2279 // minus 1 thread for every core that has 3 or more threads.
2280 //
2281 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2282# endif // KMP_MIC && REDUCE_TEAM_SIZE
2283
2284 //
2285 // Restart the thread counter, as we are on a new core.
2286 //
2287 threadIdCt = 0;
2288
2289 //
2290 // Auto-assign the thread id field if it wasn't specified.
2291 //
2292 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2293 threadInfo[i][threadIdIndex] = threadIdCt++;
2294 }
2295
2296 //
2297 // Aparrently the thread id field was specified for some
2298 // entries and not others. Start the thread id counter
2299 // off at the next higher thread id.
2300 //
2301 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2302 threadIdCt = threadInfo[i][threadIdIndex] + 1;
2303 }
2304 }
2305 break;
2306 }
2307 }
2308 if (index < threadIdIndex) {
2309 //
2310 // If thread ids were specified, it is an error if they are not
2311 // unique. Also, check that we waven't already restarted the
2312 // loop (to be safe - shouldn't need to).
2313 //
2314 if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2315 || assign_thread_ids) {
2316 __kmp_free(lastId);
2317 __kmp_free(totals);
2318 __kmp_free(maxCt);
2319 __kmp_free(counts);
2320 CLEANUP_THREAD_INFO;
2321 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2322 return -1;
2323 }
2324
2325 //
2326 // If the thread ids were not specified and we see entries
2327 // entries that are duplicates, start the loop over and
2328 // assign the thread ids manually.
2329 //
2330 assign_thread_ids = true;
2331 goto restart_radix_check;
2332 }
2333 }
2334
2335# if KMP_MIC && REDUCE_TEAM_SIZE
2336 //
2337 // The default team size is the total #threads in the machine
2338 // minus 1 thread for every core that has 3 or more threads.
2339 //
2340 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2341# endif // KMP_MIC && REDUCE_TEAM_SIZE
2342
2343 for (index = threadIdIndex; index <= maxIndex; index++) {
2344 if (counts[index] > maxCt[index]) {
2345 maxCt[index] = counts[index];
2346 }
2347 }
2348
2349 __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2350 nCoresPerPkg = maxCt[coreIdIndex];
2351 nPackages = totals[pkgIdIndex];
2352
2353 //
2354 // Check to see if the machine topology is uniform
2355 //
2356 unsigned prod = totals[maxIndex];
2357 for (index = threadIdIndex; index < maxIndex; index++) {
2358 prod *= maxCt[index];
2359 }
2360 bool uniform = (prod == totals[threadIdIndex]);
2361
2362 //
2363 // When affinity is off, this routine will still be called to set
Andrey Churbanovf696c822015-01-27 16:55:43 +00002364 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
Jim Cownie5e8470a2013-09-27 10:38:44 +00002365 // nCoresPerPkg, & nPackages. Make sure all these vars are set
2366 // correctly, and return now if affinity is not enabled.
2367 //
Jim Cownie5e8470a2013-09-27 10:38:44 +00002368 __kmp_ncores = totals[coreIdIndex];
2369
2370 if (__kmp_affinity_verbose) {
2371 if (! KMP_AFFINITY_CAPABLE()) {
2372 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2373 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2374 if (uniform) {
2375 KMP_INFORM(Uniform, "KMP_AFFINITY");
2376 } else {
2377 KMP_INFORM(NonUniform, "KMP_AFFINITY");
2378 }
2379 }
2380 else {
2381 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2382 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2383 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2384 if (__kmp_affinity_respect_mask) {
2385 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2386 } else {
2387 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2388 }
2389 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2390 if (uniform) {
2391 KMP_INFORM(Uniform, "KMP_AFFINITY");
2392 } else {
2393 KMP_INFORM(NonUniform, "KMP_AFFINITY");
2394 }
2395 }
2396 kmp_str_buf_t buf;
2397 __kmp_str_buf_init(&buf);
2398
2399 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2400 for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2401 __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2402 }
2403 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2404 maxCt[threadIdIndex], __kmp_ncores);
2405
2406 __kmp_str_buf_free(&buf);
2407 }
2408
2409# if KMP_MIC && REDUCE_TEAM_SIZE
2410 //
2411 // Set the default team size.
2412 //
2413 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2414 __kmp_dflt_team_nth = teamSize;
2415 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2416 __kmp_dflt_team_nth));
2417 }
2418# endif // KMP_MIC && REDUCE_TEAM_SIZE
2419
2420 if (__kmp_affinity_type == affinity_none) {
2421 __kmp_free(lastId);
2422 __kmp_free(totals);
2423 __kmp_free(maxCt);
2424 __kmp_free(counts);
2425 CLEANUP_THREAD_INFO;
2426 return 0;
2427 }
2428
2429 //
2430 // Count the number of levels which have more nodes at that level than
2431 // at the parent's level (with there being an implicit root node of
2432 // the top level). This is equivalent to saying that there is at least
2433 // one node at this level which has a sibling. These levels are in the
2434 // map, and the package level is always in the map.
2435 //
2436 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2437 int level = 0;
2438 for (index = threadIdIndex; index < maxIndex; index++) {
2439 KMP_ASSERT(totals[index] >= totals[index + 1]);
2440 inMap[index] = (totals[index] > totals[index + 1]);
2441 }
2442 inMap[maxIndex] = (totals[maxIndex] > 1);
2443 inMap[pkgIdIndex] = true;
2444
2445 int depth = 0;
2446 for (index = threadIdIndex; index <= maxIndex; index++) {
2447 if (inMap[index]) {
2448 depth++;
2449 }
2450 }
2451 KMP_ASSERT(depth > 0);
2452
2453 //
2454 // Construct the data structure that is to be returned.
2455 //
2456 *address2os = (AddrUnsPair*)
2457 __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2458 int pkgLevel = -1;
2459 int coreLevel = -1;
2460 int threadLevel = -1;
2461
2462 for (i = 0; i < num_avail; ++i) {
2463 Address addr(depth);
2464 unsigned os = threadInfo[i][osIdIndex];
2465 int src_index;
2466 int dst_index = 0;
2467
2468 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2469 if (! inMap[src_index]) {
2470 continue;
2471 }
2472 addr.labels[dst_index] = threadInfo[i][src_index];
2473 if (src_index == pkgIdIndex) {
2474 pkgLevel = dst_index;
2475 }
2476 else if (src_index == coreIdIndex) {
2477 coreLevel = dst_index;
2478 }
2479 else if (src_index == threadIdIndex) {
2480 threadLevel = dst_index;
2481 }
2482 dst_index++;
2483 }
2484 (*address2os)[i] = AddrUnsPair(addr, os);
2485 }
2486
2487 if (__kmp_affinity_gran_levels < 0) {
2488 //
2489 // Set the granularity level based on what levels are modeled
2490 // in the machine topology map.
2491 //
2492 unsigned src_index;
2493 __kmp_affinity_gran_levels = 0;
2494 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2495 if (! inMap[src_index]) {
2496 continue;
2497 }
2498 switch (src_index) {
2499 case threadIdIndex:
2500 if (__kmp_affinity_gran > affinity_gran_thread) {
2501 __kmp_affinity_gran_levels++;
2502 }
2503
2504 break;
2505 case coreIdIndex:
2506 if (__kmp_affinity_gran > affinity_gran_core) {
2507 __kmp_affinity_gran_levels++;
2508 }
2509 break;
2510
2511 case pkgIdIndex:
2512 if (__kmp_affinity_gran > affinity_gran_package) {
2513 __kmp_affinity_gran_levels++;
2514 }
2515 break;
2516 }
2517 }
2518 }
2519
2520 if (__kmp_affinity_verbose) {
2521 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2522 coreLevel, threadLevel);
2523 }
2524
2525 __kmp_free(inMap);
2526 __kmp_free(lastId);
2527 __kmp_free(totals);
2528 __kmp_free(maxCt);
2529 __kmp_free(counts);
2530 CLEANUP_THREAD_INFO;
2531 return depth;
2532}
2533
2534
2535//
2536// Create and return a table of affinity masks, indexed by OS thread ID.
2537// This routine handles OR'ing together all the affinity masks of threads
2538// that are sufficiently close, if granularity > fine.
2539//
2540static kmp_affin_mask_t *
2541__kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2542 AddrUnsPair *address2os, unsigned numAddrs)
2543{
2544 //
2545 // First form a table of affinity masks in order of OS thread id.
2546 //
2547 unsigned depth;
2548 unsigned maxOsId;
2549 unsigned i;
2550
2551 KMP_ASSERT(numAddrs > 0);
2552 depth = address2os[0].first.depth;
2553
2554 maxOsId = 0;
2555 for (i = 0; i < numAddrs; i++) {
2556 unsigned osId = address2os[i].second;
2557 if (osId > maxOsId) {
2558 maxOsId = osId;
2559 }
2560 }
2561 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2562 (maxOsId + 1) * __kmp_affin_mask_size);
2563
2564 //
2565 // Sort the address2os table according to physical order. Doing so
2566 // will put all threads on the same core/package/node in consecutive
2567 // locations.
2568 //
2569 qsort(address2os, numAddrs, sizeof(*address2os),
2570 __kmp_affinity_cmp_Address_labels);
2571
2572 KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2573 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2574 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2575 }
2576 if (__kmp_affinity_gran_levels >= (int)depth) {
2577 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2578 && (__kmp_affinity_type != affinity_none))) {
2579 KMP_WARNING(AffThreadsMayMigrate);
2580 }
2581 }
2582
2583 //
2584 // Run through the table, forming the masks for all threads on each
2585 // core. Threads on the same core will have identical "Address"
2586 // objects, not considering the last level, which must be the thread
2587 // id. All threads on a core will appear consecutively.
2588 //
2589 unsigned unique = 0;
2590 unsigned j = 0; // index of 1st thread on core
2591 unsigned leader = 0;
2592 Address *leaderAddr = &(address2os[0].first);
2593 kmp_affin_mask_t *sum
2594 = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
2595 KMP_CPU_ZERO(sum);
2596 KMP_CPU_SET(address2os[0].second, sum);
2597 for (i = 1; i < numAddrs; i++) {
2598 //
Alp Toker8f2d3f02014-02-24 10:40:15 +00002599 // If this thread is sufficiently close to the leader (within the
Jim Cownie5e8470a2013-09-27 10:38:44 +00002600 // granularity setting), then set the bit for this os thread in the
2601 // affinity mask for this group, and go on to the next thread.
2602 //
2603 if (leaderAddr->isClose(address2os[i].first,
2604 __kmp_affinity_gran_levels)) {
2605 KMP_CPU_SET(address2os[i].second, sum);
2606 continue;
2607 }
2608
2609 //
2610 // For every thread in this group, copy the mask to the thread's
2611 // entry in the osId2Mask table. Mark the first address as a
2612 // leader.
2613 //
2614 for (; j < i; j++) {
2615 unsigned osId = address2os[j].second;
2616 KMP_DEBUG_ASSERT(osId <= maxOsId);
2617 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2618 KMP_CPU_COPY(mask, sum);
2619 address2os[j].first.leader = (j == leader);
2620 }
2621 unique++;
2622
2623 //
2624 // Start a new mask.
2625 //
2626 leader = i;
2627 leaderAddr = &(address2os[i].first);
2628 KMP_CPU_ZERO(sum);
2629 KMP_CPU_SET(address2os[i].second, sum);
2630 }
2631
2632 //
2633 // For every thread in last group, copy the mask to the thread's
2634 // entry in the osId2Mask table.
2635 //
2636 for (; j < i; j++) {
2637 unsigned osId = address2os[j].second;
2638 KMP_DEBUG_ASSERT(osId <= maxOsId);
2639 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2640 KMP_CPU_COPY(mask, sum);
2641 address2os[j].first.leader = (j == leader);
2642 }
2643 unique++;
2644
2645 *maxIndex = maxOsId;
2646 *numUnique = unique;
2647 return osId2Mask;
2648}
2649
2650
2651//
2652// Stuff for the affinity proclist parsers. It's easier to declare these vars
2653// as file-static than to try and pass them through the calling sequence of
2654// the recursive-descent OMP_PLACES parser.
2655//
2656static kmp_affin_mask_t *newMasks;
2657static int numNewMasks;
2658static int nextNewMask;
2659
2660#define ADD_MASK(_mask) \
2661 { \
2662 if (nextNewMask >= numNewMasks) { \
2663 numNewMasks *= 2; \
2664 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2665 numNewMasks * __kmp_affin_mask_size); \
2666 } \
2667 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2668 nextNewMask++; \
2669 }
2670
2671#define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2672 { \
2673 if (((_osId) > _maxOsId) || \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002674 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
Jim Cownie5e8470a2013-09-27 10:38:44 +00002675 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \
2676 && (__kmp_affinity_type != affinity_none))) { \
2677 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2678 } \
2679 } \
2680 else { \
2681 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2682 } \
2683 }
2684
2685
2686//
2687// Re-parse the proclist (for the explicit affinity type), and form the list
2688// of affinity newMasks indexed by gtid.
2689//
2690static void
2691__kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2692 unsigned int *out_numMasks, const char *proclist,
2693 kmp_affin_mask_t *osId2Mask, int maxOsId)
2694{
2695 const char *scan = proclist;
2696 const char *next = proclist;
2697
2698 //
2699 // We use malloc() for the temporary mask vector,
2700 // so that we can use realloc() to extend it.
2701 //
2702 numNewMasks = 2;
2703 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2704 * __kmp_affin_mask_size);
2705 nextNewMask = 0;
2706 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2707 __kmp_affin_mask_size);
2708 int setSize = 0;
2709
2710 for (;;) {
2711 int start, end, stride;
2712
2713 SKIP_WS(scan);
2714 next = scan;
2715 if (*next == '\0') {
2716 break;
2717 }
2718
2719 if (*next == '{') {
2720 int num;
2721 setSize = 0;
2722 next++; // skip '{'
2723 SKIP_WS(next);
2724 scan = next;
2725
2726 //
2727 // Read the first integer in the set.
2728 //
2729 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2730 "bad proclist");
2731 SKIP_DIGITS(next);
2732 num = __kmp_str_to_int(scan, *next);
2733 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2734
2735 //
2736 // Copy the mask for that osId to the sum (union) mask.
2737 //
2738 if ((num > maxOsId) ||
2739 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2740 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2741 && (__kmp_affinity_type != affinity_none))) {
2742 KMP_WARNING(AffIgnoreInvalidProcID, num);
2743 }
2744 KMP_CPU_ZERO(sumMask);
2745 }
2746 else {
2747 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2748 setSize = 1;
2749 }
2750
2751 for (;;) {
2752 //
2753 // Check for end of set.
2754 //
2755 SKIP_WS(next);
2756 if (*next == '}') {
2757 next++; // skip '}'
2758 break;
2759 }
2760
2761 //
2762 // Skip optional comma.
2763 //
2764 if (*next == ',') {
2765 next++;
2766 }
2767 SKIP_WS(next);
2768
2769 //
2770 // Read the next integer in the set.
2771 //
2772 scan = next;
2773 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2774 "bad explicit proc list");
2775
2776 SKIP_DIGITS(next);
2777 num = __kmp_str_to_int(scan, *next);
2778 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2779
2780 //
2781 // Add the mask for that osId to the sum mask.
2782 //
2783 if ((num > maxOsId) ||
2784 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2785 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2786 && (__kmp_affinity_type != affinity_none))) {
2787 KMP_WARNING(AffIgnoreInvalidProcID, num);
2788 }
2789 }
2790 else {
2791 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2792 setSize++;
2793 }
2794 }
2795 if (setSize > 0) {
2796 ADD_MASK(sumMask);
2797 }
2798
2799 SKIP_WS(next);
2800 if (*next == ',') {
2801 next++;
2802 }
2803 scan = next;
2804 continue;
2805 }
2806
2807 //
2808 // Read the first integer.
2809 //
2810 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2811 SKIP_DIGITS(next);
2812 start = __kmp_str_to_int(scan, *next);
2813 KMP_ASSERT2(start >= 0, "bad explicit proc list");
2814 SKIP_WS(next);
2815
2816 //
2817 // If this isn't a range, then add a mask to the list and go on.
2818 //
2819 if (*next != '-') {
2820 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2821
2822 //
2823 // Skip optional comma.
2824 //
2825 if (*next == ',') {
2826 next++;
2827 }
2828 scan = next;
2829 continue;
2830 }
2831
2832 //
2833 // This is a range. Skip over the '-' and read in the 2nd int.
2834 //
2835 next++; // skip '-'
2836 SKIP_WS(next);
2837 scan = next;
2838 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2839 SKIP_DIGITS(next);
2840 end = __kmp_str_to_int(scan, *next);
2841 KMP_ASSERT2(end >= 0, "bad explicit proc list");
2842
2843 //
2844 // Check for a stride parameter
2845 //
2846 stride = 1;
2847 SKIP_WS(next);
2848 if (*next == ':') {
2849 //
2850 // A stride is specified. Skip over the ':" and read the 3rd int.
2851 //
2852 int sign = +1;
2853 next++; // skip ':'
2854 SKIP_WS(next);
2855 scan = next;
2856 if (*next == '-') {
2857 sign = -1;
2858 next++;
2859 SKIP_WS(next);
2860 scan = next;
2861 }
2862 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2863 "bad explicit proc list");
2864 SKIP_DIGITS(next);
2865 stride = __kmp_str_to_int(scan, *next);
2866 KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2867 stride *= sign;
2868 }
2869
2870 //
2871 // Do some range checks.
2872 //
2873 KMP_ASSERT2(stride != 0, "bad explicit proc list");
2874 if (stride > 0) {
2875 KMP_ASSERT2(start <= end, "bad explicit proc list");
2876 }
2877 else {
2878 KMP_ASSERT2(start >= end, "bad explicit proc list");
2879 }
2880 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2881
2882 //
2883 // Add the mask for each OS proc # to the list.
2884 //
2885 if (stride > 0) {
2886 do {
2887 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2888 start += stride;
2889 } while (start <= end);
2890 }
2891 else {
2892 do {
2893 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2894 start += stride;
2895 } while (start >= end);
2896 }
2897
2898 //
2899 // Skip optional comma.
2900 //
2901 SKIP_WS(next);
2902 if (*next == ',') {
2903 next++;
2904 }
2905 scan = next;
2906 }
2907
2908 *out_numMasks = nextNewMask;
2909 if (nextNewMask == 0) {
2910 *out_masks = NULL;
2911 KMP_INTERNAL_FREE(newMasks);
2912 return;
2913 }
2914 *out_masks
2915 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2916 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2917 __kmp_free(sumMask);
2918 KMP_INTERNAL_FREE(newMasks);
2919}
2920
2921
2922# if OMP_40_ENABLED
2923
2924/*-----------------------------------------------------------------------------
2925
2926Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2927places. Again, Here is the grammar:
2928
2929place_list := place
2930place_list := place , place_list
2931place := num
2932place := place : num
2933place := place : num : signed
2934place := { subplacelist }
2935place := ! place // (lowest priority)
2936subplace_list := subplace
2937subplace_list := subplace , subplace_list
2938subplace := num
2939subplace := num : num
2940subplace := num : num : signed
2941signed := num
2942signed := + signed
2943signed := - signed
2944
2945-----------------------------------------------------------------------------*/
2946
2947static void
2948__kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2949 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2950{
2951 const char *next;
2952
2953 for (;;) {
2954 int start, count, stride, i;
2955
2956 //
2957 // Read in the starting proc id
2958 //
2959 SKIP_WS(*scan);
2960 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2961 "bad explicit places list");
2962 next = *scan;
2963 SKIP_DIGITS(next);
2964 start = __kmp_str_to_int(*scan, *next);
2965 KMP_ASSERT(start >= 0);
2966 *scan = next;
2967
2968 //
2969 // valid follow sets are ',' ':' and '}'
2970 //
2971 SKIP_WS(*scan);
2972 if (**scan == '}' || **scan == ',') {
2973 if ((start > maxOsId) ||
2974 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2975 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2976 && (__kmp_affinity_type != affinity_none))) {
2977 KMP_WARNING(AffIgnoreInvalidProcID, start);
2978 }
2979 }
2980 else {
2981 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2982 (*setSize)++;
2983 }
2984 if (**scan == '}') {
2985 break;
2986 }
2987 (*scan)++; // skip ','
2988 continue;
2989 }
2990 KMP_ASSERT2(**scan == ':', "bad explicit places list");
2991 (*scan)++; // skip ':'
2992
2993 //
2994 // Read count parameter
2995 //
2996 SKIP_WS(*scan);
2997 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2998 "bad explicit places list");
2999 next = *scan;
3000 SKIP_DIGITS(next);
3001 count = __kmp_str_to_int(*scan, *next);
3002 KMP_ASSERT(count >= 0);
3003 *scan = next;
3004
3005 //
3006 // valid follow sets are ',' ':' and '}'
3007 //
3008 SKIP_WS(*scan);
3009 if (**scan == '}' || **scan == ',') {
3010 for (i = 0; i < count; i++) {
3011 if ((start > maxOsId) ||
3012 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3013 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3014 && (__kmp_affinity_type != affinity_none))) {
3015 KMP_WARNING(AffIgnoreInvalidProcID, start);
3016 }
3017 break; // don't proliferate warnings for large count
3018 }
3019 else {
3020 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3021 start++;
3022 (*setSize)++;
3023 }
3024 }
3025 if (**scan == '}') {
3026 break;
3027 }
3028 (*scan)++; // skip ','
3029 continue;
3030 }
3031 KMP_ASSERT2(**scan == ':', "bad explicit places list");
3032 (*scan)++; // skip ':'
3033
3034 //
3035 // Read stride parameter
3036 //
3037 int sign = +1;
3038 for (;;) {
3039 SKIP_WS(*scan);
3040 if (**scan == '+') {
3041 (*scan)++; // skip '+'
3042 continue;
3043 }
3044 if (**scan == '-') {
3045 sign *= -1;
3046 (*scan)++; // skip '-'
3047 continue;
3048 }
3049 break;
3050 }
3051 SKIP_WS(*scan);
3052 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3053 "bad explicit places list");
3054 next = *scan;
3055 SKIP_DIGITS(next);
3056 stride = __kmp_str_to_int(*scan, *next);
3057 KMP_ASSERT(stride >= 0);
3058 *scan = next;
3059 stride *= sign;
3060
3061 //
3062 // valid follow sets are ',' and '}'
3063 //
3064 SKIP_WS(*scan);
3065 if (**scan == '}' || **scan == ',') {
3066 for (i = 0; i < count; i++) {
3067 if ((start > maxOsId) ||
3068 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3069 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3070 && (__kmp_affinity_type != affinity_none))) {
3071 KMP_WARNING(AffIgnoreInvalidProcID, start);
3072 }
3073 break; // don't proliferate warnings for large count
3074 }
3075 else {
3076 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3077 start += stride;
3078 (*setSize)++;
3079 }
3080 }
3081 if (**scan == '}') {
3082 break;
3083 }
3084 (*scan)++; // skip ','
3085 continue;
3086 }
3087
3088 KMP_ASSERT2(0, "bad explicit places list");
3089 }
3090}
3091
3092
3093static void
3094__kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3095 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3096{
3097 const char *next;
3098
3099 //
3100 // valid follow sets are '{' '!' and num
3101 //
3102 SKIP_WS(*scan);
3103 if (**scan == '{') {
3104 (*scan)++; // skip '{'
3105 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3106 setSize);
3107 KMP_ASSERT2(**scan == '}', "bad explicit places list");
3108 (*scan)++; // skip '}'
3109 }
3110 else if (**scan == '!') {
3111 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3112 KMP_CPU_COMPLEMENT(tempMask);
3113 (*scan)++; // skip '!'
3114 }
3115 else if ((**scan >= '0') && (**scan <= '9')) {
3116 next = *scan;
3117 SKIP_DIGITS(next);
3118 int num = __kmp_str_to_int(*scan, *next);
3119 KMP_ASSERT(num >= 0);
3120 if ((num > maxOsId) ||
3121 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3122 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3123 && (__kmp_affinity_type != affinity_none))) {
3124 KMP_WARNING(AffIgnoreInvalidProcID, num);
3125 }
3126 }
3127 else {
3128 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3129 (*setSize)++;
3130 }
3131 *scan = next; // skip num
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003132 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003133 else {
3134 KMP_ASSERT2(0, "bad explicit places list");
3135 }
3136}
3137
3138
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003139//static void
3140void
Jim Cownie5e8470a2013-09-27 10:38:44 +00003141__kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3142 unsigned int *out_numMasks, const char *placelist,
3143 kmp_affin_mask_t *osId2Mask, int maxOsId)
3144{
3145 const char *scan = placelist;
3146 const char *next = placelist;
3147
3148 numNewMasks = 2;
3149 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3150 * __kmp_affin_mask_size);
3151 nextNewMask = 0;
3152
3153 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3154 __kmp_affin_mask_size);
3155 KMP_CPU_ZERO(tempMask);
3156 int setSize = 0;
3157
3158 for (;;) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003159 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3160
3161 //
3162 // valid follow sets are ',' ':' and EOL
3163 //
3164 SKIP_WS(scan);
3165 if (*scan == '\0' || *scan == ',') {
3166 if (setSize > 0) {
3167 ADD_MASK(tempMask);
3168 }
3169 KMP_CPU_ZERO(tempMask);
3170 setSize = 0;
3171 if (*scan == '\0') {
3172 break;
3173 }
3174 scan++; // skip ','
3175 continue;
3176 }
3177
3178 KMP_ASSERT2(*scan == ':', "bad explicit places list");
3179 scan++; // skip ':'
3180
3181 //
3182 // Read count parameter
3183 //
3184 SKIP_WS(scan);
3185 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3186 "bad explicit places list");
3187 next = scan;
3188 SKIP_DIGITS(next);
Jim Cownie181b4bb2013-12-23 17:28:57 +00003189 int count = __kmp_str_to_int(scan, *next);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003190 KMP_ASSERT(count >= 0);
3191 scan = next;
3192
3193 //
3194 // valid follow sets are ',' ':' and EOL
3195 //
3196 SKIP_WS(scan);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003197 int stride;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003198 if (*scan == '\0' || *scan == ',') {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003199 stride = +1;
3200 }
3201 else {
3202 KMP_ASSERT2(*scan == ':', "bad explicit places list");
3203 scan++; // skip ':'
Jim Cownie5e8470a2013-09-27 10:38:44 +00003204
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003205 //
3206 // Read stride parameter
3207 //
3208 int sign = +1;
3209 for (;;) {
3210 SKIP_WS(scan);
3211 if (*scan == '+') {
3212 scan++; // skip '+'
3213 continue;
3214 }
3215 if (*scan == '-') {
3216 sign *= -1;
3217 scan++; // skip '-'
3218 continue;
3219 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003220 break;
3221 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003222 SKIP_WS(scan);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003223 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3224 "bad explicit places list");
3225 next = scan;
3226 SKIP_DIGITS(next);
3227 stride = __kmp_str_to_int(scan, *next);
3228 KMP_DEBUG_ASSERT(stride >= 0);
3229 scan = next;
3230 stride *= sign;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003231 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003232
3233 if (stride > 0) {
3234 int i;
3235 for (i = 0; i < count; i++) {
3236 int j;
3237 if (setSize == 0) {
3238 break;
3239 }
3240 ADD_MASK(tempMask);
3241 setSize = 0;
3242 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003243 if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3244 KMP_CPU_CLR(j, tempMask);
3245 }
3246 else if ((j > maxOsId) ||
3247 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3248 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3249 && (__kmp_affinity_type != affinity_none))) {
3250 KMP_WARNING(AffIgnoreInvalidProcID, j);
3251 }
3252 KMP_CPU_CLR(j, tempMask);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003253 }
3254 else {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003255 KMP_CPU_SET(j, tempMask);
3256 setSize++;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003257 }
3258 }
3259 for (; j >= 0; j--) {
3260 KMP_CPU_CLR(j, tempMask);
3261 }
3262 }
3263 }
3264 else {
3265 int i;
3266 for (i = 0; i < count; i++) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003267 int j;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003268 if (setSize == 0) {
3269 break;
3270 }
3271 ADD_MASK(tempMask);
3272 setSize = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003273 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003274 j++) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003275 if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3276 KMP_CPU_CLR(j, tempMask);
3277 }
3278 else if ((j > maxOsId) ||
3279 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3280 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3281 && (__kmp_affinity_type != affinity_none))) {
3282 KMP_WARNING(AffIgnoreInvalidProcID, j);
3283 }
3284 KMP_CPU_CLR(j, tempMask);
3285 }
3286 else {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003287 KMP_CPU_SET(j, tempMask);
3288 setSize++;
3289 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003290 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003291 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003292 KMP_CPU_CLR(j, tempMask);
3293 }
3294 }
3295 }
3296 KMP_CPU_ZERO(tempMask);
3297 setSize = 0;
3298
3299 //
3300 // valid follow sets are ',' and EOL
3301 //
3302 SKIP_WS(scan);
3303 if (*scan == '\0') {
3304 break;
3305 }
3306 if (*scan == ',') {
3307 scan++; // skip ','
3308 continue;
3309 }
3310
3311 KMP_ASSERT2(0, "bad explicit places list");
3312 }
3313
3314 *out_numMasks = nextNewMask;
3315 if (nextNewMask == 0) {
3316 *out_masks = NULL;
3317 KMP_INTERNAL_FREE(newMasks);
3318 return;
3319 }
3320 *out_masks
3321 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3322 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3323 __kmp_free(tempMask);
3324 KMP_INTERNAL_FREE(newMasks);
3325}
3326
3327# endif /* OMP_40_ENABLED */
3328
3329#undef ADD_MASK
3330#undef ADD_MASK_OSID
3331
3332
3333# if KMP_MIC
3334
3335static void
3336__kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3337{
3338 if ( __kmp_place_num_cores == 0 ) {
3339 if ( __kmp_place_num_threads_per_core == 0 ) {
3340 return; // no cores limiting actions requested, exit
3341 }
3342 __kmp_place_num_cores = nCoresPerPkg; // use all available cores
3343 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003344 if ( !__kmp_affinity_uniform_topology() ) {
3345 KMP_WARNING( AffThrPlaceNonUniform );
3346 return; // don't support non-uniform topology
3347 }
3348 if ( depth != 3 ) {
3349 KMP_WARNING( AffThrPlaceNonThreeLevel );
3350 return; // don't support not-3-level topology
Jim Cownie5e8470a2013-09-27 10:38:44 +00003351 }
3352 if ( __kmp_place_num_threads_per_core == 0 ) {
3353 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3354 }
3355 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3356 KMP_WARNING( AffThrPlaceManyCores );
3357 return;
3358 }
3359
3360 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3361 nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3362 int i, j, k, n_old = 0, n_new = 0;
3363 for ( i = 0; i < nPackages; ++i ) {
3364 for ( j = 0; j < nCoresPerPkg; ++j ) {
3365 if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3366 n_old += __kmp_nThreadsPerCore; // skip not-requested core
3367 } else {
3368 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3369 if ( k < __kmp_place_num_threads_per_core ) {
3370 newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location
3371 n_new++;
3372 }
3373 n_old++;
3374 }
3375 }
3376 }
3377 }
3378 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg
3379 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3380 __kmp_avail_proc = n_new; // correct avail_proc
3381 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores
3382
3383 __kmp_free( *pAddr );
3384 *pAddr = newAddr; // replace old topology with new one
3385}
3386
3387# endif /* KMP_MIC */
3388
3389
3390static AddrUnsPair *address2os = NULL;
3391static int * procarr = NULL;
3392static int __kmp_aff_depth = 0;
3393
3394static void
3395__kmp_aux_affinity_initialize(void)
3396{
3397 if (__kmp_affinity_masks != NULL) {
3398 KMP_ASSERT(fullMask != NULL);
3399 return;
3400 }
3401
3402 //
3403 // Create the "full" mask - this defines all of the processors that we
3404 // consider to be in the machine model. If respect is set, then it is
3405 // the initialization thread's affinity mask. Otherwise, it is all
3406 // processors that we know about on the machine.
3407 //
3408 if (fullMask == NULL) {
3409 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3410 }
3411 if (KMP_AFFINITY_CAPABLE()) {
3412 if (__kmp_affinity_respect_mask) {
3413 __kmp_get_system_affinity(fullMask, TRUE);
3414
3415 //
3416 // Count the number of available processors.
3417 //
3418 unsigned i;
3419 __kmp_avail_proc = 0;
3420 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3421 if (! KMP_CPU_ISSET(i, fullMask)) {
3422 continue;
3423 }
3424 __kmp_avail_proc++;
3425 }
3426 if (__kmp_avail_proc > __kmp_xproc) {
3427 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3428 && (__kmp_affinity_type != affinity_none))) {
3429 KMP_WARNING(ErrorInitializeAffinity);
3430 }
3431 __kmp_affinity_type = affinity_none;
3432 __kmp_affin_mask_size = 0;
3433 return;
3434 }
3435 }
3436 else {
3437 __kmp_affinity_entire_machine_mask(fullMask);
3438 __kmp_avail_proc = __kmp_xproc;
3439 }
3440 }
3441
3442 int depth = -1;
3443 kmp_i18n_id_t msg_id = kmp_i18n_null;
3444
3445 //
Alp Toker8f2d3f02014-02-24 10:40:15 +00003446 // For backward compatibility, setting KMP_CPUINFO_FILE =>
Jim Cownie5e8470a2013-09-27 10:38:44 +00003447 // KMP_TOPOLOGY_METHOD=cpuinfo
3448 //
3449 if ((__kmp_cpuinfo_file != NULL) &&
3450 (__kmp_affinity_top_method == affinity_top_method_all)) {
3451 __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3452 }
3453
3454 if (__kmp_affinity_top_method == affinity_top_method_all) {
3455 //
3456 // In the default code path, errors are not fatal - we just try using
3457 // another method. We only emit a warning message if affinity is on,
3458 // or the verbose flag is set, an the nowarnings flag was not set.
3459 //
3460 const char *file_name = NULL;
3461 int line = 0;
3462
3463# if KMP_ARCH_X86 || KMP_ARCH_X86_64
3464
3465 if (__kmp_affinity_verbose) {
3466 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3467 }
3468
3469 file_name = NULL;
3470 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3471 if (depth == 0) {
3472 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3473 KMP_ASSERT(address2os == NULL);
3474 return;
3475 }
3476
3477 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003478 if (__kmp_affinity_verbose) {
3479 if (msg_id != kmp_i18n_null) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003480 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3481 KMP_I18N_STR(DecodingLegacyAPIC));
3482 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003483 else {
3484 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3485 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003486 }
3487
3488 file_name = NULL;
3489 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3490 if (depth == 0) {
3491 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3492 KMP_ASSERT(address2os == NULL);
3493 return;
3494 }
3495 }
3496
3497# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3498
3499# if KMP_OS_LINUX
3500
3501 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003502 if (__kmp_affinity_verbose) {
3503 if (msg_id != kmp_i18n_null) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003504 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3505 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003506 else {
3507 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3508 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003509 }
3510
3511 FILE *f = fopen("/proc/cpuinfo", "r");
3512 if (f == NULL) {
3513 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3514 }
3515 else {
3516 file_name = "/proc/cpuinfo";
3517 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3518 fclose(f);
3519 if (depth == 0) {
3520 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3521 KMP_ASSERT(address2os == NULL);
3522 return;
3523 }
3524 }
3525 }
3526
3527# endif /* KMP_OS_LINUX */
3528
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003529# if KMP_GROUP_AFFINITY
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003530
3531 if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3532 if (__kmp_affinity_verbose) {
3533 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3534 }
3535
3536 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3537 KMP_ASSERT(depth != 0);
3538 }
3539
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003540# endif /* KMP_GROUP_AFFINITY */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003541
Jim Cownie5e8470a2013-09-27 10:38:44 +00003542 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003543 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003544 if (file_name == NULL) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003545 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003546 }
3547 else if (line == 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003548 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003549 }
3550 else {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003551 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003552 }
3553 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003554 // FIXME - print msg if msg_id = kmp_i18n_null ???
Jim Cownie5e8470a2013-09-27 10:38:44 +00003555
3556 file_name = "";
3557 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3558 if (depth == 0) {
3559 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3560 KMP_ASSERT(address2os == NULL);
3561 return;
3562 }
3563 KMP_ASSERT(depth > 0);
3564 KMP_ASSERT(address2os != NULL);
3565 }
3566 }
3567
3568 //
3569 // If the user has specified that a paricular topology discovery method
3570 // is to be used, then we abort if that method fails. The exception is
3571 // group affinity, which might have been implicitly set.
3572 //
3573
3574# if KMP_ARCH_X86 || KMP_ARCH_X86_64
3575
3576 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3577 if (__kmp_affinity_verbose) {
3578 KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3579 KMP_I18N_STR(Decodingx2APIC));
3580 }
3581
3582 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3583 if (depth == 0) {
3584 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3585 KMP_ASSERT(address2os == NULL);
3586 return;
3587 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003588 if (depth < 0) {
3589 KMP_ASSERT(msg_id != kmp_i18n_null);
3590 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3591 }
3592 }
3593 else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3594 if (__kmp_affinity_verbose) {
3595 KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3596 KMP_I18N_STR(DecodingLegacyAPIC));
3597 }
3598
3599 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3600 if (depth == 0) {
3601 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3602 KMP_ASSERT(address2os == NULL);
3603 return;
3604 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003605 if (depth < 0) {
3606 KMP_ASSERT(msg_id != kmp_i18n_null);
3607 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3608 }
3609 }
3610
3611# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3612
3613 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3614 const char *filename;
3615 if (__kmp_cpuinfo_file != NULL) {
3616 filename = __kmp_cpuinfo_file;
3617 }
3618 else {
3619 filename = "/proc/cpuinfo";
3620 }
3621
3622 if (__kmp_affinity_verbose) {
3623 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3624 }
3625
3626 FILE *f = fopen(filename, "r");
3627 if (f == NULL) {
3628 int code = errno;
3629 if (__kmp_cpuinfo_file != NULL) {
3630 __kmp_msg(
3631 kmp_ms_fatal,
3632 KMP_MSG(CantOpenFileForReading, filename),
3633 KMP_ERR(code),
3634 KMP_HNT(NameComesFrom_CPUINFO_FILE),
3635 __kmp_msg_null
3636 );
3637 }
3638 else {
3639 __kmp_msg(
3640 kmp_ms_fatal,
3641 KMP_MSG(CantOpenFileForReading, filename),
3642 KMP_ERR(code),
3643 __kmp_msg_null
3644 );
3645 }
3646 }
3647 int line = 0;
3648 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3649 fclose(f);
3650 if (depth < 0) {
3651 KMP_ASSERT(msg_id != kmp_i18n_null);
3652 if (line > 0) {
3653 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3654 }
3655 else {
3656 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3657 }
3658 }
3659 if (__kmp_affinity_type == affinity_none) {
3660 KMP_ASSERT(depth == 0);
3661 KMP_ASSERT(address2os == NULL);
3662 return;
3663 }
3664 }
3665
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003666# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00003667
3668 else if (__kmp_affinity_top_method == affinity_top_method_group) {
3669 if (__kmp_affinity_verbose) {
3670 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3671 }
3672
3673 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3674 KMP_ASSERT(depth != 0);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003675 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003676 KMP_ASSERT(msg_id != kmp_i18n_null);
3677 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003678 }
3679 }
3680
Andrey Churbanov7daf9802015-01-27 16:52:57 +00003681# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +00003682
3683 else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3684 if (__kmp_affinity_verbose) {
3685 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3686 }
3687
3688 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3689 if (depth == 0) {
3690 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3691 KMP_ASSERT(address2os == NULL);
3692 return;
3693 }
3694 // should not fail
3695 KMP_ASSERT(depth > 0);
3696 KMP_ASSERT(address2os != NULL);
3697 }
3698
3699 if (address2os == NULL) {
3700 if (KMP_AFFINITY_CAPABLE()
3701 && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3702 && (__kmp_affinity_type != affinity_none)))) {
3703 KMP_WARNING(ErrorInitializeAffinity);
3704 }
3705 __kmp_affinity_type = affinity_none;
3706 __kmp_affin_mask_size = 0;
3707 return;
3708 }
3709
3710# if KMP_MIC
3711 __kmp_apply_thread_places(&address2os, depth);
3712# endif
3713
3714 //
3715 // Create the table of masks, indexed by thread Id.
3716 //
3717 unsigned maxIndex;
3718 unsigned numUnique;
3719 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3720 address2os, __kmp_avail_proc);
3721 if (__kmp_affinity_gran_levels == 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003722 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003723 }
3724
3725 //
3726 // Set the childNums vector in all Address objects. This must be done
3727 // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3728 // which takes into account the setting of __kmp_affinity_compact.
3729 //
3730 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3731
3732 switch (__kmp_affinity_type) {
3733
3734 case affinity_explicit:
3735 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3736# if OMP_40_ENABLED
3737 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3738# endif
3739 {
3740 __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3741 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3742 maxIndex);
3743 }
3744# if OMP_40_ENABLED
3745 else {
3746 __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3747 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3748 maxIndex);
3749 }
3750# endif
3751 if (__kmp_affinity_num_masks == 0) {
3752 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3753 && (__kmp_affinity_type != affinity_none))) {
3754 KMP_WARNING(AffNoValidProcID);
3755 }
3756 __kmp_affinity_type = affinity_none;
3757 return;
3758 }
3759 break;
3760
3761 //
3762 // The other affinity types rely on sorting the Addresses according
3763 // to some permutation of the machine topology tree. Set
3764 // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3765 // then jump to a common code fragment to do the sort and create
3766 // the array of affinity masks.
3767 //
3768
3769 case affinity_logical:
3770 __kmp_affinity_compact = 0;
3771 if (__kmp_affinity_offset) {
3772 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3773 % __kmp_avail_proc;
3774 }
3775 goto sortAddresses;
3776
3777 case affinity_physical:
3778 if (__kmp_nThreadsPerCore > 1) {
3779 __kmp_affinity_compact = 1;
3780 if (__kmp_affinity_compact >= depth) {
3781 __kmp_affinity_compact = 0;
3782 }
3783 } else {
3784 __kmp_affinity_compact = 0;
3785 }
3786 if (__kmp_affinity_offset) {
3787 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3788 % __kmp_avail_proc;
3789 }
3790 goto sortAddresses;
3791
3792 case affinity_scatter:
3793 if (__kmp_affinity_compact >= depth) {
3794 __kmp_affinity_compact = 0;
3795 }
3796 else {
3797 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3798 }
3799 goto sortAddresses;
3800
3801 case affinity_compact:
3802 if (__kmp_affinity_compact >= depth) {
3803 __kmp_affinity_compact = depth - 1;
3804 }
3805 goto sortAddresses;
3806
Jim Cownie5e8470a2013-09-27 10:38:44 +00003807 case affinity_balanced:
Andrey Churbanovf28f6132015-01-13 14:54:00 +00003808 // Balanced works only for the case of a single package
Jim Cownie5e8470a2013-09-27 10:38:44 +00003809 if( nPackages > 1 ) {
3810 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3811 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3812 }
3813 __kmp_affinity_type = affinity_none;
3814 return;
3815 } else if( __kmp_affinity_uniform_topology() ) {
3816 break;
3817 } else { // Non-uniform topology
3818
3819 // Save the depth for further usage
3820 __kmp_aff_depth = depth;
3821
3822 // Number of hyper threads per core in HT machine
3823 int nth_per_core = __kmp_nThreadsPerCore;
3824
3825 int core_level;
3826 if( nth_per_core > 1 ) {
3827 core_level = depth - 2;
3828 } else {
3829 core_level = depth - 1;
3830 }
3831 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3832 int nproc = nth_per_core * ncores;
3833
3834 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3835 for( int i = 0; i < nproc; i++ ) {
3836 procarr[ i ] = -1;
3837 }
3838
3839 for( int i = 0; i < __kmp_avail_proc; i++ ) {
3840 int proc = address2os[ i ].second;
3841 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3842 // If there is only one thread per core then depth == 2: level 0 - package,
3843 // level 1 - core.
3844 int level = depth - 1;
3845
3846 // __kmp_nth_per_core == 1
3847 int thread = 0;
3848 int core = address2os[ i ].first.labels[ level ];
3849 // If the thread level exists, that is we have more than one thread context per core
3850 if( nth_per_core > 1 ) {
3851 thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3852 core = address2os[ i ].first.labels[ level - 1 ];
3853 }
3854 procarr[ core * nth_per_core + thread ] = proc;
3855 }
3856
3857 break;
3858 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003859
3860 sortAddresses:
3861 //
3862 // Allocate the gtid->affinity mask table.
3863 //
3864 if (__kmp_affinity_dups) {
3865 __kmp_affinity_num_masks = __kmp_avail_proc;
3866 }
3867 else {
3868 __kmp_affinity_num_masks = numUnique;
3869 }
3870
3871# if OMP_40_ENABLED
3872 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3873 && ( __kmp_affinity_num_places > 0 )
3874 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3875 __kmp_affinity_num_masks = __kmp_affinity_num_places;
3876 }
3877# endif
3878
3879 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3880 __kmp_affinity_num_masks * __kmp_affin_mask_size);
3881
3882 //
3883 // Sort the address2os table according to the current setting of
3884 // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3885 //
3886 qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3887 __kmp_affinity_cmp_Address_child_num);
3888 {
3889 int i;
3890 unsigned j;
3891 for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3892 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3893 continue;
3894 }
3895 unsigned osId = address2os[i].second;
3896 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3897 kmp_affin_mask_t *dest
3898 = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3899 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3900 KMP_CPU_COPY(dest, src);
3901 if (++j >= __kmp_affinity_num_masks) {
3902 break;
3903 }
3904 }
3905 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3906 }
3907 break;
3908
3909 default:
3910 KMP_ASSERT2(0, "Unexpected affinity setting");
3911 }
3912
3913 __kmp_free(osId2Mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003914 machine_hierarchy.init(address2os, __kmp_avail_proc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003915}
3916
3917
3918void
3919__kmp_affinity_initialize(void)
3920{
3921 //
3922 // Much of the code above was written assumming that if a machine was not
3923 // affinity capable, then __kmp_affinity_type == affinity_none. We now
3924 // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3925 //
3926 // There are too many checks for __kmp_affinity_type == affinity_none
3927 // in this code. Instead of trying to change them all, check if
3928 // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3929 // affinity_none, call the real initialization routine, then restore
3930 // __kmp_affinity_type to affinity_disabled.
3931 //
3932 int disabled = (__kmp_affinity_type == affinity_disabled);
3933 if (! KMP_AFFINITY_CAPABLE()) {
3934 KMP_ASSERT(disabled);
3935 }
3936 if (disabled) {
3937 __kmp_affinity_type = affinity_none;
3938 }
3939 __kmp_aux_affinity_initialize();
3940 if (disabled) {
3941 __kmp_affinity_type = affinity_disabled;
3942 }
3943}
3944
3945
3946void
3947__kmp_affinity_uninitialize(void)
3948{
3949 if (__kmp_affinity_masks != NULL) {
3950 __kmp_free(__kmp_affinity_masks);
3951 __kmp_affinity_masks = NULL;
3952 }
3953 if (fullMask != NULL) {
3954 KMP_CPU_FREE(fullMask);
3955 fullMask = NULL;
3956 }
3957 __kmp_affinity_num_masks = 0;
3958# if OMP_40_ENABLED
3959 __kmp_affinity_num_places = 0;
3960# endif
3961 if (__kmp_affinity_proclist != NULL) {
3962 __kmp_free(__kmp_affinity_proclist);
3963 __kmp_affinity_proclist = NULL;
3964 }
3965 if( address2os != NULL ) {
3966 __kmp_free( address2os );
3967 address2os = NULL;
3968 }
3969 if( procarr != NULL ) {
3970 __kmp_free( procarr );
3971 procarr = NULL;
3972 }
3973}
3974
3975
3976void
3977__kmp_affinity_set_init_mask(int gtid, int isa_root)
3978{
3979 if (! KMP_AFFINITY_CAPABLE()) {
3980 return;
3981 }
3982
3983 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3984 if (th->th.th_affin_mask == NULL) {
3985 KMP_CPU_ALLOC(th->th.th_affin_mask);
3986 }
3987 else {
3988 KMP_CPU_ZERO(th->th.th_affin_mask);
3989 }
3990
3991 //
3992 // Copy the thread mask to the kmp_info_t strucuture.
3993 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
3994 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
3995 // is set, then the full mask is the same as the mask of the initialization
3996 // thread.
3997 //
3998 kmp_affin_mask_t *mask;
3999 int i;
4000
4001# if OMP_40_ENABLED
4002 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4003# endif
4004 {
Andrey Churbanovf28f6132015-01-13 14:54:00 +00004005 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
Jim Cownie5e8470a2013-09-27 10:38:44 +00004006 ) {
Andrey Churbanov7daf9802015-01-27 16:52:57 +00004007# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00004008 if (__kmp_num_proc_groups > 1) {
4009 return;
4010 }
4011# endif
4012 KMP_ASSERT(fullMask != NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004013 i = KMP_PLACE_ALL;
Jim Cownie5e8470a2013-09-27 10:38:44 +00004014 mask = fullMask;
4015 }
4016 else {
4017 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4018 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4019 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4020 }
4021 }
4022# if OMP_40_ENABLED
4023 else {
4024 if ((! isa_root)
4025 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
Andrey Churbanov7daf9802015-01-27 16:52:57 +00004026# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00004027 if (__kmp_num_proc_groups > 1) {
4028 return;
4029 }
4030# endif
4031 KMP_ASSERT(fullMask != NULL);
4032 i = KMP_PLACE_ALL;
4033 mask = fullMask;
4034 }
4035 else {
4036 //
4037 // int i = some hash function or just a counter that doesn't
4038 // always start at 0. Use gtid for now.
4039 //
4040 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4041 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4042 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4043 }
4044 }
4045# endif
4046
4047# if OMP_40_ENABLED
4048 th->th.th_current_place = i;
4049 if (isa_root) {
4050 th->th.th_new_place = i;
4051 th->th.th_first_place = 0;
4052 th->th.th_last_place = __kmp_affinity_num_masks - 1;
4053 }
4054
4055 if (i == KMP_PLACE_ALL) {
4056 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4057 gtid));
4058 }
4059 else {
4060 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4061 gtid, i));
4062 }
4063# else
4064 if (i == -1) {
4065 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4066 gtid));
4067 }
4068 else {
4069 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4070 gtid, i));
4071 }
4072# endif /* OMP_40_ENABLED */
4073
4074 KMP_CPU_COPY(th->th.th_affin_mask, mask);
4075
4076 if (__kmp_affinity_verbose) {
4077 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4078 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4079 th->th.th_affin_mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004080 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4081 buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004082 }
4083
4084# if KMP_OS_WINDOWS
4085 //
4086 // On Windows* OS, the process affinity mask might have changed.
4087 // If the user didn't request affinity and this call fails,
4088 // just continue silently. See CQ171393.
4089 //
4090 if ( __kmp_affinity_type == affinity_none ) {
4091 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4092 }
4093 else
4094# endif
4095 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4096}
4097
4098
4099# if OMP_40_ENABLED
4100
4101void
4102__kmp_affinity_set_place(int gtid)
4103{
4104 int retval;
4105
4106 if (! KMP_AFFINITY_CAPABLE()) {
4107 return;
4108 }
4109
4110 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4111
4112 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4113 gtid, th->th.th_new_place, th->th.th_current_place));
4114
4115 //
Alp Toker8f2d3f02014-02-24 10:40:15 +00004116 // Check that the new place is within this thread's partition.
Jim Cownie5e8470a2013-09-27 10:38:44 +00004117 //
4118 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004119 KMP_ASSERT(th->th.th_new_place >= 0);
4120 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004121 if (th->th.th_first_place <= th->th.th_last_place) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004122 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
Jim Cownie5e8470a2013-09-27 10:38:44 +00004123 && (th->th.th_new_place <= th->th.th_last_place));
4124 }
4125 else {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004126 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
Jim Cownie5e8470a2013-09-27 10:38:44 +00004127 || (th->th.th_new_place >= th->th.th_last_place));
4128 }
4129
4130 //
4131 // Copy the thread mask to the kmp_info_t strucuture,
4132 // and set this thread's affinity.
4133 //
4134 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4135 th->th.th_new_place);
4136 KMP_CPU_COPY(th->th.th_affin_mask, mask);
4137 th->th.th_current_place = th->th.th_new_place;
4138
4139 if (__kmp_affinity_verbose) {
4140 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4141 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4142 th->th.th_affin_mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004143 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4144 gtid, buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004145 }
4146 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4147}
4148
4149# endif /* OMP_40_ENABLED */
4150
4151
4152int
4153__kmp_aux_set_affinity(void **mask)
4154{
4155 int gtid;
4156 kmp_info_t *th;
4157 int retval;
4158
4159 if (! KMP_AFFINITY_CAPABLE()) {
4160 return -1;
4161 }
4162
4163 gtid = __kmp_entry_gtid();
4164 KA_TRACE(1000, ;{
4165 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4166 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4167 (kmp_affin_mask_t *)(*mask));
4168 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4169 gtid, buf);
4170 });
4171
4172 if (__kmp_env_consistency_check) {
4173 if ((mask == NULL) || (*mask == NULL)) {
4174 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4175 }
4176 else {
4177 unsigned proc;
4178 int num_procs = 0;
4179
4180 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4181 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4182 continue;
4183 }
4184 num_procs++;
4185 if (! KMP_CPU_ISSET(proc, fullMask)) {
4186 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4187 break;
4188 }
4189 }
4190 if (num_procs == 0) {
4191 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4192 }
4193
Andrey Churbanov7daf9802015-01-27 16:52:57 +00004194# if KMP_GROUP_AFFINITY
Jim Cownie5e8470a2013-09-27 10:38:44 +00004195 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4196 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4197 }
Andrey Churbanov7daf9802015-01-27 16:52:57 +00004198# endif /* KMP_GROUP_AFFINITY */
Jim Cownie5e8470a2013-09-27 10:38:44 +00004199
4200 }
4201 }
4202
4203 th = __kmp_threads[gtid];
4204 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4205 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4206 if (retval == 0) {
4207 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4208 }
4209
4210# if OMP_40_ENABLED
4211 th->th.th_current_place = KMP_PLACE_UNDEFINED;
4212 th->th.th_new_place = KMP_PLACE_UNDEFINED;
4213 th->th.th_first_place = 0;
4214 th->th.th_last_place = __kmp_affinity_num_masks - 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004215
4216 //
4217 // Turn off 4.0 affinity for the current tread at this parallel level.
4218 //
4219 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
Jim Cownie5e8470a2013-09-27 10:38:44 +00004220# endif
4221
4222 return retval;
4223}
4224
4225
4226int
4227__kmp_aux_get_affinity(void **mask)
4228{
4229 int gtid;
4230 int retval;
4231 kmp_info_t *th;
4232
4233 if (! KMP_AFFINITY_CAPABLE()) {
4234 return -1;
4235 }
4236
4237 gtid = __kmp_entry_gtid();
4238 th = __kmp_threads[gtid];
4239 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4240
4241 KA_TRACE(1000, ;{
4242 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4243 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4244 th->th.th_affin_mask);
4245 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4246 });
4247
4248 if (__kmp_env_consistency_check) {
4249 if ((mask == NULL) || (*mask == NULL)) {
4250 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4251 }
4252 }
4253
4254# if !KMP_OS_WINDOWS
4255
4256 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4257 KA_TRACE(1000, ;{
4258 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4259 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4260 (kmp_affin_mask_t *)(*mask));
4261 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4262 });
4263 return retval;
4264
4265# else
4266
4267 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4268 return 0;
4269
4270# endif /* KMP_OS_WINDOWS */
4271
4272}
4273
Jim Cownie5e8470a2013-09-27 10:38:44 +00004274int
4275__kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4276{
4277 int retval;
4278
4279 if (! KMP_AFFINITY_CAPABLE()) {
4280 return -1;
4281 }
4282
4283 KA_TRACE(1000, ;{
4284 int gtid = __kmp_entry_gtid();
4285 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4286 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4287 (kmp_affin_mask_t *)(*mask));
4288 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4289 proc, gtid, buf);
4290 });
4291
4292 if (__kmp_env_consistency_check) {
4293 if ((mask == NULL) || (*mask == NULL)) {
4294 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4295 }
4296 }
4297
4298 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4299 return -1;
4300 }
4301 if (! KMP_CPU_ISSET(proc, fullMask)) {
4302 return -2;
4303 }
4304
4305 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4306 return 0;
4307}
4308
4309
4310int
4311__kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4312{
4313 int retval;
4314
4315 if (! KMP_AFFINITY_CAPABLE()) {
4316 return -1;
4317 }
4318
4319 KA_TRACE(1000, ;{
4320 int gtid = __kmp_entry_gtid();
4321 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4322 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4323 (kmp_affin_mask_t *)(*mask));
4324 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4325 proc, gtid, buf);
4326 });
4327
4328 if (__kmp_env_consistency_check) {
4329 if ((mask == NULL) || (*mask == NULL)) {
4330 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4331 }
4332 }
4333
4334 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4335 return -1;
4336 }
4337 if (! KMP_CPU_ISSET(proc, fullMask)) {
4338 return -2;
4339 }
4340
4341 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4342 return 0;
4343}
4344
4345
4346int
4347__kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4348{
4349 int retval;
4350
4351 if (! KMP_AFFINITY_CAPABLE()) {
4352 return -1;
4353 }
4354
4355 KA_TRACE(1000, ;{
4356 int gtid = __kmp_entry_gtid();
4357 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4358 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4359 (kmp_affin_mask_t *)(*mask));
4360 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4361 proc, gtid, buf);
4362 });
4363
4364 if (__kmp_env_consistency_check) {
4365 if ((mask == NULL) || (*mask == NULL)) {
4366 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4367 }
4368 }
4369
4370 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4371 return 0;
4372 }
4373 if (! KMP_CPU_ISSET(proc, fullMask)) {
4374 return 0;
4375 }
4376
4377 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4378}
4379
Jim Cownie5e8470a2013-09-27 10:38:44 +00004380
4381// Dynamic affinity settings - Affinity balanced
4382void __kmp_balanced_affinity( int tid, int nthreads )
4383{
4384 if( __kmp_affinity_uniform_topology() ) {
4385 int coreID;
4386 int threadID;
4387 // Number of hyper threads per core in HT machine
4388 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4389 // Number of cores
4390 int ncores = __kmp_ncores;
4391 // How many threads will be bound to each core
4392 int chunk = nthreads / ncores;
4393 // How many cores will have an additional thread bound to it - "big cores"
4394 int big_cores = nthreads % ncores;
4395 // Number of threads on the big cores
4396 int big_nth = ( chunk + 1 ) * big_cores;
4397 if( tid < big_nth ) {
4398 coreID = tid / (chunk + 1 );
4399 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4400 } else { //tid >= big_nth
4401 coreID = ( tid - big_cores ) / chunk;
4402 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4403 }
4404
4405 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4406 "Illegal set affinity operation when not capable");
4407
4408 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4409 KMP_CPU_ZERO(mask);
4410
4411 // Granularity == thread
4412 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4413 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4414 KMP_CPU_SET( osID, mask);
4415 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4416 for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4417 int osID;
4418 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4419 KMP_CPU_SET( osID, mask);
4420 }
4421 }
4422 if (__kmp_affinity_verbose) {
4423 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4424 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004425 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4426 tid, buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004427 }
4428 __kmp_set_system_affinity( mask, TRUE );
4429 } else { // Non-uniform topology
4430
4431 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4432 KMP_CPU_ZERO(mask);
4433
4434 // Number of hyper threads per core in HT machine
4435 int nth_per_core = __kmp_nThreadsPerCore;
4436 int core_level;
4437 if( nth_per_core > 1 ) {
4438 core_level = __kmp_aff_depth - 2;
4439 } else {
4440 core_level = __kmp_aff_depth - 1;
4441 }
4442
4443 // Number of cores - maximum value; it does not count trail cores with 0 processors
4444 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4445
4446 // For performance gain consider the special case nthreads == __kmp_avail_proc
4447 if( nthreads == __kmp_avail_proc ) {
4448 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4449 int osID = address2os[ tid ].second;
4450 KMP_CPU_SET( osID, mask);
4451 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4452 int coreID = address2os[ tid ].first.labels[ core_level ];
4453 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4454 // since the address2os is sortied we can break when cnt==nth_per_core
4455 int cnt = 0;
4456 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4457 int osID = address2os[ i ].second;
4458 int core = address2os[ i ].first.labels[ core_level ];
4459 if( core == coreID ) {
4460 KMP_CPU_SET( osID, mask);
4461 cnt++;
4462 if( cnt == nth_per_core ) {
4463 break;
4464 }
4465 }
4466 }
4467 }
4468 } else if( nthreads <= __kmp_ncores ) {
4469
4470 int core = 0;
4471 for( int i = 0; i < ncores; i++ ) {
4472 // Check if this core from procarr[] is in the mask
4473 int in_mask = 0;
4474 for( int j = 0; j < nth_per_core; j++ ) {
4475 if( procarr[ i * nth_per_core + j ] != - 1 ) {
4476 in_mask = 1;
4477 break;
4478 }
4479 }
4480 if( in_mask ) {
4481 if( tid == core ) {
4482 for( int j = 0; j < nth_per_core; j++ ) {
4483 int osID = procarr[ i * nth_per_core + j ];
4484 if( osID != -1 ) {
4485 KMP_CPU_SET( osID, mask );
4486 // For granularity=thread it is enough to set the first available osID for this core
4487 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4488 break;
4489 }
4490 }
4491 }
4492 break;
4493 } else {
4494 core++;
4495 }
4496 }
4497 }
4498
4499 } else { // nthreads > __kmp_ncores
4500
4501 // Array to save the number of processors at each core
4502 int nproc_at_core[ ncores ];
4503 // Array to save the number of cores with "x" available processors;
4504 int ncores_with_x_procs[ nth_per_core + 1 ];
4505 // Array to save the number of cores with # procs from x to nth_per_core
4506 int ncores_with_x_to_max_procs[ nth_per_core + 1 ];
4507
4508 for( int i = 0; i <= nth_per_core; i++ ) {
4509 ncores_with_x_procs[ i ] = 0;
4510 ncores_with_x_to_max_procs[ i ] = 0;
4511 }
4512
4513 for( int i = 0; i < ncores; i++ ) {
4514 int cnt = 0;
4515 for( int j = 0; j < nth_per_core; j++ ) {
4516 if( procarr[ i * nth_per_core + j ] != -1 ) {
4517 cnt++;
4518 }
4519 }
4520 nproc_at_core[ i ] = cnt;
4521 ncores_with_x_procs[ cnt ]++;
4522 }
4523
4524 for( int i = 0; i <= nth_per_core; i++ ) {
4525 for( int j = i; j <= nth_per_core; j++ ) {
4526 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4527 }
4528 }
4529
4530 // Max number of processors
4531 int nproc = nth_per_core * ncores;
4532 // An array to keep number of threads per each context
4533 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4534 for( int i = 0; i < nproc; i++ ) {
4535 newarr[ i ] = 0;
4536 }
4537
4538 int nth = nthreads;
4539 int flag = 0;
4540 while( nth > 0 ) {
4541 for( int j = 1; j <= nth_per_core; j++ ) {
4542 int cnt = ncores_with_x_to_max_procs[ j ];
4543 for( int i = 0; i < ncores; i++ ) {
4544 // Skip the core with 0 processors
4545 if( nproc_at_core[ i ] == 0 ) {
4546 continue;
4547 }
4548 for( int k = 0; k < nth_per_core; k++ ) {
4549 if( procarr[ i * nth_per_core + k ] != -1 ) {
4550 if( newarr[ i * nth_per_core + k ] == 0 ) {
4551 newarr[ i * nth_per_core + k ] = 1;
4552 cnt--;
4553 nth--;
4554 break;
4555 } else {
4556 if( flag != 0 ) {
4557 newarr[ i * nth_per_core + k ] ++;
4558 cnt--;
4559 nth--;
4560 break;
4561 }
4562 }
4563 }
4564 }
4565 if( cnt == 0 || nth == 0 ) {
4566 break;
4567 }
4568 }
4569 if( nth == 0 ) {
4570 break;
4571 }
4572 }
4573 flag = 1;
4574 }
4575 int sum = 0;
4576 for( int i = 0; i < nproc; i++ ) {
4577 sum += newarr[ i ];
4578 if( sum > tid ) {
4579 // Granularity == thread
4580 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4581 int osID = procarr[ i ];
4582 KMP_CPU_SET( osID, mask);
4583 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4584 int coreID = i / nth_per_core;
4585 for( int ii = 0; ii < nth_per_core; ii++ ) {
4586 int osID = procarr[ coreID * nth_per_core + ii ];
4587 if( osID != -1 ) {
4588 KMP_CPU_SET( osID, mask);
4589 }
4590 }
4591 }
4592 break;
4593 }
4594 }
4595 __kmp_free( newarr );
4596 }
4597
4598 if (__kmp_affinity_verbose) {
4599 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4600 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004601 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4602 tid, buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004603 }
4604 __kmp_set_system_affinity( mask, TRUE );
4605 }
4606}
4607
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004608#else
4609 // affinity not supported
4610
4611kmp_uint32 mac_skipPerLevel[7];
4612kmp_uint32 mac_depth;
4613kmp_uint8 mac_leaf_kids;
4614void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
4615 static int first = 1;
4616 if (first) {
4617 const kmp_uint32 maxLevels = 7;
4618 kmp_uint32 numPerLevel[maxLevels];
4619
4620 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
4621 numPerLevel[i] = 1;
4622 mac_skipPerLevel[i] = 1;
4623 }
4624
4625 mac_depth = 2;
4626 numPerLevel[0] = nproc;
4627
4628 kmp_uint32 branch = 4;
4629 if (numPerLevel[0] == 1) branch = nproc/4;
4630 if (branch<4) branch=4;
4631 for (kmp_uint32 d=0; d<mac_depth-1; ++d) { // optimize hierarchy width
4632 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
4633 if (numPerLevel[d] & 1) numPerLevel[d]++;
4634 numPerLevel[d] = numPerLevel[d] >> 1;
4635 if (numPerLevel[d+1] == 1) mac_depth++;
4636 numPerLevel[d+1] = numPerLevel[d+1] << 1;
4637 }
4638 if(numPerLevel[0] == 1) {
4639 branch = branch >> 1;
4640 if (branch<4) branch = 4;
4641 }
4642 }
4643
4644 for (kmp_uint32 i=1; i<mac_depth; ++i)
4645 mac_skipPerLevel[i] = numPerLevel[i-1] * mac_skipPerLevel[i-1];
4646 mac_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
4647 first=0;
4648 }
4649 thr_bar->depth = mac_depth;
4650 thr_bar->base_leaf_kids = mac_leaf_kids;
4651 thr_bar->skip_per_level = mac_skipPerLevel;
4652}
4653
Alp Toker763b9392014-02-28 09:42:41 +00004654#endif // KMP_AFFINITY_SUPPORTED