blob: d6821e0440f7069e6bc93386ae2645e2ca0a04f8 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_affinity.cpp -- affinity management
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003 * $Revision: 43473 $
4 * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
Jim Cownie5e8470a2013-09-27 10:38:44 +00005 */
6
7
8//===----------------------------------------------------------------------===//
9//
10// The LLVM Compiler Infrastructure
11//
12// This file is dual licensed under the MIT and the University of Illinois Open
13// Source Licenses. See LICENSE.txt for details.
14//
15//===----------------------------------------------------------------------===//
16
17
18#include "kmp.h"
19#include "kmp_i18n.h"
20#include "kmp_io.h"
21#include "kmp_str.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000022#include "kmp_wrapper_getpid.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000023
Alp Toker763b9392014-02-28 09:42:41 +000024#if KMP_AFFINITY_SUPPORTED
Jim Cownie5e8470a2013-09-27 10:38:44 +000025
26//
27// Print the affinity mask to the character array in a pretty format.
28//
29char *
30__kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
31{
32 KMP_ASSERT(buf_len >= 40);
33 char *scan = buf;
34 char *end = buf + buf_len - 1;
35
36 //
37 // Find first element / check for empty set.
38 //
39 size_t i;
40 for (i = 0; i < KMP_CPU_SETSIZE; i++) {
41 if (KMP_CPU_ISSET(i, mask)) {
42 break;
43 }
44 }
45 if (i == KMP_CPU_SETSIZE) {
46 sprintf(scan, "{<empty>}");
47 while (*scan != '\0') scan++;
48 KMP_ASSERT(scan <= end);
49 return buf;
50 }
51
Jim Cownie4cc4bb42014-10-07 16:25:50 +000052 sprintf(scan, "{%ld", (long)i);
Jim Cownie5e8470a2013-09-27 10:38:44 +000053 while (*scan != '\0') scan++;
54 i++;
55 for (; i < KMP_CPU_SETSIZE; i++) {
56 if (! KMP_CPU_ISSET(i, mask)) {
57 continue;
58 }
59
60 //
61 // Check for buffer overflow. A string of the form ",<n>" will have
62 // at most 10 characters, plus we want to leave room to print ",...}"
63 // if the set is too large to print for a total of 15 characters.
64 // We already left room for '\0' in setting end.
65 //
66 if (end - scan < 15) {
67 break;
68 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +000069 sprintf(scan, ",%-ld", (long)i);
Jim Cownie5e8470a2013-09-27 10:38:44 +000070 while (*scan != '\0') scan++;
71 }
72 if (i < KMP_CPU_SETSIZE) {
73 sprintf(scan, ",...");
74 while (*scan != '\0') scan++;
75 }
76 sprintf(scan, "}");
77 while (*scan != '\0') scan++;
78 KMP_ASSERT(scan <= end);
79 return buf;
80}
81
82
83void
84__kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
85{
86 KMP_CPU_ZERO(mask);
87
88# if KMP_OS_WINDOWS && KMP_ARCH_X86_64
89
90 if (__kmp_num_proc_groups > 1) {
91 int group;
Jim Cownie5e8470a2013-09-27 10:38:44 +000092 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
93 for (group = 0; group < __kmp_num_proc_groups; group++) {
94 int i;
95 int num = __kmp_GetActiveProcessorCount(group);
96 for (i = 0; i < num; i++) {
97 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
98 }
99 }
100 }
101 else
102
103# endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
104
105 {
106 int proc;
107 for (proc = 0; proc < __kmp_xproc; proc++) {
108 KMP_CPU_SET(proc, mask);
109 }
110 }
111}
112
113
114//
115// In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
116// functions.
117//
118// The icc codegen emits sections with extremely long names, of the form
119// ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug
120// introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
121// some sort of memory corruption or table overflow that is triggered by
122// these long strings. I checked the latest version of the linker -
123// GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
124// fixed.
125//
126// Unfortunately, my attempts to reproduce it in a smaller example have
127// failed - I'm not sure what the prospects are of getting it fixed
128// properly - but we need a reproducer smaller than all of libiomp.
129//
130// Work around the problem by avoiding inline constructors in such builds.
131// We do this for all platforms, not just Linux* OS - non-inline functions are
132// more debuggable and provide better coverage into than inline functions.
133// Use inline functions in shipping libs, for performance.
134//
135
136# if !defined(KMP_DEBUG) && !defined(COVER)
137
138class Address {
139public:
140 static const unsigned maxDepth = 32;
141 unsigned labels[maxDepth];
142 unsigned childNums[maxDepth];
143 unsigned depth;
144 unsigned leader;
145 Address(unsigned _depth)
146 : depth(_depth), leader(FALSE) {
147 }
148 Address &operator=(const Address &b) {
149 depth = b.depth;
150 for (unsigned i = 0; i < depth; i++) {
151 labels[i] = b.labels[i];
152 childNums[i] = b.childNums[i];
153 }
154 leader = FALSE;
155 return *this;
156 }
157 bool operator==(const Address &b) const {
158 if (depth != b.depth)
159 return false;
160 for (unsigned i = 0; i < depth; i++)
161 if(labels[i] != b.labels[i])
162 return false;
163 return true;
164 }
165 bool isClose(const Address &b, int level) const {
166 if (depth != b.depth)
167 return false;
168 if ((unsigned)level >= depth)
169 return true;
170 for (unsigned i = 0; i < (depth - level); i++)
171 if(labels[i] != b.labels[i])
172 return false;
173 return true;
174 }
175 bool operator!=(const Address &b) const {
176 return !operator==(b);
177 }
178};
179
180class AddrUnsPair {
181public:
182 Address first;
183 unsigned second;
184 AddrUnsPair(Address _first, unsigned _second)
185 : first(_first), second(_second) {
186 }
187 AddrUnsPair &operator=(const AddrUnsPair &b)
188 {
189 first = b.first;
190 second = b.second;
191 return *this;
192 }
193};
194
195# else
196
197class Address {
198public:
199 static const unsigned maxDepth = 32;
200 unsigned labels[maxDepth];
201 unsigned childNums[maxDepth];
202 unsigned depth;
203 unsigned leader;
204 Address(unsigned _depth);
205 Address &operator=(const Address &b);
206 bool operator==(const Address &b) const;
207 bool isClose(const Address &b, int level) const;
208 bool operator!=(const Address &b) const;
209};
210
211Address::Address(unsigned _depth)
212{
213 depth = _depth;
214 leader = FALSE;
215}
216
217Address &Address::operator=(const Address &b) {
218 depth = b.depth;
219 for (unsigned i = 0; i < depth; i++) {
220 labels[i] = b.labels[i];
221 childNums[i] = b.childNums[i];
222 }
223 leader = FALSE;
224 return *this;
225}
226
227bool Address::operator==(const Address &b) const {
228 if (depth != b.depth)
229 return false;
230 for (unsigned i = 0; i < depth; i++)
231 if(labels[i] != b.labels[i])
232 return false;
233 return true;
234}
235
236bool Address::isClose(const Address &b, int level) const {
237 if (depth != b.depth)
238 return false;
239 if ((unsigned)level >= depth)
240 return true;
241 for (unsigned i = 0; i < (depth - level); i++)
242 if(labels[i] != b.labels[i])
243 return false;
244 return true;
245}
246
247bool Address::operator!=(const Address &b) const {
248 return !operator==(b);
249}
250
251class AddrUnsPair {
252public:
253 Address first;
254 unsigned second;
255 AddrUnsPair(Address _first, unsigned _second);
256 AddrUnsPair &operator=(const AddrUnsPair &b);
257};
258
259AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
260 : first(_first), second(_second)
261{
262}
263
264AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
265{
266 first = b.first;
267 second = b.second;
268 return *this;
269}
270
271# endif /* !defined(KMP_DEBUG) && !defined(COVER) */
272
273
274static int
275__kmp_affinity_cmp_Address_labels(const void *a, const void *b)
276{
277 const Address *aa = (const Address *)&(((AddrUnsPair *)a)
278 ->first);
279 const Address *bb = (const Address *)&(((AddrUnsPair *)b)
280 ->first);
281 unsigned depth = aa->depth;
282 unsigned i;
283 KMP_DEBUG_ASSERT(depth == bb->depth);
284 for (i = 0; i < depth; i++) {
285 if (aa->labels[i] < bb->labels[i]) return -1;
286 if (aa->labels[i] > bb->labels[i]) return 1;
287 }
288 return 0;
289}
290
291
292static int
293__kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
294{
295 const Address *aa = (const Address *)&(((AddrUnsPair *)a)
296 ->first);
297 const Address *bb = (const Address *)&(((AddrUnsPair *)b)
298 ->first);
299 unsigned depth = aa->depth;
300 unsigned i;
301 KMP_DEBUG_ASSERT(depth == bb->depth);
302 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
303 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
304 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
305 int j = depth - i - 1;
306 if (aa->childNums[j] < bb->childNums[j]) return -1;
307 if (aa->childNums[j] > bb->childNums[j]) return 1;
308 }
309 for (; i < depth; i++) {
310 int j = i - __kmp_affinity_compact;
311 if (aa->childNums[j] < bb->childNums[j]) return -1;
312 if (aa->childNums[j] > bb->childNums[j]) return 1;
313 }
314 return 0;
315}
316
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000317/** A structure for holding machine-specific hierarchy info to be computed once at init. */
318class hierarchy_info {
319public:
320 /** Typical levels are threads/core, cores/package or socket, packages/node, nodes/machine,
321 etc. We don't want to get specific with nomenclature */
322 static const kmp_uint32 maxLevels=7;
323
324 /** This is specifically the depth of the machine configuration hierarchy, in terms of the
325 number of levels along the longest path from root to any leaf. It corresponds to the
326 number of entries in numPerLevel if we exclude all but one trailing 1. */
327 kmp_uint32 depth;
328 kmp_uint32 base_depth;
329 kmp_uint32 base_num_threads;
330 bool uninitialized;
331
332 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
333 node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
334 and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
335 kmp_uint32 numPerLevel[maxLevels];
336 kmp_uint32 skipPerLevel[maxLevels];
337
338 void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
339 int hier_depth = adr2os[0].first.depth;
340 int level = 0;
341 for (int i=hier_depth-1; i>=0; --i) {
342 int max = -1;
343 for (int j=0; j<num_addrs; ++j) {
344 int next = adr2os[j].first.childNums[i];
345 if (next > max) max = next;
346 }
347 numPerLevel[level] = max+1;
348 ++level;
349 }
350 }
351
352 hierarchy_info() : depth(1), uninitialized(true) {}
353 void init(AddrUnsPair *adr2os, int num_addrs)
354 {
355 uninitialized = false;
356 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
357 numPerLevel[i] = 1;
358 skipPerLevel[i] = 1;
359 }
360
361 // Sort table by physical ID
362 if (adr2os) {
363 qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
364 deriveLevels(adr2os, num_addrs);
365 }
366 else {
367 numPerLevel[0] = 4;
368 numPerLevel[1] = num_addrs/4;
369 if (num_addrs%4) numPerLevel[1]++;
370 }
371
372 base_num_threads = num_addrs;
373 for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
374 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
375 depth++;
376
377 kmp_uint32 branch = 4;
378 if (numPerLevel[0] == 1) branch = num_addrs/4;
379 if (branch<4) branch=4;
380 for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
381 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
382 if (numPerLevel[d] & 1) numPerLevel[d]++;
383 numPerLevel[d] = numPerLevel[d] >> 1;
384 if (numPerLevel[d+1] == 1) depth++;
385 numPerLevel[d+1] = numPerLevel[d+1] << 1;
386 }
387 if(numPerLevel[0] == 1) {
388 branch = branch >> 1;
389 if (branch<4) branch = 4;
390 }
391 }
392
393 for (kmp_uint32 i=1; i<depth; ++i)
394 skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
395
396 base_depth = depth;
397 }
398};
399
400static hierarchy_info machine_hierarchy;
401
402void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
403 if (machine_hierarchy.uninitialized)
404 machine_hierarchy.init(NULL, nproc);
405
406 if (nproc <= machine_hierarchy.base_num_threads)
407 machine_hierarchy.depth = machine_hierarchy.base_depth;
408 KMP_DEBUG_ASSERT(machine_hierarchy.depth > 0);
409 while (nproc > machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1]) {
410 machine_hierarchy.depth++;
411 machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1] = 2*machine_hierarchy.skipPerLevel[machine_hierarchy.depth-2];
412 }
413 thr_bar->depth = machine_hierarchy.depth;
414 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
415 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
416}
Jim Cownie5e8470a2013-09-27 10:38:44 +0000417
418//
419// When sorting by labels, __kmp_affinity_assign_child_nums() must first be
420// called to renumber the labels from [0..n] and place them into the child_num
421// vector of the address object. This is done in case the labels used for
Alp Toker8f2d3f02014-02-24 10:40:15 +0000422// the children at one node of the hierarchy differ from those used for
Jim Cownie5e8470a2013-09-27 10:38:44 +0000423// another node at the same level. Example: suppose the machine has 2 nodes
424// with 2 packages each. The first node contains packages 601 and 602, and
425// second node contains packages 603 and 604. If we try to sort the table
426// for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
427// because we are paying attention to the labels themselves, not the ordinal
428// child numbers. By using the child numbers in the sort, the result is
429// {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
430//
431static void
432__kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
433 int numAddrs)
434{
435 KMP_DEBUG_ASSERT(numAddrs > 0);
436 int depth = address2os->first.depth;
437 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
438 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
439 * sizeof(unsigned));
440 int labCt;
441 for (labCt = 0; labCt < depth; labCt++) {
442 address2os[0].first.childNums[labCt] = counts[labCt] = 0;
443 lastLabel[labCt] = address2os[0].first.labels[labCt];
444 }
445 int i;
446 for (i = 1; i < numAddrs; i++) {
447 for (labCt = 0; labCt < depth; labCt++) {
448 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
449 int labCt2;
450 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
451 counts[labCt2] = 0;
452 lastLabel[labCt2] = address2os[i].first.labels[labCt2];
453 }
454 counts[labCt]++;
455 lastLabel[labCt] = address2os[i].first.labels[labCt];
456 break;
457 }
458 }
459 for (labCt = 0; labCt < depth; labCt++) {
460 address2os[i].first.childNums[labCt] = counts[labCt];
461 }
462 for (; labCt < (int)Address::maxDepth; labCt++) {
463 address2os[i].first.childNums[labCt] = 0;
464 }
465 }
466}
467
468
469//
470// All of the __kmp_affinity_create_*_map() routines should set
471// __kmp_affinity_masks to a vector of affinity mask objects of length
472// __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
473// return the number of levels in the machine topology tree (zero if
474// __kmp_affinity_type == affinity_none).
475//
476// All of the __kmp_affinity_create_*_map() routines should set *fullMask
477// to the affinity mask for the initialization thread. They need to save and
478// restore the mask, and it could be needed later, so saving it is just an
479// optimization to avoid calling kmp_get_system_affinity() again.
480//
481static kmp_affin_mask_t *fullMask = NULL;
482
483kmp_affin_mask_t *
484__kmp_affinity_get_fullMask() { return fullMask; }
485
486
487static int nCoresPerPkg, nPackages;
488int __kmp_nThreadsPerCore;
489
490//
491// __kmp_affinity_uniform_topology() doesn't work when called from
492// places which support arbitrarily many levels in the machine topology
493// map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
494// __kmp_affinity_create_x2apicid_map().
495//
496inline static bool
497__kmp_affinity_uniform_topology()
498{
499 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
500}
501
502
503//
504// Print out the detailed machine topology map, i.e. the physical locations
505// of each OS proc.
506//
507static void
508__kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
509 int pkgLevel, int coreLevel, int threadLevel)
510{
511 int proc;
512
513 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
514 for (proc = 0; proc < len; proc++) {
515 int level;
516 kmp_str_buf_t buf;
517 __kmp_str_buf_init(&buf);
518 for (level = 0; level < depth; level++) {
519 if (level == threadLevel) {
520 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
521 }
522 else if (level == coreLevel) {
523 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
524 }
525 else if (level == pkgLevel) {
526 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
527 }
528 else if (level > pkgLevel) {
529 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
530 level - pkgLevel - 1);
531 }
532 else {
533 __kmp_str_buf_print(&buf, "L%d ", level);
534 }
535 __kmp_str_buf_print(&buf, "%d ",
536 address2os[proc].first.labels[level]);
537 }
538 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
539 buf.str);
540 __kmp_str_buf_free(&buf);
541 }
542}
543
544
545//
546// If we don't know how to retrieve the machine's processor topology, or
547// encounter an error in doing so, this routine is called to form a "flat"
548// mapping of os thread id's <-> processor id's.
549//
550static int
551__kmp_affinity_create_flat_map(AddrUnsPair **address2os,
552 kmp_i18n_id_t *const msg_id)
553{
554 *address2os = NULL;
555 *msg_id = kmp_i18n_null;
556
557 //
558 // Even if __kmp_affinity_type == affinity_none, this routine might still
559 // called to set __kmp_ht_enabled, & __kmp_ncores, as well as
560 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
561 //
562 if (! KMP_AFFINITY_CAPABLE()) {
563 KMP_ASSERT(__kmp_affinity_type == affinity_none);
564 __kmp_ncores = nPackages = __kmp_xproc;
565 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
566 __kmp_ht_enabled = FALSE;
567 if (__kmp_affinity_verbose) {
568 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
569 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
570 KMP_INFORM(Uniform, "KMP_AFFINITY");
571 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
572 __kmp_nThreadsPerCore, __kmp_ncores);
573 }
574 return 0;
575 }
576
577 //
578 // When affinity is off, this routine will still be called to set
579 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
580 // nCoresPerPkg, & nPackages. Make sure all these vars are set
581 // correctly, and return now if affinity is not enabled.
582 //
583 __kmp_ncores = nPackages = __kmp_avail_proc;
584 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
585 __kmp_ht_enabled = FALSE;
586 if (__kmp_affinity_verbose) {
587 char buf[KMP_AFFIN_MASK_PRINT_LEN];
588 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
589
590 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
591 if (__kmp_affinity_respect_mask) {
592 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
593 } else {
594 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
595 }
596 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
597 KMP_INFORM(Uniform, "KMP_AFFINITY");
598 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
599 __kmp_nThreadsPerCore, __kmp_ncores);
600 }
601 if (__kmp_affinity_type == affinity_none) {
602 return 0;
603 }
604
605 //
606 // Contruct the data structure to be returned.
607 //
608 *address2os = (AddrUnsPair*)
609 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
610 int avail_ct = 0;
611 unsigned int i;
612 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
613 //
614 // Skip this proc if it is not included in the machine model.
615 //
616 if (! KMP_CPU_ISSET(i, fullMask)) {
617 continue;
618 }
619
620 Address addr(1);
621 addr.labels[0] = i;
622 (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
623 }
624 if (__kmp_affinity_verbose) {
625 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
626 }
627
628 if (__kmp_affinity_gran_levels < 0) {
629 //
630 // Only the package level is modeled in the machine topology map,
631 // so the #levels of granularity is either 0 or 1.
632 //
633 if (__kmp_affinity_gran > affinity_gran_package) {
634 __kmp_affinity_gran_levels = 1;
635 }
636 else {
637 __kmp_affinity_gran_levels = 0;
638 }
639 }
640 return 1;
641}
642
643
644# if KMP_OS_WINDOWS && KMP_ARCH_X86_64
645
646//
647// If multiple Windows* OS processor groups exist, we can create a 2-level
648// topology map with the groups at level 0 and the individual procs at
649// level 1.
650//
651// This facilitates letting the threads float among all procs in a group,
652// if granularity=group (the default when there are multiple groups).
653//
654static int
655__kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
656 kmp_i18n_id_t *const msg_id)
657{
658 *address2os = NULL;
659 *msg_id = kmp_i18n_null;
660
661 //
662 // If we don't have multiple processor groups, return now.
663 // The flat mapping will be used.
664 //
665 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
666 // FIXME set *msg_id
667 return -1;
668 }
669
670 //
671 // Contruct the data structure to be returned.
672 //
673 *address2os = (AddrUnsPair*)
674 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
675 int avail_ct = 0;
676 int i;
677 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
678 //
679 // Skip this proc if it is not included in the machine model.
680 //
681 if (! KMP_CPU_ISSET(i, fullMask)) {
682 continue;
683 }
684
685 Address addr(2);
686 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
687 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
688 (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
689
690 if (__kmp_affinity_verbose) {
691 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
692 addr.labels[1]);
693 }
694 }
695
696 if (__kmp_affinity_gran_levels < 0) {
697 if (__kmp_affinity_gran == affinity_gran_group) {
698 __kmp_affinity_gran_levels = 1;
699 }
700 else if ((__kmp_affinity_gran == affinity_gran_fine)
701 || (__kmp_affinity_gran == affinity_gran_thread)) {
702 __kmp_affinity_gran_levels = 0;
703 }
704 else {
705 const char *gran_str = NULL;
706 if (__kmp_affinity_gran == affinity_gran_core) {
707 gran_str = "core";
708 }
709 else if (__kmp_affinity_gran == affinity_gran_package) {
710 gran_str = "package";
711 }
712 else if (__kmp_affinity_gran == affinity_gran_node) {
713 gran_str = "node";
714 }
715 else {
716 KMP_ASSERT(0);
717 }
718
719 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
720 __kmp_affinity_gran_levels = 0;
721 }
722 }
723 return 2;
724}
725
726# endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
727
728
729# if KMP_ARCH_X86 || KMP_ARCH_X86_64
730
731static int
732__kmp_cpuid_mask_width(int count) {
733 int r = 0;
734
735 while((1<<r) < count)
736 ++r;
737 return r;
738}
739
740
741class apicThreadInfo {
742public:
743 unsigned osId; // param to __kmp_affinity_bind_thread
744 unsigned apicId; // from cpuid after binding
745 unsigned maxCoresPerPkg; // ""
746 unsigned maxThreadsPerPkg; // ""
747 unsigned pkgId; // inferred from above values
748 unsigned coreId; // ""
749 unsigned threadId; // ""
750};
751
752
753static int
754__kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
755{
756 const apicThreadInfo *aa = (const apicThreadInfo *)a;
757 const apicThreadInfo *bb = (const apicThreadInfo *)b;
758 if (aa->osId < bb->osId) return -1;
759 if (aa->osId > bb->osId) return 1;
760 return 0;
761}
762
763
764static int
765__kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
766{
767 const apicThreadInfo *aa = (const apicThreadInfo *)a;
768 const apicThreadInfo *bb = (const apicThreadInfo *)b;
769 if (aa->pkgId < bb->pkgId) return -1;
770 if (aa->pkgId > bb->pkgId) return 1;
771 if (aa->coreId < bb->coreId) return -1;
772 if (aa->coreId > bb->coreId) return 1;
773 if (aa->threadId < bb->threadId) return -1;
774 if (aa->threadId > bb->threadId) return 1;
775 return 0;
776}
777
778
779//
780// On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
781// an algorithm which cycles through the available os threads, setting
782// the current thread's affinity mask to that thread, and then retrieves
783// the Apic Id for each thread context using the cpuid instruction.
784//
785static int
786__kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
787 kmp_i18n_id_t *const msg_id)
788{
789 int rc;
790 *address2os = NULL;
791 *msg_id = kmp_i18n_null;
792
793# if KMP_MIC
794 {
795 // The code below will use cpuid(4).
796 // Check if cpuid(4) is supported.
797 // FIXME? - this really doesn't need to be specific to MIC.
798 kmp_cpuid buf;
799 __kmp_x86_cpuid(0, 0, &buf);
800 if (buf.eax < 4) {
801 *msg_id = kmp_i18n_str_NoLeaf4Support;
802 return -1;
803 }
804 }
805# endif // KMP_MIC
806
807 //
808 // Even if __kmp_affinity_type == affinity_none, this routine is still
809 // called to set __kmp_ht_enabled, & __kmp_ncores, as well as
810 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
811 //
812 // The algorithm used starts by setting the affinity to each available
813 // thread and retreiving info from the cpuid instruction, so if we are not
814 // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(),
815 // then we need to do something else.
816 //
817 if (! KMP_AFFINITY_CAPABLE()) {
818 //
819 // Hack to try and infer the machine topology using only the data
820 // available from cpuid on the current thread, and __kmp_xproc.
821 //
822 KMP_ASSERT(__kmp_affinity_type == affinity_none);
823
824 //
825 // Get an upper bound on the number of threads per package using
826 // cpuid(1).
827 //
828 // On some OS/chps combinations where HT is supported by the chip
829 // but is disabled, this value will be 2 on a single core chip.
830 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
831 //
832 kmp_cpuid buf;
833 __kmp_x86_cpuid(1, 0, &buf);
834 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
835 if (maxThreadsPerPkg == 0) {
836 maxThreadsPerPkg = 1;
837 }
838
839 //
840 // The num cores per pkg comes from cpuid(4).
841 // 1 must be added to the encoded value.
842 //
843 // The author of cpu_count.cpp treated this only an upper bound
844 // on the number of cores, but I haven't seen any cases where it
845 // was greater than the actual number of cores, so we will treat
846 // it as exact in this block of code.
847 //
848 // First, we need to check if cpuid(4) is supported on this chip.
849 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
850 // has the value n or greater.
851 //
852 __kmp_x86_cpuid(0, 0, &buf);
853 if (buf.eax >= 4) {
854 __kmp_x86_cpuid(4, 0, &buf);
855 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
856 }
857 else {
858 nCoresPerPkg = 1;
859 }
860
861 //
862 // There is no way to reliably tell if HT is enabled without issuing
863 // the cpuid instruction from every thread, can correlating the cpuid
864 // info, so if the machine is not affinity capable, we assume that HT
865 // is off. We have seen quite a few machines where maxThreadsPerPkg
866 // is 2, yet the machine does not support HT.
867 //
868 // - Older OSes are usually found on machines with older chips, which
869 // do not support HT.
870 //
871 // - The performance penalty for mistakenly identifying a machine as
872 // HT when it isn't (which results in blocktime being incorrecly set
873 // to 0) is greater than the penalty when for mistakenly identifying
874 // a machine as being 1 thread/core when it is really HT enabled
875 // (which results in blocktime being incorrectly set to a positive
876 // value).
877 //
878 __kmp_ncores = __kmp_xproc;
879 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
880 __kmp_nThreadsPerCore = 1;
881 __kmp_ht_enabled = FALSE;
882 if (__kmp_affinity_verbose) {
883 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
884 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
885 if (__kmp_affinity_uniform_topology()) {
886 KMP_INFORM(Uniform, "KMP_AFFINITY");
887 } else {
888 KMP_INFORM(NonUniform, "KMP_AFFINITY");
889 }
890 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
891 __kmp_nThreadsPerCore, __kmp_ncores);
892 }
893 return 0;
894 }
895
896 //
897 //
898 // From here on, we can assume that it is safe to call
899 // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
900 // even if __kmp_affinity_type = affinity_none.
901 //
902
903 //
904 // Save the affinity mask for the current thread.
905 //
906 kmp_affin_mask_t *oldMask;
907 KMP_CPU_ALLOC(oldMask);
908 KMP_ASSERT(oldMask != NULL);
909 __kmp_get_system_affinity(oldMask, TRUE);
910
911 //
912 // Run through each of the available contexts, binding the current thread
913 // to it, and obtaining the pertinent information using the cpuid instr.
914 //
915 // The relevant information is:
916 //
917 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
918 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
919 //
920 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The
921 // value of this field determines the width of the core# + thread#
922 // fields in the Apic Id. It is also an upper bound on the number
923 // of threads per package, but it has been verified that situations
924 // happen were it is not exact. In particular, on certain OS/chip
925 // combinations where Intel(R) Hyper-Threading Technology is supported
926 // by the chip but has
927 // been disabled, the value of this field will be 2 (for a single core
928 // chip). On other OS/chip combinations supporting
929 // Intel(R) Hyper-Threading Technology, the value of
930 // this field will be 1 when Intel(R) Hyper-Threading Technology is
931 // disabled and 2 when it is enabled.
932 //
933 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The
934 // value of this field (+1) determines the width of the core# field in
935 // the Apic Id. The comments in "cpucount.cpp" say that this value is
936 // an upper bound, but the IA-32 architecture manual says that it is
937 // exactly the number of cores per package, and I haven't seen any
938 // case where it wasn't.
939 //
940 // From this information, deduce the package Id, core Id, and thread Id,
941 // and set the corresponding fields in the apicThreadInfo struct.
942 //
943 unsigned i;
944 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
945 __kmp_avail_proc * sizeof(apicThreadInfo));
946 unsigned nApics = 0;
947 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
948 //
949 // Skip this proc if it is not included in the machine model.
950 //
951 if (! KMP_CPU_ISSET(i, fullMask)) {
952 continue;
953 }
954 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
955
956 __kmp_affinity_bind_thread(i);
957 threadInfo[nApics].osId = i;
958
959 //
960 // The apic id and max threads per pkg come from cpuid(1).
961 //
962 kmp_cpuid buf;
963 __kmp_x86_cpuid(1, 0, &buf);
964 if (! (buf.edx >> 9) & 1) {
965 __kmp_set_system_affinity(oldMask, TRUE);
966 __kmp_free(threadInfo);
967 KMP_CPU_FREE(oldMask);
968 *msg_id = kmp_i18n_str_ApicNotPresent;
969 return -1;
970 }
971 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
972 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
973 if (threadInfo[nApics].maxThreadsPerPkg == 0) {
974 threadInfo[nApics].maxThreadsPerPkg = 1;
975 }
976
977 //
978 // Max cores per pkg comes from cpuid(4).
979 // 1 must be added to the encoded value.
980 //
981 // First, we need to check if cpuid(4) is supported on this chip.
982 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
983 // has the value n or greater.
984 //
985 __kmp_x86_cpuid(0, 0, &buf);
986 if (buf.eax >= 4) {
987 __kmp_x86_cpuid(4, 0, &buf);
988 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
989 }
990 else {
991 threadInfo[nApics].maxCoresPerPkg = 1;
992 }
993
994 //
995 // Infer the pkgId / coreId / threadId using only the info
996 // obtained locally.
997 //
998 int widthCT = __kmp_cpuid_mask_width(
999 threadInfo[nApics].maxThreadsPerPkg);
1000 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1001
1002 int widthC = __kmp_cpuid_mask_width(
1003 threadInfo[nApics].maxCoresPerPkg);
1004 int widthT = widthCT - widthC;
1005 if (widthT < 0) {
1006 //
1007 // I've never seen this one happen, but I suppose it could, if
1008 // the cpuid instruction on a chip was really screwed up.
1009 // Make sure to restore the affinity mask before the tail call.
1010 //
1011 __kmp_set_system_affinity(oldMask, TRUE);
1012 __kmp_free(threadInfo);
1013 KMP_CPU_FREE(oldMask);
1014 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1015 return -1;
1016 }
1017
1018 int maskC = (1 << widthC) - 1;
1019 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1020 &maskC;
1021
1022 int maskT = (1 << widthT) - 1;
1023 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1024
1025 nApics++;
1026 }
1027
1028 //
1029 // We've collected all the info we need.
1030 // Restore the old affinity mask for this thread.
1031 //
1032 __kmp_set_system_affinity(oldMask, TRUE);
1033
1034 //
1035 // If there's only one thread context to bind to, form an Address object
1036 // with depth 1 and return immediately (or, if affinity is off, set
1037 // address2os to NULL and return).
1038 //
1039 // If it is configured to omit the package level when there is only a
1040 // single package, the logic at the end of this routine won't work if
1041 // there is only a single thread - it would try to form an Address
1042 // object with depth 0.
1043 //
1044 KMP_ASSERT(nApics > 0);
1045 if (nApics == 1) {
1046 __kmp_ncores = nPackages = 1;
1047 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1048 __kmp_ht_enabled = FALSE;
1049 if (__kmp_affinity_verbose) {
1050 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1051 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1052
1053 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1054 if (__kmp_affinity_respect_mask) {
1055 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1056 } else {
1057 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1058 }
1059 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1060 KMP_INFORM(Uniform, "KMP_AFFINITY");
1061 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1062 __kmp_nThreadsPerCore, __kmp_ncores);
1063 }
1064
1065 if (__kmp_affinity_type == affinity_none) {
1066 __kmp_free(threadInfo);
1067 KMP_CPU_FREE(oldMask);
1068 return 0;
1069 }
1070
1071 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1072 Address addr(1);
1073 addr.labels[0] = threadInfo[0].pkgId;
1074 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1075
1076 if (__kmp_affinity_gran_levels < 0) {
1077 __kmp_affinity_gran_levels = 0;
1078 }
1079
1080 if (__kmp_affinity_verbose) {
1081 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1082 }
1083
1084 __kmp_free(threadInfo);
1085 KMP_CPU_FREE(oldMask);
1086 return 1;
1087 }
1088
1089 //
1090 // Sort the threadInfo table by physical Id.
1091 //
1092 qsort(threadInfo, nApics, sizeof(*threadInfo),
1093 __kmp_affinity_cmp_apicThreadInfo_phys_id);
1094
1095 //
1096 // The table is now sorted by pkgId / coreId / threadId, but we really
1097 // don't know the radix of any of the fields. pkgId's may be sparsely
1098 // assigned among the chips on a system. Although coreId's are usually
1099 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1100 // [0..threadsPerCore-1], we don't want to make any such assumptions.
1101 //
1102 // For that matter, we don't know what coresPerPkg and threadsPerCore
1103 // (or the total # packages) are at this point - we want to determine
1104 // that now. We only have an upper bound on the first two figures.
1105 //
1106 // We also perform a consistency check at this point: the values returned
1107 // by the cpuid instruction for any thread bound to a given package had
1108 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1109 //
1110 nPackages = 1;
1111 nCoresPerPkg = 1;
1112 __kmp_nThreadsPerCore = 1;
1113 unsigned nCores = 1;
1114
1115 unsigned pkgCt = 1; // to determine radii
1116 unsigned lastPkgId = threadInfo[0].pkgId;
1117 unsigned coreCt = 1;
1118 unsigned lastCoreId = threadInfo[0].coreId;
1119 unsigned threadCt = 1;
1120 unsigned lastThreadId = threadInfo[0].threadId;
1121
1122 // intra-pkg consist checks
1123 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1124 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1125
1126 for (i = 1; i < nApics; i++) {
1127 if (threadInfo[i].pkgId != lastPkgId) {
1128 nCores++;
1129 pkgCt++;
1130 lastPkgId = threadInfo[i].pkgId;
1131 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1132 coreCt = 1;
1133 lastCoreId = threadInfo[i].coreId;
1134 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1135 threadCt = 1;
1136 lastThreadId = threadInfo[i].threadId;
1137
1138 //
1139 // This is a different package, so go on to the next iteration
1140 // without doing any consistency checks. Reset the consistency
1141 // check vars, though.
1142 //
1143 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1144 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1145 continue;
1146 }
1147
1148 if (threadInfo[i].coreId != lastCoreId) {
1149 nCores++;
1150 coreCt++;
1151 lastCoreId = threadInfo[i].coreId;
1152 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1153 threadCt = 1;
1154 lastThreadId = threadInfo[i].threadId;
1155 }
1156 else if (threadInfo[i].threadId != lastThreadId) {
1157 threadCt++;
1158 lastThreadId = threadInfo[i].threadId;
1159 }
1160 else {
1161 __kmp_free(threadInfo);
1162 KMP_CPU_FREE(oldMask);
1163 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1164 return -1;
1165 }
1166
1167 //
1168 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1169 // fields agree between all the threads bounds to a given package.
1170 //
1171 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1172 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1173 __kmp_free(threadInfo);
1174 KMP_CPU_FREE(oldMask);
1175 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1176 return -1;
1177 }
1178 }
1179 nPackages = pkgCt;
1180 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1181 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1182
1183 //
1184 // When affinity is off, this routine will still be called to set
1185 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
1186 // nCoresPerPkg, & nPackages. Make sure all these vars are set
1187 // correctly, and return now if affinity is not enabled.
1188 //
1189 __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1190 __kmp_ncores = nCores;
1191 if (__kmp_affinity_verbose) {
1192 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1193 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1194
1195 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1196 if (__kmp_affinity_respect_mask) {
1197 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1198 } else {
1199 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1200 }
1201 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1202 if (__kmp_affinity_uniform_topology()) {
1203 KMP_INFORM(Uniform, "KMP_AFFINITY");
1204 } else {
1205 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1206 }
1207 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1208 __kmp_nThreadsPerCore, __kmp_ncores);
1209
1210 }
1211
1212 if (__kmp_affinity_type == affinity_none) {
1213 __kmp_free(threadInfo);
1214 KMP_CPU_FREE(oldMask);
1215 return 0;
1216 }
1217
1218 //
1219 // Now that we've determined the number of packages, the number of cores
1220 // per package, and the number of threads per core, we can construct the
1221 // data structure that is to be returned.
1222 //
1223 int pkgLevel = 0;
1224 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1225 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1226 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1227
1228 KMP_ASSERT(depth > 0);
1229 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1230
1231 for (i = 0; i < nApics; ++i) {
1232 Address addr(depth);
1233 unsigned os = threadInfo[i].osId;
1234 int d = 0;
1235
1236 if (pkgLevel >= 0) {
1237 addr.labels[d++] = threadInfo[i].pkgId;
1238 }
1239 if (coreLevel >= 0) {
1240 addr.labels[d++] = threadInfo[i].coreId;
1241 }
1242 if (threadLevel >= 0) {
1243 addr.labels[d++] = threadInfo[i].threadId;
1244 }
1245 (*address2os)[i] = AddrUnsPair(addr, os);
1246 }
1247
1248 if (__kmp_affinity_gran_levels < 0) {
1249 //
1250 // Set the granularity level based on what levels are modeled
1251 // in the machine topology map.
1252 //
1253 __kmp_affinity_gran_levels = 0;
1254 if ((threadLevel >= 0)
1255 && (__kmp_affinity_gran > affinity_gran_thread)) {
1256 __kmp_affinity_gran_levels++;
1257 }
1258 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1259 __kmp_affinity_gran_levels++;
1260 }
1261 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1262 __kmp_affinity_gran_levels++;
1263 }
1264 }
1265
1266 if (__kmp_affinity_verbose) {
1267 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1268 coreLevel, threadLevel);
1269 }
1270
1271 __kmp_free(threadInfo);
1272 KMP_CPU_FREE(oldMask);
1273 return depth;
1274}
1275
1276
1277//
1278// Intel(R) microarchitecture code name Nehalem, Dunnington and later
1279// architectures support a newer interface for specifying the x2APIC Ids,
1280// based on cpuid leaf 11.
1281//
1282static int
1283__kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1284 kmp_i18n_id_t *const msg_id)
1285{
1286 kmp_cpuid buf;
1287
1288 *address2os = NULL;
1289 *msg_id = kmp_i18n_null;
1290
1291 //
1292 // Check to see if cpuid leaf 11 is supported.
1293 //
1294 __kmp_x86_cpuid(0, 0, &buf);
1295 if (buf.eax < 11) {
1296 *msg_id = kmp_i18n_str_NoLeaf11Support;
1297 return -1;
1298 }
1299 __kmp_x86_cpuid(11, 0, &buf);
1300 if (buf.ebx == 0) {
1301 *msg_id = kmp_i18n_str_NoLeaf11Support;
1302 return -1;
1303 }
1304
1305 //
1306 // Find the number of levels in the machine topology. While we're at it,
1307 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will
1308 // try to get more accurate values later by explicitly counting them,
1309 // but get reasonable defaults now, in case we return early.
1310 //
1311 int level;
1312 int threadLevel = -1;
1313 int coreLevel = -1;
1314 int pkgLevel = -1;
1315 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1316
1317 for (level = 0;; level++) {
1318 if (level > 31) {
1319 //
1320 // FIXME: Hack for DPD200163180
1321 //
1322 // If level is big then something went wrong -> exiting
1323 //
1324 // There could actually be 32 valid levels in the machine topology,
1325 // but so far, the only machine we have seen which does not exit
1326 // this loop before iteration 32 has fubar x2APIC settings.
1327 //
1328 // For now, just reject this case based upon loop trip count.
1329 //
1330 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1331 return -1;
1332 }
1333 __kmp_x86_cpuid(11, level, &buf);
1334 if (buf.ebx == 0) {
1335 if (pkgLevel < 0) {
1336 //
1337 // Will infer nPackages from __kmp_xproc
1338 //
1339 pkgLevel = level;
1340 level++;
1341 }
1342 break;
1343 }
1344 int kind = (buf.ecx >> 8) & 0xff;
1345 if (kind == 1) {
1346 //
1347 // SMT level
1348 //
1349 threadLevel = level;
1350 coreLevel = -1;
1351 pkgLevel = -1;
1352 __kmp_nThreadsPerCore = buf.ebx & 0xff;
1353 if (__kmp_nThreadsPerCore == 0) {
1354 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1355 return -1;
1356 }
1357 }
1358 else if (kind == 2) {
1359 //
1360 // core level
1361 //
1362 coreLevel = level;
1363 pkgLevel = -1;
1364 nCoresPerPkg = buf.ebx & 0xff;
1365 if (nCoresPerPkg == 0) {
1366 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1367 return -1;
1368 }
1369 }
1370 else {
1371 if (level <= 0) {
1372 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1373 return -1;
1374 }
1375 if (pkgLevel >= 0) {
1376 continue;
1377 }
1378 pkgLevel = level;
1379 nPackages = buf.ebx & 0xff;
1380 if (nPackages == 0) {
1381 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1382 return -1;
1383 }
1384 }
1385 }
1386 int depth = level;
1387
1388 //
1389 // In the above loop, "level" was counted from the finest level (usually
1390 // thread) to the coarsest. The caller expects that we will place the
1391 // labels in (*address2os)[].first.labels[] in the inverse order, so
1392 // we need to invert the vars saying which level means what.
1393 //
1394 if (threadLevel >= 0) {
1395 threadLevel = depth - threadLevel - 1;
1396 }
1397 if (coreLevel >= 0) {
1398 coreLevel = depth - coreLevel - 1;
1399 }
1400 KMP_DEBUG_ASSERT(pkgLevel >= 0);
1401 pkgLevel = depth - pkgLevel - 1;
1402
1403 //
1404 // The algorithm used starts by setting the affinity to each available
1405 // thread and retrieving info from the cpuid instruction, so if we are not
1406 // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(),
1407 // then we need to do something else - use the defaults that we calculated
1408 // from issuing cpuid without binding to each proc.
1409 //
1410 if (! KMP_AFFINITY_CAPABLE())
1411 {
1412 //
1413 // Hack to try and infer the machine topology using only the data
1414 // available from cpuid on the current thread, and __kmp_xproc.
1415 //
1416 KMP_ASSERT(__kmp_affinity_type == affinity_none);
1417
1418 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1419 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1420 __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1421 if (__kmp_affinity_verbose) {
1422 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1423 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1424 if (__kmp_affinity_uniform_topology()) {
1425 KMP_INFORM(Uniform, "KMP_AFFINITY");
1426 } else {
1427 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1428 }
1429 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1430 __kmp_nThreadsPerCore, __kmp_ncores);
1431 }
1432 return 0;
1433 }
1434
1435 //
1436 //
1437 // From here on, we can assume that it is safe to call
1438 // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1439 // even if __kmp_affinity_type = affinity_none.
1440 //
1441
1442 //
1443 // Save the affinity mask for the current thread.
1444 //
1445 kmp_affin_mask_t *oldMask;
1446 KMP_CPU_ALLOC(oldMask);
1447 __kmp_get_system_affinity(oldMask, TRUE);
1448
1449 //
1450 // Allocate the data structure to be returned.
1451 //
1452 AddrUnsPair *retval = (AddrUnsPair *)
1453 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1454
1455 //
1456 // Run through each of the available contexts, binding the current thread
1457 // to it, and obtaining the pertinent information using the cpuid instr.
1458 //
1459 unsigned int proc;
1460 int nApics = 0;
1461 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1462 //
1463 // Skip this proc if it is not included in the machine model.
1464 //
1465 if (! KMP_CPU_ISSET(proc, fullMask)) {
1466 continue;
1467 }
1468 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1469
1470 __kmp_affinity_bind_thread(proc);
1471
1472 //
1473 // Extrach the labels for each level in the machine topology map
1474 // from the Apic ID.
1475 //
1476 Address addr(depth);
1477 int prev_shift = 0;
1478
1479 for (level = 0; level < depth; level++) {
1480 __kmp_x86_cpuid(11, level, &buf);
1481 unsigned apicId = buf.edx;
1482 if (buf.ebx == 0) {
1483 if (level != depth - 1) {
1484 KMP_CPU_FREE(oldMask);
1485 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1486 return -1;
1487 }
1488 addr.labels[depth - level - 1] = apicId >> prev_shift;
1489 level++;
1490 break;
1491 }
1492 int shift = buf.eax & 0x1f;
1493 int mask = (1 << shift) - 1;
1494 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1495 prev_shift = shift;
1496 }
1497 if (level != depth) {
1498 KMP_CPU_FREE(oldMask);
1499 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1500 return -1;
1501 }
1502
1503 retval[nApics] = AddrUnsPair(addr, proc);
1504 nApics++;
1505 }
1506
1507 //
1508 // We've collected all the info we need.
1509 // Restore the old affinity mask for this thread.
1510 //
1511 __kmp_set_system_affinity(oldMask, TRUE);
1512
1513 //
1514 // If there's only one thread context to bind to, return now.
1515 //
1516 KMP_ASSERT(nApics > 0);
1517 if (nApics == 1) {
1518 __kmp_ncores = nPackages = 1;
1519 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1520 __kmp_ht_enabled = FALSE;
1521 if (__kmp_affinity_verbose) {
1522 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1523 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1524
1525 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1526 if (__kmp_affinity_respect_mask) {
1527 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1528 } else {
1529 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1530 }
1531 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1532 KMP_INFORM(Uniform, "KMP_AFFINITY");
1533 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1534 __kmp_nThreadsPerCore, __kmp_ncores);
1535 }
1536
1537 if (__kmp_affinity_type == affinity_none) {
1538 __kmp_free(retval);
1539 KMP_CPU_FREE(oldMask);
1540 return 0;
1541 }
1542
1543 //
1544 // Form an Address object which only includes the package level.
1545 //
1546 Address addr(1);
1547 addr.labels[0] = retval[0].first.labels[pkgLevel];
1548 retval[0].first = addr;
1549
1550 if (__kmp_affinity_gran_levels < 0) {
1551 __kmp_affinity_gran_levels = 0;
1552 }
1553
1554 if (__kmp_affinity_verbose) {
1555 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1556 }
1557
1558 *address2os = retval;
1559 KMP_CPU_FREE(oldMask);
1560 return 1;
1561 }
1562
1563 //
1564 // Sort the table by physical Id.
1565 //
1566 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1567
1568 //
1569 // Find the radix at each of the levels.
1570 //
1571 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1572 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1573 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1574 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1575 for (level = 0; level < depth; level++) {
1576 totals[level] = 1;
1577 maxCt[level] = 1;
1578 counts[level] = 1;
1579 last[level] = retval[0].first.labels[level];
1580 }
1581
1582 //
1583 // From here on, the iteration variable "level" runs from the finest
1584 // level to the coarsest, i.e. we iterate forward through
1585 // (*address2os)[].first.labels[] - in the previous loops, we iterated
1586 // backwards.
1587 //
1588 for (proc = 1; (int)proc < nApics; proc++) {
1589 int level;
1590 for (level = 0; level < depth; level++) {
1591 if (retval[proc].first.labels[level] != last[level]) {
1592 int j;
1593 for (j = level + 1; j < depth; j++) {
1594 totals[j]++;
1595 counts[j] = 1;
1596 // The line below causes printing incorrect topology information
1597 // in case the max value for some level (maxCt[level]) is encountered earlier than
1598 // some less value while going through the array.
1599 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1600 // whereas it must be 4.
1601 // TODO!!! Check if it can be commented safely
1602 //maxCt[j] = 1;
1603 last[j] = retval[proc].first.labels[j];
1604 }
1605 totals[level]++;
1606 counts[level]++;
1607 if (counts[level] > maxCt[level]) {
1608 maxCt[level] = counts[level];
1609 }
1610 last[level] = retval[proc].first.labels[level];
1611 break;
1612 }
1613 else if (level == depth - 1) {
1614 __kmp_free(last);
1615 __kmp_free(maxCt);
1616 __kmp_free(counts);
1617 __kmp_free(totals);
1618 __kmp_free(retval);
1619 KMP_CPU_FREE(oldMask);
1620 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1621 return -1;
1622 }
1623 }
1624 }
1625
1626 //
1627 // When affinity is off, this routine will still be called to set
1628 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
1629 // nCoresPerPkg, & nPackages. Make sure all these vars are set
1630 // correctly, and return if affinity is not enabled.
1631 //
1632 if (threadLevel >= 0) {
1633 __kmp_nThreadsPerCore = maxCt[threadLevel];
1634 }
1635 else {
1636 __kmp_nThreadsPerCore = 1;
1637 }
1638 __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1639
1640 nPackages = totals[pkgLevel];
1641
1642 if (coreLevel >= 0) {
1643 __kmp_ncores = totals[coreLevel];
1644 nCoresPerPkg = maxCt[coreLevel];
1645 }
1646 else {
1647 __kmp_ncores = nPackages;
1648 nCoresPerPkg = 1;
1649 }
1650
1651 //
1652 // Check to see if the machine topology is uniform
1653 //
1654 unsigned prod = maxCt[0];
1655 for (level = 1; level < depth; level++) {
1656 prod *= maxCt[level];
1657 }
1658 bool uniform = (prod == totals[level - 1]);
1659
1660 //
1661 // Print the machine topology summary.
1662 //
1663 if (__kmp_affinity_verbose) {
1664 char mask[KMP_AFFIN_MASK_PRINT_LEN];
1665 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1666
1667 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1668 if (__kmp_affinity_respect_mask) {
1669 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1670 } else {
1671 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1672 }
1673 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1674 if (uniform) {
1675 KMP_INFORM(Uniform, "KMP_AFFINITY");
1676 } else {
1677 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1678 }
1679
1680 kmp_str_buf_t buf;
1681 __kmp_str_buf_init(&buf);
1682
1683 __kmp_str_buf_print(&buf, "%d", totals[0]);
1684 for (level = 1; level <= pkgLevel; level++) {
1685 __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1686 }
1687 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1688 __kmp_nThreadsPerCore, __kmp_ncores);
1689
1690 __kmp_str_buf_free(&buf);
1691 }
1692
1693 if (__kmp_affinity_type == affinity_none) {
1694 __kmp_free(last);
1695 __kmp_free(maxCt);
1696 __kmp_free(counts);
1697 __kmp_free(totals);
1698 __kmp_free(retval);
1699 KMP_CPU_FREE(oldMask);
1700 return 0;
1701 }
1702
1703 //
1704 // Find any levels with radiix 1, and remove them from the map
1705 // (except for the package level).
1706 //
1707 int new_depth = 0;
1708 for (level = 0; level < depth; level++) {
1709 if ((maxCt[level] == 1) && (level != pkgLevel)) {
1710 continue;
1711 }
1712 new_depth++;
1713 }
1714
1715 //
1716 // If we are removing any levels, allocate a new vector to return,
1717 // and copy the relevant information to it.
1718 //
1719 if (new_depth != depth) {
1720 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1721 sizeof(AddrUnsPair) * nApics);
1722 for (proc = 0; (int)proc < nApics; proc++) {
1723 Address addr(new_depth);
1724 new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1725 }
1726 int new_level = 0;
1727 for (level = 0; level < depth; level++) {
1728 if ((maxCt[level] == 1) && (level != pkgLevel)) {
1729 if (level == threadLevel) {
1730 threadLevel = -1;
1731 }
1732 else if ((threadLevel >= 0) && (level < threadLevel)) {
1733 threadLevel--;
1734 }
1735 if (level == coreLevel) {
1736 coreLevel = -1;
1737 }
1738 else if ((coreLevel >= 0) && (level < coreLevel)) {
1739 coreLevel--;
1740 }
1741 if (level < pkgLevel) {
1742 pkgLevel--;
1743 }
1744 continue;
1745 }
1746 for (proc = 0; (int)proc < nApics; proc++) {
1747 new_retval[proc].first.labels[new_level]
1748 = retval[proc].first.labels[level];
1749 }
1750 new_level++;
1751 }
1752
1753 __kmp_free(retval);
1754 retval = new_retval;
1755 depth = new_depth;
1756 }
1757
1758 if (__kmp_affinity_gran_levels < 0) {
1759 //
1760 // Set the granularity level based on what levels are modeled
1761 // in the machine topology map.
1762 //
1763 __kmp_affinity_gran_levels = 0;
1764 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1765 __kmp_affinity_gran_levels++;
1766 }
1767 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1768 __kmp_affinity_gran_levels++;
1769 }
1770 if (__kmp_affinity_gran > affinity_gran_package) {
1771 __kmp_affinity_gran_levels++;
1772 }
1773 }
1774
1775 if (__kmp_affinity_verbose) {
1776 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1777 coreLevel, threadLevel);
1778 }
1779
1780 __kmp_free(last);
1781 __kmp_free(maxCt);
1782 __kmp_free(counts);
1783 __kmp_free(totals);
1784 KMP_CPU_FREE(oldMask);
1785 *address2os = retval;
1786 return depth;
1787}
1788
1789
1790# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1791
1792
1793#define osIdIndex 0
1794#define threadIdIndex 1
1795#define coreIdIndex 2
1796#define pkgIdIndex 3
1797#define nodeIdIndex 4
1798
1799typedef unsigned *ProcCpuInfo;
1800static unsigned maxIndex = pkgIdIndex;
1801
1802
1803static int
1804__kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1805{
1806 const unsigned *aa = (const unsigned *)a;
1807 const unsigned *bb = (const unsigned *)b;
1808 if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1809 if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1810 return 0;
1811};
1812
1813
1814static int
1815__kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1816{
1817 unsigned i;
1818 const unsigned *aa = *((const unsigned **)a);
1819 const unsigned *bb = *((const unsigned **)b);
1820 for (i = maxIndex; ; i--) {
1821 if (aa[i] < bb[i]) return -1;
1822 if (aa[i] > bb[i]) return 1;
1823 if (i == osIdIndex) break;
1824 }
1825 return 0;
1826}
1827
1828
1829//
1830// Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1831// affinity map.
1832//
1833static int
1834__kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1835 kmp_i18n_id_t *const msg_id, FILE *f)
1836{
1837 *address2os = NULL;
1838 *msg_id = kmp_i18n_null;
1839
1840 //
1841 // Scan of the file, and count the number of "processor" (osId) fields,
Alp Toker8f2d3f02014-02-24 10:40:15 +00001842 // and find the highest value of <n> for a node_<n> field.
Jim Cownie5e8470a2013-09-27 10:38:44 +00001843 //
1844 char buf[256];
1845 unsigned num_records = 0;
1846 while (! feof(f)) {
1847 buf[sizeof(buf) - 1] = 1;
1848 if (! fgets(buf, sizeof(buf), f)) {
1849 //
1850 // Read errors presumably because of EOF
1851 //
1852 break;
1853 }
1854
1855 char s1[] = "processor";
1856 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1857 num_records++;
1858 continue;
1859 }
1860
1861 //
1862 // FIXME - this will match "node_<n> <garbage>"
1863 //
1864 unsigned level;
1865 if (sscanf(buf, "node_%d id", &level) == 1) {
1866 if (nodeIdIndex + level >= maxIndex) {
1867 maxIndex = nodeIdIndex + level;
1868 }
1869 continue;
1870 }
1871 }
1872
1873 //
1874 // Check for empty file / no valid processor records, or too many.
1875 // The number of records can't exceed the number of valid bits in the
1876 // affinity mask.
1877 //
1878 if (num_records == 0) {
1879 *line = 0;
1880 *msg_id = kmp_i18n_str_NoProcRecords;
1881 return -1;
1882 }
1883 if (num_records > (unsigned)__kmp_xproc) {
1884 *line = 0;
1885 *msg_id = kmp_i18n_str_TooManyProcRecords;
1886 return -1;
1887 }
1888
1889 //
1890 // Set the file pointer back to the begginning, so that we can scan the
1891 // file again, this time performing a full parse of the data.
1892 // Allocate a vector of ProcCpuInfo object, where we will place the data.
1893 // Adding an extra element at the end allows us to remove a lot of extra
1894 // checks for termination conditions.
1895 //
1896 if (fseek(f, 0, SEEK_SET) != 0) {
1897 *line = 0;
1898 *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1899 return -1;
1900 }
1901
1902 //
1903 // Allocate the array of records to store the proc info in. The dummy
1904 // element at the end makes the logic in filling them out easier to code.
1905 //
1906 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1907 * sizeof(unsigned *));
1908 unsigned i;
1909 for (i = 0; i <= num_records; i++) {
1910 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1911 * sizeof(unsigned));
1912 }
1913
1914#define CLEANUP_THREAD_INFO \
1915 for (i = 0; i <= num_records; i++) { \
1916 __kmp_free(threadInfo[i]); \
1917 } \
1918 __kmp_free(threadInfo);
1919
1920 //
1921 // A value of UINT_MAX means that we didn't find the field
1922 //
1923 unsigned __index;
1924
1925#define INIT_PROC_INFO(p) \
1926 for (__index = 0; __index <= maxIndex; __index++) { \
1927 (p)[__index] = UINT_MAX; \
1928 }
1929
1930 for (i = 0; i <= num_records; i++) {
1931 INIT_PROC_INFO(threadInfo[i]);
1932 }
1933
1934 unsigned num_avail = 0;
1935 *line = 0;
1936 while (! feof(f)) {
1937 //
1938 // Create an inner scoping level, so that all the goto targets at the
1939 // end of the loop appear in an outer scoping level. This avoids
1940 // warnings about jumping past an initialization to a target in the
1941 // same block.
1942 //
1943 {
1944 buf[sizeof(buf) - 1] = 1;
1945 bool long_line = false;
1946 if (! fgets(buf, sizeof(buf), f)) {
1947 //
1948 // Read errors presumably because of EOF
1949 //
1950 // If there is valid data in threadInfo[num_avail], then fake
1951 // a blank line in ensure that the last address gets parsed.
1952 //
1953 bool valid = false;
1954 for (i = 0; i <= maxIndex; i++) {
1955 if (threadInfo[num_avail][i] != UINT_MAX) {
1956 valid = true;
1957 }
1958 }
1959 if (! valid) {
1960 break;
1961 }
1962 buf[0] = 0;
1963 } else if (!buf[sizeof(buf) - 1]) {
1964 //
1965 // The line is longer than the buffer. Set a flag and don't
1966 // emit an error if we were going to ignore the line, anyway.
1967 //
1968 long_line = true;
1969
1970#define CHECK_LINE \
1971 if (long_line) { \
1972 CLEANUP_THREAD_INFO; \
1973 *msg_id = kmp_i18n_str_LongLineCpuinfo; \
1974 return -1; \
1975 }
1976 }
1977 (*line)++;
1978
1979 char s1[] = "processor";
1980 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1981 CHECK_LINE;
1982 char *p = strchr(buf + sizeof(s1) - 1, ':');
1983 unsigned val;
1984 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1985 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
1986 threadInfo[num_avail][osIdIndex] = val;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001987#if KMP_OS_LINUX && USE_SYSFS_INFO
1988 char path[256];
1989 snprintf(path, sizeof(path),
1990 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
1991 threadInfo[num_avail][osIdIndex]);
1992 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
1993
1994 snprintf(path, sizeof(path),
1995 "/sys/devices/system/cpu/cpu%u/topology/core_id",
1996 threadInfo[num_avail][osIdIndex]);
1997 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001998 continue;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001999#else
Jim Cownie5e8470a2013-09-27 10:38:44 +00002000 }
2001 char s2[] = "physical id";
2002 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2003 CHECK_LINE;
2004 char *p = strchr(buf + sizeof(s2) - 1, ':');
2005 unsigned val;
2006 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2007 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2008 threadInfo[num_avail][pkgIdIndex] = val;
2009 continue;
2010 }
2011 char s3[] = "core id";
2012 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2013 CHECK_LINE;
2014 char *p = strchr(buf + sizeof(s3) - 1, ':');
2015 unsigned val;
2016 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2017 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2018 threadInfo[num_avail][coreIdIndex] = val;
2019 continue;
Jim Cownie181b4bb2013-12-23 17:28:57 +00002020#endif // KMP_OS_LINUX && USE_SYSFS_INFO
Jim Cownie5e8470a2013-09-27 10:38:44 +00002021 }
2022 char s4[] = "thread id";
2023 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2024 CHECK_LINE;
2025 char *p = strchr(buf + sizeof(s4) - 1, ':');
2026 unsigned val;
2027 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2028 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2029 threadInfo[num_avail][threadIdIndex] = val;
2030 continue;
2031 }
2032 unsigned level;
2033 if (sscanf(buf, "node_%d id", &level) == 1) {
2034 CHECK_LINE;
2035 char *p = strchr(buf + sizeof(s4) - 1, ':');
2036 unsigned val;
2037 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2038 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2039 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2040 threadInfo[num_avail][nodeIdIndex + level] = val;
2041 continue;
2042 }
2043
2044 //
2045 // We didn't recognize the leading token on the line.
2046 // There are lots of leading tokens that we don't recognize -
2047 // if the line isn't empty, go on to the next line.
2048 //
2049 if ((*buf != 0) && (*buf != '\n')) {
2050 //
2051 // If the line is longer than the buffer, read characters
2052 // until we find a newline.
2053 //
2054 if (long_line) {
2055 int ch;
2056 while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2057 }
2058 continue;
2059 }
2060
2061 //
2062 // A newline has signalled the end of the processor record.
2063 // Check that there aren't too many procs specified.
2064 //
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002065 if ((int)num_avail == __kmp_xproc) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002066 CLEANUP_THREAD_INFO;
2067 *msg_id = kmp_i18n_str_TooManyEntries;
2068 return -1;
2069 }
2070
2071 //
2072 // Check for missing fields. The osId field must be there, and we
2073 // currently require that the physical id field is specified, also.
2074 //
2075 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2076 CLEANUP_THREAD_INFO;
2077 *msg_id = kmp_i18n_str_MissingProcField;
2078 return -1;
2079 }
2080 if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2081 CLEANUP_THREAD_INFO;
2082 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2083 return -1;
2084 }
2085
2086 //
2087 // Skip this proc if it is not included in the machine model.
2088 //
2089 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2090 INIT_PROC_INFO(threadInfo[num_avail]);
2091 continue;
2092 }
2093
2094 //
2095 // We have a successful parse of this proc's info.
2096 // Increment the counter, and prepare for the next proc.
2097 //
2098 num_avail++;
2099 KMP_ASSERT(num_avail <= num_records);
2100 INIT_PROC_INFO(threadInfo[num_avail]);
2101 }
2102 continue;
2103
2104 no_val:
2105 CLEANUP_THREAD_INFO;
2106 *msg_id = kmp_i18n_str_MissingValCpuinfo;
2107 return -1;
2108
2109 dup_field:
2110 CLEANUP_THREAD_INFO;
2111 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2112 return -1;
2113 }
2114 *line = 0;
2115
2116# if KMP_MIC && REDUCE_TEAM_SIZE
2117 unsigned teamSize = 0;
2118# endif // KMP_MIC && REDUCE_TEAM_SIZE
2119
2120 // check for num_records == __kmp_xproc ???
2121
2122 //
2123 // If there's only one thread context to bind to, form an Address object
2124 // with depth 1 and return immediately (or, if affinity is off, set
2125 // address2os to NULL and return).
2126 //
2127 // If it is configured to omit the package level when there is only a
2128 // single package, the logic at the end of this routine won't work if
2129 // there is only a single thread - it would try to form an Address
2130 // object with depth 0.
2131 //
2132 KMP_ASSERT(num_avail > 0);
2133 KMP_ASSERT(num_avail <= num_records);
2134 if (num_avail == 1) {
2135 __kmp_ncores = 1;
2136 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2137 __kmp_ht_enabled = FALSE;
2138 if (__kmp_affinity_verbose) {
2139 if (! KMP_AFFINITY_CAPABLE()) {
2140 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2141 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2142 KMP_INFORM(Uniform, "KMP_AFFINITY");
2143 }
2144 else {
2145 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2146 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2147 fullMask);
2148 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2149 if (__kmp_affinity_respect_mask) {
2150 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2151 } else {
2152 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2153 }
2154 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2155 KMP_INFORM(Uniform, "KMP_AFFINITY");
2156 }
2157 int index;
2158 kmp_str_buf_t buf;
2159 __kmp_str_buf_init(&buf);
2160 __kmp_str_buf_print(&buf, "1");
2161 for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2162 __kmp_str_buf_print(&buf, " x 1");
2163 }
2164 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2165 __kmp_str_buf_free(&buf);
2166 }
2167
2168 if (__kmp_affinity_type == affinity_none) {
2169 CLEANUP_THREAD_INFO;
2170 return 0;
2171 }
2172
2173 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2174 Address addr(1);
2175 addr.labels[0] = threadInfo[0][pkgIdIndex];
2176 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2177
2178 if (__kmp_affinity_gran_levels < 0) {
2179 __kmp_affinity_gran_levels = 0;
2180 }
2181
2182 if (__kmp_affinity_verbose) {
2183 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2184 }
2185
2186 CLEANUP_THREAD_INFO;
2187 return 1;
2188 }
2189
2190 //
2191 // Sort the threadInfo table by physical Id.
2192 //
2193 qsort(threadInfo, num_avail, sizeof(*threadInfo),
2194 __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2195
2196 //
2197 // The table is now sorted by pkgId / coreId / threadId, but we really
2198 // don't know the radix of any of the fields. pkgId's may be sparsely
2199 // assigned among the chips on a system. Although coreId's are usually
2200 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2201 // [0..threadsPerCore-1], we don't want to make any such assumptions.
2202 //
2203 // For that matter, we don't know what coresPerPkg and threadsPerCore
2204 // (or the total # packages) are at this point - we want to determine
2205 // that now. We only have an upper bound on the first two figures.
2206 //
2207 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2208 * sizeof(unsigned));
2209 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2210 * sizeof(unsigned));
2211 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2212 * sizeof(unsigned));
2213 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2214 * sizeof(unsigned));
2215
2216 bool assign_thread_ids = false;
2217 unsigned threadIdCt;
2218 unsigned index;
2219
2220 restart_radix_check:
2221 threadIdCt = 0;
2222
2223 //
2224 // Initialize the counter arrays with data from threadInfo[0].
2225 //
2226 if (assign_thread_ids) {
2227 if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2228 threadInfo[0][threadIdIndex] = threadIdCt++;
2229 }
2230 else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2231 threadIdCt = threadInfo[0][threadIdIndex] + 1;
2232 }
2233 }
2234 for (index = 0; index <= maxIndex; index++) {
2235 counts[index] = 1;
2236 maxCt[index] = 1;
2237 totals[index] = 1;
2238 lastId[index] = threadInfo[0][index];;
2239 }
2240
2241 //
2242 // Run through the rest of the OS procs.
2243 //
2244 for (i = 1; i < num_avail; i++) {
2245 //
2246 // Find the most significant index whose id differs
2247 // from the id for the previous OS proc.
2248 //
2249 for (index = maxIndex; index >= threadIdIndex; index--) {
2250 if (assign_thread_ids && (index == threadIdIndex)) {
2251 //
2252 // Auto-assign the thread id field if it wasn't specified.
2253 //
2254 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2255 threadInfo[i][threadIdIndex] = threadIdCt++;
2256 }
2257
2258 //
2259 // Aparrently the thread id field was specified for some
2260 // entries and not others. Start the thread id counter
2261 // off at the next higher thread id.
2262 //
2263 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2264 threadIdCt = threadInfo[i][threadIdIndex] + 1;
2265 }
2266 }
2267 if (threadInfo[i][index] != lastId[index]) {
2268 //
2269 // Run through all indices which are less significant,
2270 // and reset the counts to 1.
2271 //
2272 // At all levels up to and including index, we need to
2273 // increment the totals and record the last id.
2274 //
2275 unsigned index2;
2276 for (index2 = threadIdIndex; index2 < index; index2++) {
2277 totals[index2]++;
2278 if (counts[index2] > maxCt[index2]) {
2279 maxCt[index2] = counts[index2];
2280 }
2281 counts[index2] = 1;
2282 lastId[index2] = threadInfo[i][index2];
2283 }
2284 counts[index]++;
2285 totals[index]++;
2286 lastId[index] = threadInfo[i][index];
2287
2288 if (assign_thread_ids && (index > threadIdIndex)) {
2289
2290# if KMP_MIC && REDUCE_TEAM_SIZE
2291 //
2292 // The default team size is the total #threads in the machine
2293 // minus 1 thread for every core that has 3 or more threads.
2294 //
2295 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2296# endif // KMP_MIC && REDUCE_TEAM_SIZE
2297
2298 //
2299 // Restart the thread counter, as we are on a new core.
2300 //
2301 threadIdCt = 0;
2302
2303 //
2304 // Auto-assign the thread id field if it wasn't specified.
2305 //
2306 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2307 threadInfo[i][threadIdIndex] = threadIdCt++;
2308 }
2309
2310 //
2311 // Aparrently the thread id field was specified for some
2312 // entries and not others. Start the thread id counter
2313 // off at the next higher thread id.
2314 //
2315 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2316 threadIdCt = threadInfo[i][threadIdIndex] + 1;
2317 }
2318 }
2319 break;
2320 }
2321 }
2322 if (index < threadIdIndex) {
2323 //
2324 // If thread ids were specified, it is an error if they are not
2325 // unique. Also, check that we waven't already restarted the
2326 // loop (to be safe - shouldn't need to).
2327 //
2328 if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2329 || assign_thread_ids) {
2330 __kmp_free(lastId);
2331 __kmp_free(totals);
2332 __kmp_free(maxCt);
2333 __kmp_free(counts);
2334 CLEANUP_THREAD_INFO;
2335 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2336 return -1;
2337 }
2338
2339 //
2340 // If the thread ids were not specified and we see entries
2341 // entries that are duplicates, start the loop over and
2342 // assign the thread ids manually.
2343 //
2344 assign_thread_ids = true;
2345 goto restart_radix_check;
2346 }
2347 }
2348
2349# if KMP_MIC && REDUCE_TEAM_SIZE
2350 //
2351 // The default team size is the total #threads in the machine
2352 // minus 1 thread for every core that has 3 or more threads.
2353 //
2354 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2355# endif // KMP_MIC && REDUCE_TEAM_SIZE
2356
2357 for (index = threadIdIndex; index <= maxIndex; index++) {
2358 if (counts[index] > maxCt[index]) {
2359 maxCt[index] = counts[index];
2360 }
2361 }
2362
2363 __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2364 nCoresPerPkg = maxCt[coreIdIndex];
2365 nPackages = totals[pkgIdIndex];
2366
2367 //
2368 // Check to see if the machine topology is uniform
2369 //
2370 unsigned prod = totals[maxIndex];
2371 for (index = threadIdIndex; index < maxIndex; index++) {
2372 prod *= maxCt[index];
2373 }
2374 bool uniform = (prod == totals[threadIdIndex]);
2375
2376 //
2377 // When affinity is off, this routine will still be called to set
2378 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
2379 // nCoresPerPkg, & nPackages. Make sure all these vars are set
2380 // correctly, and return now if affinity is not enabled.
2381 //
2382 __kmp_ht_enabled = (maxCt[threadIdIndex] > 1); // threads per core > 1
2383 __kmp_ncores = totals[coreIdIndex];
2384
2385 if (__kmp_affinity_verbose) {
2386 if (! KMP_AFFINITY_CAPABLE()) {
2387 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2388 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2389 if (uniform) {
2390 KMP_INFORM(Uniform, "KMP_AFFINITY");
2391 } else {
2392 KMP_INFORM(NonUniform, "KMP_AFFINITY");
2393 }
2394 }
2395 else {
2396 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2397 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2398 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2399 if (__kmp_affinity_respect_mask) {
2400 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2401 } else {
2402 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2403 }
2404 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2405 if (uniform) {
2406 KMP_INFORM(Uniform, "KMP_AFFINITY");
2407 } else {
2408 KMP_INFORM(NonUniform, "KMP_AFFINITY");
2409 }
2410 }
2411 kmp_str_buf_t buf;
2412 __kmp_str_buf_init(&buf);
2413
2414 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2415 for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2416 __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2417 }
2418 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2419 maxCt[threadIdIndex], __kmp_ncores);
2420
2421 __kmp_str_buf_free(&buf);
2422 }
2423
2424# if KMP_MIC && REDUCE_TEAM_SIZE
2425 //
2426 // Set the default team size.
2427 //
2428 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2429 __kmp_dflt_team_nth = teamSize;
2430 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2431 __kmp_dflt_team_nth));
2432 }
2433# endif // KMP_MIC && REDUCE_TEAM_SIZE
2434
2435 if (__kmp_affinity_type == affinity_none) {
2436 __kmp_free(lastId);
2437 __kmp_free(totals);
2438 __kmp_free(maxCt);
2439 __kmp_free(counts);
2440 CLEANUP_THREAD_INFO;
2441 return 0;
2442 }
2443
2444 //
2445 // Count the number of levels which have more nodes at that level than
2446 // at the parent's level (with there being an implicit root node of
2447 // the top level). This is equivalent to saying that there is at least
2448 // one node at this level which has a sibling. These levels are in the
2449 // map, and the package level is always in the map.
2450 //
2451 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2452 int level = 0;
2453 for (index = threadIdIndex; index < maxIndex; index++) {
2454 KMP_ASSERT(totals[index] >= totals[index + 1]);
2455 inMap[index] = (totals[index] > totals[index + 1]);
2456 }
2457 inMap[maxIndex] = (totals[maxIndex] > 1);
2458 inMap[pkgIdIndex] = true;
2459
2460 int depth = 0;
2461 for (index = threadIdIndex; index <= maxIndex; index++) {
2462 if (inMap[index]) {
2463 depth++;
2464 }
2465 }
2466 KMP_ASSERT(depth > 0);
2467
2468 //
2469 // Construct the data structure that is to be returned.
2470 //
2471 *address2os = (AddrUnsPair*)
2472 __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2473 int pkgLevel = -1;
2474 int coreLevel = -1;
2475 int threadLevel = -1;
2476
2477 for (i = 0; i < num_avail; ++i) {
2478 Address addr(depth);
2479 unsigned os = threadInfo[i][osIdIndex];
2480 int src_index;
2481 int dst_index = 0;
2482
2483 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2484 if (! inMap[src_index]) {
2485 continue;
2486 }
2487 addr.labels[dst_index] = threadInfo[i][src_index];
2488 if (src_index == pkgIdIndex) {
2489 pkgLevel = dst_index;
2490 }
2491 else if (src_index == coreIdIndex) {
2492 coreLevel = dst_index;
2493 }
2494 else if (src_index == threadIdIndex) {
2495 threadLevel = dst_index;
2496 }
2497 dst_index++;
2498 }
2499 (*address2os)[i] = AddrUnsPair(addr, os);
2500 }
2501
2502 if (__kmp_affinity_gran_levels < 0) {
2503 //
2504 // Set the granularity level based on what levels are modeled
2505 // in the machine topology map.
2506 //
2507 unsigned src_index;
2508 __kmp_affinity_gran_levels = 0;
2509 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2510 if (! inMap[src_index]) {
2511 continue;
2512 }
2513 switch (src_index) {
2514 case threadIdIndex:
2515 if (__kmp_affinity_gran > affinity_gran_thread) {
2516 __kmp_affinity_gran_levels++;
2517 }
2518
2519 break;
2520 case coreIdIndex:
2521 if (__kmp_affinity_gran > affinity_gran_core) {
2522 __kmp_affinity_gran_levels++;
2523 }
2524 break;
2525
2526 case pkgIdIndex:
2527 if (__kmp_affinity_gran > affinity_gran_package) {
2528 __kmp_affinity_gran_levels++;
2529 }
2530 break;
2531 }
2532 }
2533 }
2534
2535 if (__kmp_affinity_verbose) {
2536 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2537 coreLevel, threadLevel);
2538 }
2539
2540 __kmp_free(inMap);
2541 __kmp_free(lastId);
2542 __kmp_free(totals);
2543 __kmp_free(maxCt);
2544 __kmp_free(counts);
2545 CLEANUP_THREAD_INFO;
2546 return depth;
2547}
2548
2549
2550//
2551// Create and return a table of affinity masks, indexed by OS thread ID.
2552// This routine handles OR'ing together all the affinity masks of threads
2553// that are sufficiently close, if granularity > fine.
2554//
2555static kmp_affin_mask_t *
2556__kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2557 AddrUnsPair *address2os, unsigned numAddrs)
2558{
2559 //
2560 // First form a table of affinity masks in order of OS thread id.
2561 //
2562 unsigned depth;
2563 unsigned maxOsId;
2564 unsigned i;
2565
2566 KMP_ASSERT(numAddrs > 0);
2567 depth = address2os[0].first.depth;
2568
2569 maxOsId = 0;
2570 for (i = 0; i < numAddrs; i++) {
2571 unsigned osId = address2os[i].second;
2572 if (osId > maxOsId) {
2573 maxOsId = osId;
2574 }
2575 }
2576 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2577 (maxOsId + 1) * __kmp_affin_mask_size);
2578
2579 //
2580 // Sort the address2os table according to physical order. Doing so
2581 // will put all threads on the same core/package/node in consecutive
2582 // locations.
2583 //
2584 qsort(address2os, numAddrs, sizeof(*address2os),
2585 __kmp_affinity_cmp_Address_labels);
2586
2587 KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2588 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2589 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2590 }
2591 if (__kmp_affinity_gran_levels >= (int)depth) {
2592 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2593 && (__kmp_affinity_type != affinity_none))) {
2594 KMP_WARNING(AffThreadsMayMigrate);
2595 }
2596 }
2597
2598 //
2599 // Run through the table, forming the masks for all threads on each
2600 // core. Threads on the same core will have identical "Address"
2601 // objects, not considering the last level, which must be the thread
2602 // id. All threads on a core will appear consecutively.
2603 //
2604 unsigned unique = 0;
2605 unsigned j = 0; // index of 1st thread on core
2606 unsigned leader = 0;
2607 Address *leaderAddr = &(address2os[0].first);
2608 kmp_affin_mask_t *sum
2609 = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
2610 KMP_CPU_ZERO(sum);
2611 KMP_CPU_SET(address2os[0].second, sum);
2612 for (i = 1; i < numAddrs; i++) {
2613 //
Alp Toker8f2d3f02014-02-24 10:40:15 +00002614 // If this thread is sufficiently close to the leader (within the
Jim Cownie5e8470a2013-09-27 10:38:44 +00002615 // granularity setting), then set the bit for this os thread in the
2616 // affinity mask for this group, and go on to the next thread.
2617 //
2618 if (leaderAddr->isClose(address2os[i].first,
2619 __kmp_affinity_gran_levels)) {
2620 KMP_CPU_SET(address2os[i].second, sum);
2621 continue;
2622 }
2623
2624 //
2625 // For every thread in this group, copy the mask to the thread's
2626 // entry in the osId2Mask table. Mark the first address as a
2627 // leader.
2628 //
2629 for (; j < i; j++) {
2630 unsigned osId = address2os[j].second;
2631 KMP_DEBUG_ASSERT(osId <= maxOsId);
2632 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2633 KMP_CPU_COPY(mask, sum);
2634 address2os[j].first.leader = (j == leader);
2635 }
2636 unique++;
2637
2638 //
2639 // Start a new mask.
2640 //
2641 leader = i;
2642 leaderAddr = &(address2os[i].first);
2643 KMP_CPU_ZERO(sum);
2644 KMP_CPU_SET(address2os[i].second, sum);
2645 }
2646
2647 //
2648 // For every thread in last group, copy the mask to the thread's
2649 // entry in the osId2Mask table.
2650 //
2651 for (; j < i; j++) {
2652 unsigned osId = address2os[j].second;
2653 KMP_DEBUG_ASSERT(osId <= maxOsId);
2654 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2655 KMP_CPU_COPY(mask, sum);
2656 address2os[j].first.leader = (j == leader);
2657 }
2658 unique++;
2659
2660 *maxIndex = maxOsId;
2661 *numUnique = unique;
2662 return osId2Mask;
2663}
2664
2665
2666//
2667// Stuff for the affinity proclist parsers. It's easier to declare these vars
2668// as file-static than to try and pass them through the calling sequence of
2669// the recursive-descent OMP_PLACES parser.
2670//
2671static kmp_affin_mask_t *newMasks;
2672static int numNewMasks;
2673static int nextNewMask;
2674
2675#define ADD_MASK(_mask) \
2676 { \
2677 if (nextNewMask >= numNewMasks) { \
2678 numNewMasks *= 2; \
2679 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2680 numNewMasks * __kmp_affin_mask_size); \
2681 } \
2682 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2683 nextNewMask++; \
2684 }
2685
2686#define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2687 { \
2688 if (((_osId) > _maxOsId) || \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002689 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
Jim Cownie5e8470a2013-09-27 10:38:44 +00002690 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \
2691 && (__kmp_affinity_type != affinity_none))) { \
2692 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2693 } \
2694 } \
2695 else { \
2696 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2697 } \
2698 }
2699
2700
2701//
2702// Re-parse the proclist (for the explicit affinity type), and form the list
2703// of affinity newMasks indexed by gtid.
2704//
2705static void
2706__kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2707 unsigned int *out_numMasks, const char *proclist,
2708 kmp_affin_mask_t *osId2Mask, int maxOsId)
2709{
2710 const char *scan = proclist;
2711 const char *next = proclist;
2712
2713 //
2714 // We use malloc() for the temporary mask vector,
2715 // so that we can use realloc() to extend it.
2716 //
2717 numNewMasks = 2;
2718 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2719 * __kmp_affin_mask_size);
2720 nextNewMask = 0;
2721 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2722 __kmp_affin_mask_size);
2723 int setSize = 0;
2724
2725 for (;;) {
2726 int start, end, stride;
2727
2728 SKIP_WS(scan);
2729 next = scan;
2730 if (*next == '\0') {
2731 break;
2732 }
2733
2734 if (*next == '{') {
2735 int num;
2736 setSize = 0;
2737 next++; // skip '{'
2738 SKIP_WS(next);
2739 scan = next;
2740
2741 //
2742 // Read the first integer in the set.
2743 //
2744 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2745 "bad proclist");
2746 SKIP_DIGITS(next);
2747 num = __kmp_str_to_int(scan, *next);
2748 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2749
2750 //
2751 // Copy the mask for that osId to the sum (union) mask.
2752 //
2753 if ((num > maxOsId) ||
2754 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2755 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2756 && (__kmp_affinity_type != affinity_none))) {
2757 KMP_WARNING(AffIgnoreInvalidProcID, num);
2758 }
2759 KMP_CPU_ZERO(sumMask);
2760 }
2761 else {
2762 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2763 setSize = 1;
2764 }
2765
2766 for (;;) {
2767 //
2768 // Check for end of set.
2769 //
2770 SKIP_WS(next);
2771 if (*next == '}') {
2772 next++; // skip '}'
2773 break;
2774 }
2775
2776 //
2777 // Skip optional comma.
2778 //
2779 if (*next == ',') {
2780 next++;
2781 }
2782 SKIP_WS(next);
2783
2784 //
2785 // Read the next integer in the set.
2786 //
2787 scan = next;
2788 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2789 "bad explicit proc list");
2790
2791 SKIP_DIGITS(next);
2792 num = __kmp_str_to_int(scan, *next);
2793 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2794
2795 //
2796 // Add the mask for that osId to the sum mask.
2797 //
2798 if ((num > maxOsId) ||
2799 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2800 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2801 && (__kmp_affinity_type != affinity_none))) {
2802 KMP_WARNING(AffIgnoreInvalidProcID, num);
2803 }
2804 }
2805 else {
2806 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2807 setSize++;
2808 }
2809 }
2810 if (setSize > 0) {
2811 ADD_MASK(sumMask);
2812 }
2813
2814 SKIP_WS(next);
2815 if (*next == ',') {
2816 next++;
2817 }
2818 scan = next;
2819 continue;
2820 }
2821
2822 //
2823 // Read the first integer.
2824 //
2825 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2826 SKIP_DIGITS(next);
2827 start = __kmp_str_to_int(scan, *next);
2828 KMP_ASSERT2(start >= 0, "bad explicit proc list");
2829 SKIP_WS(next);
2830
2831 //
2832 // If this isn't a range, then add a mask to the list and go on.
2833 //
2834 if (*next != '-') {
2835 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2836
2837 //
2838 // Skip optional comma.
2839 //
2840 if (*next == ',') {
2841 next++;
2842 }
2843 scan = next;
2844 continue;
2845 }
2846
2847 //
2848 // This is a range. Skip over the '-' and read in the 2nd int.
2849 //
2850 next++; // skip '-'
2851 SKIP_WS(next);
2852 scan = next;
2853 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2854 SKIP_DIGITS(next);
2855 end = __kmp_str_to_int(scan, *next);
2856 KMP_ASSERT2(end >= 0, "bad explicit proc list");
2857
2858 //
2859 // Check for a stride parameter
2860 //
2861 stride = 1;
2862 SKIP_WS(next);
2863 if (*next == ':') {
2864 //
2865 // A stride is specified. Skip over the ':" and read the 3rd int.
2866 //
2867 int sign = +1;
2868 next++; // skip ':'
2869 SKIP_WS(next);
2870 scan = next;
2871 if (*next == '-') {
2872 sign = -1;
2873 next++;
2874 SKIP_WS(next);
2875 scan = next;
2876 }
2877 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2878 "bad explicit proc list");
2879 SKIP_DIGITS(next);
2880 stride = __kmp_str_to_int(scan, *next);
2881 KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2882 stride *= sign;
2883 }
2884
2885 //
2886 // Do some range checks.
2887 //
2888 KMP_ASSERT2(stride != 0, "bad explicit proc list");
2889 if (stride > 0) {
2890 KMP_ASSERT2(start <= end, "bad explicit proc list");
2891 }
2892 else {
2893 KMP_ASSERT2(start >= end, "bad explicit proc list");
2894 }
2895 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2896
2897 //
2898 // Add the mask for each OS proc # to the list.
2899 //
2900 if (stride > 0) {
2901 do {
2902 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2903 start += stride;
2904 } while (start <= end);
2905 }
2906 else {
2907 do {
2908 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2909 start += stride;
2910 } while (start >= end);
2911 }
2912
2913 //
2914 // Skip optional comma.
2915 //
2916 SKIP_WS(next);
2917 if (*next == ',') {
2918 next++;
2919 }
2920 scan = next;
2921 }
2922
2923 *out_numMasks = nextNewMask;
2924 if (nextNewMask == 0) {
2925 *out_masks = NULL;
2926 KMP_INTERNAL_FREE(newMasks);
2927 return;
2928 }
2929 *out_masks
2930 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2931 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2932 __kmp_free(sumMask);
2933 KMP_INTERNAL_FREE(newMasks);
2934}
2935
2936
2937# if OMP_40_ENABLED
2938
2939/*-----------------------------------------------------------------------------
2940
2941Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2942places. Again, Here is the grammar:
2943
2944place_list := place
2945place_list := place , place_list
2946place := num
2947place := place : num
2948place := place : num : signed
2949place := { subplacelist }
2950place := ! place // (lowest priority)
2951subplace_list := subplace
2952subplace_list := subplace , subplace_list
2953subplace := num
2954subplace := num : num
2955subplace := num : num : signed
2956signed := num
2957signed := + signed
2958signed := - signed
2959
2960-----------------------------------------------------------------------------*/
2961
2962static void
2963__kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2964 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2965{
2966 const char *next;
2967
2968 for (;;) {
2969 int start, count, stride, i;
2970
2971 //
2972 // Read in the starting proc id
2973 //
2974 SKIP_WS(*scan);
2975 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2976 "bad explicit places list");
2977 next = *scan;
2978 SKIP_DIGITS(next);
2979 start = __kmp_str_to_int(*scan, *next);
2980 KMP_ASSERT(start >= 0);
2981 *scan = next;
2982
2983 //
2984 // valid follow sets are ',' ':' and '}'
2985 //
2986 SKIP_WS(*scan);
2987 if (**scan == '}' || **scan == ',') {
2988 if ((start > maxOsId) ||
2989 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2990 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2991 && (__kmp_affinity_type != affinity_none))) {
2992 KMP_WARNING(AffIgnoreInvalidProcID, start);
2993 }
2994 }
2995 else {
2996 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2997 (*setSize)++;
2998 }
2999 if (**scan == '}') {
3000 break;
3001 }
3002 (*scan)++; // skip ','
3003 continue;
3004 }
3005 KMP_ASSERT2(**scan == ':', "bad explicit places list");
3006 (*scan)++; // skip ':'
3007
3008 //
3009 // Read count parameter
3010 //
3011 SKIP_WS(*scan);
3012 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3013 "bad explicit places list");
3014 next = *scan;
3015 SKIP_DIGITS(next);
3016 count = __kmp_str_to_int(*scan, *next);
3017 KMP_ASSERT(count >= 0);
3018 *scan = next;
3019
3020 //
3021 // valid follow sets are ',' ':' and '}'
3022 //
3023 SKIP_WS(*scan);
3024 if (**scan == '}' || **scan == ',') {
3025 for (i = 0; i < count; i++) {
3026 if ((start > maxOsId) ||
3027 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3028 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3029 && (__kmp_affinity_type != affinity_none))) {
3030 KMP_WARNING(AffIgnoreInvalidProcID, start);
3031 }
3032 break; // don't proliferate warnings for large count
3033 }
3034 else {
3035 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3036 start++;
3037 (*setSize)++;
3038 }
3039 }
3040 if (**scan == '}') {
3041 break;
3042 }
3043 (*scan)++; // skip ','
3044 continue;
3045 }
3046 KMP_ASSERT2(**scan == ':', "bad explicit places list");
3047 (*scan)++; // skip ':'
3048
3049 //
3050 // Read stride parameter
3051 //
3052 int sign = +1;
3053 for (;;) {
3054 SKIP_WS(*scan);
3055 if (**scan == '+') {
3056 (*scan)++; // skip '+'
3057 continue;
3058 }
3059 if (**scan == '-') {
3060 sign *= -1;
3061 (*scan)++; // skip '-'
3062 continue;
3063 }
3064 break;
3065 }
3066 SKIP_WS(*scan);
3067 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3068 "bad explicit places list");
3069 next = *scan;
3070 SKIP_DIGITS(next);
3071 stride = __kmp_str_to_int(*scan, *next);
3072 KMP_ASSERT(stride >= 0);
3073 *scan = next;
3074 stride *= sign;
3075
3076 //
3077 // valid follow sets are ',' and '}'
3078 //
3079 SKIP_WS(*scan);
3080 if (**scan == '}' || **scan == ',') {
3081 for (i = 0; i < count; i++) {
3082 if ((start > maxOsId) ||
3083 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3084 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3085 && (__kmp_affinity_type != affinity_none))) {
3086 KMP_WARNING(AffIgnoreInvalidProcID, start);
3087 }
3088 break; // don't proliferate warnings for large count
3089 }
3090 else {
3091 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3092 start += stride;
3093 (*setSize)++;
3094 }
3095 }
3096 if (**scan == '}') {
3097 break;
3098 }
3099 (*scan)++; // skip ','
3100 continue;
3101 }
3102
3103 KMP_ASSERT2(0, "bad explicit places list");
3104 }
3105}
3106
3107
3108static void
3109__kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3110 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3111{
3112 const char *next;
3113
3114 //
3115 // valid follow sets are '{' '!' and num
3116 //
3117 SKIP_WS(*scan);
3118 if (**scan == '{') {
3119 (*scan)++; // skip '{'
3120 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3121 setSize);
3122 KMP_ASSERT2(**scan == '}', "bad explicit places list");
3123 (*scan)++; // skip '}'
3124 }
3125 else if (**scan == '!') {
3126 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3127 KMP_CPU_COMPLEMENT(tempMask);
3128 (*scan)++; // skip '!'
3129 }
3130 else if ((**scan >= '0') && (**scan <= '9')) {
3131 next = *scan;
3132 SKIP_DIGITS(next);
3133 int num = __kmp_str_to_int(*scan, *next);
3134 KMP_ASSERT(num >= 0);
3135 if ((num > maxOsId) ||
3136 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3137 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3138 && (__kmp_affinity_type != affinity_none))) {
3139 KMP_WARNING(AffIgnoreInvalidProcID, num);
3140 }
3141 }
3142 else {
3143 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3144 (*setSize)++;
3145 }
3146 *scan = next; // skip num
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003147 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003148 else {
3149 KMP_ASSERT2(0, "bad explicit places list");
3150 }
3151}
3152
3153
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003154//static void
3155void
Jim Cownie5e8470a2013-09-27 10:38:44 +00003156__kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3157 unsigned int *out_numMasks, const char *placelist,
3158 kmp_affin_mask_t *osId2Mask, int maxOsId)
3159{
3160 const char *scan = placelist;
3161 const char *next = placelist;
3162
3163 numNewMasks = 2;
3164 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3165 * __kmp_affin_mask_size);
3166 nextNewMask = 0;
3167
3168 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3169 __kmp_affin_mask_size);
3170 KMP_CPU_ZERO(tempMask);
3171 int setSize = 0;
3172
3173 for (;;) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003174 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3175
3176 //
3177 // valid follow sets are ',' ':' and EOL
3178 //
3179 SKIP_WS(scan);
3180 if (*scan == '\0' || *scan == ',') {
3181 if (setSize > 0) {
3182 ADD_MASK(tempMask);
3183 }
3184 KMP_CPU_ZERO(tempMask);
3185 setSize = 0;
3186 if (*scan == '\0') {
3187 break;
3188 }
3189 scan++; // skip ','
3190 continue;
3191 }
3192
3193 KMP_ASSERT2(*scan == ':', "bad explicit places list");
3194 scan++; // skip ':'
3195
3196 //
3197 // Read count parameter
3198 //
3199 SKIP_WS(scan);
3200 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3201 "bad explicit places list");
3202 next = scan;
3203 SKIP_DIGITS(next);
Jim Cownie181b4bb2013-12-23 17:28:57 +00003204 int count = __kmp_str_to_int(scan, *next);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003205 KMP_ASSERT(count >= 0);
3206 scan = next;
3207
3208 //
3209 // valid follow sets are ',' ':' and EOL
3210 //
3211 SKIP_WS(scan);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003212 int stride;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003213 if (*scan == '\0' || *scan == ',') {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003214 stride = +1;
3215 }
3216 else {
3217 KMP_ASSERT2(*scan == ':', "bad explicit places list");
3218 scan++; // skip ':'
Jim Cownie5e8470a2013-09-27 10:38:44 +00003219
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003220 //
3221 // Read stride parameter
3222 //
3223 int sign = +1;
3224 for (;;) {
3225 SKIP_WS(scan);
3226 if (*scan == '+') {
3227 scan++; // skip '+'
3228 continue;
3229 }
3230 if (*scan == '-') {
3231 sign *= -1;
3232 scan++; // skip '-'
3233 continue;
3234 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003235 break;
3236 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003237 SKIP_WS(scan);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003238 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3239 "bad explicit places list");
3240 next = scan;
3241 SKIP_DIGITS(next);
3242 stride = __kmp_str_to_int(scan, *next);
3243 KMP_DEBUG_ASSERT(stride >= 0);
3244 scan = next;
3245 stride *= sign;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003246 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003247
3248 if (stride > 0) {
3249 int i;
3250 for (i = 0; i < count; i++) {
3251 int j;
3252 if (setSize == 0) {
3253 break;
3254 }
3255 ADD_MASK(tempMask);
3256 setSize = 0;
3257 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003258 if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3259 KMP_CPU_CLR(j, tempMask);
3260 }
3261 else if ((j > maxOsId) ||
3262 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3263 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3264 && (__kmp_affinity_type != affinity_none))) {
3265 KMP_WARNING(AffIgnoreInvalidProcID, j);
3266 }
3267 KMP_CPU_CLR(j, tempMask);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003268 }
3269 else {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003270 KMP_CPU_SET(j, tempMask);
3271 setSize++;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003272 }
3273 }
3274 for (; j >= 0; j--) {
3275 KMP_CPU_CLR(j, tempMask);
3276 }
3277 }
3278 }
3279 else {
3280 int i;
3281 for (i = 0; i < count; i++) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003282 int j;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003283 if (setSize == 0) {
3284 break;
3285 }
3286 ADD_MASK(tempMask);
3287 setSize = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003288 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
Jim Cownie5e8470a2013-09-27 10:38:44 +00003289 j++) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003290 if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3291 KMP_CPU_CLR(j, tempMask);
3292 }
3293 else if ((j > maxOsId) ||
3294 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3295 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3296 && (__kmp_affinity_type != affinity_none))) {
3297 KMP_WARNING(AffIgnoreInvalidProcID, j);
3298 }
3299 KMP_CPU_CLR(j, tempMask);
3300 }
3301 else {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003302 KMP_CPU_SET(j, tempMask);
3303 setSize++;
3304 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003305 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003306 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003307 KMP_CPU_CLR(j, tempMask);
3308 }
3309 }
3310 }
3311 KMP_CPU_ZERO(tempMask);
3312 setSize = 0;
3313
3314 //
3315 // valid follow sets are ',' and EOL
3316 //
3317 SKIP_WS(scan);
3318 if (*scan == '\0') {
3319 break;
3320 }
3321 if (*scan == ',') {
3322 scan++; // skip ','
3323 continue;
3324 }
3325
3326 KMP_ASSERT2(0, "bad explicit places list");
3327 }
3328
3329 *out_numMasks = nextNewMask;
3330 if (nextNewMask == 0) {
3331 *out_masks = NULL;
3332 KMP_INTERNAL_FREE(newMasks);
3333 return;
3334 }
3335 *out_masks
3336 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3337 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3338 __kmp_free(tempMask);
3339 KMP_INTERNAL_FREE(newMasks);
3340}
3341
3342# endif /* OMP_40_ENABLED */
3343
3344#undef ADD_MASK
3345#undef ADD_MASK_OSID
3346
3347
3348# if KMP_MIC
3349
3350static void
3351__kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3352{
3353 if ( __kmp_place_num_cores == 0 ) {
3354 if ( __kmp_place_num_threads_per_core == 0 ) {
3355 return; // no cores limiting actions requested, exit
3356 }
3357 __kmp_place_num_cores = nCoresPerPkg; // use all available cores
3358 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003359 if ( !__kmp_affinity_uniform_topology() ) {
3360 KMP_WARNING( AffThrPlaceNonUniform );
3361 return; // don't support non-uniform topology
3362 }
3363 if ( depth != 3 ) {
3364 KMP_WARNING( AffThrPlaceNonThreeLevel );
3365 return; // don't support not-3-level topology
Jim Cownie5e8470a2013-09-27 10:38:44 +00003366 }
3367 if ( __kmp_place_num_threads_per_core == 0 ) {
3368 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3369 }
3370 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3371 KMP_WARNING( AffThrPlaceManyCores );
3372 return;
3373 }
3374
3375 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3376 nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3377 int i, j, k, n_old = 0, n_new = 0;
3378 for ( i = 0; i < nPackages; ++i ) {
3379 for ( j = 0; j < nCoresPerPkg; ++j ) {
3380 if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3381 n_old += __kmp_nThreadsPerCore; // skip not-requested core
3382 } else {
3383 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3384 if ( k < __kmp_place_num_threads_per_core ) {
3385 newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location
3386 n_new++;
3387 }
3388 n_old++;
3389 }
3390 }
3391 }
3392 }
3393 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg
3394 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3395 __kmp_avail_proc = n_new; // correct avail_proc
3396 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores
3397
3398 __kmp_free( *pAddr );
3399 *pAddr = newAddr; // replace old topology with new one
3400}
3401
3402# endif /* KMP_MIC */
3403
3404
3405static AddrUnsPair *address2os = NULL;
3406static int * procarr = NULL;
3407static int __kmp_aff_depth = 0;
3408
3409static void
3410__kmp_aux_affinity_initialize(void)
3411{
3412 if (__kmp_affinity_masks != NULL) {
3413 KMP_ASSERT(fullMask != NULL);
3414 return;
3415 }
3416
3417 //
3418 // Create the "full" mask - this defines all of the processors that we
3419 // consider to be in the machine model. If respect is set, then it is
3420 // the initialization thread's affinity mask. Otherwise, it is all
3421 // processors that we know about on the machine.
3422 //
3423 if (fullMask == NULL) {
3424 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3425 }
3426 if (KMP_AFFINITY_CAPABLE()) {
3427 if (__kmp_affinity_respect_mask) {
3428 __kmp_get_system_affinity(fullMask, TRUE);
3429
3430 //
3431 // Count the number of available processors.
3432 //
3433 unsigned i;
3434 __kmp_avail_proc = 0;
3435 for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3436 if (! KMP_CPU_ISSET(i, fullMask)) {
3437 continue;
3438 }
3439 __kmp_avail_proc++;
3440 }
3441 if (__kmp_avail_proc > __kmp_xproc) {
3442 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3443 && (__kmp_affinity_type != affinity_none))) {
3444 KMP_WARNING(ErrorInitializeAffinity);
3445 }
3446 __kmp_affinity_type = affinity_none;
3447 __kmp_affin_mask_size = 0;
3448 return;
3449 }
3450 }
3451 else {
3452 __kmp_affinity_entire_machine_mask(fullMask);
3453 __kmp_avail_proc = __kmp_xproc;
3454 }
3455 }
3456
3457 int depth = -1;
3458 kmp_i18n_id_t msg_id = kmp_i18n_null;
3459
3460 //
Alp Toker8f2d3f02014-02-24 10:40:15 +00003461 // For backward compatibility, setting KMP_CPUINFO_FILE =>
Jim Cownie5e8470a2013-09-27 10:38:44 +00003462 // KMP_TOPOLOGY_METHOD=cpuinfo
3463 //
3464 if ((__kmp_cpuinfo_file != NULL) &&
3465 (__kmp_affinity_top_method == affinity_top_method_all)) {
3466 __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3467 }
3468
3469 if (__kmp_affinity_top_method == affinity_top_method_all) {
3470 //
3471 // In the default code path, errors are not fatal - we just try using
3472 // another method. We only emit a warning message if affinity is on,
3473 // or the verbose flag is set, an the nowarnings flag was not set.
3474 //
3475 const char *file_name = NULL;
3476 int line = 0;
3477
3478# if KMP_ARCH_X86 || KMP_ARCH_X86_64
3479
3480 if (__kmp_affinity_verbose) {
3481 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3482 }
3483
3484 file_name = NULL;
3485 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3486 if (depth == 0) {
3487 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3488 KMP_ASSERT(address2os == NULL);
3489 return;
3490 }
3491
3492 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003493 if (__kmp_affinity_verbose) {
3494 if (msg_id != kmp_i18n_null) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003495 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3496 KMP_I18N_STR(DecodingLegacyAPIC));
3497 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003498 else {
3499 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3500 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003501 }
3502
3503 file_name = NULL;
3504 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3505 if (depth == 0) {
3506 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3507 KMP_ASSERT(address2os == NULL);
3508 return;
3509 }
3510 }
3511
3512# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3513
3514# if KMP_OS_LINUX
3515
3516 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003517 if (__kmp_affinity_verbose) {
3518 if (msg_id != kmp_i18n_null) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003519 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3520 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003521 else {
3522 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3523 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003524 }
3525
3526 FILE *f = fopen("/proc/cpuinfo", "r");
3527 if (f == NULL) {
3528 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3529 }
3530 else {
3531 file_name = "/proc/cpuinfo";
3532 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3533 fclose(f);
3534 if (depth == 0) {
3535 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3536 KMP_ASSERT(address2os == NULL);
3537 return;
3538 }
3539 }
3540 }
3541
3542# endif /* KMP_OS_LINUX */
3543
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003544# if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3545
3546 if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3547 if (__kmp_affinity_verbose) {
3548 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3549 }
3550
3551 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3552 KMP_ASSERT(depth != 0);
3553 }
3554
3555# endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
3556
Jim Cownie5e8470a2013-09-27 10:38:44 +00003557 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003558 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00003559 if (file_name == NULL) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003560 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003561 }
3562 else if (line == 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003563 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003564 }
3565 else {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003566 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003567 }
3568 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003569 // FIXME - print msg if msg_id = kmp_i18n_null ???
Jim Cownie5e8470a2013-09-27 10:38:44 +00003570
3571 file_name = "";
3572 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3573 if (depth == 0) {
3574 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3575 KMP_ASSERT(address2os == NULL);
3576 return;
3577 }
3578 KMP_ASSERT(depth > 0);
3579 KMP_ASSERT(address2os != NULL);
3580 }
3581 }
3582
3583 //
3584 // If the user has specified that a paricular topology discovery method
3585 // is to be used, then we abort if that method fails. The exception is
3586 // group affinity, which might have been implicitly set.
3587 //
3588
3589# if KMP_ARCH_X86 || KMP_ARCH_X86_64
3590
3591 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3592 if (__kmp_affinity_verbose) {
3593 KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3594 KMP_I18N_STR(Decodingx2APIC));
3595 }
3596
3597 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3598 if (depth == 0) {
3599 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3600 KMP_ASSERT(address2os == NULL);
3601 return;
3602 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003603 if (depth < 0) {
3604 KMP_ASSERT(msg_id != kmp_i18n_null);
3605 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3606 }
3607 }
3608 else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3609 if (__kmp_affinity_verbose) {
3610 KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3611 KMP_I18N_STR(DecodingLegacyAPIC));
3612 }
3613
3614 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3615 if (depth == 0) {
3616 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3617 KMP_ASSERT(address2os == NULL);
3618 return;
3619 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00003620 if (depth < 0) {
3621 KMP_ASSERT(msg_id != kmp_i18n_null);
3622 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3623 }
3624 }
3625
3626# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3627
3628 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3629 const char *filename;
3630 if (__kmp_cpuinfo_file != NULL) {
3631 filename = __kmp_cpuinfo_file;
3632 }
3633 else {
3634 filename = "/proc/cpuinfo";
3635 }
3636
3637 if (__kmp_affinity_verbose) {
3638 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3639 }
3640
3641 FILE *f = fopen(filename, "r");
3642 if (f == NULL) {
3643 int code = errno;
3644 if (__kmp_cpuinfo_file != NULL) {
3645 __kmp_msg(
3646 kmp_ms_fatal,
3647 KMP_MSG(CantOpenFileForReading, filename),
3648 KMP_ERR(code),
3649 KMP_HNT(NameComesFrom_CPUINFO_FILE),
3650 __kmp_msg_null
3651 );
3652 }
3653 else {
3654 __kmp_msg(
3655 kmp_ms_fatal,
3656 KMP_MSG(CantOpenFileForReading, filename),
3657 KMP_ERR(code),
3658 __kmp_msg_null
3659 );
3660 }
3661 }
3662 int line = 0;
3663 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3664 fclose(f);
3665 if (depth < 0) {
3666 KMP_ASSERT(msg_id != kmp_i18n_null);
3667 if (line > 0) {
3668 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3669 }
3670 else {
3671 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3672 }
3673 }
3674 if (__kmp_affinity_type == affinity_none) {
3675 KMP_ASSERT(depth == 0);
3676 KMP_ASSERT(address2os == NULL);
3677 return;
3678 }
3679 }
3680
3681# if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3682
3683 else if (__kmp_affinity_top_method == affinity_top_method_group) {
3684 if (__kmp_affinity_verbose) {
3685 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3686 }
3687
3688 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3689 KMP_ASSERT(depth != 0);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003690 if (depth < 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003691 KMP_ASSERT(msg_id != kmp_i18n_null);
3692 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
Jim Cownie5e8470a2013-09-27 10:38:44 +00003693 }
3694 }
3695
3696# endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
3697
3698 else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3699 if (__kmp_affinity_verbose) {
3700 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3701 }
3702
3703 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3704 if (depth == 0) {
3705 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3706 KMP_ASSERT(address2os == NULL);
3707 return;
3708 }
3709 // should not fail
3710 KMP_ASSERT(depth > 0);
3711 KMP_ASSERT(address2os != NULL);
3712 }
3713
3714 if (address2os == NULL) {
3715 if (KMP_AFFINITY_CAPABLE()
3716 && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3717 && (__kmp_affinity_type != affinity_none)))) {
3718 KMP_WARNING(ErrorInitializeAffinity);
3719 }
3720 __kmp_affinity_type = affinity_none;
3721 __kmp_affin_mask_size = 0;
3722 return;
3723 }
3724
3725# if KMP_MIC
3726 __kmp_apply_thread_places(&address2os, depth);
3727# endif
3728
3729 //
3730 // Create the table of masks, indexed by thread Id.
3731 //
3732 unsigned maxIndex;
3733 unsigned numUnique;
3734 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3735 address2os, __kmp_avail_proc);
3736 if (__kmp_affinity_gran_levels == 0) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003737 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003738 }
3739
3740 //
3741 // Set the childNums vector in all Address objects. This must be done
3742 // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3743 // which takes into account the setting of __kmp_affinity_compact.
3744 //
3745 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3746
3747 switch (__kmp_affinity_type) {
3748
3749 case affinity_explicit:
3750 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3751# if OMP_40_ENABLED
3752 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3753# endif
3754 {
3755 __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3756 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3757 maxIndex);
3758 }
3759# if OMP_40_ENABLED
3760 else {
3761 __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3762 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3763 maxIndex);
3764 }
3765# endif
3766 if (__kmp_affinity_num_masks == 0) {
3767 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3768 && (__kmp_affinity_type != affinity_none))) {
3769 KMP_WARNING(AffNoValidProcID);
3770 }
3771 __kmp_affinity_type = affinity_none;
3772 return;
3773 }
3774 break;
3775
3776 //
3777 // The other affinity types rely on sorting the Addresses according
3778 // to some permutation of the machine topology tree. Set
3779 // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3780 // then jump to a common code fragment to do the sort and create
3781 // the array of affinity masks.
3782 //
3783
3784 case affinity_logical:
3785 __kmp_affinity_compact = 0;
3786 if (__kmp_affinity_offset) {
3787 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3788 % __kmp_avail_proc;
3789 }
3790 goto sortAddresses;
3791
3792 case affinity_physical:
3793 if (__kmp_nThreadsPerCore > 1) {
3794 __kmp_affinity_compact = 1;
3795 if (__kmp_affinity_compact >= depth) {
3796 __kmp_affinity_compact = 0;
3797 }
3798 } else {
3799 __kmp_affinity_compact = 0;
3800 }
3801 if (__kmp_affinity_offset) {
3802 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3803 % __kmp_avail_proc;
3804 }
3805 goto sortAddresses;
3806
3807 case affinity_scatter:
3808 if (__kmp_affinity_compact >= depth) {
3809 __kmp_affinity_compact = 0;
3810 }
3811 else {
3812 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3813 }
3814 goto sortAddresses;
3815
3816 case affinity_compact:
3817 if (__kmp_affinity_compact >= depth) {
3818 __kmp_affinity_compact = depth - 1;
3819 }
3820 goto sortAddresses;
3821
3822# if KMP_MIC
3823 case affinity_balanced:
3824 // Balanced works only for the case of a single package and uniform topology
3825 if( nPackages > 1 ) {
3826 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3827 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3828 }
3829 __kmp_affinity_type = affinity_none;
3830 return;
3831 } else if( __kmp_affinity_uniform_topology() ) {
3832 break;
3833 } else { // Non-uniform topology
3834
3835 // Save the depth for further usage
3836 __kmp_aff_depth = depth;
3837
3838 // Number of hyper threads per core in HT machine
3839 int nth_per_core = __kmp_nThreadsPerCore;
3840
3841 int core_level;
3842 if( nth_per_core > 1 ) {
3843 core_level = depth - 2;
3844 } else {
3845 core_level = depth - 1;
3846 }
3847 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3848 int nproc = nth_per_core * ncores;
3849
3850 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3851 for( int i = 0; i < nproc; i++ ) {
3852 procarr[ i ] = -1;
3853 }
3854
3855 for( int i = 0; i < __kmp_avail_proc; i++ ) {
3856 int proc = address2os[ i ].second;
3857 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3858 // If there is only one thread per core then depth == 2: level 0 - package,
3859 // level 1 - core.
3860 int level = depth - 1;
3861
3862 // __kmp_nth_per_core == 1
3863 int thread = 0;
3864 int core = address2os[ i ].first.labels[ level ];
3865 // If the thread level exists, that is we have more than one thread context per core
3866 if( nth_per_core > 1 ) {
3867 thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3868 core = address2os[ i ].first.labels[ level - 1 ];
3869 }
3870 procarr[ core * nth_per_core + thread ] = proc;
3871 }
3872
3873 break;
3874 }
3875# endif
3876
3877 sortAddresses:
3878 //
3879 // Allocate the gtid->affinity mask table.
3880 //
3881 if (__kmp_affinity_dups) {
3882 __kmp_affinity_num_masks = __kmp_avail_proc;
3883 }
3884 else {
3885 __kmp_affinity_num_masks = numUnique;
3886 }
3887
3888# if OMP_40_ENABLED
3889 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3890 && ( __kmp_affinity_num_places > 0 )
3891 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3892 __kmp_affinity_num_masks = __kmp_affinity_num_places;
3893 }
3894# endif
3895
3896 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3897 __kmp_affinity_num_masks * __kmp_affin_mask_size);
3898
3899 //
3900 // Sort the address2os table according to the current setting of
3901 // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3902 //
3903 qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3904 __kmp_affinity_cmp_Address_child_num);
3905 {
3906 int i;
3907 unsigned j;
3908 for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3909 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3910 continue;
3911 }
3912 unsigned osId = address2os[i].second;
3913 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3914 kmp_affin_mask_t *dest
3915 = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3916 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3917 KMP_CPU_COPY(dest, src);
3918 if (++j >= __kmp_affinity_num_masks) {
3919 break;
3920 }
3921 }
3922 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3923 }
3924 break;
3925
3926 default:
3927 KMP_ASSERT2(0, "Unexpected affinity setting");
3928 }
3929
3930 __kmp_free(osId2Mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003931 machine_hierarchy.init(address2os, __kmp_avail_proc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00003932}
3933
3934
3935void
3936__kmp_affinity_initialize(void)
3937{
3938 //
3939 // Much of the code above was written assumming that if a machine was not
3940 // affinity capable, then __kmp_affinity_type == affinity_none. We now
3941 // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3942 //
3943 // There are too many checks for __kmp_affinity_type == affinity_none
3944 // in this code. Instead of trying to change them all, check if
3945 // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3946 // affinity_none, call the real initialization routine, then restore
3947 // __kmp_affinity_type to affinity_disabled.
3948 //
3949 int disabled = (__kmp_affinity_type == affinity_disabled);
3950 if (! KMP_AFFINITY_CAPABLE()) {
3951 KMP_ASSERT(disabled);
3952 }
3953 if (disabled) {
3954 __kmp_affinity_type = affinity_none;
3955 }
3956 __kmp_aux_affinity_initialize();
3957 if (disabled) {
3958 __kmp_affinity_type = affinity_disabled;
3959 }
3960}
3961
3962
3963void
3964__kmp_affinity_uninitialize(void)
3965{
3966 if (__kmp_affinity_masks != NULL) {
3967 __kmp_free(__kmp_affinity_masks);
3968 __kmp_affinity_masks = NULL;
3969 }
3970 if (fullMask != NULL) {
3971 KMP_CPU_FREE(fullMask);
3972 fullMask = NULL;
3973 }
3974 __kmp_affinity_num_masks = 0;
3975# if OMP_40_ENABLED
3976 __kmp_affinity_num_places = 0;
3977# endif
3978 if (__kmp_affinity_proclist != NULL) {
3979 __kmp_free(__kmp_affinity_proclist);
3980 __kmp_affinity_proclist = NULL;
3981 }
3982 if( address2os != NULL ) {
3983 __kmp_free( address2os );
3984 address2os = NULL;
3985 }
3986 if( procarr != NULL ) {
3987 __kmp_free( procarr );
3988 procarr = NULL;
3989 }
3990}
3991
3992
3993void
3994__kmp_affinity_set_init_mask(int gtid, int isa_root)
3995{
3996 if (! KMP_AFFINITY_CAPABLE()) {
3997 return;
3998 }
3999
4000 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4001 if (th->th.th_affin_mask == NULL) {
4002 KMP_CPU_ALLOC(th->th.th_affin_mask);
4003 }
4004 else {
4005 KMP_CPU_ZERO(th->th.th_affin_mask);
4006 }
4007
4008 //
4009 // Copy the thread mask to the kmp_info_t strucuture.
4010 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
4011 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4012 // is set, then the full mask is the same as the mask of the initialization
4013 // thread.
4014 //
4015 kmp_affin_mask_t *mask;
4016 int i;
4017
4018# if OMP_40_ENABLED
4019 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4020# endif
4021 {
4022 if ((__kmp_affinity_type == affinity_none)
4023# if KMP_MIC
4024 || (__kmp_affinity_type == affinity_balanced)
4025# endif
4026 ) {
4027# if KMP_OS_WINDOWS && KMP_ARCH_X86_64
4028 if (__kmp_num_proc_groups > 1) {
4029 return;
4030 }
4031# endif
4032 KMP_ASSERT(fullMask != NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004033 i = KMP_PLACE_ALL;
Jim Cownie5e8470a2013-09-27 10:38:44 +00004034 mask = fullMask;
4035 }
4036 else {
4037 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4038 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4039 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4040 }
4041 }
4042# if OMP_40_ENABLED
4043 else {
4044 if ((! isa_root)
4045 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4046# if KMP_OS_WINDOWS && KMP_ARCH_X86_64
4047 if (__kmp_num_proc_groups > 1) {
4048 return;
4049 }
4050# endif
4051 KMP_ASSERT(fullMask != NULL);
4052 i = KMP_PLACE_ALL;
4053 mask = fullMask;
4054 }
4055 else {
4056 //
4057 // int i = some hash function or just a counter that doesn't
4058 // always start at 0. Use gtid for now.
4059 //
4060 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4061 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4062 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4063 }
4064 }
4065# endif
4066
4067# if OMP_40_ENABLED
4068 th->th.th_current_place = i;
4069 if (isa_root) {
4070 th->th.th_new_place = i;
4071 th->th.th_first_place = 0;
4072 th->th.th_last_place = __kmp_affinity_num_masks - 1;
4073 }
4074
4075 if (i == KMP_PLACE_ALL) {
4076 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4077 gtid));
4078 }
4079 else {
4080 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4081 gtid, i));
4082 }
4083# else
4084 if (i == -1) {
4085 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4086 gtid));
4087 }
4088 else {
4089 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4090 gtid, i));
4091 }
4092# endif /* OMP_40_ENABLED */
4093
4094 KMP_CPU_COPY(th->th.th_affin_mask, mask);
4095
4096 if (__kmp_affinity_verbose) {
4097 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4098 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4099 th->th.th_affin_mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004100 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4101 buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004102 }
4103
4104# if KMP_OS_WINDOWS
4105 //
4106 // On Windows* OS, the process affinity mask might have changed.
4107 // If the user didn't request affinity and this call fails,
4108 // just continue silently. See CQ171393.
4109 //
4110 if ( __kmp_affinity_type == affinity_none ) {
4111 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4112 }
4113 else
4114# endif
4115 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4116}
4117
4118
4119# if OMP_40_ENABLED
4120
4121void
4122__kmp_affinity_set_place(int gtid)
4123{
4124 int retval;
4125
4126 if (! KMP_AFFINITY_CAPABLE()) {
4127 return;
4128 }
4129
4130 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4131
4132 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4133 gtid, th->th.th_new_place, th->th.th_current_place));
4134
4135 //
Alp Toker8f2d3f02014-02-24 10:40:15 +00004136 // Check that the new place is within this thread's partition.
Jim Cownie5e8470a2013-09-27 10:38:44 +00004137 //
4138 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004139 KMP_ASSERT(th->th.th_new_place >= 0);
4140 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004141 if (th->th.th_first_place <= th->th.th_last_place) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004142 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
Jim Cownie5e8470a2013-09-27 10:38:44 +00004143 && (th->th.th_new_place <= th->th.th_last_place));
4144 }
4145 else {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004146 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
Jim Cownie5e8470a2013-09-27 10:38:44 +00004147 || (th->th.th_new_place >= th->th.th_last_place));
4148 }
4149
4150 //
4151 // Copy the thread mask to the kmp_info_t strucuture,
4152 // and set this thread's affinity.
4153 //
4154 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4155 th->th.th_new_place);
4156 KMP_CPU_COPY(th->th.th_affin_mask, mask);
4157 th->th.th_current_place = th->th.th_new_place;
4158
4159 if (__kmp_affinity_verbose) {
4160 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4161 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4162 th->th.th_affin_mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004163 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4164 gtid, buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004165 }
4166 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4167}
4168
4169# endif /* OMP_40_ENABLED */
4170
4171
4172int
4173__kmp_aux_set_affinity(void **mask)
4174{
4175 int gtid;
4176 kmp_info_t *th;
4177 int retval;
4178
4179 if (! KMP_AFFINITY_CAPABLE()) {
4180 return -1;
4181 }
4182
4183 gtid = __kmp_entry_gtid();
4184 KA_TRACE(1000, ;{
4185 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4186 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4187 (kmp_affin_mask_t *)(*mask));
4188 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4189 gtid, buf);
4190 });
4191
4192 if (__kmp_env_consistency_check) {
4193 if ((mask == NULL) || (*mask == NULL)) {
4194 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4195 }
4196 else {
4197 unsigned proc;
4198 int num_procs = 0;
4199
4200 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4201 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4202 continue;
4203 }
4204 num_procs++;
4205 if (! KMP_CPU_ISSET(proc, fullMask)) {
4206 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4207 break;
4208 }
4209 }
4210 if (num_procs == 0) {
4211 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4212 }
4213
4214# if KMP_OS_WINDOWS && KMP_ARCH_X86_64
4215 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4216 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4217 }
4218# endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
4219
4220 }
4221 }
4222
4223 th = __kmp_threads[gtid];
4224 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4225 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4226 if (retval == 0) {
4227 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4228 }
4229
4230# if OMP_40_ENABLED
4231 th->th.th_current_place = KMP_PLACE_UNDEFINED;
4232 th->th.th_new_place = KMP_PLACE_UNDEFINED;
4233 th->th.th_first_place = 0;
4234 th->th.th_last_place = __kmp_affinity_num_masks - 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004235
4236 //
4237 // Turn off 4.0 affinity for the current tread at this parallel level.
4238 //
4239 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
Jim Cownie5e8470a2013-09-27 10:38:44 +00004240# endif
4241
4242 return retval;
4243}
4244
4245
4246int
4247__kmp_aux_get_affinity(void **mask)
4248{
4249 int gtid;
4250 int retval;
4251 kmp_info_t *th;
4252
4253 if (! KMP_AFFINITY_CAPABLE()) {
4254 return -1;
4255 }
4256
4257 gtid = __kmp_entry_gtid();
4258 th = __kmp_threads[gtid];
4259 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4260
4261 KA_TRACE(1000, ;{
4262 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4263 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4264 th->th.th_affin_mask);
4265 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4266 });
4267
4268 if (__kmp_env_consistency_check) {
4269 if ((mask == NULL) || (*mask == NULL)) {
4270 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4271 }
4272 }
4273
4274# if !KMP_OS_WINDOWS
4275
4276 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4277 KA_TRACE(1000, ;{
4278 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4279 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4280 (kmp_affin_mask_t *)(*mask));
4281 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4282 });
4283 return retval;
4284
4285# else
4286
4287 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4288 return 0;
4289
4290# endif /* KMP_OS_WINDOWS */
4291
4292}
4293
Jim Cownie5e8470a2013-09-27 10:38:44 +00004294int
4295__kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4296{
4297 int retval;
4298
4299 if (! KMP_AFFINITY_CAPABLE()) {
4300 return -1;
4301 }
4302
4303 KA_TRACE(1000, ;{
4304 int gtid = __kmp_entry_gtid();
4305 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4306 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4307 (kmp_affin_mask_t *)(*mask));
4308 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4309 proc, gtid, buf);
4310 });
4311
4312 if (__kmp_env_consistency_check) {
4313 if ((mask == NULL) || (*mask == NULL)) {
4314 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4315 }
4316 }
4317
4318 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4319 return -1;
4320 }
4321 if (! KMP_CPU_ISSET(proc, fullMask)) {
4322 return -2;
4323 }
4324
4325 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4326 return 0;
4327}
4328
4329
4330int
4331__kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4332{
4333 int retval;
4334
4335 if (! KMP_AFFINITY_CAPABLE()) {
4336 return -1;
4337 }
4338
4339 KA_TRACE(1000, ;{
4340 int gtid = __kmp_entry_gtid();
4341 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4342 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4343 (kmp_affin_mask_t *)(*mask));
4344 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4345 proc, gtid, buf);
4346 });
4347
4348 if (__kmp_env_consistency_check) {
4349 if ((mask == NULL) || (*mask == NULL)) {
4350 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4351 }
4352 }
4353
4354 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4355 return -1;
4356 }
4357 if (! KMP_CPU_ISSET(proc, fullMask)) {
4358 return -2;
4359 }
4360
4361 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4362 return 0;
4363}
4364
4365
4366int
4367__kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4368{
4369 int retval;
4370
4371 if (! KMP_AFFINITY_CAPABLE()) {
4372 return -1;
4373 }
4374
4375 KA_TRACE(1000, ;{
4376 int gtid = __kmp_entry_gtid();
4377 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4378 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4379 (kmp_affin_mask_t *)(*mask));
4380 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4381 proc, gtid, buf);
4382 });
4383
4384 if (__kmp_env_consistency_check) {
4385 if ((mask == NULL) || (*mask == NULL)) {
4386 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4387 }
4388 }
4389
4390 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4391 return 0;
4392 }
4393 if (! KMP_CPU_ISSET(proc, fullMask)) {
4394 return 0;
4395 }
4396
4397 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4398}
4399
4400# if KMP_MIC
4401
4402// Dynamic affinity settings - Affinity balanced
4403void __kmp_balanced_affinity( int tid, int nthreads )
4404{
4405 if( __kmp_affinity_uniform_topology() ) {
4406 int coreID;
4407 int threadID;
4408 // Number of hyper threads per core in HT machine
4409 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4410 // Number of cores
4411 int ncores = __kmp_ncores;
4412 // How many threads will be bound to each core
4413 int chunk = nthreads / ncores;
4414 // How many cores will have an additional thread bound to it - "big cores"
4415 int big_cores = nthreads % ncores;
4416 // Number of threads on the big cores
4417 int big_nth = ( chunk + 1 ) * big_cores;
4418 if( tid < big_nth ) {
4419 coreID = tid / (chunk + 1 );
4420 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4421 } else { //tid >= big_nth
4422 coreID = ( tid - big_cores ) / chunk;
4423 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4424 }
4425
4426 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4427 "Illegal set affinity operation when not capable");
4428
4429 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4430 KMP_CPU_ZERO(mask);
4431
4432 // Granularity == thread
4433 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4434 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4435 KMP_CPU_SET( osID, mask);
4436 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4437 for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4438 int osID;
4439 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4440 KMP_CPU_SET( osID, mask);
4441 }
4442 }
4443 if (__kmp_affinity_verbose) {
4444 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4445 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004446 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4447 tid, buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004448 }
4449 __kmp_set_system_affinity( mask, TRUE );
4450 } else { // Non-uniform topology
4451
4452 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4453 KMP_CPU_ZERO(mask);
4454
4455 // Number of hyper threads per core in HT machine
4456 int nth_per_core = __kmp_nThreadsPerCore;
4457 int core_level;
4458 if( nth_per_core > 1 ) {
4459 core_level = __kmp_aff_depth - 2;
4460 } else {
4461 core_level = __kmp_aff_depth - 1;
4462 }
4463
4464 // Number of cores - maximum value; it does not count trail cores with 0 processors
4465 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4466
4467 // For performance gain consider the special case nthreads == __kmp_avail_proc
4468 if( nthreads == __kmp_avail_proc ) {
4469 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4470 int osID = address2os[ tid ].second;
4471 KMP_CPU_SET( osID, mask);
4472 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4473 int coreID = address2os[ tid ].first.labels[ core_level ];
4474 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4475 // since the address2os is sortied we can break when cnt==nth_per_core
4476 int cnt = 0;
4477 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4478 int osID = address2os[ i ].second;
4479 int core = address2os[ i ].first.labels[ core_level ];
4480 if( core == coreID ) {
4481 KMP_CPU_SET( osID, mask);
4482 cnt++;
4483 if( cnt == nth_per_core ) {
4484 break;
4485 }
4486 }
4487 }
4488 }
4489 } else if( nthreads <= __kmp_ncores ) {
4490
4491 int core = 0;
4492 for( int i = 0; i < ncores; i++ ) {
4493 // Check if this core from procarr[] is in the mask
4494 int in_mask = 0;
4495 for( int j = 0; j < nth_per_core; j++ ) {
4496 if( procarr[ i * nth_per_core + j ] != - 1 ) {
4497 in_mask = 1;
4498 break;
4499 }
4500 }
4501 if( in_mask ) {
4502 if( tid == core ) {
4503 for( int j = 0; j < nth_per_core; j++ ) {
4504 int osID = procarr[ i * nth_per_core + j ];
4505 if( osID != -1 ) {
4506 KMP_CPU_SET( osID, mask );
4507 // For granularity=thread it is enough to set the first available osID for this core
4508 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4509 break;
4510 }
4511 }
4512 }
4513 break;
4514 } else {
4515 core++;
4516 }
4517 }
4518 }
4519
4520 } else { // nthreads > __kmp_ncores
4521
4522 // Array to save the number of processors at each core
4523 int nproc_at_core[ ncores ];
4524 // Array to save the number of cores with "x" available processors;
4525 int ncores_with_x_procs[ nth_per_core + 1 ];
4526 // Array to save the number of cores with # procs from x to nth_per_core
4527 int ncores_with_x_to_max_procs[ nth_per_core + 1 ];
4528
4529 for( int i = 0; i <= nth_per_core; i++ ) {
4530 ncores_with_x_procs[ i ] = 0;
4531 ncores_with_x_to_max_procs[ i ] = 0;
4532 }
4533
4534 for( int i = 0; i < ncores; i++ ) {
4535 int cnt = 0;
4536 for( int j = 0; j < nth_per_core; j++ ) {
4537 if( procarr[ i * nth_per_core + j ] != -1 ) {
4538 cnt++;
4539 }
4540 }
4541 nproc_at_core[ i ] = cnt;
4542 ncores_with_x_procs[ cnt ]++;
4543 }
4544
4545 for( int i = 0; i <= nth_per_core; i++ ) {
4546 for( int j = i; j <= nth_per_core; j++ ) {
4547 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4548 }
4549 }
4550
4551 // Max number of processors
4552 int nproc = nth_per_core * ncores;
4553 // An array to keep number of threads per each context
4554 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4555 for( int i = 0; i < nproc; i++ ) {
4556 newarr[ i ] = 0;
4557 }
4558
4559 int nth = nthreads;
4560 int flag = 0;
4561 while( nth > 0 ) {
4562 for( int j = 1; j <= nth_per_core; j++ ) {
4563 int cnt = ncores_with_x_to_max_procs[ j ];
4564 for( int i = 0; i < ncores; i++ ) {
4565 // Skip the core with 0 processors
4566 if( nproc_at_core[ i ] == 0 ) {
4567 continue;
4568 }
4569 for( int k = 0; k < nth_per_core; k++ ) {
4570 if( procarr[ i * nth_per_core + k ] != -1 ) {
4571 if( newarr[ i * nth_per_core + k ] == 0 ) {
4572 newarr[ i * nth_per_core + k ] = 1;
4573 cnt--;
4574 nth--;
4575 break;
4576 } else {
4577 if( flag != 0 ) {
4578 newarr[ i * nth_per_core + k ] ++;
4579 cnt--;
4580 nth--;
4581 break;
4582 }
4583 }
4584 }
4585 }
4586 if( cnt == 0 || nth == 0 ) {
4587 break;
4588 }
4589 }
4590 if( nth == 0 ) {
4591 break;
4592 }
4593 }
4594 flag = 1;
4595 }
4596 int sum = 0;
4597 for( int i = 0; i < nproc; i++ ) {
4598 sum += newarr[ i ];
4599 if( sum > tid ) {
4600 // Granularity == thread
4601 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4602 int osID = procarr[ i ];
4603 KMP_CPU_SET( osID, mask);
4604 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4605 int coreID = i / nth_per_core;
4606 for( int ii = 0; ii < nth_per_core; ii++ ) {
4607 int osID = procarr[ coreID * nth_per_core + ii ];
4608 if( osID != -1 ) {
4609 KMP_CPU_SET( osID, mask);
4610 }
4611 }
4612 }
4613 break;
4614 }
4615 }
4616 __kmp_free( newarr );
4617 }
4618
4619 if (__kmp_affinity_verbose) {
4620 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4621 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004622 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4623 tid, buf);
Jim Cownie5e8470a2013-09-27 10:38:44 +00004624 }
4625 __kmp_set_system_affinity( mask, TRUE );
4626 }
4627}
4628
4629# endif /* KMP_MIC */
4630
Jim Cownie4cc4bb42014-10-07 16:25:50 +00004631#else
4632 // affinity not supported
4633
4634kmp_uint32 mac_skipPerLevel[7];
4635kmp_uint32 mac_depth;
4636kmp_uint8 mac_leaf_kids;
4637void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
4638 static int first = 1;
4639 if (first) {
4640 const kmp_uint32 maxLevels = 7;
4641 kmp_uint32 numPerLevel[maxLevels];
4642
4643 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
4644 numPerLevel[i] = 1;
4645 mac_skipPerLevel[i] = 1;
4646 }
4647
4648 mac_depth = 2;
4649 numPerLevel[0] = nproc;
4650
4651 kmp_uint32 branch = 4;
4652 if (numPerLevel[0] == 1) branch = nproc/4;
4653 if (branch<4) branch=4;
4654 for (kmp_uint32 d=0; d<mac_depth-1; ++d) { // optimize hierarchy width
4655 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
4656 if (numPerLevel[d] & 1) numPerLevel[d]++;
4657 numPerLevel[d] = numPerLevel[d] >> 1;
4658 if (numPerLevel[d+1] == 1) mac_depth++;
4659 numPerLevel[d+1] = numPerLevel[d+1] << 1;
4660 }
4661 if(numPerLevel[0] == 1) {
4662 branch = branch >> 1;
4663 if (branch<4) branch = 4;
4664 }
4665 }
4666
4667 for (kmp_uint32 i=1; i<mac_depth; ++i)
4668 mac_skipPerLevel[i] = numPerLevel[i-1] * mac_skipPerLevel[i-1];
4669 mac_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
4670 first=0;
4671 }
4672 thr_bar->depth = mac_depth;
4673 thr_bar->base_leaf_kids = mac_leaf_kids;
4674 thr_bar->skip_per_level = mac_skipPerLevel;
4675}
4676
Alp Toker763b9392014-02-28 09:42:41 +00004677#endif // KMP_AFFINITY_SUPPORTED