LLVM OpenMP* Runtime Library
kmp_affinity.cpp
1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_str.h"
20 #include "kmp_wrapper_getpid.h"
21 #include "kmp_affinity.h"
22 
23 // Store the real or imagined machine hierarchy here
24 static hierarchy_info machine_hierarchy;
25 
26 void __kmp_cleanup_hierarchy() {
27  machine_hierarchy.fini();
28 }
29 
30 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
31  kmp_uint32 depth;
32  // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
33  if (TCR_1(machine_hierarchy.uninitialized))
34  machine_hierarchy.init(NULL, nproc);
35 
36  // Adjust the hierarchy in case num threads exceeds original
37  if (nproc > machine_hierarchy.base_num_threads)
38  machine_hierarchy.resize(nproc);
39 
40  depth = machine_hierarchy.depth;
41  KMP_DEBUG_ASSERT(depth > 0);
42 
43  thr_bar->depth = depth;
44  thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
45  thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
46 }
47 
48 #if KMP_AFFINITY_SUPPORTED
49 
50 //
51 // Print the affinity mask to the character array in a pretty format.
52 //
53 #if KMP_USE_HWLOC
54 char *
55 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
56 {
57  int num_chars_to_write, num_chars_written;
58  char* scan;
59  KMP_ASSERT(buf_len >= 40);
60 
61  // bufsize of 0 just retrieves the needed buffer size.
62  num_chars_to_write = hwloc_bitmap_list_snprintf(buf, 0, (hwloc_bitmap_t)mask);
63 
64  // need '{', "xxxxxxxx...xx", '}', '\0' = num_chars_to_write + 3 bytes
65  // * num_chars_to_write returned by hwloc_bitmap_list_snprintf does not
66  // take into account the '\0' character.
67  if(hwloc_bitmap_iszero((hwloc_bitmap_t)mask)) {
68  KMP_SNPRINTF(buf, buf_len, "{<empty>}");
69  } else if(num_chars_to_write < buf_len - 3) {
70  // no problem fitting the mask into buf_len number of characters
71  buf[0] = '{';
72  // use buf_len-3 because we have the three characters: '{' '}' '\0' to add to the buffer
73  num_chars_written = hwloc_bitmap_list_snprintf(buf+1, buf_len-3, (hwloc_bitmap_t)mask);
74  buf[num_chars_written+1] = '}';
75  buf[num_chars_written+2] = '\0';
76  } else {
77  // Need to truncate the affinity mask string and add ellipsis.
78  // To do this, we first write out the '{' + str(mask)
79  buf[0] = '{';
80  hwloc_bitmap_list_snprintf(buf+1, buf_len-1, (hwloc_bitmap_t)mask);
81  // then, what we do here is go to the 7th to last character, then go backwards until we are NOT
82  // on a digit then write "...}\0". This way it is a clean ellipsis addition and we don't
83  // overwrite part of an affinity number. i.e., we avoid something like { 45, 67, 8...} and get
84  // { 45, 67,...} instead.
85  scan = buf + buf_len - 7;
86  while(*scan >= '0' && *scan <= '9' && scan >= buf)
87  scan--;
88  *(scan+1) = '.';
89  *(scan+2) = '.';
90  *(scan+3) = '.';
91  *(scan+4) = '}';
92  *(scan+5) = '\0';
93  }
94  return buf;
95 }
96 #else
97 char *
98 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
99 {
100  KMP_ASSERT(buf_len >= 40);
101  char *scan = buf;
102  char *end = buf + buf_len - 1;
103 
104  //
105  // Find first element / check for empty set.
106  //
107  size_t i;
108  for (i = 0; i < KMP_CPU_SETSIZE; i++) {
109  if (KMP_CPU_ISSET(i, mask)) {
110  break;
111  }
112  }
113  if (i == KMP_CPU_SETSIZE) {
114  KMP_SNPRINTF(scan, end-scan+1, "{<empty>}");
115  while (*scan != '\0') scan++;
116  KMP_ASSERT(scan <= end);
117  return buf;
118  }
119 
120  KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i);
121  while (*scan != '\0') scan++;
122  i++;
123  for (; i < KMP_CPU_SETSIZE; i++) {
124  if (! KMP_CPU_ISSET(i, mask)) {
125  continue;
126  }
127 
128  //
129  // Check for buffer overflow. A string of the form ",<n>" will have
130  // at most 10 characters, plus we want to leave room to print ",...}"
131  // if the set is too large to print for a total of 15 characters.
132  // We already left room for '\0' in setting end.
133  //
134  if (end - scan < 15) {
135  break;
136  }
137  KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i);
138  while (*scan != '\0') scan++;
139  }
140  if (i < KMP_CPU_SETSIZE) {
141  KMP_SNPRINTF(scan, end-scan+1, ",...");
142  while (*scan != '\0') scan++;
143  }
144  KMP_SNPRINTF(scan, end-scan+1, "}");
145  while (*scan != '\0') scan++;
146  KMP_ASSERT(scan <= end);
147  return buf;
148 }
149 #endif // KMP_USE_HWLOC
150 
151 
152 void
153 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
154 {
155  KMP_CPU_ZERO(mask);
156 
157 # if KMP_GROUP_AFFINITY
158 
159  if (__kmp_num_proc_groups > 1) {
160  int group;
161  KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
162  for (group = 0; group < __kmp_num_proc_groups; group++) {
163  int i;
164  int num = __kmp_GetActiveProcessorCount(group);
165  for (i = 0; i < num; i++) {
166  KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
167  }
168  }
169  }
170  else
171 
172 # endif /* KMP_GROUP_AFFINITY */
173 
174  {
175  int proc;
176  for (proc = 0; proc < __kmp_xproc; proc++) {
177  KMP_CPU_SET(proc, mask);
178  }
179  }
180 }
181 
182 //
183 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
184 // called to renumber the labels from [0..n] and place them into the child_num
185 // vector of the address object. This is done in case the labels used for
186 // the children at one node of the hierarchy differ from those used for
187 // another node at the same level. Example: suppose the machine has 2 nodes
188 // with 2 packages each. The first node contains packages 601 and 602, and
189 // second node contains packages 603 and 604. If we try to sort the table
190 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
191 // because we are paying attention to the labels themselves, not the ordinal
192 // child numbers. By using the child numbers in the sort, the result is
193 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
194 //
195 static void
196 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
197  int numAddrs)
198 {
199  KMP_DEBUG_ASSERT(numAddrs > 0);
200  int depth = address2os->first.depth;
201  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
202  unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
203  * sizeof(unsigned));
204  int labCt;
205  for (labCt = 0; labCt < depth; labCt++) {
206  address2os[0].first.childNums[labCt] = counts[labCt] = 0;
207  lastLabel[labCt] = address2os[0].first.labels[labCt];
208  }
209  int i;
210  for (i = 1; i < numAddrs; i++) {
211  for (labCt = 0; labCt < depth; labCt++) {
212  if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
213  int labCt2;
214  for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
215  counts[labCt2] = 0;
216  lastLabel[labCt2] = address2os[i].first.labels[labCt2];
217  }
218  counts[labCt]++;
219  lastLabel[labCt] = address2os[i].first.labels[labCt];
220  break;
221  }
222  }
223  for (labCt = 0; labCt < depth; labCt++) {
224  address2os[i].first.childNums[labCt] = counts[labCt];
225  }
226  for (; labCt < (int)Address::maxDepth; labCt++) {
227  address2os[i].first.childNums[labCt] = 0;
228  }
229  }
230  __kmp_free(lastLabel);
231  __kmp_free(counts);
232 }
233 
234 
235 //
236 // All of the __kmp_affinity_create_*_map() routines should set
237 // __kmp_affinity_masks to a vector of affinity mask objects of length
238 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
239 // return the number of levels in the machine topology tree (zero if
240 // __kmp_affinity_type == affinity_none).
241 //
242 // All of the __kmp_affinity_create_*_map() routines should set *__kmp_affin_fullMask
243 // to the affinity mask for the initialization thread. They need to save and
244 // restore the mask, and it could be needed later, so saving it is just an
245 // optimization to avoid calling kmp_get_system_affinity() again.
246 //
247 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
248 
249 static int nCoresPerPkg, nPackages;
250 static int __kmp_nThreadsPerCore;
251 #ifndef KMP_DFLT_NTH_CORES
252 static int __kmp_ncores;
253 #endif
254 static int *__kmp_pu_os_idx = NULL;
255 
256 //
257 // __kmp_affinity_uniform_topology() doesn't work when called from
258 // places which support arbitrarily many levels in the machine topology
259 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
260 // __kmp_affinity_create_x2apicid_map().
261 //
262 inline static bool
263 __kmp_affinity_uniform_topology()
264 {
265  return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
266 }
267 
268 
269 //
270 // Print out the detailed machine topology map, i.e. the physical locations
271 // of each OS proc.
272 //
273 static void
274 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
275  int pkgLevel, int coreLevel, int threadLevel)
276 {
277  int proc;
278 
279  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
280  for (proc = 0; proc < len; proc++) {
281  int level;
282  kmp_str_buf_t buf;
283  __kmp_str_buf_init(&buf);
284  for (level = 0; level < depth; level++) {
285  if (level == threadLevel) {
286  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
287  }
288  else if (level == coreLevel) {
289  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
290  }
291  else if (level == pkgLevel) {
292  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
293  }
294  else if (level > pkgLevel) {
295  __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
296  level - pkgLevel - 1);
297  }
298  else {
299  __kmp_str_buf_print(&buf, "L%d ", level);
300  }
301  __kmp_str_buf_print(&buf, "%d ",
302  address2os[proc].first.labels[level]);
303  }
304  KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
305  buf.str);
306  __kmp_str_buf_free(&buf);
307  }
308 }
309 
310 #if KMP_USE_HWLOC
311 
312 // This function removes the topology levels that are radix 1 and don't offer
313 // further information about the topology. The most common example is when you
314 // have one thread context per core, we don't want the extra thread context
315 // level if it offers no unique labels. So they are removed.
316 // return value: the new depth of address2os
317 static int
318 __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os, int nActiveThreads, int depth, int* pkgLevel, int* coreLevel, int* threadLevel) {
319  int level;
320  int i;
321  int radix1_detected;
322 
323  for (level = depth-1; level >= 0; --level) {
324  // Always keep the package level
325  if (level == *pkgLevel)
326  continue;
327  // Detect if this level is radix 1
328  radix1_detected = 1;
329  for (i = 1; i < nActiveThreads; ++i) {
330  if (address2os[0].first.labels[level] != address2os[i].first.labels[level]) {
331  // There are differing label values for this level so it stays
332  radix1_detected = 0;
333  break;
334  }
335  }
336  if (!radix1_detected)
337  continue;
338  // Radix 1 was detected
339  if (level == *threadLevel) {
340  // If only one thread per core, then just decrement
341  // the depth which removes the threadlevel from address2os
342  for (i = 0; i < nActiveThreads; ++i) {
343  address2os[i].first.depth--;
344  }
345  *threadLevel = -1;
346  } else if (level == *coreLevel) {
347  // For core level, we move the thread labels over if they are still
348  // valid (*threadLevel != -1), and also reduce the depth another level
349  for (i = 0; i < nActiveThreads; ++i) {
350  if (*threadLevel != -1) {
351  address2os[i].first.labels[*coreLevel] = address2os[i].first.labels[*threadLevel];
352  }
353  address2os[i].first.depth--;
354  }
355  *coreLevel = -1;
356  }
357  }
358  return address2os[0].first.depth;
359 }
360 
361 // Returns the number of objects of type 'type' below 'obj' within the topology tree structure.
362 // e.g., if obj is a HWLOC_OBJ_SOCKET object, and type is HWLOC_OBJ_PU, then
363 // this will return the number of PU's under the SOCKET object.
364 static int
365 __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, hwloc_obj_type_t type) {
366  int retval = 0;
367  hwloc_obj_t first;
368  for(first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, obj->logical_index, type, 0);
369  first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == obj;
370  first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, first))
371  {
372  ++retval;
373  }
374  return retval;
375 }
376 
377 static int
378 __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
379  kmp_i18n_id_t *const msg_id)
380 {
381  *address2os = NULL;
382  *msg_id = kmp_i18n_null;
383 
384  //
385  // Save the affinity mask for the current thread.
386  //
387  kmp_affin_mask_t *oldMask;
388  KMP_CPU_ALLOC(oldMask);
389  __kmp_get_system_affinity(oldMask, TRUE);
390 
391  int depth = 3;
392  int pkgLevel = 0;
393  int coreLevel = 1;
394  int threadLevel = 2;
395 
396  if (! KMP_AFFINITY_CAPABLE())
397  {
398  //
399  // Hack to try and infer the machine topology using only the data
400  // available from cpuid on the current thread, and __kmp_xproc.
401  //
402  KMP_ASSERT(__kmp_affinity_type == affinity_none);
403 
404  nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0), HWLOC_OBJ_CORE);
405  __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU);
406  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
407  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
408  if (__kmp_affinity_verbose) {
409  KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
410  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
411  if (__kmp_affinity_uniform_topology()) {
412  KMP_INFORM(Uniform, "KMP_AFFINITY");
413  } else {
414  KMP_INFORM(NonUniform, "KMP_AFFINITY");
415  }
416  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
417  __kmp_nThreadsPerCore, __kmp_ncores);
418  }
419  KMP_CPU_FREE(oldMask);
420  return 0;
421  }
422 
423  //
424  // Allocate the data structure to be returned.
425  //
426  AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
427  __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
428 
429  //
430  // When affinity is off, this routine will still be called to set
431  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
432  // nCoresPerPkg, & nPackages. Make sure all these vars are set
433  // correctly, and return if affinity is not enabled.
434  //
435 
436  hwloc_obj_t pu;
437  hwloc_obj_t core;
438  hwloc_obj_t socket;
439  int nActiveThreads = 0;
440  int socket_identifier = 0;
441  // re-calculate globals to count only accessible resources
442  __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
443  for(socket = hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0);
444  socket != NULL;
445  socket = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, socket),
446  socket_identifier++)
447  {
448  int core_identifier = 0;
449  int num_active_cores = 0;
450  for(core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type, socket->logical_index, HWLOC_OBJ_CORE, 0);
451  core != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type, core) == socket;
452  core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, core),
453  core_identifier++)
454  {
455  int pu_identifier = 0;
456  int num_active_threads = 0;
457  for(pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type, core->logical_index, HWLOC_OBJ_PU, 0);
458  pu != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type, pu) == core;
459  pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU, pu),
460  pu_identifier++)
461  {
462  Address addr(3);
463  if(! KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
464  continue; // skip inactive (inaccessible) unit
465  KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
466  socket->os_index, socket->logical_index, core->os_index, core->logical_index, pu->os_index,pu->logical_index));
467  addr.labels[0] = socket_identifier; // package
468  addr.labels[1] = core_identifier; // core
469  addr.labels[2] = pu_identifier; // pu
470  retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
471  __kmp_pu_os_idx[nActiveThreads] = pu->os_index; // keep os index for each active pu
472  nActiveThreads++;
473  ++num_active_threads; // count active threads per core
474  }
475  if (num_active_threads) { // were there any active threads on the core?
476  ++__kmp_ncores; // count total active cores
477  ++num_active_cores; // count active cores per socket
478  if (num_active_threads > __kmp_nThreadsPerCore)
479  __kmp_nThreadsPerCore = num_active_threads; // calc maximum
480  }
481  }
482  if (num_active_cores) { // were there any active cores on the socket?
483  ++nPackages; // count total active packages
484  if (num_active_cores > nCoresPerPkg)
485  nCoresPerPkg = num_active_cores; // calc maximum
486  }
487  }
488 
489  //
490  // If there's only one thread context to bind to, return now.
491  //
492  KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
493  KMP_ASSERT(nActiveThreads > 0);
494  if (nActiveThreads == 1) {
495  __kmp_ncores = nPackages = 1;
496  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
497  if (__kmp_affinity_verbose) {
498  char buf[KMP_AFFIN_MASK_PRINT_LEN];
499  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
500 
501  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
502  if (__kmp_affinity_respect_mask) {
503  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
504  } else {
505  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
506  }
507  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
508  KMP_INFORM(Uniform, "KMP_AFFINITY");
509  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
510  __kmp_nThreadsPerCore, __kmp_ncores);
511  }
512 
513  if (__kmp_affinity_type == affinity_none) {
514  __kmp_free(retval);
515  KMP_CPU_FREE(oldMask);
516  return 0;
517  }
518 
519  //
520  // Form an Address object which only includes the package level.
521  //
522  Address addr(1);
523  addr.labels[0] = retval[0].first.labels[pkgLevel];
524  retval[0].first = addr;
525 
526  if (__kmp_affinity_gran_levels < 0) {
527  __kmp_affinity_gran_levels = 0;
528  }
529 
530  if (__kmp_affinity_verbose) {
531  __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
532  }
533 
534  *address2os = retval;
535  KMP_CPU_FREE(oldMask);
536  return 1;
537  }
538 
539  //
540  // Sort the table by physical Id.
541  //
542  qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
543 
544  //
545  // Check to see if the machine topology is uniform
546  //
547  unsigned uniform = (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads);
548 
549  //
550  // Print the machine topology summary.
551  //
552  if (__kmp_affinity_verbose) {
553  char mask[KMP_AFFIN_MASK_PRINT_LEN];
554  __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
555 
556  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
557  if (__kmp_affinity_respect_mask) {
558  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
559  } else {
560  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
561  }
562  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
563  if (uniform) {
564  KMP_INFORM(Uniform, "KMP_AFFINITY");
565  } else {
566  KMP_INFORM(NonUniform, "KMP_AFFINITY");
567  }
568 
569  kmp_str_buf_t buf;
570  __kmp_str_buf_init(&buf);
571 
572  __kmp_str_buf_print(&buf, "%d", nPackages);
573  //for (level = 1; level <= pkgLevel; level++) {
574  // __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
575  // }
576  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
577  __kmp_nThreadsPerCore, __kmp_ncores);
578 
579  __kmp_str_buf_free(&buf);
580  }
581 
582  if (__kmp_affinity_type == affinity_none) {
583  __kmp_free(retval);
584  KMP_CPU_FREE(oldMask);
585  return 0;
586  }
587 
588  //
589  // Find any levels with radiix 1, and remove them from the map
590  // (except for the package level).
591  //
592  depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel);
593 
594  if (__kmp_affinity_gran_levels < 0) {
595  //
596  // Set the granularity level based on what levels are modeled
597  // in the machine topology map.
598  //
599  __kmp_affinity_gran_levels = 0;
600  if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
601  __kmp_affinity_gran_levels++;
602  }
603  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
604  __kmp_affinity_gran_levels++;
605  }
606  if (__kmp_affinity_gran > affinity_gran_package) {
607  __kmp_affinity_gran_levels++;
608  }
609  }
610 
611  if (__kmp_affinity_verbose) {
612  __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel,
613  coreLevel, threadLevel);
614  }
615 
616  KMP_CPU_FREE(oldMask);
617  *address2os = retval;
618  return depth;
619 }
620 #endif // KMP_USE_HWLOC
621 
622 //
623 // If we don't know how to retrieve the machine's processor topology, or
624 // encounter an error in doing so, this routine is called to form a "flat"
625 // mapping of os thread id's <-> processor id's.
626 //
627 static int
628 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
629  kmp_i18n_id_t *const msg_id)
630 {
631  *address2os = NULL;
632  *msg_id = kmp_i18n_null;
633 
634  //
635  // Even if __kmp_affinity_type == affinity_none, this routine might still
636  // called to set __kmp_ncores, as well as
637  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
638  //
639  if (! KMP_AFFINITY_CAPABLE()) {
640  KMP_ASSERT(__kmp_affinity_type == affinity_none);
641  __kmp_ncores = nPackages = __kmp_xproc;
642  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
643  if (__kmp_affinity_verbose) {
644  KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
645  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
646  KMP_INFORM(Uniform, "KMP_AFFINITY");
647  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
648  __kmp_nThreadsPerCore, __kmp_ncores);
649  }
650  return 0;
651  }
652 
653  //
654  // When affinity is off, this routine will still be called to set
655  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
656  // nCoresPerPkg, & nPackages. Make sure all these vars are set
657  // correctly, and return now if affinity is not enabled.
658  //
659  __kmp_ncores = nPackages = __kmp_avail_proc;
660  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
661  if (__kmp_affinity_verbose) {
662  char buf[KMP_AFFIN_MASK_PRINT_LEN];
663  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask);
664 
665  KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
666  if (__kmp_affinity_respect_mask) {
667  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
668  } else {
669  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
670  }
671  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
672  KMP_INFORM(Uniform, "KMP_AFFINITY");
673  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
674  __kmp_nThreadsPerCore, __kmp_ncores);
675  }
676  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
677  __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
678  if (__kmp_affinity_type == affinity_none) {
679  int avail_ct = 0;
680  unsigned int i;
681  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
682  if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask))
683  continue;
684  __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
685  }
686  return 0;
687  }
688 
689  //
690  // Contruct the data structure to be returned.
691  //
692  *address2os = (AddrUnsPair*)
693  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
694  int avail_ct = 0;
695  unsigned int i;
696  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
697  //
698  // Skip this proc if it is not included in the machine model.
699  //
700  if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
701  continue;
702  }
703  __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
704  Address addr(1);
705  addr.labels[0] = i;
706  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
707  }
708  if (__kmp_affinity_verbose) {
709  KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
710  }
711 
712  if (__kmp_affinity_gran_levels < 0) {
713  //
714  // Only the package level is modeled in the machine topology map,
715  // so the #levels of granularity is either 0 or 1.
716  //
717  if (__kmp_affinity_gran > affinity_gran_package) {
718  __kmp_affinity_gran_levels = 1;
719  }
720  else {
721  __kmp_affinity_gran_levels = 0;
722  }
723  }
724  return 1;
725 }
726 
727 
728 # if KMP_GROUP_AFFINITY
729 
730 //
731 // If multiple Windows* OS processor groups exist, we can create a 2-level
732 // topology map with the groups at level 0 and the individual procs at
733 // level 1.
734 //
735 // This facilitates letting the threads float among all procs in a group,
736 // if granularity=group (the default when there are multiple groups).
737 //
738 static int
739 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
740  kmp_i18n_id_t *const msg_id)
741 {
742  *address2os = NULL;
743  *msg_id = kmp_i18n_null;
744 
745  //
746  // If we don't have multiple processor groups, return now.
747  // The flat mapping will be used.
748  //
749  if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(__kmp_affin_fullMask) >= 0)) {
750  // FIXME set *msg_id
751  return -1;
752  }
753 
754  //
755  // Contruct the data structure to be returned.
756  //
757  *address2os = (AddrUnsPair*)
758  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
759  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
760  __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
761  int avail_ct = 0;
762  int i;
763  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
764  //
765  // Skip this proc if it is not included in the machine model.
766  //
767  if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
768  continue;
769  }
770  __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
771  Address addr(2);
772  addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
773  addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
774  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
775 
776  if (__kmp_affinity_verbose) {
777  KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
778  addr.labels[1]);
779  }
780  }
781 
782  if (__kmp_affinity_gran_levels < 0) {
783  if (__kmp_affinity_gran == affinity_gran_group) {
784  __kmp_affinity_gran_levels = 1;
785  }
786  else if ((__kmp_affinity_gran == affinity_gran_fine)
787  || (__kmp_affinity_gran == affinity_gran_thread)) {
788  __kmp_affinity_gran_levels = 0;
789  }
790  else {
791  const char *gran_str = NULL;
792  if (__kmp_affinity_gran == affinity_gran_core) {
793  gran_str = "core";
794  }
795  else if (__kmp_affinity_gran == affinity_gran_package) {
796  gran_str = "package";
797  }
798  else if (__kmp_affinity_gran == affinity_gran_node) {
799  gran_str = "node";
800  }
801  else {
802  KMP_ASSERT(0);
803  }
804 
805  // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
806  __kmp_affinity_gran_levels = 0;
807  }
808  }
809  return 2;
810 }
811 
812 # endif /* KMP_GROUP_AFFINITY */
813 
814 
815 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
816 
817 static int
818 __kmp_cpuid_mask_width(int count) {
819  int r = 0;
820 
821  while((1<<r) < count)
822  ++r;
823  return r;
824 }
825 
826 
827 class apicThreadInfo {
828 public:
829  unsigned osId; // param to __kmp_affinity_bind_thread
830  unsigned apicId; // from cpuid after binding
831  unsigned maxCoresPerPkg; // ""
832  unsigned maxThreadsPerPkg; // ""
833  unsigned pkgId; // inferred from above values
834  unsigned coreId; // ""
835  unsigned threadId; // ""
836 };
837 
838 
839 static int
840 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
841 {
842  const apicThreadInfo *aa = (const apicThreadInfo *)a;
843  const apicThreadInfo *bb = (const apicThreadInfo *)b;
844  if (aa->osId < bb->osId) return -1;
845  if (aa->osId > bb->osId) return 1;
846  return 0;
847 }
848 
849 
850 static int
851 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
852 {
853  const apicThreadInfo *aa = (const apicThreadInfo *)a;
854  const apicThreadInfo *bb = (const apicThreadInfo *)b;
855  if (aa->pkgId < bb->pkgId) return -1;
856  if (aa->pkgId > bb->pkgId) return 1;
857  if (aa->coreId < bb->coreId) return -1;
858  if (aa->coreId > bb->coreId) return 1;
859  if (aa->threadId < bb->threadId) return -1;
860  if (aa->threadId > bb->threadId) return 1;
861  return 0;
862 }
863 
864 
865 //
866 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
867 // an algorithm which cycles through the available os threads, setting
868 // the current thread's affinity mask to that thread, and then retrieves
869 // the Apic Id for each thread context using the cpuid instruction.
870 //
871 static int
872 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
873  kmp_i18n_id_t *const msg_id)
874 {
875  kmp_cpuid buf;
876  int rc;
877  *address2os = NULL;
878  *msg_id = kmp_i18n_null;
879 
880  //
881  // Check if cpuid leaf 4 is supported.
882  //
883  __kmp_x86_cpuid(0, 0, &buf);
884  if (buf.eax < 4) {
885  *msg_id = kmp_i18n_str_NoLeaf4Support;
886  return -1;
887  }
888 
889  //
890  // The algorithm used starts by setting the affinity to each available
891  // thread and retrieving info from the cpuid instruction, so if we are
892  // not capable of calling __kmp_get_system_affinity() and
893  // _kmp_get_system_affinity(), then we need to do something else - use
894  // the defaults that we calculated from issuing cpuid without binding
895  // to each proc.
896  //
897  if (! KMP_AFFINITY_CAPABLE()) {
898  //
899  // Hack to try and infer the machine topology using only the data
900  // available from cpuid on the current thread, and __kmp_xproc.
901  //
902  KMP_ASSERT(__kmp_affinity_type == affinity_none);
903 
904  //
905  // Get an upper bound on the number of threads per package using
906  // cpuid(1).
907  //
908  // On some OS/chps combinations where HT is supported by the chip
909  // but is disabled, this value will be 2 on a single core chip.
910  // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
911  //
912  __kmp_x86_cpuid(1, 0, &buf);
913  int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
914  if (maxThreadsPerPkg == 0) {
915  maxThreadsPerPkg = 1;
916  }
917 
918  //
919  // The num cores per pkg comes from cpuid(4).
920  // 1 must be added to the encoded value.
921  //
922  // The author of cpu_count.cpp treated this only an upper bound
923  // on the number of cores, but I haven't seen any cases where it
924  // was greater than the actual number of cores, so we will treat
925  // it as exact in this block of code.
926  //
927  // First, we need to check if cpuid(4) is supported on this chip.
928  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
929  // has the value n or greater.
930  //
931  __kmp_x86_cpuid(0, 0, &buf);
932  if (buf.eax >= 4) {
933  __kmp_x86_cpuid(4, 0, &buf);
934  nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
935  }
936  else {
937  nCoresPerPkg = 1;
938  }
939 
940  //
941  // There is no way to reliably tell if HT is enabled without issuing
942  // the cpuid instruction from every thread, can correlating the cpuid
943  // info, so if the machine is not affinity capable, we assume that HT
944  // is off. We have seen quite a few machines where maxThreadsPerPkg
945  // is 2, yet the machine does not support HT.
946  //
947  // - Older OSes are usually found on machines with older chips, which
948  // do not support HT.
949  //
950  // - The performance penalty for mistakenly identifying a machine as
951  // HT when it isn't (which results in blocktime being incorrecly set
952  // to 0) is greater than the penalty when for mistakenly identifying
953  // a machine as being 1 thread/core when it is really HT enabled
954  // (which results in blocktime being incorrectly set to a positive
955  // value).
956  //
957  __kmp_ncores = __kmp_xproc;
958  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
959  __kmp_nThreadsPerCore = 1;
960  if (__kmp_affinity_verbose) {
961  KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
962  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
963  if (__kmp_affinity_uniform_topology()) {
964  KMP_INFORM(Uniform, "KMP_AFFINITY");
965  } else {
966  KMP_INFORM(NonUniform, "KMP_AFFINITY");
967  }
968  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
969  __kmp_nThreadsPerCore, __kmp_ncores);
970  }
971  return 0;
972  }
973 
974  //
975  //
976  // From here on, we can assume that it is safe to call
977  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
978  // even if __kmp_affinity_type = affinity_none.
979  //
980 
981  //
982  // Save the affinity mask for the current thread.
983  //
984  kmp_affin_mask_t *oldMask;
985  KMP_CPU_ALLOC(oldMask);
986  KMP_ASSERT(oldMask != NULL);
987  __kmp_get_system_affinity(oldMask, TRUE);
988 
989  //
990  // Run through each of the available contexts, binding the current thread
991  // to it, and obtaining the pertinent information using the cpuid instr.
992  //
993  // The relevant information is:
994  //
995  // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
996  // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
997  //
998  // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The
999  // value of this field determines the width of the core# + thread#
1000  // fields in the Apic Id. It is also an upper bound on the number
1001  // of threads per package, but it has been verified that situations
1002  // happen were it is not exact. In particular, on certain OS/chip
1003  // combinations where Intel(R) Hyper-Threading Technology is supported
1004  // by the chip but has
1005  // been disabled, the value of this field will be 2 (for a single core
1006  // chip). On other OS/chip combinations supporting
1007  // Intel(R) Hyper-Threading Technology, the value of
1008  // this field will be 1 when Intel(R) Hyper-Threading Technology is
1009  // disabled and 2 when it is enabled.
1010  //
1011  // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The
1012  // value of this field (+1) determines the width of the core# field in
1013  // the Apic Id. The comments in "cpucount.cpp" say that this value is
1014  // an upper bound, but the IA-32 architecture manual says that it is
1015  // exactly the number of cores per package, and I haven't seen any
1016  // case where it wasn't.
1017  //
1018  // From this information, deduce the package Id, core Id, and thread Id,
1019  // and set the corresponding fields in the apicThreadInfo struct.
1020  //
1021  unsigned i;
1022  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1023  __kmp_avail_proc * sizeof(apicThreadInfo));
1024  unsigned nApics = 0;
1025  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1026  //
1027  // Skip this proc if it is not included in the machine model.
1028  //
1029  if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1030  continue;
1031  }
1032  KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1033 
1034  __kmp_affinity_bind_thread(i);
1035  threadInfo[nApics].osId = i;
1036 
1037  //
1038  // The apic id and max threads per pkg come from cpuid(1).
1039  //
1040  __kmp_x86_cpuid(1, 0, &buf);
1041  if (! (buf.edx >> 9) & 1) {
1042  __kmp_set_system_affinity(oldMask, TRUE);
1043  __kmp_free(threadInfo);
1044  KMP_CPU_FREE(oldMask);
1045  *msg_id = kmp_i18n_str_ApicNotPresent;
1046  return -1;
1047  }
1048  threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1049  threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1050  if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1051  threadInfo[nApics].maxThreadsPerPkg = 1;
1052  }
1053 
1054  //
1055  // Max cores per pkg comes from cpuid(4).
1056  // 1 must be added to the encoded value.
1057  //
1058  // First, we need to check if cpuid(4) is supported on this chip.
1059  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
1060  // has the value n or greater.
1061  //
1062  __kmp_x86_cpuid(0, 0, &buf);
1063  if (buf.eax >= 4) {
1064  __kmp_x86_cpuid(4, 0, &buf);
1065  threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1066  }
1067  else {
1068  threadInfo[nApics].maxCoresPerPkg = 1;
1069  }
1070 
1071  //
1072  // Infer the pkgId / coreId / threadId using only the info
1073  // obtained locally.
1074  //
1075  int widthCT = __kmp_cpuid_mask_width(
1076  threadInfo[nApics].maxThreadsPerPkg);
1077  threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1078 
1079  int widthC = __kmp_cpuid_mask_width(
1080  threadInfo[nApics].maxCoresPerPkg);
1081  int widthT = widthCT - widthC;
1082  if (widthT < 0) {
1083  //
1084  // I've never seen this one happen, but I suppose it could, if
1085  // the cpuid instruction on a chip was really screwed up.
1086  // Make sure to restore the affinity mask before the tail call.
1087  //
1088  __kmp_set_system_affinity(oldMask, TRUE);
1089  __kmp_free(threadInfo);
1090  KMP_CPU_FREE(oldMask);
1091  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1092  return -1;
1093  }
1094 
1095  int maskC = (1 << widthC) - 1;
1096  threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1097  &maskC;
1098 
1099  int maskT = (1 << widthT) - 1;
1100  threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1101 
1102  nApics++;
1103  }
1104 
1105  //
1106  // We've collected all the info we need.
1107  // Restore the old affinity mask for this thread.
1108  //
1109  __kmp_set_system_affinity(oldMask, TRUE);
1110 
1111  //
1112  // If there's only one thread context to bind to, form an Address object
1113  // with depth 1 and return immediately (or, if affinity is off, set
1114  // address2os to NULL and return).
1115  //
1116  // If it is configured to omit the package level when there is only a
1117  // single package, the logic at the end of this routine won't work if
1118  // there is only a single thread - it would try to form an Address
1119  // object with depth 0.
1120  //
1121  KMP_ASSERT(nApics > 0);
1122  if (nApics == 1) {
1123  __kmp_ncores = nPackages = 1;
1124  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1125  if (__kmp_affinity_verbose) {
1126  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1127  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1128 
1129  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1130  if (__kmp_affinity_respect_mask) {
1131  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1132  } else {
1133  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1134  }
1135  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1136  KMP_INFORM(Uniform, "KMP_AFFINITY");
1137  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1138  __kmp_nThreadsPerCore, __kmp_ncores);
1139  }
1140 
1141  if (__kmp_affinity_type == affinity_none) {
1142  __kmp_free(threadInfo);
1143  KMP_CPU_FREE(oldMask);
1144  return 0;
1145  }
1146 
1147  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1148  Address addr(1);
1149  addr.labels[0] = threadInfo[0].pkgId;
1150  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1151 
1152  if (__kmp_affinity_gran_levels < 0) {
1153  __kmp_affinity_gran_levels = 0;
1154  }
1155 
1156  if (__kmp_affinity_verbose) {
1157  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1158  }
1159 
1160  __kmp_free(threadInfo);
1161  KMP_CPU_FREE(oldMask);
1162  return 1;
1163  }
1164 
1165  //
1166  // Sort the threadInfo table by physical Id.
1167  //
1168  qsort(threadInfo, nApics, sizeof(*threadInfo),
1169  __kmp_affinity_cmp_apicThreadInfo_phys_id);
1170 
1171  //
1172  // The table is now sorted by pkgId / coreId / threadId, but we really
1173  // don't know the radix of any of the fields. pkgId's may be sparsely
1174  // assigned among the chips on a system. Although coreId's are usually
1175  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1176  // [0..threadsPerCore-1], we don't want to make any such assumptions.
1177  //
1178  // For that matter, we don't know what coresPerPkg and threadsPerCore
1179  // (or the total # packages) are at this point - we want to determine
1180  // that now. We only have an upper bound on the first two figures.
1181  //
1182  // We also perform a consistency check at this point: the values returned
1183  // by the cpuid instruction for any thread bound to a given package had
1184  // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1185  //
1186  nPackages = 1;
1187  nCoresPerPkg = 1;
1188  __kmp_nThreadsPerCore = 1;
1189  unsigned nCores = 1;
1190 
1191  unsigned pkgCt = 1; // to determine radii
1192  unsigned lastPkgId = threadInfo[0].pkgId;
1193  unsigned coreCt = 1;
1194  unsigned lastCoreId = threadInfo[0].coreId;
1195  unsigned threadCt = 1;
1196  unsigned lastThreadId = threadInfo[0].threadId;
1197 
1198  // intra-pkg consist checks
1199  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1200  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1201 
1202  for (i = 1; i < nApics; i++) {
1203  if (threadInfo[i].pkgId != lastPkgId) {
1204  nCores++;
1205  pkgCt++;
1206  lastPkgId = threadInfo[i].pkgId;
1207  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1208  coreCt = 1;
1209  lastCoreId = threadInfo[i].coreId;
1210  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1211  threadCt = 1;
1212  lastThreadId = threadInfo[i].threadId;
1213 
1214  //
1215  // This is a different package, so go on to the next iteration
1216  // without doing any consistency checks. Reset the consistency
1217  // check vars, though.
1218  //
1219  prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1220  prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1221  continue;
1222  }
1223 
1224  if (threadInfo[i].coreId != lastCoreId) {
1225  nCores++;
1226  coreCt++;
1227  lastCoreId = threadInfo[i].coreId;
1228  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1229  threadCt = 1;
1230  lastThreadId = threadInfo[i].threadId;
1231  }
1232  else if (threadInfo[i].threadId != lastThreadId) {
1233  threadCt++;
1234  lastThreadId = threadInfo[i].threadId;
1235  }
1236  else {
1237  __kmp_free(threadInfo);
1238  KMP_CPU_FREE(oldMask);
1239  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1240  return -1;
1241  }
1242 
1243  //
1244  // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1245  // fields agree between all the threads bounds to a given package.
1246  //
1247  if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1248  || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1249  __kmp_free(threadInfo);
1250  KMP_CPU_FREE(oldMask);
1251  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1252  return -1;
1253  }
1254  }
1255  nPackages = pkgCt;
1256  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1257  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1258 
1259  //
1260  // When affinity is off, this routine will still be called to set
1261  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1262  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1263  // correctly, and return now if affinity is not enabled.
1264  //
1265  __kmp_ncores = nCores;
1266  if (__kmp_affinity_verbose) {
1267  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1268  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1269 
1270  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1271  if (__kmp_affinity_respect_mask) {
1272  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1273  } else {
1274  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1275  }
1276  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1277  if (__kmp_affinity_uniform_topology()) {
1278  KMP_INFORM(Uniform, "KMP_AFFINITY");
1279  } else {
1280  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1281  }
1282  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1283  __kmp_nThreadsPerCore, __kmp_ncores);
1284 
1285  }
1286  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1287  KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1288  __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1289  for (i = 0; i < nApics; ++i) {
1290  __kmp_pu_os_idx[i] = threadInfo[i].osId;
1291  }
1292  if (__kmp_affinity_type == affinity_none) {
1293  __kmp_free(threadInfo);
1294  KMP_CPU_FREE(oldMask);
1295  return 0;
1296  }
1297 
1298  //
1299  // Now that we've determined the number of packages, the number of cores
1300  // per package, and the number of threads per core, we can construct the
1301  // data structure that is to be returned.
1302  //
1303  int pkgLevel = 0;
1304  int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1305  int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1306  unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1307 
1308  KMP_ASSERT(depth > 0);
1309  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1310 
1311  for (i = 0; i < nApics; ++i) {
1312  Address addr(depth);
1313  unsigned os = threadInfo[i].osId;
1314  int d = 0;
1315 
1316  if (pkgLevel >= 0) {
1317  addr.labels[d++] = threadInfo[i].pkgId;
1318  }
1319  if (coreLevel >= 0) {
1320  addr.labels[d++] = threadInfo[i].coreId;
1321  }
1322  if (threadLevel >= 0) {
1323  addr.labels[d++] = threadInfo[i].threadId;
1324  }
1325  (*address2os)[i] = AddrUnsPair(addr, os);
1326  }
1327 
1328  if (__kmp_affinity_gran_levels < 0) {
1329  //
1330  // Set the granularity level based on what levels are modeled
1331  // in the machine topology map.
1332  //
1333  __kmp_affinity_gran_levels = 0;
1334  if ((threadLevel >= 0)
1335  && (__kmp_affinity_gran > affinity_gran_thread)) {
1336  __kmp_affinity_gran_levels++;
1337  }
1338  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1339  __kmp_affinity_gran_levels++;
1340  }
1341  if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1342  __kmp_affinity_gran_levels++;
1343  }
1344  }
1345 
1346  if (__kmp_affinity_verbose) {
1347  __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1348  coreLevel, threadLevel);
1349  }
1350 
1351  __kmp_free(threadInfo);
1352  KMP_CPU_FREE(oldMask);
1353  return depth;
1354 }
1355 
1356 
1357 //
1358 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1359 // architectures support a newer interface for specifying the x2APIC Ids,
1360 // based on cpuid leaf 11.
1361 //
1362 static int
1363 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1364  kmp_i18n_id_t *const msg_id)
1365 {
1366  kmp_cpuid buf;
1367 
1368  *address2os = NULL;
1369  *msg_id = kmp_i18n_null;
1370 
1371  //
1372  // Check to see if cpuid leaf 11 is supported.
1373  //
1374  __kmp_x86_cpuid(0, 0, &buf);
1375  if (buf.eax < 11) {
1376  *msg_id = kmp_i18n_str_NoLeaf11Support;
1377  return -1;
1378  }
1379  __kmp_x86_cpuid(11, 0, &buf);
1380  if (buf.ebx == 0) {
1381  *msg_id = kmp_i18n_str_NoLeaf11Support;
1382  return -1;
1383  }
1384 
1385  //
1386  // Find the number of levels in the machine topology. While we're at it,
1387  // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will
1388  // try to get more accurate values later by explicitly counting them,
1389  // but get reasonable defaults now, in case we return early.
1390  //
1391  int level;
1392  int threadLevel = -1;
1393  int coreLevel = -1;
1394  int pkgLevel = -1;
1395  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1396 
1397  for (level = 0;; level++) {
1398  if (level > 31) {
1399  //
1400  // FIXME: Hack for DPD200163180
1401  //
1402  // If level is big then something went wrong -> exiting
1403  //
1404  // There could actually be 32 valid levels in the machine topology,
1405  // but so far, the only machine we have seen which does not exit
1406  // this loop before iteration 32 has fubar x2APIC settings.
1407  //
1408  // For now, just reject this case based upon loop trip count.
1409  //
1410  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1411  return -1;
1412  }
1413  __kmp_x86_cpuid(11, level, &buf);
1414  if (buf.ebx == 0) {
1415  if (pkgLevel < 0) {
1416  //
1417  // Will infer nPackages from __kmp_xproc
1418  //
1419  pkgLevel = level;
1420  level++;
1421  }
1422  break;
1423  }
1424  int kind = (buf.ecx >> 8) & 0xff;
1425  if (kind == 1) {
1426  //
1427  // SMT level
1428  //
1429  threadLevel = level;
1430  coreLevel = -1;
1431  pkgLevel = -1;
1432  __kmp_nThreadsPerCore = buf.ebx & 0xff;
1433  if (__kmp_nThreadsPerCore == 0) {
1434  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1435  return -1;
1436  }
1437  }
1438  else if (kind == 2) {
1439  //
1440  // core level
1441  //
1442  coreLevel = level;
1443  pkgLevel = -1;
1444  nCoresPerPkg = buf.ebx & 0xff;
1445  if (nCoresPerPkg == 0) {
1446  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1447  return -1;
1448  }
1449  }
1450  else {
1451  if (level <= 0) {
1452  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1453  return -1;
1454  }
1455  if (pkgLevel >= 0) {
1456  continue;
1457  }
1458  pkgLevel = level;
1459  nPackages = buf.ebx & 0xff;
1460  if (nPackages == 0) {
1461  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1462  return -1;
1463  }
1464  }
1465  }
1466  int depth = level;
1467 
1468  //
1469  // In the above loop, "level" was counted from the finest level (usually
1470  // thread) to the coarsest. The caller expects that we will place the
1471  // labels in (*address2os)[].first.labels[] in the inverse order, so
1472  // we need to invert the vars saying which level means what.
1473  //
1474  if (threadLevel >= 0) {
1475  threadLevel = depth - threadLevel - 1;
1476  }
1477  if (coreLevel >= 0) {
1478  coreLevel = depth - coreLevel - 1;
1479  }
1480  KMP_DEBUG_ASSERT(pkgLevel >= 0);
1481  pkgLevel = depth - pkgLevel - 1;
1482 
1483  //
1484  // The algorithm used starts by setting the affinity to each available
1485  // thread and retrieving info from the cpuid instruction, so if we are
1486  // not capable of calling __kmp_get_system_affinity() and
1487  // _kmp_get_system_affinity(), then we need to do something else - use
1488  // the defaults that we calculated from issuing cpuid without binding
1489  // to each proc.
1490  //
1491  if (! KMP_AFFINITY_CAPABLE())
1492  {
1493  //
1494  // Hack to try and infer the machine topology using only the data
1495  // available from cpuid on the current thread, and __kmp_xproc.
1496  //
1497  KMP_ASSERT(__kmp_affinity_type == affinity_none);
1498 
1499  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1500  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1501  if (__kmp_affinity_verbose) {
1502  KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1503  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1504  if (__kmp_affinity_uniform_topology()) {
1505  KMP_INFORM(Uniform, "KMP_AFFINITY");
1506  } else {
1507  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1508  }
1509  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1510  __kmp_nThreadsPerCore, __kmp_ncores);
1511  }
1512  return 0;
1513  }
1514 
1515  //
1516  //
1517  // From here on, we can assume that it is safe to call
1518  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1519  // even if __kmp_affinity_type = affinity_none.
1520  //
1521 
1522  //
1523  // Save the affinity mask for the current thread.
1524  //
1525  kmp_affin_mask_t *oldMask;
1526  KMP_CPU_ALLOC(oldMask);
1527  __kmp_get_system_affinity(oldMask, TRUE);
1528 
1529  //
1530  // Allocate the data structure to be returned.
1531  //
1532  AddrUnsPair *retval = (AddrUnsPair *)
1533  __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1534 
1535  //
1536  // Run through each of the available contexts, binding the current thread
1537  // to it, and obtaining the pertinent information using the cpuid instr.
1538  //
1539  unsigned int proc;
1540  int nApics = 0;
1541  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
1542  //
1543  // Skip this proc if it is not included in the machine model.
1544  //
1545  if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
1546  continue;
1547  }
1548  KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1549 
1550  __kmp_affinity_bind_thread(proc);
1551 
1552  //
1553  // Extrach the labels for each level in the machine topology map
1554  // from the Apic ID.
1555  //
1556  Address addr(depth);
1557  int prev_shift = 0;
1558 
1559  for (level = 0; level < depth; level++) {
1560  __kmp_x86_cpuid(11, level, &buf);
1561  unsigned apicId = buf.edx;
1562  if (buf.ebx == 0) {
1563  if (level != depth - 1) {
1564  KMP_CPU_FREE(oldMask);
1565  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1566  return -1;
1567  }
1568  addr.labels[depth - level - 1] = apicId >> prev_shift;
1569  level++;
1570  break;
1571  }
1572  int shift = buf.eax & 0x1f;
1573  int mask = (1 << shift) - 1;
1574  addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1575  prev_shift = shift;
1576  }
1577  if (level != depth) {
1578  KMP_CPU_FREE(oldMask);
1579  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1580  return -1;
1581  }
1582 
1583  retval[nApics] = AddrUnsPair(addr, proc);
1584  nApics++;
1585  }
1586 
1587  //
1588  // We've collected all the info we need.
1589  // Restore the old affinity mask for this thread.
1590  //
1591  __kmp_set_system_affinity(oldMask, TRUE);
1592 
1593  //
1594  // If there's only one thread context to bind to, return now.
1595  //
1596  KMP_ASSERT(nApics > 0);
1597  if (nApics == 1) {
1598  __kmp_ncores = nPackages = 1;
1599  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1600  if (__kmp_affinity_verbose) {
1601  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1602  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1603 
1604  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1605  if (__kmp_affinity_respect_mask) {
1606  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1607  } else {
1608  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1609  }
1610  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1611  KMP_INFORM(Uniform, "KMP_AFFINITY");
1612  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1613  __kmp_nThreadsPerCore, __kmp_ncores);
1614  }
1615 
1616  if (__kmp_affinity_type == affinity_none) {
1617  __kmp_free(retval);
1618  KMP_CPU_FREE(oldMask);
1619  return 0;
1620  }
1621 
1622  //
1623  // Form an Address object which only includes the package level.
1624  //
1625  Address addr(1);
1626  addr.labels[0] = retval[0].first.labels[pkgLevel];
1627  retval[0].first = addr;
1628 
1629  if (__kmp_affinity_gran_levels < 0) {
1630  __kmp_affinity_gran_levels = 0;
1631  }
1632 
1633  if (__kmp_affinity_verbose) {
1634  __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1635  }
1636 
1637  *address2os = retval;
1638  KMP_CPU_FREE(oldMask);
1639  return 1;
1640  }
1641 
1642  //
1643  // Sort the table by physical Id.
1644  //
1645  qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1646 
1647  //
1648  // Find the radix at each of the levels.
1649  //
1650  unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1651  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1652  unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1653  unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1654  for (level = 0; level < depth; level++) {
1655  totals[level] = 1;
1656  maxCt[level] = 1;
1657  counts[level] = 1;
1658  last[level] = retval[0].first.labels[level];
1659  }
1660 
1661  //
1662  // From here on, the iteration variable "level" runs from the finest
1663  // level to the coarsest, i.e. we iterate forward through
1664  // (*address2os)[].first.labels[] - in the previous loops, we iterated
1665  // backwards.
1666  //
1667  for (proc = 1; (int)proc < nApics; proc++) {
1668  int level;
1669  for (level = 0; level < depth; level++) {
1670  if (retval[proc].first.labels[level] != last[level]) {
1671  int j;
1672  for (j = level + 1; j < depth; j++) {
1673  totals[j]++;
1674  counts[j] = 1;
1675  // The line below causes printing incorrect topology information
1676  // in case the max value for some level (maxCt[level]) is encountered earlier than
1677  // some less value while going through the array.
1678  // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1679  // whereas it must be 4.
1680  // TODO!!! Check if it can be commented safely
1681  //maxCt[j] = 1;
1682  last[j] = retval[proc].first.labels[j];
1683  }
1684  totals[level]++;
1685  counts[level]++;
1686  if (counts[level] > maxCt[level]) {
1687  maxCt[level] = counts[level];
1688  }
1689  last[level] = retval[proc].first.labels[level];
1690  break;
1691  }
1692  else if (level == depth - 1) {
1693  __kmp_free(last);
1694  __kmp_free(maxCt);
1695  __kmp_free(counts);
1696  __kmp_free(totals);
1697  __kmp_free(retval);
1698  KMP_CPU_FREE(oldMask);
1699  *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1700  return -1;
1701  }
1702  }
1703  }
1704 
1705  //
1706  // When affinity is off, this routine will still be called to set
1707  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1708  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1709  // correctly, and return if affinity is not enabled.
1710  //
1711  if (threadLevel >= 0) {
1712  __kmp_nThreadsPerCore = maxCt[threadLevel];
1713  }
1714  else {
1715  __kmp_nThreadsPerCore = 1;
1716  }
1717  nPackages = totals[pkgLevel];
1718 
1719  if (coreLevel >= 0) {
1720  __kmp_ncores = totals[coreLevel];
1721  nCoresPerPkg = maxCt[coreLevel];
1722  }
1723  else {
1724  __kmp_ncores = nPackages;
1725  nCoresPerPkg = 1;
1726  }
1727 
1728  //
1729  // Check to see if the machine topology is uniform
1730  //
1731  unsigned prod = maxCt[0];
1732  for (level = 1; level < depth; level++) {
1733  prod *= maxCt[level];
1734  }
1735  bool uniform = (prod == totals[level - 1]);
1736 
1737  //
1738  // Print the machine topology summary.
1739  //
1740  if (__kmp_affinity_verbose) {
1741  char mask[KMP_AFFIN_MASK_PRINT_LEN];
1742  __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1743 
1744  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1745  if (__kmp_affinity_respect_mask) {
1746  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1747  } else {
1748  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1749  }
1750  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1751  if (uniform) {
1752  KMP_INFORM(Uniform, "KMP_AFFINITY");
1753  } else {
1754  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1755  }
1756 
1757  kmp_str_buf_t buf;
1758  __kmp_str_buf_init(&buf);
1759 
1760  __kmp_str_buf_print(&buf, "%d", totals[0]);
1761  for (level = 1; level <= pkgLevel; level++) {
1762  __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1763  }
1764  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1765  __kmp_nThreadsPerCore, __kmp_ncores);
1766 
1767  __kmp_str_buf_free(&buf);
1768  }
1769  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1770  KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1771  __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1772  for (proc = 0; (int)proc < nApics; ++proc) {
1773  __kmp_pu_os_idx[proc] = retval[proc].second;
1774  }
1775  if (__kmp_affinity_type == affinity_none) {
1776  __kmp_free(last);
1777  __kmp_free(maxCt);
1778  __kmp_free(counts);
1779  __kmp_free(totals);
1780  __kmp_free(retval);
1781  KMP_CPU_FREE(oldMask);
1782  return 0;
1783  }
1784 
1785  //
1786  // Find any levels with radiix 1, and remove them from the map
1787  // (except for the package level).
1788  //
1789  int new_depth = 0;
1790  for (level = 0; level < depth; level++) {
1791  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1792  continue;
1793  }
1794  new_depth++;
1795  }
1796 
1797  //
1798  // If we are removing any levels, allocate a new vector to return,
1799  // and copy the relevant information to it.
1800  //
1801  if (new_depth != depth) {
1802  AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1803  sizeof(AddrUnsPair) * nApics);
1804  for (proc = 0; (int)proc < nApics; proc++) {
1805  Address addr(new_depth);
1806  new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1807  }
1808  int new_level = 0;
1809  int newPkgLevel = -1;
1810  int newCoreLevel = -1;
1811  int newThreadLevel = -1;
1812  int i;
1813  for (level = 0; level < depth; level++) {
1814  if ((maxCt[level] == 1)
1815  && (level != pkgLevel)) {
1816  //
1817  // Remove this level. Never remove the package level
1818  //
1819  continue;
1820  }
1821  if (level == pkgLevel) {
1822  newPkgLevel = level;
1823  }
1824  if (level == coreLevel) {
1825  newCoreLevel = level;
1826  }
1827  if (level == threadLevel) {
1828  newThreadLevel = level;
1829  }
1830  for (proc = 0; (int)proc < nApics; proc++) {
1831  new_retval[proc].first.labels[new_level]
1832  = retval[proc].first.labels[level];
1833  }
1834  new_level++;
1835  }
1836 
1837  __kmp_free(retval);
1838  retval = new_retval;
1839  depth = new_depth;
1840  pkgLevel = newPkgLevel;
1841  coreLevel = newCoreLevel;
1842  threadLevel = newThreadLevel;
1843  }
1844 
1845  if (__kmp_affinity_gran_levels < 0) {
1846  //
1847  // Set the granularity level based on what levels are modeled
1848  // in the machine topology map.
1849  //
1850  __kmp_affinity_gran_levels = 0;
1851  if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1852  __kmp_affinity_gran_levels++;
1853  }
1854  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1855  __kmp_affinity_gran_levels++;
1856  }
1857  if (__kmp_affinity_gran > affinity_gran_package) {
1858  __kmp_affinity_gran_levels++;
1859  }
1860  }
1861 
1862  if (__kmp_affinity_verbose) {
1863  __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1864  coreLevel, threadLevel);
1865  }
1866 
1867  __kmp_free(last);
1868  __kmp_free(maxCt);
1869  __kmp_free(counts);
1870  __kmp_free(totals);
1871  KMP_CPU_FREE(oldMask);
1872  *address2os = retval;
1873  return depth;
1874 }
1875 
1876 
1877 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1878 
1879 
1880 #define osIdIndex 0
1881 #define threadIdIndex 1
1882 #define coreIdIndex 2
1883 #define pkgIdIndex 3
1884 #define nodeIdIndex 4
1885 
1886 typedef unsigned *ProcCpuInfo;
1887 static unsigned maxIndex = pkgIdIndex;
1888 
1889 
1890 static int
1891 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1892 {
1893  const unsigned *aa = (const unsigned *)a;
1894  const unsigned *bb = (const unsigned *)b;
1895  if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1896  if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1897  return 0;
1898 };
1899 
1900 
1901 static int
1902 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1903 {
1904  unsigned i;
1905  const unsigned *aa = *((const unsigned **)a);
1906  const unsigned *bb = *((const unsigned **)b);
1907  for (i = maxIndex; ; i--) {
1908  if (aa[i] < bb[i]) return -1;
1909  if (aa[i] > bb[i]) return 1;
1910  if (i == osIdIndex) break;
1911  }
1912  return 0;
1913 }
1914 
1915 
1916 //
1917 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1918 // affinity map.
1919 //
1920 static int
1921 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1922  kmp_i18n_id_t *const msg_id, FILE *f)
1923 {
1924  *address2os = NULL;
1925  *msg_id = kmp_i18n_null;
1926 
1927  //
1928  // Scan of the file, and count the number of "processor" (osId) fields,
1929  // and find the highest value of <n> for a node_<n> field.
1930  //
1931  char buf[256];
1932  unsigned num_records = 0;
1933  while (! feof(f)) {
1934  buf[sizeof(buf) - 1] = 1;
1935  if (! fgets(buf, sizeof(buf), f)) {
1936  //
1937  // Read errors presumably because of EOF
1938  //
1939  break;
1940  }
1941 
1942  char s1[] = "processor";
1943  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1944  num_records++;
1945  continue;
1946  }
1947 
1948  //
1949  // FIXME - this will match "node_<n> <garbage>"
1950  //
1951  unsigned level;
1952  if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1953  if (nodeIdIndex + level >= maxIndex) {
1954  maxIndex = nodeIdIndex + level;
1955  }
1956  continue;
1957  }
1958  }
1959 
1960  //
1961  // Check for empty file / no valid processor records, or too many.
1962  // The number of records can't exceed the number of valid bits in the
1963  // affinity mask.
1964  //
1965  if (num_records == 0) {
1966  *line = 0;
1967  *msg_id = kmp_i18n_str_NoProcRecords;
1968  return -1;
1969  }
1970  if (num_records > (unsigned)__kmp_xproc) {
1971  *line = 0;
1972  *msg_id = kmp_i18n_str_TooManyProcRecords;
1973  return -1;
1974  }
1975 
1976  //
1977  // Set the file pointer back to the begginning, so that we can scan the
1978  // file again, this time performing a full parse of the data.
1979  // Allocate a vector of ProcCpuInfo object, where we will place the data.
1980  // Adding an extra element at the end allows us to remove a lot of extra
1981  // checks for termination conditions.
1982  //
1983  if (fseek(f, 0, SEEK_SET) != 0) {
1984  *line = 0;
1985  *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1986  return -1;
1987  }
1988 
1989  //
1990  // Allocate the array of records to store the proc info in. The dummy
1991  // element at the end makes the logic in filling them out easier to code.
1992  //
1993  unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1994  * sizeof(unsigned *));
1995  unsigned i;
1996  for (i = 0; i <= num_records; i++) {
1997  threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1998  * sizeof(unsigned));
1999  }
2000 
2001 #define CLEANUP_THREAD_INFO \
2002  for (i = 0; i <= num_records; i++) { \
2003  __kmp_free(threadInfo[i]); \
2004  } \
2005  __kmp_free(threadInfo);
2006 
2007  //
2008  // A value of UINT_MAX means that we didn't find the field
2009  //
2010  unsigned __index;
2011 
2012 #define INIT_PROC_INFO(p) \
2013  for (__index = 0; __index <= maxIndex; __index++) { \
2014  (p)[__index] = UINT_MAX; \
2015  }
2016 
2017  for (i = 0; i <= num_records; i++) {
2018  INIT_PROC_INFO(threadInfo[i]);
2019  }
2020 
2021  unsigned num_avail = 0;
2022  *line = 0;
2023  while (! feof(f)) {
2024  //
2025  // Create an inner scoping level, so that all the goto targets at the
2026  // end of the loop appear in an outer scoping level. This avoids
2027  // warnings about jumping past an initialization to a target in the
2028  // same block.
2029  //
2030  {
2031  buf[sizeof(buf) - 1] = 1;
2032  bool long_line = false;
2033  if (! fgets(buf, sizeof(buf), f)) {
2034  //
2035  // Read errors presumably because of EOF
2036  //
2037  // If there is valid data in threadInfo[num_avail], then fake
2038  // a blank line in ensure that the last address gets parsed.
2039  //
2040  bool valid = false;
2041  for (i = 0; i <= maxIndex; i++) {
2042  if (threadInfo[num_avail][i] != UINT_MAX) {
2043  valid = true;
2044  }
2045  }
2046  if (! valid) {
2047  break;
2048  }
2049  buf[0] = 0;
2050  } else if (!buf[sizeof(buf) - 1]) {
2051  //
2052  // The line is longer than the buffer. Set a flag and don't
2053  // emit an error if we were going to ignore the line, anyway.
2054  //
2055  long_line = true;
2056 
2057 #define CHECK_LINE \
2058  if (long_line) { \
2059  CLEANUP_THREAD_INFO; \
2060  *msg_id = kmp_i18n_str_LongLineCpuinfo; \
2061  return -1; \
2062  }
2063  }
2064  (*line)++;
2065 
2066  char s1[] = "processor";
2067  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2068  CHECK_LINE;
2069  char *p = strchr(buf + sizeof(s1) - 1, ':');
2070  unsigned val;
2071  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2072  if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
2073  threadInfo[num_avail][osIdIndex] = val;
2074 #if KMP_OS_LINUX && USE_SYSFS_INFO
2075  char path[256];
2076  KMP_SNPRINTF(path, sizeof(path),
2077  "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2078  threadInfo[num_avail][osIdIndex]);
2079  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2080 
2081  KMP_SNPRINTF(path, sizeof(path),
2082  "/sys/devices/system/cpu/cpu%u/topology/core_id",
2083  threadInfo[num_avail][osIdIndex]);
2084  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2085  continue;
2086 #else
2087  }
2088  char s2[] = "physical id";
2089  if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2090  CHECK_LINE;
2091  char *p = strchr(buf + sizeof(s2) - 1, ':');
2092  unsigned val;
2093  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2094  if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2095  threadInfo[num_avail][pkgIdIndex] = val;
2096  continue;
2097  }
2098  char s3[] = "core id";
2099  if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2100  CHECK_LINE;
2101  char *p = strchr(buf + sizeof(s3) - 1, ':');
2102  unsigned val;
2103  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2104  if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2105  threadInfo[num_avail][coreIdIndex] = val;
2106  continue;
2107 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2108  }
2109  char s4[] = "thread id";
2110  if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2111  CHECK_LINE;
2112  char *p = strchr(buf + sizeof(s4) - 1, ':');
2113  unsigned val;
2114  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2115  if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2116  threadInfo[num_avail][threadIdIndex] = val;
2117  continue;
2118  }
2119  unsigned level;
2120  if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
2121  CHECK_LINE;
2122  char *p = strchr(buf + sizeof(s4) - 1, ':');
2123  unsigned val;
2124  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2125  KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2126  if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2127  threadInfo[num_avail][nodeIdIndex + level] = val;
2128  continue;
2129  }
2130 
2131  //
2132  // We didn't recognize the leading token on the line.
2133  // There are lots of leading tokens that we don't recognize -
2134  // if the line isn't empty, go on to the next line.
2135  //
2136  if ((*buf != 0) && (*buf != '\n')) {
2137  //
2138  // If the line is longer than the buffer, read characters
2139  // until we find a newline.
2140  //
2141  if (long_line) {
2142  int ch;
2143  while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2144  }
2145  continue;
2146  }
2147 
2148  //
2149  // A newline has signalled the end of the processor record.
2150  // Check that there aren't too many procs specified.
2151  //
2152  if ((int)num_avail == __kmp_xproc) {
2153  CLEANUP_THREAD_INFO;
2154  *msg_id = kmp_i18n_str_TooManyEntries;
2155  return -1;
2156  }
2157 
2158  //
2159  // Check for missing fields. The osId field must be there, and we
2160  // currently require that the physical id field is specified, also.
2161  //
2162  if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2163  CLEANUP_THREAD_INFO;
2164  *msg_id = kmp_i18n_str_MissingProcField;
2165  return -1;
2166  }
2167  if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2168  CLEANUP_THREAD_INFO;
2169  *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2170  return -1;
2171  }
2172 
2173  //
2174  // Skip this proc if it is not included in the machine model.
2175  //
2176  if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], __kmp_affin_fullMask)) {
2177  INIT_PROC_INFO(threadInfo[num_avail]);
2178  continue;
2179  }
2180 
2181  //
2182  // We have a successful parse of this proc's info.
2183  // Increment the counter, and prepare for the next proc.
2184  //
2185  num_avail++;
2186  KMP_ASSERT(num_avail <= num_records);
2187  INIT_PROC_INFO(threadInfo[num_avail]);
2188  }
2189  continue;
2190 
2191  no_val:
2192  CLEANUP_THREAD_INFO;
2193  *msg_id = kmp_i18n_str_MissingValCpuinfo;
2194  return -1;
2195 
2196  dup_field:
2197  CLEANUP_THREAD_INFO;
2198  *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2199  return -1;
2200  }
2201  *line = 0;
2202 
2203 # if KMP_MIC && REDUCE_TEAM_SIZE
2204  unsigned teamSize = 0;
2205 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2206 
2207  // check for num_records == __kmp_xproc ???
2208 
2209  //
2210  // If there's only one thread context to bind to, form an Address object
2211  // with depth 1 and return immediately (or, if affinity is off, set
2212  // address2os to NULL and return).
2213  //
2214  // If it is configured to omit the package level when there is only a
2215  // single package, the logic at the end of this routine won't work if
2216  // there is only a single thread - it would try to form an Address
2217  // object with depth 0.
2218  //
2219  KMP_ASSERT(num_avail > 0);
2220  KMP_ASSERT(num_avail <= num_records);
2221  if (num_avail == 1) {
2222  __kmp_ncores = 1;
2223  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2224  if (__kmp_affinity_verbose) {
2225  if (! KMP_AFFINITY_CAPABLE()) {
2226  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2227  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2228  KMP_INFORM(Uniform, "KMP_AFFINITY");
2229  }
2230  else {
2231  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2232  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2233  __kmp_affin_fullMask);
2234  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2235  if (__kmp_affinity_respect_mask) {
2236  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2237  } else {
2238  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2239  }
2240  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2241  KMP_INFORM(Uniform, "KMP_AFFINITY");
2242  }
2243  int index;
2244  kmp_str_buf_t buf;
2245  __kmp_str_buf_init(&buf);
2246  __kmp_str_buf_print(&buf, "1");
2247  for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2248  __kmp_str_buf_print(&buf, " x 1");
2249  }
2250  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2251  __kmp_str_buf_free(&buf);
2252  }
2253 
2254  if (__kmp_affinity_type == affinity_none) {
2255  CLEANUP_THREAD_INFO;
2256  return 0;
2257  }
2258 
2259  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2260  Address addr(1);
2261  addr.labels[0] = threadInfo[0][pkgIdIndex];
2262  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2263 
2264  if (__kmp_affinity_gran_levels < 0) {
2265  __kmp_affinity_gran_levels = 0;
2266  }
2267 
2268  if (__kmp_affinity_verbose) {
2269  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2270  }
2271 
2272  CLEANUP_THREAD_INFO;
2273  return 1;
2274  }
2275 
2276  //
2277  // Sort the threadInfo table by physical Id.
2278  //
2279  qsort(threadInfo, num_avail, sizeof(*threadInfo),
2280  __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2281 
2282  //
2283  // The table is now sorted by pkgId / coreId / threadId, but we really
2284  // don't know the radix of any of the fields. pkgId's may be sparsely
2285  // assigned among the chips on a system. Although coreId's are usually
2286  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2287  // [0..threadsPerCore-1], we don't want to make any such assumptions.
2288  //
2289  // For that matter, we don't know what coresPerPkg and threadsPerCore
2290  // (or the total # packages) are at this point - we want to determine
2291  // that now. We only have an upper bound on the first two figures.
2292  //
2293  unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2294  * sizeof(unsigned));
2295  unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2296  * sizeof(unsigned));
2297  unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2298  * sizeof(unsigned));
2299  unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2300  * sizeof(unsigned));
2301 
2302  bool assign_thread_ids = false;
2303  unsigned threadIdCt;
2304  unsigned index;
2305 
2306  restart_radix_check:
2307  threadIdCt = 0;
2308 
2309  //
2310  // Initialize the counter arrays with data from threadInfo[0].
2311  //
2312  if (assign_thread_ids) {
2313  if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2314  threadInfo[0][threadIdIndex] = threadIdCt++;
2315  }
2316  else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2317  threadIdCt = threadInfo[0][threadIdIndex] + 1;
2318  }
2319  }
2320  for (index = 0; index <= maxIndex; index++) {
2321  counts[index] = 1;
2322  maxCt[index] = 1;
2323  totals[index] = 1;
2324  lastId[index] = threadInfo[0][index];;
2325  }
2326 
2327  //
2328  // Run through the rest of the OS procs.
2329  //
2330  for (i = 1; i < num_avail; i++) {
2331  //
2332  // Find the most significant index whose id differs
2333  // from the id for the previous OS proc.
2334  //
2335  for (index = maxIndex; index >= threadIdIndex; index--) {
2336  if (assign_thread_ids && (index == threadIdIndex)) {
2337  //
2338  // Auto-assign the thread id field if it wasn't specified.
2339  //
2340  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2341  threadInfo[i][threadIdIndex] = threadIdCt++;
2342  }
2343 
2344  //
2345  // Aparrently the thread id field was specified for some
2346  // entries and not others. Start the thread id counter
2347  // off at the next higher thread id.
2348  //
2349  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2350  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2351  }
2352  }
2353  if (threadInfo[i][index] != lastId[index]) {
2354  //
2355  // Run through all indices which are less significant,
2356  // and reset the counts to 1.
2357  //
2358  // At all levels up to and including index, we need to
2359  // increment the totals and record the last id.
2360  //
2361  unsigned index2;
2362  for (index2 = threadIdIndex; index2 < index; index2++) {
2363  totals[index2]++;
2364  if (counts[index2] > maxCt[index2]) {
2365  maxCt[index2] = counts[index2];
2366  }
2367  counts[index2] = 1;
2368  lastId[index2] = threadInfo[i][index2];
2369  }
2370  counts[index]++;
2371  totals[index]++;
2372  lastId[index] = threadInfo[i][index];
2373 
2374  if (assign_thread_ids && (index > threadIdIndex)) {
2375 
2376 # if KMP_MIC && REDUCE_TEAM_SIZE
2377  //
2378  // The default team size is the total #threads in the machine
2379  // minus 1 thread for every core that has 3 or more threads.
2380  //
2381  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2382 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2383 
2384  //
2385  // Restart the thread counter, as we are on a new core.
2386  //
2387  threadIdCt = 0;
2388 
2389  //
2390  // Auto-assign the thread id field if it wasn't specified.
2391  //
2392  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2393  threadInfo[i][threadIdIndex] = threadIdCt++;
2394  }
2395 
2396  //
2397  // Aparrently the thread id field was specified for some
2398  // entries and not others. Start the thread id counter
2399  // off at the next higher thread id.
2400  //
2401  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2402  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2403  }
2404  }
2405  break;
2406  }
2407  }
2408  if (index < threadIdIndex) {
2409  //
2410  // If thread ids were specified, it is an error if they are not
2411  // unique. Also, check that we waven't already restarted the
2412  // loop (to be safe - shouldn't need to).
2413  //
2414  if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2415  || assign_thread_ids) {
2416  __kmp_free(lastId);
2417  __kmp_free(totals);
2418  __kmp_free(maxCt);
2419  __kmp_free(counts);
2420  CLEANUP_THREAD_INFO;
2421  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2422  return -1;
2423  }
2424 
2425  //
2426  // If the thread ids were not specified and we see entries
2427  // entries that are duplicates, start the loop over and
2428  // assign the thread ids manually.
2429  //
2430  assign_thread_ids = true;
2431  goto restart_radix_check;
2432  }
2433  }
2434 
2435 # if KMP_MIC && REDUCE_TEAM_SIZE
2436  //
2437  // The default team size is the total #threads in the machine
2438  // minus 1 thread for every core that has 3 or more threads.
2439  //
2440  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2441 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2442 
2443  for (index = threadIdIndex; index <= maxIndex; index++) {
2444  if (counts[index] > maxCt[index]) {
2445  maxCt[index] = counts[index];
2446  }
2447  }
2448 
2449  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2450  nCoresPerPkg = maxCt[coreIdIndex];
2451  nPackages = totals[pkgIdIndex];
2452 
2453  //
2454  // Check to see if the machine topology is uniform
2455  //
2456  unsigned prod = totals[maxIndex];
2457  for (index = threadIdIndex; index < maxIndex; index++) {
2458  prod *= maxCt[index];
2459  }
2460  bool uniform = (prod == totals[threadIdIndex]);
2461 
2462  //
2463  // When affinity is off, this routine will still be called to set
2464  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2465  // nCoresPerPkg, & nPackages. Make sure all these vars are set
2466  // correctly, and return now if affinity is not enabled.
2467  //
2468  __kmp_ncores = totals[coreIdIndex];
2469 
2470  if (__kmp_affinity_verbose) {
2471  if (! KMP_AFFINITY_CAPABLE()) {
2472  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2473  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2474  if (uniform) {
2475  KMP_INFORM(Uniform, "KMP_AFFINITY");
2476  } else {
2477  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2478  }
2479  }
2480  else {
2481  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2482  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask);
2483  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2484  if (__kmp_affinity_respect_mask) {
2485  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2486  } else {
2487  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2488  }
2489  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2490  if (uniform) {
2491  KMP_INFORM(Uniform, "KMP_AFFINITY");
2492  } else {
2493  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2494  }
2495  }
2496  kmp_str_buf_t buf;
2497  __kmp_str_buf_init(&buf);
2498 
2499  __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2500  for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2501  __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2502  }
2503  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2504  maxCt[threadIdIndex], __kmp_ncores);
2505 
2506  __kmp_str_buf_free(&buf);
2507  }
2508 
2509 # if KMP_MIC && REDUCE_TEAM_SIZE
2510  //
2511  // Set the default team size.
2512  //
2513  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2514  __kmp_dflt_team_nth = teamSize;
2515  KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2516  __kmp_dflt_team_nth));
2517  }
2518 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2519 
2520  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
2521  KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc);
2522  __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
2523  for (i = 0; i < num_avail; ++i) { // fill the os indices
2524  __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
2525  }
2526 
2527  if (__kmp_affinity_type == affinity_none) {
2528  __kmp_free(lastId);
2529  __kmp_free(totals);
2530  __kmp_free(maxCt);
2531  __kmp_free(counts);
2532  CLEANUP_THREAD_INFO;
2533  return 0;
2534  }
2535 
2536  //
2537  // Count the number of levels which have more nodes at that level than
2538  // at the parent's level (with there being an implicit root node of
2539  // the top level). This is equivalent to saying that there is at least
2540  // one node at this level which has a sibling. These levels are in the
2541  // map, and the package level is always in the map.
2542  //
2543  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2544  int level = 0;
2545  for (index = threadIdIndex; index < maxIndex; index++) {
2546  KMP_ASSERT(totals[index] >= totals[index + 1]);
2547  inMap[index] = (totals[index] > totals[index + 1]);
2548  }
2549  inMap[maxIndex] = (totals[maxIndex] > 1);
2550  inMap[pkgIdIndex] = true;
2551 
2552  int depth = 0;
2553  for (index = threadIdIndex; index <= maxIndex; index++) {
2554  if (inMap[index]) {
2555  depth++;
2556  }
2557  }
2558  KMP_ASSERT(depth > 0);
2559 
2560  //
2561  // Construct the data structure that is to be returned.
2562  //
2563  *address2os = (AddrUnsPair*)
2564  __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2565  int pkgLevel = -1;
2566  int coreLevel = -1;
2567  int threadLevel = -1;
2568 
2569  for (i = 0; i < num_avail; ++i) {
2570  Address addr(depth);
2571  unsigned os = threadInfo[i][osIdIndex];
2572  int src_index;
2573  int dst_index = 0;
2574 
2575  for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2576  if (! inMap[src_index]) {
2577  continue;
2578  }
2579  addr.labels[dst_index] = threadInfo[i][src_index];
2580  if (src_index == pkgIdIndex) {
2581  pkgLevel = dst_index;
2582  }
2583  else if (src_index == coreIdIndex) {
2584  coreLevel = dst_index;
2585  }
2586  else if (src_index == threadIdIndex) {
2587  threadLevel = dst_index;
2588  }
2589  dst_index++;
2590  }
2591  (*address2os)[i] = AddrUnsPair(addr, os);
2592  }
2593 
2594  if (__kmp_affinity_gran_levels < 0) {
2595  //
2596  // Set the granularity level based on what levels are modeled
2597  // in the machine topology map.
2598  //
2599  unsigned src_index;
2600  __kmp_affinity_gran_levels = 0;
2601  for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2602  if (! inMap[src_index]) {
2603  continue;
2604  }
2605  switch (src_index) {
2606  case threadIdIndex:
2607  if (__kmp_affinity_gran > affinity_gran_thread) {
2608  __kmp_affinity_gran_levels++;
2609  }
2610 
2611  break;
2612  case coreIdIndex:
2613  if (__kmp_affinity_gran > affinity_gran_core) {
2614  __kmp_affinity_gran_levels++;
2615  }
2616  break;
2617 
2618  case pkgIdIndex:
2619  if (__kmp_affinity_gran > affinity_gran_package) {
2620  __kmp_affinity_gran_levels++;
2621  }
2622  break;
2623  }
2624  }
2625  }
2626 
2627  if (__kmp_affinity_verbose) {
2628  __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2629  coreLevel, threadLevel);
2630  }
2631 
2632  __kmp_free(inMap);
2633  __kmp_free(lastId);
2634  __kmp_free(totals);
2635  __kmp_free(maxCt);
2636  __kmp_free(counts);
2637  CLEANUP_THREAD_INFO;
2638  return depth;
2639 }
2640 
2641 
2642 //
2643 // Create and return a table of affinity masks, indexed by OS thread ID.
2644 // This routine handles OR'ing together all the affinity masks of threads
2645 // that are sufficiently close, if granularity > fine.
2646 //
2647 static kmp_affin_mask_t *
2648 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2649  AddrUnsPair *address2os, unsigned numAddrs)
2650 {
2651  //
2652  // First form a table of affinity masks in order of OS thread id.
2653  //
2654  unsigned depth;
2655  unsigned maxOsId;
2656  unsigned i;
2657 
2658  KMP_ASSERT(numAddrs > 0);
2659  depth = address2os[0].first.depth;
2660 
2661  maxOsId = 0;
2662  for (i = 0; i < numAddrs; i++) {
2663  unsigned osId = address2os[i].second;
2664  if (osId > maxOsId) {
2665  maxOsId = osId;
2666  }
2667  }
2668  kmp_affin_mask_t *osId2Mask;
2669  KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1));
2670 
2671  //
2672  // Sort the address2os table according to physical order. Doing so
2673  // will put all threads on the same core/package/node in consecutive
2674  // locations.
2675  //
2676  qsort(address2os, numAddrs, sizeof(*address2os),
2677  __kmp_affinity_cmp_Address_labels);
2678 
2679  KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2680  if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2681  KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2682  }
2683  if (__kmp_affinity_gran_levels >= (int)depth) {
2684  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2685  && (__kmp_affinity_type != affinity_none))) {
2686  KMP_WARNING(AffThreadsMayMigrate);
2687  }
2688  }
2689 
2690  //
2691  // Run through the table, forming the masks for all threads on each
2692  // core. Threads on the same core will have identical "Address"
2693  // objects, not considering the last level, which must be the thread
2694  // id. All threads on a core will appear consecutively.
2695  //
2696  unsigned unique = 0;
2697  unsigned j = 0; // index of 1st thread on core
2698  unsigned leader = 0;
2699  Address *leaderAddr = &(address2os[0].first);
2700  kmp_affin_mask_t *sum;
2701  KMP_CPU_ALLOC_ON_STACK(sum);
2702  KMP_CPU_ZERO(sum);
2703  KMP_CPU_SET(address2os[0].second, sum);
2704  for (i = 1; i < numAddrs; i++) {
2705  //
2706  // If this thread is sufficiently close to the leader (within the
2707  // granularity setting), then set the bit for this os thread in the
2708  // affinity mask for this group, and go on to the next thread.
2709  //
2710  if (leaderAddr->isClose(address2os[i].first,
2711  __kmp_affinity_gran_levels)) {
2712  KMP_CPU_SET(address2os[i].second, sum);
2713  continue;
2714  }
2715 
2716  //
2717  // For every thread in this group, copy the mask to the thread's
2718  // entry in the osId2Mask table. Mark the first address as a
2719  // leader.
2720  //
2721  for (; j < i; j++) {
2722  unsigned osId = address2os[j].second;
2723  KMP_DEBUG_ASSERT(osId <= maxOsId);
2724  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2725  KMP_CPU_COPY(mask, sum);
2726  address2os[j].first.leader = (j == leader);
2727  }
2728  unique++;
2729 
2730  //
2731  // Start a new mask.
2732  //
2733  leader = i;
2734  leaderAddr = &(address2os[i].first);
2735  KMP_CPU_ZERO(sum);
2736  KMP_CPU_SET(address2os[i].second, sum);
2737  }
2738 
2739  //
2740  // For every thread in last group, copy the mask to the thread's
2741  // entry in the osId2Mask table.
2742  //
2743  for (; j < i; j++) {
2744  unsigned osId = address2os[j].second;
2745  KMP_DEBUG_ASSERT(osId <= maxOsId);
2746  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2747  KMP_CPU_COPY(mask, sum);
2748  address2os[j].first.leader = (j == leader);
2749  }
2750  unique++;
2751  KMP_CPU_FREE_FROM_STACK(sum);
2752 
2753  *maxIndex = maxOsId;
2754  *numUnique = unique;
2755  return osId2Mask;
2756 }
2757 
2758 
2759 //
2760 // Stuff for the affinity proclist parsers. It's easier to declare these vars
2761 // as file-static than to try and pass them through the calling sequence of
2762 // the recursive-descent OMP_PLACES parser.
2763 //
2764 static kmp_affin_mask_t *newMasks;
2765 static int numNewMasks;
2766 static int nextNewMask;
2767 
2768 #define ADD_MASK(_mask) \
2769  { \
2770  if (nextNewMask >= numNewMasks) { \
2771  int i; \
2772  numNewMasks *= 2; \
2773  kmp_affin_mask_t* temp; \
2774  KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \
2775  for(i=0;i<numNewMasks/2;i++) { \
2776  kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); \
2777  kmp_affin_mask_t* dest = KMP_CPU_INDEX(temp, i); \
2778  KMP_CPU_COPY(dest, src); \
2779  } \
2780  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks/2); \
2781  newMasks = temp; \
2782  } \
2783  KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2784  nextNewMask++; \
2785  }
2786 
2787 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2788  { \
2789  if (((_osId) > _maxOsId) || \
2790  (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2791  if (__kmp_affinity_verbose || (__kmp_affinity_warnings \
2792  && (__kmp_affinity_type != affinity_none))) { \
2793  KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2794  } \
2795  } \
2796  else { \
2797  ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2798  } \
2799  }
2800 
2801 
2802 //
2803 // Re-parse the proclist (for the explicit affinity type), and form the list
2804 // of affinity newMasks indexed by gtid.
2805 //
2806 static void
2807 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2808  unsigned int *out_numMasks, const char *proclist,
2809  kmp_affin_mask_t *osId2Mask, int maxOsId)
2810 {
2811  int i;
2812  const char *scan = proclist;
2813  const char *next = proclist;
2814 
2815  //
2816  // We use malloc() for the temporary mask vector,
2817  // so that we can use realloc() to extend it.
2818  //
2819  numNewMasks = 2;
2820  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
2821  nextNewMask = 0;
2822  kmp_affin_mask_t *sumMask;
2823  KMP_CPU_ALLOC(sumMask);
2824  int setSize = 0;
2825 
2826  for (;;) {
2827  int start, end, stride;
2828 
2829  SKIP_WS(scan);
2830  next = scan;
2831  if (*next == '\0') {
2832  break;
2833  }
2834 
2835  if (*next == '{') {
2836  int num;
2837  setSize = 0;
2838  next++; // skip '{'
2839  SKIP_WS(next);
2840  scan = next;
2841 
2842  //
2843  // Read the first integer in the set.
2844  //
2845  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2846  "bad proclist");
2847  SKIP_DIGITS(next);
2848  num = __kmp_str_to_int(scan, *next);
2849  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2850 
2851  //
2852  // Copy the mask for that osId to the sum (union) mask.
2853  //
2854  if ((num > maxOsId) ||
2855  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2856  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2857  && (__kmp_affinity_type != affinity_none))) {
2858  KMP_WARNING(AffIgnoreInvalidProcID, num);
2859  }
2860  KMP_CPU_ZERO(sumMask);
2861  }
2862  else {
2863  KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2864  setSize = 1;
2865  }
2866 
2867  for (;;) {
2868  //
2869  // Check for end of set.
2870  //
2871  SKIP_WS(next);
2872  if (*next == '}') {
2873  next++; // skip '}'
2874  break;
2875  }
2876 
2877  //
2878  // Skip optional comma.
2879  //
2880  if (*next == ',') {
2881  next++;
2882  }
2883  SKIP_WS(next);
2884 
2885  //
2886  // Read the next integer in the set.
2887  //
2888  scan = next;
2889  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2890  "bad explicit proc list");
2891 
2892  SKIP_DIGITS(next);
2893  num = __kmp_str_to_int(scan, *next);
2894  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2895 
2896  //
2897  // Add the mask for that osId to the sum mask.
2898  //
2899  if ((num > maxOsId) ||
2900  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2901  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2902  && (__kmp_affinity_type != affinity_none))) {
2903  KMP_WARNING(AffIgnoreInvalidProcID, num);
2904  }
2905  }
2906  else {
2907  KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2908  setSize++;
2909  }
2910  }
2911  if (setSize > 0) {
2912  ADD_MASK(sumMask);
2913  }
2914 
2915  SKIP_WS(next);
2916  if (*next == ',') {
2917  next++;
2918  }
2919  scan = next;
2920  continue;
2921  }
2922 
2923  //
2924  // Read the first integer.
2925  //
2926  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2927  SKIP_DIGITS(next);
2928  start = __kmp_str_to_int(scan, *next);
2929  KMP_ASSERT2(start >= 0, "bad explicit proc list");
2930  SKIP_WS(next);
2931 
2932  //
2933  // If this isn't a range, then add a mask to the list and go on.
2934  //
2935  if (*next != '-') {
2936  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2937 
2938  //
2939  // Skip optional comma.
2940  //
2941  if (*next == ',') {
2942  next++;
2943  }
2944  scan = next;
2945  continue;
2946  }
2947 
2948  //
2949  // This is a range. Skip over the '-' and read in the 2nd int.
2950  //
2951  next++; // skip '-'
2952  SKIP_WS(next);
2953  scan = next;
2954  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2955  SKIP_DIGITS(next);
2956  end = __kmp_str_to_int(scan, *next);
2957  KMP_ASSERT2(end >= 0, "bad explicit proc list");
2958 
2959  //
2960  // Check for a stride parameter
2961  //
2962  stride = 1;
2963  SKIP_WS(next);
2964  if (*next == ':') {
2965  //
2966  // A stride is specified. Skip over the ':" and read the 3rd int.
2967  //
2968  int sign = +1;
2969  next++; // skip ':'
2970  SKIP_WS(next);
2971  scan = next;
2972  if (*next == '-') {
2973  sign = -1;
2974  next++;
2975  SKIP_WS(next);
2976  scan = next;
2977  }
2978  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2979  "bad explicit proc list");
2980  SKIP_DIGITS(next);
2981  stride = __kmp_str_to_int(scan, *next);
2982  KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2983  stride *= sign;
2984  }
2985 
2986  //
2987  // Do some range checks.
2988  //
2989  KMP_ASSERT2(stride != 0, "bad explicit proc list");
2990  if (stride > 0) {
2991  KMP_ASSERT2(start <= end, "bad explicit proc list");
2992  }
2993  else {
2994  KMP_ASSERT2(start >= end, "bad explicit proc list");
2995  }
2996  KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2997 
2998  //
2999  // Add the mask for each OS proc # to the list.
3000  //
3001  if (stride > 0) {
3002  do {
3003  ADD_MASK_OSID(start, osId2Mask, maxOsId);
3004  start += stride;
3005  } while (start <= end);
3006  }
3007  else {
3008  do {
3009  ADD_MASK_OSID(start, osId2Mask, maxOsId);
3010  start += stride;
3011  } while (start >= end);
3012  }
3013 
3014  //
3015  // Skip optional comma.
3016  //
3017  SKIP_WS(next);
3018  if (*next == ',') {
3019  next++;
3020  }
3021  scan = next;
3022  }
3023 
3024  *out_numMasks = nextNewMask;
3025  if (nextNewMask == 0) {
3026  *out_masks = NULL;
3027  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3028  return;
3029  }
3030  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3031  for(i = 0; i < nextNewMask; i++) {
3032  kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i);
3033  kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
3034  KMP_CPU_COPY(dest, src);
3035  }
3036  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3037  KMP_CPU_FREE(sumMask);
3038 }
3039 
3040 
3041 # if OMP_40_ENABLED
3042 
3043 /*-----------------------------------------------------------------------------
3044 
3045 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3046 places. Again, Here is the grammar:
3047 
3048 place_list := place
3049 place_list := place , place_list
3050 place := num
3051 place := place : num
3052 place := place : num : signed
3053 place := { subplacelist }
3054 place := ! place // (lowest priority)
3055 subplace_list := subplace
3056 subplace_list := subplace , subplace_list
3057 subplace := num
3058 subplace := num : num
3059 subplace := num : num : signed
3060 signed := num
3061 signed := + signed
3062 signed := - signed
3063 
3064 -----------------------------------------------------------------------------*/
3065 
3066 static void
3067 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
3068  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3069 {
3070  const char *next;
3071 
3072  for (;;) {
3073  int start, count, stride, i;
3074 
3075  //
3076  // Read in the starting proc id
3077  //
3078  SKIP_WS(*scan);
3079  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3080  "bad explicit places list");
3081  next = *scan;
3082  SKIP_DIGITS(next);
3083  start = __kmp_str_to_int(*scan, *next);
3084  KMP_ASSERT(start >= 0);
3085  *scan = next;
3086 
3087  //
3088  // valid follow sets are ',' ':' and '}'
3089  //
3090  SKIP_WS(*scan);
3091  if (**scan == '}' || **scan == ',') {
3092  if ((start > maxOsId) ||
3093  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3094  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3095  && (__kmp_affinity_type != affinity_none))) {
3096  KMP_WARNING(AffIgnoreInvalidProcID, start);
3097  }
3098  }
3099  else {
3100  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3101  (*setSize)++;
3102  }
3103  if (**scan == '}') {
3104  break;
3105  }
3106  (*scan)++; // skip ','
3107  continue;
3108  }
3109  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3110  (*scan)++; // skip ':'
3111 
3112  //
3113  // Read count parameter
3114  //
3115  SKIP_WS(*scan);
3116  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3117  "bad explicit places list");
3118  next = *scan;
3119  SKIP_DIGITS(next);
3120  count = __kmp_str_to_int(*scan, *next);
3121  KMP_ASSERT(count >= 0);
3122  *scan = next;
3123 
3124  //
3125  // valid follow sets are ',' ':' and '}'
3126  //
3127  SKIP_WS(*scan);
3128  if (**scan == '}' || **scan == ',') {
3129  for (i = 0; i < count; i++) {
3130  if ((start > maxOsId) ||
3131  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3132  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3133  && (__kmp_affinity_type != affinity_none))) {
3134  KMP_WARNING(AffIgnoreInvalidProcID, start);
3135  }
3136  break; // don't proliferate warnings for large count
3137  }
3138  else {
3139  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3140  start++;
3141  (*setSize)++;
3142  }
3143  }
3144  if (**scan == '}') {
3145  break;
3146  }
3147  (*scan)++; // skip ','
3148  continue;
3149  }
3150  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3151  (*scan)++; // skip ':'
3152 
3153  //
3154  // Read stride parameter
3155  //
3156  int sign = +1;
3157  for (;;) {
3158  SKIP_WS(*scan);
3159  if (**scan == '+') {
3160  (*scan)++; // skip '+'
3161  continue;
3162  }
3163  if (**scan == '-') {
3164  sign *= -1;
3165  (*scan)++; // skip '-'
3166  continue;
3167  }
3168  break;
3169  }
3170  SKIP_WS(*scan);
3171  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3172  "bad explicit places list");
3173  next = *scan;
3174  SKIP_DIGITS(next);
3175  stride = __kmp_str_to_int(*scan, *next);
3176  KMP_ASSERT(stride >= 0);
3177  *scan = next;
3178  stride *= sign;
3179 
3180  //
3181  // valid follow sets are ',' and '}'
3182  //
3183  SKIP_WS(*scan);
3184  if (**scan == '}' || **scan == ',') {
3185  for (i = 0; i < count; i++) {
3186  if ((start > maxOsId) ||
3187  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3188  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3189  && (__kmp_affinity_type != affinity_none))) {
3190  KMP_WARNING(AffIgnoreInvalidProcID, start);
3191  }
3192  break; // don't proliferate warnings for large count
3193  }
3194  else {
3195  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3196  start += stride;
3197  (*setSize)++;
3198  }
3199  }
3200  if (**scan == '}') {
3201  break;
3202  }
3203  (*scan)++; // skip ','
3204  continue;
3205  }
3206 
3207  KMP_ASSERT2(0, "bad explicit places list");
3208  }
3209 }
3210 
3211 
3212 static void
3213 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3214  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3215 {
3216  const char *next;
3217 
3218  //
3219  // valid follow sets are '{' '!' and num
3220  //
3221  SKIP_WS(*scan);
3222  if (**scan == '{') {
3223  (*scan)++; // skip '{'
3224  __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3225  setSize);
3226  KMP_ASSERT2(**scan == '}', "bad explicit places list");
3227  (*scan)++; // skip '}'
3228  }
3229  else if (**scan == '!') {
3230  (*scan)++; // skip '!'
3231  __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3232  KMP_CPU_COMPLEMENT(maxOsId, tempMask);
3233  }
3234  else if ((**scan >= '0') && (**scan <= '9')) {
3235  next = *scan;
3236  SKIP_DIGITS(next);
3237  int num = __kmp_str_to_int(*scan, *next);
3238  KMP_ASSERT(num >= 0);
3239  if ((num > maxOsId) ||
3240  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3241  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3242  && (__kmp_affinity_type != affinity_none))) {
3243  KMP_WARNING(AffIgnoreInvalidProcID, num);
3244  }
3245  }
3246  else {
3247  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3248  (*setSize)++;
3249  }
3250  *scan = next; // skip num
3251  }
3252  else {
3253  KMP_ASSERT2(0, "bad explicit places list");
3254  }
3255 }
3256 
3257 
3258 //static void
3259 void
3260 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3261  unsigned int *out_numMasks, const char *placelist,
3262  kmp_affin_mask_t *osId2Mask, int maxOsId)
3263 {
3264  int i,j,count,stride,sign;
3265  const char *scan = placelist;
3266  const char *next = placelist;
3267 
3268  numNewMasks = 2;
3269  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3270  nextNewMask = 0;
3271 
3272  // tempMask is modified based on the previous or initial
3273  // place to form the current place
3274  // previousMask contains the previous place
3275  kmp_affin_mask_t *tempMask;
3276  kmp_affin_mask_t *previousMask;
3277  KMP_CPU_ALLOC(tempMask);
3278  KMP_CPU_ZERO(tempMask);
3279  KMP_CPU_ALLOC(previousMask);
3280  KMP_CPU_ZERO(previousMask);
3281  int setSize = 0;
3282 
3283  for (;;) {
3284  __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3285 
3286  //
3287  // valid follow sets are ',' ':' and EOL
3288  //
3289  SKIP_WS(scan);
3290  if (*scan == '\0' || *scan == ',') {
3291  if (setSize > 0) {
3292  ADD_MASK(tempMask);
3293  }
3294  KMP_CPU_ZERO(tempMask);
3295  setSize = 0;
3296  if (*scan == '\0') {
3297  break;
3298  }
3299  scan++; // skip ','
3300  continue;
3301  }
3302 
3303  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3304  scan++; // skip ':'
3305 
3306  //
3307  // Read count parameter
3308  //
3309  SKIP_WS(scan);
3310  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3311  "bad explicit places list");
3312  next = scan;
3313  SKIP_DIGITS(next);
3314  count = __kmp_str_to_int(scan, *next);
3315  KMP_ASSERT(count >= 0);
3316  scan = next;
3317 
3318  //
3319  // valid follow sets are ',' ':' and EOL
3320  //
3321  SKIP_WS(scan);
3322  if (*scan == '\0' || *scan == ',') {
3323  stride = +1;
3324  }
3325  else {
3326  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3327  scan++; // skip ':'
3328 
3329  //
3330  // Read stride parameter
3331  //
3332  sign = +1;
3333  for (;;) {
3334  SKIP_WS(scan);
3335  if (*scan == '+') {
3336  scan++; // skip '+'
3337  continue;
3338  }
3339  if (*scan == '-') {
3340  sign *= -1;
3341  scan++; // skip '-'
3342  continue;
3343  }
3344  break;
3345  }
3346  SKIP_WS(scan);
3347  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3348  "bad explicit places list");
3349  next = scan;
3350  SKIP_DIGITS(next);
3351  stride = __kmp_str_to_int(scan, *next);
3352  KMP_DEBUG_ASSERT(stride >= 0);
3353  scan = next;
3354  stride *= sign;
3355  }
3356 
3357  // Add places determined by initial_place : count : stride
3358  for (i = 0; i < count; i++) {
3359  if (setSize == 0) {
3360  break;
3361  }
3362  // Add the current place, then build the next place (tempMask) from that
3363  KMP_CPU_COPY(previousMask, tempMask);
3364  ADD_MASK(previousMask);
3365  KMP_CPU_ZERO(tempMask);
3366  setSize = 0;
3367  KMP_CPU_SET_ITERATE(j, previousMask) {
3368  if (! KMP_CPU_ISSET(j, previousMask)) {
3369  continue;
3370  }
3371  if ((j+stride > maxOsId) || (j+stride < 0) ||
3372  (! KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
3373  (! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) {
3374  if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3375  && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3376  KMP_WARNING(AffIgnoreInvalidProcID, j+stride);
3377  }
3378  continue;
3379  }
3380  KMP_CPU_SET(j+stride, tempMask);
3381  setSize++;
3382  }
3383  }
3384  KMP_CPU_ZERO(tempMask);
3385  setSize = 0;
3386 
3387  //
3388  // valid follow sets are ',' and EOL
3389  //
3390  SKIP_WS(scan);
3391  if (*scan == '\0') {
3392  break;
3393  }
3394  if (*scan == ',') {
3395  scan++; // skip ','
3396  continue;
3397  }
3398 
3399  KMP_ASSERT2(0, "bad explicit places list");
3400  }
3401 
3402  *out_numMasks = nextNewMask;
3403  if (nextNewMask == 0) {
3404  *out_masks = NULL;
3405  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3406  return;
3407  }
3408  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3409  KMP_CPU_FREE(tempMask);
3410  KMP_CPU_FREE(previousMask);
3411  for(i = 0; i < nextNewMask; i++) {
3412  kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i);
3413  kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
3414  KMP_CPU_COPY(dest, src);
3415  }
3416  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3417 }
3418 
3419 # endif /* OMP_40_ENABLED */
3420 
3421 #undef ADD_MASK
3422 #undef ADD_MASK_OSID
3423 
3424 static void
3425 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3426 {
3427  int i, j, k, n_old = 0, n_new = 0, proc_num = 0;
3428  if (__kmp_place_num_sockets == 0 &&
3429  __kmp_place_num_cores == 0 &&
3430  __kmp_place_num_threads_per_core == 0 )
3431  goto _exit; // no topology limiting actions requested, exit
3432  if (__kmp_place_num_sockets == 0)
3433  __kmp_place_num_sockets = nPackages; // use all available sockets
3434  if (__kmp_place_num_cores == 0)
3435  __kmp_place_num_cores = nCoresPerPkg; // use all available cores
3436  if (__kmp_place_num_threads_per_core == 0 ||
3437  __kmp_place_num_threads_per_core > __kmp_nThreadsPerCore)
3438  __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3439 
3440  if ( !__kmp_affinity_uniform_topology() ) {
3441  KMP_WARNING( AffHWSubsetNonUniform );
3442  goto _exit; // don't support non-uniform topology
3443  }
3444  if ( depth > 3 ) {
3445  KMP_WARNING( AffHWSubsetNonThreeLevel );
3446  goto _exit; // don't support not-3-level topology
3447  }
3448  if (__kmp_place_socket_offset + __kmp_place_num_sockets > nPackages) {
3449  KMP_WARNING(AffHWSubsetManySockets);
3450  goto _exit;
3451  }
3452  if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3453  KMP_WARNING( AffHWSubsetManyCores );
3454  goto _exit;
3455  }
3456 
3457  AddrUnsPair *newAddr;
3458  if (pAddr) // pAddr is NULL in case of affinity_none
3459  newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3460  __kmp_place_num_sockets * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3461 
3462  for (i = 0; i < nPackages; ++i) {
3463  if (i < __kmp_place_socket_offset ||
3464  i >= __kmp_place_socket_offset + __kmp_place_num_sockets) {
3465  n_old += nCoresPerPkg * __kmp_nThreadsPerCore; // skip not-requested socket
3466  if (__kmp_pu_os_idx != NULL) {
3467  for (j = 0; j < nCoresPerPkg; ++j) { // walk through skipped socket
3468  for (k = 0; k < __kmp_nThreadsPerCore; ++k) {
3469  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3470  ++proc_num;
3471  }
3472  }
3473  }
3474  } else {
3475  for (j = 0; j < nCoresPerPkg; ++j) { // walk through requested socket
3476  if (j < __kmp_place_core_offset ||
3477  j >= __kmp_place_core_offset + __kmp_place_num_cores) {
3478  n_old += __kmp_nThreadsPerCore; // skip not-requested core
3479  if (__kmp_pu_os_idx != NULL) {
3480  for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through skipped core
3481  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3482  ++proc_num;
3483  }
3484  }
3485  } else {
3486  for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through requested core
3487  if (k < __kmp_place_num_threads_per_core) {
3488  if (pAddr)
3489  newAddr[n_new] = (*pAddr)[n_old]; // collect requested thread's data
3490  n_new++;
3491  } else {
3492  if (__kmp_pu_os_idx != NULL)
3493  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3494  }
3495  n_old++;
3496  ++proc_num;
3497  }
3498  }
3499  }
3500  }
3501  }
3502  KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
3503  KMP_DEBUG_ASSERT(n_new == __kmp_place_num_sockets * __kmp_place_num_cores *
3504  __kmp_place_num_threads_per_core);
3505 
3506  nPackages = __kmp_place_num_sockets; // correct nPackages
3507  nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg
3508  __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3509  __kmp_avail_proc = n_new; // correct avail_proc
3510  __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores
3511 
3512  if (pAddr) {
3513  __kmp_free( *pAddr );
3514  *pAddr = newAddr; // replace old topology with new one
3515  }
3516 _exit:
3517  if (__kmp_pu_os_idx != NULL) {
3518  __kmp_free(__kmp_pu_os_idx);
3519  __kmp_pu_os_idx = NULL;
3520  }
3521 }
3522 
3523 
3524 static AddrUnsPair *address2os = NULL;
3525 static int * procarr = NULL;
3526 static int __kmp_aff_depth = 0;
3527 
3528 #define KMP_EXIT_AFF_NONE \
3529  KMP_ASSERT(__kmp_affinity_type == affinity_none); \
3530  KMP_ASSERT(address2os == NULL); \
3531  __kmp_apply_thread_places(NULL, 0); \
3532  return;
3533 
3534 static void
3535 __kmp_aux_affinity_initialize(void)
3536 {
3537  if (__kmp_affinity_masks != NULL) {
3538  KMP_ASSERT(__kmp_affin_fullMask != NULL);
3539  return;
3540  }
3541 
3542  //
3543  // Create the "full" mask - this defines all of the processors that we
3544  // consider to be in the machine model. If respect is set, then it is
3545  // the initialization thread's affinity mask. Otherwise, it is all
3546  // processors that we know about on the machine.
3547  //
3548  if (__kmp_affin_fullMask == NULL) {
3549  KMP_CPU_ALLOC(__kmp_affin_fullMask);
3550  }
3551  if (KMP_AFFINITY_CAPABLE()) {
3552  if (__kmp_affinity_respect_mask) {
3553  __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
3554 
3555  //
3556  // Count the number of available processors.
3557  //
3558  unsigned i;
3559  __kmp_avail_proc = 0;
3560  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
3561  if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
3562  continue;
3563  }
3564  __kmp_avail_proc++;
3565  }
3566  if (__kmp_avail_proc > __kmp_xproc) {
3567  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3568  && (__kmp_affinity_type != affinity_none))) {
3569  KMP_WARNING(ErrorInitializeAffinity);
3570  }
3571  __kmp_affinity_type = affinity_none;
3572  KMP_AFFINITY_DISABLE();
3573  return;
3574  }
3575  }
3576  else {
3577  __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
3578  __kmp_avail_proc = __kmp_xproc;
3579  }
3580  }
3581 
3582  int depth = -1;
3583  kmp_i18n_id_t msg_id = kmp_i18n_null;
3584 
3585  //
3586  // For backward compatibility, setting KMP_CPUINFO_FILE =>
3587  // KMP_TOPOLOGY_METHOD=cpuinfo
3588  //
3589  if ((__kmp_cpuinfo_file != NULL) &&
3590  (__kmp_affinity_top_method == affinity_top_method_all)) {
3591  __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3592  }
3593 
3594  if (__kmp_affinity_top_method == affinity_top_method_all) {
3595  //
3596  // In the default code path, errors are not fatal - we just try using
3597  // another method. We only emit a warning message if affinity is on,
3598  // or the verbose flag is set, an the nowarnings flag was not set.
3599  //
3600  const char *file_name = NULL;
3601  int line = 0;
3602 # if KMP_USE_HWLOC
3603  if (depth < 0) {
3604  if (__kmp_affinity_verbose) {
3605  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
3606  }
3607  if(!__kmp_hwloc_error) {
3608  depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
3609  if (depth == 0) {
3610  KMP_EXIT_AFF_NONE;
3611  } else if(depth < 0 && __kmp_affinity_verbose) {
3612  KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3613  }
3614  } else if(__kmp_affinity_verbose) {
3615  KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3616  }
3617  }
3618 # endif
3619 
3620 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3621 
3622  if (depth < 0) {
3623  if (__kmp_affinity_verbose) {
3624  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3625  }
3626 
3627  file_name = NULL;
3628  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3629  if (depth == 0) {
3630  KMP_EXIT_AFF_NONE;
3631  }
3632 
3633  if (depth < 0) {
3634  if (__kmp_affinity_verbose) {
3635  if (msg_id != kmp_i18n_null) {
3636  KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3637  KMP_I18N_STR(DecodingLegacyAPIC));
3638  }
3639  else {
3640  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3641  }
3642  }
3643 
3644  file_name = NULL;
3645  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3646  if (depth == 0) {
3647  KMP_EXIT_AFF_NONE;
3648  }
3649  }
3650  }
3651 
3652 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3653 
3654 # if KMP_OS_LINUX
3655 
3656  if (depth < 0) {
3657  if (__kmp_affinity_verbose) {
3658  if (msg_id != kmp_i18n_null) {
3659  KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3660  }
3661  else {
3662  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3663  }
3664  }
3665 
3666  FILE *f = fopen("/proc/cpuinfo", "r");
3667  if (f == NULL) {
3668  msg_id = kmp_i18n_str_CantOpenCpuinfo;
3669  }
3670  else {
3671  file_name = "/proc/cpuinfo";
3672  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3673  fclose(f);
3674  if (depth == 0) {
3675  KMP_EXIT_AFF_NONE;
3676  }
3677  }
3678  }
3679 
3680 # endif /* KMP_OS_LINUX */
3681 
3682 # if KMP_GROUP_AFFINITY
3683 
3684  if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3685  if (__kmp_affinity_verbose) {
3686  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3687  }
3688 
3689  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3690  KMP_ASSERT(depth != 0);
3691  }
3692 
3693 # endif /* KMP_GROUP_AFFINITY */
3694 
3695  if (depth < 0) {
3696  if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3697  if (file_name == NULL) {
3698  KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3699  }
3700  else if (line == 0) {
3701  KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3702  }
3703  else {
3704  KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3705  }
3706  }
3707  // FIXME - print msg if msg_id = kmp_i18n_null ???
3708 
3709  file_name = "";
3710  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3711  if (depth == 0) {
3712  KMP_EXIT_AFF_NONE;
3713  }
3714  KMP_ASSERT(depth > 0);
3715  KMP_ASSERT(address2os != NULL);
3716  }
3717  }
3718 
3719  //
3720  // If the user has specified that a paricular topology discovery method
3721  // is to be used, then we abort if that method fails. The exception is
3722  // group affinity, which might have been implicitly set.
3723  //
3724 
3725 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3726 
3727  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3728  if (__kmp_affinity_verbose) {
3729  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3730  KMP_I18N_STR(Decodingx2APIC));
3731  }
3732 
3733  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3734  if (depth == 0) {
3735  KMP_EXIT_AFF_NONE;
3736  }
3737  if (depth < 0) {
3738  KMP_ASSERT(msg_id != kmp_i18n_null);
3739  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3740  }
3741  }
3742  else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3743  if (__kmp_affinity_verbose) {
3744  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3745  KMP_I18N_STR(DecodingLegacyAPIC));
3746  }
3747 
3748  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3749  if (depth == 0) {
3750  KMP_EXIT_AFF_NONE;
3751  }
3752  if (depth < 0) {
3753  KMP_ASSERT(msg_id != kmp_i18n_null);
3754  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3755  }
3756  }
3757 
3758 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3759 
3760  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3761  const char *filename;
3762  if (__kmp_cpuinfo_file != NULL) {
3763  filename = __kmp_cpuinfo_file;
3764  }
3765  else {
3766  filename = "/proc/cpuinfo";
3767  }
3768 
3769  if (__kmp_affinity_verbose) {
3770  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3771  }
3772 
3773  FILE *f = fopen(filename, "r");
3774  if (f == NULL) {
3775  int code = errno;
3776  if (__kmp_cpuinfo_file != NULL) {
3777  __kmp_msg(
3778  kmp_ms_fatal,
3779  KMP_MSG(CantOpenFileForReading, filename),
3780  KMP_ERR(code),
3781  KMP_HNT(NameComesFrom_CPUINFO_FILE),
3782  __kmp_msg_null
3783  );
3784  }
3785  else {
3786  __kmp_msg(
3787  kmp_ms_fatal,
3788  KMP_MSG(CantOpenFileForReading, filename),
3789  KMP_ERR(code),
3790  __kmp_msg_null
3791  );
3792  }
3793  }
3794  int line = 0;
3795  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3796  fclose(f);
3797  if (depth < 0) {
3798  KMP_ASSERT(msg_id != kmp_i18n_null);
3799  if (line > 0) {
3800  KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3801  }
3802  else {
3803  KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3804  }
3805  }
3806  if (__kmp_affinity_type == affinity_none) {
3807  KMP_ASSERT(depth == 0);
3808  KMP_EXIT_AFF_NONE;
3809  }
3810  }
3811 
3812 # if KMP_GROUP_AFFINITY
3813 
3814  else if (__kmp_affinity_top_method == affinity_top_method_group) {
3815  if (__kmp_affinity_verbose) {
3816  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3817  }
3818 
3819  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3820  KMP_ASSERT(depth != 0);
3821  if (depth < 0) {
3822  KMP_ASSERT(msg_id != kmp_i18n_null);
3823  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3824  }
3825  }
3826 
3827 # endif /* KMP_GROUP_AFFINITY */
3828 
3829  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3830  if (__kmp_affinity_verbose) {
3831  KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3832  }
3833 
3834  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3835  if (depth == 0) {
3836  KMP_EXIT_AFF_NONE;
3837  }
3838  // should not fail
3839  KMP_ASSERT(depth > 0);
3840  KMP_ASSERT(address2os != NULL);
3841  }
3842 
3843 # if KMP_USE_HWLOC
3844  else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
3845  if (__kmp_affinity_verbose) {
3846  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
3847  }
3848  depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
3849  if (depth == 0) {
3850  KMP_EXIT_AFF_NONE;
3851  }
3852  }
3853 # endif // KMP_USE_HWLOC
3854 
3855  if (address2os == NULL) {
3856  if (KMP_AFFINITY_CAPABLE()
3857  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3858  && (__kmp_affinity_type != affinity_none)))) {
3859  KMP_WARNING(ErrorInitializeAffinity);
3860  }
3861  __kmp_affinity_type = affinity_none;
3862  KMP_AFFINITY_DISABLE();
3863  return;
3864  }
3865 
3866  __kmp_apply_thread_places(&address2os, depth);
3867 
3868  //
3869  // Create the table of masks, indexed by thread Id.
3870  //
3871  unsigned maxIndex;
3872  unsigned numUnique;
3873  kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3874  address2os, __kmp_avail_proc);
3875  if (__kmp_affinity_gran_levels == 0) {
3876  KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3877  }
3878 
3879  //
3880  // Set the childNums vector in all Address objects. This must be done
3881  // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3882  // which takes into account the setting of __kmp_affinity_compact.
3883  //
3884  __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3885 
3886  switch (__kmp_affinity_type) {
3887 
3888  case affinity_explicit:
3889  KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3890 # if OMP_40_ENABLED
3891  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3892 # endif
3893  {
3894  __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3895  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3896  maxIndex);
3897  }
3898 # if OMP_40_ENABLED
3899  else {
3900  __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3901  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3902  maxIndex);
3903  }
3904 # endif
3905  if (__kmp_affinity_num_masks == 0) {
3906  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3907  && (__kmp_affinity_type != affinity_none))) {
3908  KMP_WARNING(AffNoValidProcID);
3909  }
3910  __kmp_affinity_type = affinity_none;
3911  return;
3912  }
3913  break;
3914 
3915  //
3916  // The other affinity types rely on sorting the Addresses according
3917  // to some permutation of the machine topology tree. Set
3918  // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3919  // then jump to a common code fragment to do the sort and create
3920  // the array of affinity masks.
3921  //
3922 
3923  case affinity_logical:
3924  __kmp_affinity_compact = 0;
3925  if (__kmp_affinity_offset) {
3926  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3927  % __kmp_avail_proc;
3928  }
3929  goto sortAddresses;
3930 
3931  case affinity_physical:
3932  if (__kmp_nThreadsPerCore > 1) {
3933  __kmp_affinity_compact = 1;
3934  if (__kmp_affinity_compact >= depth) {
3935  __kmp_affinity_compact = 0;
3936  }
3937  } else {
3938  __kmp_affinity_compact = 0;
3939  }
3940  if (__kmp_affinity_offset) {
3941  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3942  % __kmp_avail_proc;
3943  }
3944  goto sortAddresses;
3945 
3946  case affinity_scatter:
3947  if (__kmp_affinity_compact >= depth) {
3948  __kmp_affinity_compact = 0;
3949  }
3950  else {
3951  __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3952  }
3953  goto sortAddresses;
3954 
3955  case affinity_compact:
3956  if (__kmp_affinity_compact >= depth) {
3957  __kmp_affinity_compact = depth - 1;
3958  }
3959  goto sortAddresses;
3960 
3961  case affinity_balanced:
3962  // Balanced works only for the case of a single package
3963  if( nPackages > 1 ) {
3964  if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3965  KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3966  }
3967  __kmp_affinity_type = affinity_none;
3968  return;
3969  } else if( __kmp_affinity_uniform_topology() ) {
3970  break;
3971  } else { // Non-uniform topology
3972 
3973  // Save the depth for further usage
3974  __kmp_aff_depth = depth;
3975 
3976  // Number of hyper threads per core in HT machine
3977  int nth_per_core = __kmp_nThreadsPerCore;
3978 
3979  int core_level;
3980  if( nth_per_core > 1 ) {
3981  core_level = depth - 2;
3982  } else {
3983  core_level = depth - 1;
3984  }
3985  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3986  int nproc = nth_per_core * ncores;
3987 
3988  procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3989  for( int i = 0; i < nproc; i++ ) {
3990  procarr[ i ] = -1;
3991  }
3992 
3993  for( int i = 0; i < __kmp_avail_proc; i++ ) {
3994  int proc = address2os[ i ].second;
3995  // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3996  // If there is only one thread per core then depth == 2: level 0 - package,
3997  // level 1 - core.
3998  int level = depth - 1;
3999 
4000  // __kmp_nth_per_core == 1
4001  int thread = 0;
4002  int core = address2os[ i ].first.labels[ level ];
4003  // If the thread level exists, that is we have more than one thread context per core
4004  if( nth_per_core > 1 ) {
4005  thread = address2os[ i ].first.labels[ level ] % nth_per_core;
4006  core = address2os[ i ].first.labels[ level - 1 ];
4007  }
4008  procarr[ core * nth_per_core + thread ] = proc;
4009  }
4010 
4011  break;
4012  }
4013 
4014  sortAddresses:
4015  //
4016  // Allocate the gtid->affinity mask table.
4017  //
4018  if (__kmp_affinity_dups) {
4019  __kmp_affinity_num_masks = __kmp_avail_proc;
4020  }
4021  else {
4022  __kmp_affinity_num_masks = numUnique;
4023  }
4024 
4025 # if OMP_40_ENABLED
4026  if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
4027  && ( __kmp_affinity_num_places > 0 )
4028  && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
4029  __kmp_affinity_num_masks = __kmp_affinity_num_places;
4030  }
4031 # endif
4032 
4033  KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4034 
4035  //
4036  // Sort the address2os table according to the current setting of
4037  // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
4038  //
4039  qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
4040  __kmp_affinity_cmp_Address_child_num);
4041  {
4042  int i;
4043  unsigned j;
4044  for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
4045  if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
4046  continue;
4047  }
4048  unsigned osId = address2os[i].second;
4049  kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
4050  kmp_affin_mask_t *dest
4051  = KMP_CPU_INDEX(__kmp_affinity_masks, j);
4052  KMP_ASSERT(KMP_CPU_ISSET(osId, src));
4053  KMP_CPU_COPY(dest, src);
4054  if (++j >= __kmp_affinity_num_masks) {
4055  break;
4056  }
4057  }
4058  KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
4059  }
4060  break;
4061 
4062  default:
4063  KMP_ASSERT2(0, "Unexpected affinity setting");
4064  }
4065 
4066  __kmp_free(osId2Mask);
4067  machine_hierarchy.init(address2os, __kmp_avail_proc);
4068 }
4069 #undef KMP_EXIT_AFF_NONE
4070 
4071 
4072 void
4073 __kmp_affinity_initialize(void)
4074 {
4075  //
4076  // Much of the code above was written assumming that if a machine was not
4077  // affinity capable, then __kmp_affinity_type == affinity_none. We now
4078  // explicitly represent this as __kmp_affinity_type == affinity_disabled.
4079  //
4080  // There are too many checks for __kmp_affinity_type == affinity_none
4081  // in this code. Instead of trying to change them all, check if
4082  // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4083  // affinity_none, call the real initialization routine, then restore
4084  // __kmp_affinity_type to affinity_disabled.
4085  //
4086  int disabled = (__kmp_affinity_type == affinity_disabled);
4087  if (! KMP_AFFINITY_CAPABLE()) {
4088  KMP_ASSERT(disabled);
4089  }
4090  if (disabled) {
4091  __kmp_affinity_type = affinity_none;
4092  }
4093  __kmp_aux_affinity_initialize();
4094  if (disabled) {
4095  __kmp_affinity_type = affinity_disabled;
4096  }
4097 }
4098 
4099 
4100 void
4101 __kmp_affinity_uninitialize(void)
4102 {
4103  if (__kmp_affinity_masks != NULL) {
4104  KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4105  __kmp_affinity_masks = NULL;
4106  }
4107  if (__kmp_affin_fullMask != NULL) {
4108  KMP_CPU_FREE(__kmp_affin_fullMask);
4109  __kmp_affin_fullMask = NULL;
4110  }
4111  __kmp_affinity_num_masks = 0;
4112 # if OMP_40_ENABLED
4113  __kmp_affinity_num_places = 0;
4114 # endif
4115  if (__kmp_affinity_proclist != NULL) {
4116  __kmp_free(__kmp_affinity_proclist);
4117  __kmp_affinity_proclist = NULL;
4118  }
4119  if( address2os != NULL ) {
4120  __kmp_free( address2os );
4121  address2os = NULL;
4122  }
4123  if( procarr != NULL ) {
4124  __kmp_free( procarr );
4125  procarr = NULL;
4126  }
4127 # if KMP_USE_HWLOC
4128  if (__kmp_hwloc_topology != NULL) {
4129  hwloc_topology_destroy(__kmp_hwloc_topology);
4130  __kmp_hwloc_topology = NULL;
4131  }
4132 # endif
4133 }
4134 
4135 
4136 void
4137 __kmp_affinity_set_init_mask(int gtid, int isa_root)
4138 {
4139  if (! KMP_AFFINITY_CAPABLE()) {
4140  return;
4141  }
4142 
4143  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4144  if (th->th.th_affin_mask == NULL) {
4145  KMP_CPU_ALLOC(th->th.th_affin_mask);
4146  }
4147  else {
4148  KMP_CPU_ZERO(th->th.th_affin_mask);
4149  }
4150 
4151  //
4152  // Copy the thread mask to the kmp_info_t strucuture.
4153  // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
4154  // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4155  // is set, then the full mask is the same as the mask of the initialization
4156  // thread.
4157  //
4158  kmp_affin_mask_t *mask;
4159  int i;
4160 
4161 # if OMP_40_ENABLED
4162  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4163 # endif
4164  {
4165  if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
4166  ) {
4167 # if KMP_GROUP_AFFINITY
4168  if (__kmp_num_proc_groups > 1) {
4169  return;
4170  }
4171 # endif
4172  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4173  i = KMP_PLACE_ALL;
4174  mask = __kmp_affin_fullMask;
4175  }
4176  else {
4177  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4178  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4179  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4180  }
4181  }
4182 # if OMP_40_ENABLED
4183  else {
4184  if ((! isa_root)
4185  || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4186 # if KMP_GROUP_AFFINITY
4187  if (__kmp_num_proc_groups > 1) {
4188  return;
4189  }
4190 # endif
4191  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4192  i = KMP_PLACE_ALL;
4193  mask = __kmp_affin_fullMask;
4194  }
4195  else {
4196  //
4197  // int i = some hash function or just a counter that doesn't
4198  // always start at 0. Use gtid for now.
4199  //
4200  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4201  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4202  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4203  }
4204  }
4205 # endif
4206 
4207 # if OMP_40_ENABLED
4208  th->th.th_current_place = i;
4209  if (isa_root) {
4210  th->th.th_new_place = i;
4211  th->th.th_first_place = 0;
4212  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4213  }
4214 
4215  if (i == KMP_PLACE_ALL) {
4216  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4217  gtid));
4218  }
4219  else {
4220  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4221  gtid, i));
4222  }
4223 # else
4224  if (i == -1) {
4225  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n",
4226  gtid));
4227  }
4228  else {
4229  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4230  gtid, i));
4231  }
4232 # endif /* OMP_40_ENABLED */
4233 
4234  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4235 
4236  if (__kmp_affinity_verbose) {
4237  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4238  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4239  th->th.th_affin_mask);
4240  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4241  buf);
4242  }
4243 
4244 # if KMP_OS_WINDOWS
4245  //
4246  // On Windows* OS, the process affinity mask might have changed.
4247  // If the user didn't request affinity and this call fails,
4248  // just continue silently. See CQ171393.
4249  //
4250  if ( __kmp_affinity_type == affinity_none ) {
4251  __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4252  }
4253  else
4254 # endif
4255  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4256 }
4257 
4258 
4259 # if OMP_40_ENABLED
4260 
4261 void
4262 __kmp_affinity_set_place(int gtid)
4263 {
4264  int retval;
4265 
4266  if (! KMP_AFFINITY_CAPABLE()) {
4267  return;
4268  }
4269 
4270  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4271 
4272  KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4273  gtid, th->th.th_new_place, th->th.th_current_place));
4274 
4275  //
4276  // Check that the new place is within this thread's partition.
4277  //
4278  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4279  KMP_ASSERT(th->th.th_new_place >= 0);
4280  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4281  if (th->th.th_first_place <= th->th.th_last_place) {
4282  KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4283  && (th->th.th_new_place <= th->th.th_last_place));
4284  }
4285  else {
4286  KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4287  || (th->th.th_new_place >= th->th.th_last_place));
4288  }
4289 
4290  //
4291  // Copy the thread mask to the kmp_info_t strucuture,
4292  // and set this thread's affinity.
4293  //
4294  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4295  th->th.th_new_place);
4296  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4297  th->th.th_current_place = th->th.th_new_place;
4298 
4299  if (__kmp_affinity_verbose) {
4300  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4301  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4302  th->th.th_affin_mask);
4303  KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4304  gtid, buf);
4305  }
4306  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4307 }
4308 
4309 # endif /* OMP_40_ENABLED */
4310 
4311 
4312 int
4313 __kmp_aux_set_affinity(void **mask)
4314 {
4315  int gtid;
4316  kmp_info_t *th;
4317  int retval;
4318 
4319  if (! KMP_AFFINITY_CAPABLE()) {
4320  return -1;
4321  }
4322 
4323  gtid = __kmp_entry_gtid();
4324  KA_TRACE(1000, ;{
4325  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4326  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4327  (kmp_affin_mask_t *)(*mask));
4328  __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4329  gtid, buf);
4330  });
4331 
4332  if (__kmp_env_consistency_check) {
4333  if ((mask == NULL) || (*mask == NULL)) {
4334  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4335  }
4336  else {
4337  unsigned proc;
4338  int num_procs = 0;
4339 
4340  KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) {
4341  if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4342  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4343  }
4344  if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4345  continue;
4346  }
4347  num_procs++;
4348  }
4349  if (num_procs == 0) {
4350  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4351  }
4352 
4353 # if KMP_GROUP_AFFINITY
4354  if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4355  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4356  }
4357 # endif /* KMP_GROUP_AFFINITY */
4358 
4359  }
4360  }
4361 
4362  th = __kmp_threads[gtid];
4363  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4364  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4365  if (retval == 0) {
4366  KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4367  }
4368 
4369 # if OMP_40_ENABLED
4370  th->th.th_current_place = KMP_PLACE_UNDEFINED;
4371  th->th.th_new_place = KMP_PLACE_UNDEFINED;
4372  th->th.th_first_place = 0;
4373  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4374 
4375  //
4376  // Turn off 4.0 affinity for the current tread at this parallel level.
4377  //
4378  th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4379 # endif
4380 
4381  return retval;
4382 }
4383 
4384 
4385 int
4386 __kmp_aux_get_affinity(void **mask)
4387 {
4388  int gtid;
4389  int retval;
4390  kmp_info_t *th;
4391 
4392  if (! KMP_AFFINITY_CAPABLE()) {
4393  return -1;
4394  }
4395 
4396  gtid = __kmp_entry_gtid();
4397  th = __kmp_threads[gtid];
4398  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4399 
4400  KA_TRACE(1000, ;{
4401  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4402  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4403  th->th.th_affin_mask);
4404  __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4405  });
4406 
4407  if (__kmp_env_consistency_check) {
4408  if ((mask == NULL) || (*mask == NULL)) {
4409  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4410  }
4411  }
4412 
4413 # if !KMP_OS_WINDOWS
4414 
4415  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4416  KA_TRACE(1000, ;{
4417  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4418  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4419  (kmp_affin_mask_t *)(*mask));
4420  __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4421  });
4422  return retval;
4423 
4424 # else
4425 
4426  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4427  return 0;
4428 
4429 # endif /* KMP_OS_WINDOWS */
4430 
4431 }
4432 
4433 int
4434 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4435 {
4436  int retval;
4437 
4438  if (! KMP_AFFINITY_CAPABLE()) {
4439  return -1;
4440  }
4441 
4442  KA_TRACE(1000, ;{
4443  int gtid = __kmp_entry_gtid();
4444  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4445  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4446  (kmp_affin_mask_t *)(*mask));
4447  __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4448  proc, gtid, buf);
4449  });
4450 
4451  if (__kmp_env_consistency_check) {
4452  if ((mask == NULL) || (*mask == NULL)) {
4453  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4454  }
4455  }
4456 
4457  if ((proc < 0)
4458 # if !KMP_USE_HWLOC
4459  || ((unsigned)proc >= KMP_CPU_SETSIZE)
4460 # endif
4461  ) {
4462  return -1;
4463  }
4464  if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4465  return -2;
4466  }
4467 
4468  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4469  return 0;
4470 }
4471 
4472 
4473 int
4474 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4475 {
4476  int retval;
4477 
4478  if (! KMP_AFFINITY_CAPABLE()) {
4479  return -1;
4480  }
4481 
4482  KA_TRACE(1000, ;{
4483  int gtid = __kmp_entry_gtid();
4484  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4485  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4486  (kmp_affin_mask_t *)(*mask));
4487  __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4488  proc, gtid, buf);
4489  });
4490 
4491  if (__kmp_env_consistency_check) {
4492  if ((mask == NULL) || (*mask == NULL)) {
4493  KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4494  }
4495  }
4496 
4497  if ((proc < 0)
4498 # if !KMP_USE_HWLOC
4499  || ((unsigned)proc >= KMP_CPU_SETSIZE)
4500 # endif
4501  ) {
4502  return -1;
4503  }
4504  if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4505  return -2;
4506  }
4507 
4508  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4509  return 0;
4510 }
4511 
4512 
4513 int
4514 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4515 {
4516  int retval;
4517 
4518  if (! KMP_AFFINITY_CAPABLE()) {
4519  return -1;
4520  }
4521 
4522  KA_TRACE(1000, ;{
4523  int gtid = __kmp_entry_gtid();
4524  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4525  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4526  (kmp_affin_mask_t *)(*mask));
4527  __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4528  proc, gtid, buf);
4529  });
4530 
4531  if (__kmp_env_consistency_check) {
4532  if ((mask == NULL) || (*mask == NULL)) {
4533  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4534  }
4535  }
4536 
4537  if ((proc < 0)
4538 # if !KMP_USE_HWLOC
4539  || ((unsigned)proc >= KMP_CPU_SETSIZE)
4540 # endif
4541  ) {
4542  return -1;
4543  }
4544  if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4545  return 0;
4546  }
4547 
4548  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4549 }
4550 
4551 
4552 // Dynamic affinity settings - Affinity balanced
4553 void __kmp_balanced_affinity( int tid, int nthreads )
4554 {
4555  if( __kmp_affinity_uniform_topology() ) {
4556  int coreID;
4557  int threadID;
4558  // Number of hyper threads per core in HT machine
4559  int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4560  // Number of cores
4561  int ncores = __kmp_ncores;
4562  // How many threads will be bound to each core
4563  int chunk = nthreads / ncores;
4564  // How many cores will have an additional thread bound to it - "big cores"
4565  int big_cores = nthreads % ncores;
4566  // Number of threads on the big cores
4567  int big_nth = ( chunk + 1 ) * big_cores;
4568  if( tid < big_nth ) {
4569  coreID = tid / (chunk + 1 );
4570  threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4571  } else { //tid >= big_nth
4572  coreID = ( tid - big_cores ) / chunk;
4573  threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4574  }
4575 
4576  KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4577  "Illegal set affinity operation when not capable");
4578 
4579  kmp_affin_mask_t *mask;
4580  KMP_CPU_ALLOC_ON_STACK(mask);
4581  KMP_CPU_ZERO(mask);
4582 
4583  // Granularity == thread
4584  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4585  int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4586  KMP_CPU_SET( osID, mask);
4587  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4588  for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4589  int osID;
4590  osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4591  KMP_CPU_SET( osID, mask);
4592  }
4593  }
4594  if (__kmp_affinity_verbose) {
4595  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4596  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4597  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4598  tid, buf);
4599  }
4600  __kmp_set_system_affinity( mask, TRUE );
4601  KMP_CPU_FREE_FROM_STACK(mask);
4602  } else { // Non-uniform topology
4603 
4604  kmp_affin_mask_t *mask;
4605  KMP_CPU_ALLOC_ON_STACK(mask);
4606  KMP_CPU_ZERO(mask);
4607 
4608  // Number of hyper threads per core in HT machine
4609  int nth_per_core = __kmp_nThreadsPerCore;
4610  int core_level;
4611  if( nth_per_core > 1 ) {
4612  core_level = __kmp_aff_depth - 2;
4613  } else {
4614  core_level = __kmp_aff_depth - 1;
4615  }
4616 
4617  // Number of cores - maximum value; it does not count trail cores with 0 processors
4618  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4619 
4620  // For performance gain consider the special case nthreads == __kmp_avail_proc
4621  if( nthreads == __kmp_avail_proc ) {
4622  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4623  int osID = address2os[ tid ].second;
4624  KMP_CPU_SET( osID, mask);
4625  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4626  int coreID = address2os[ tid ].first.labels[ core_level ];
4627  // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4628  // since the address2os is sortied we can break when cnt==nth_per_core
4629  int cnt = 0;
4630  for( int i = 0; i < __kmp_avail_proc; i++ ) {
4631  int osID = address2os[ i ].second;
4632  int core = address2os[ i ].first.labels[ core_level ];
4633  if( core == coreID ) {
4634  KMP_CPU_SET( osID, mask);
4635  cnt++;
4636  if( cnt == nth_per_core ) {
4637  break;
4638  }
4639  }
4640  }
4641  }
4642  } else if( nthreads <= __kmp_ncores ) {
4643 
4644  int core = 0;
4645  for( int i = 0; i < ncores; i++ ) {
4646  // Check if this core from procarr[] is in the mask
4647  int in_mask = 0;
4648  for( int j = 0; j < nth_per_core; j++ ) {
4649  if( procarr[ i * nth_per_core + j ] != - 1 ) {
4650  in_mask = 1;
4651  break;
4652  }
4653  }
4654  if( in_mask ) {
4655  if( tid == core ) {
4656  for( int j = 0; j < nth_per_core; j++ ) {
4657  int osID = procarr[ i * nth_per_core + j ];
4658  if( osID != -1 ) {
4659  KMP_CPU_SET( osID, mask );
4660  // For granularity=thread it is enough to set the first available osID for this core
4661  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4662  break;
4663  }
4664  }
4665  }
4666  break;
4667  } else {
4668  core++;
4669  }
4670  }
4671  }
4672 
4673  } else { // nthreads > __kmp_ncores
4674 
4675  // Array to save the number of processors at each core
4676  int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
4677  // Array to save the number of cores with "x" available processors;
4678  int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4679  // Array to save the number of cores with # procs from x to nth_per_core
4680  int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4681 
4682  for( int i = 0; i <= nth_per_core; i++ ) {
4683  ncores_with_x_procs[ i ] = 0;
4684  ncores_with_x_to_max_procs[ i ] = 0;
4685  }
4686 
4687  for( int i = 0; i < ncores; i++ ) {
4688  int cnt = 0;
4689  for( int j = 0; j < nth_per_core; j++ ) {
4690  if( procarr[ i * nth_per_core + j ] != -1 ) {
4691  cnt++;
4692  }
4693  }
4694  nproc_at_core[ i ] = cnt;
4695  ncores_with_x_procs[ cnt ]++;
4696  }
4697 
4698  for( int i = 0; i <= nth_per_core; i++ ) {
4699  for( int j = i; j <= nth_per_core; j++ ) {
4700  ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4701  }
4702  }
4703 
4704  // Max number of processors
4705  int nproc = nth_per_core * ncores;
4706  // An array to keep number of threads per each context
4707  int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4708  for( int i = 0; i < nproc; i++ ) {
4709  newarr[ i ] = 0;
4710  }
4711 
4712  int nth = nthreads;
4713  int flag = 0;
4714  while( nth > 0 ) {
4715  for( int j = 1; j <= nth_per_core; j++ ) {
4716  int cnt = ncores_with_x_to_max_procs[ j ];
4717  for( int i = 0; i < ncores; i++ ) {
4718  // Skip the core with 0 processors
4719  if( nproc_at_core[ i ] == 0 ) {
4720  continue;
4721  }
4722  for( int k = 0; k < nth_per_core; k++ ) {
4723  if( procarr[ i * nth_per_core + k ] != -1 ) {
4724  if( newarr[ i * nth_per_core + k ] == 0 ) {
4725  newarr[ i * nth_per_core + k ] = 1;
4726  cnt--;
4727  nth--;
4728  break;
4729  } else {
4730  if( flag != 0 ) {
4731  newarr[ i * nth_per_core + k ] ++;
4732  cnt--;
4733  nth--;
4734  break;
4735  }
4736  }
4737  }
4738  }
4739  if( cnt == 0 || nth == 0 ) {
4740  break;
4741  }
4742  }
4743  if( nth == 0 ) {
4744  break;
4745  }
4746  }
4747  flag = 1;
4748  }
4749  int sum = 0;
4750  for( int i = 0; i < nproc; i++ ) {
4751  sum += newarr[ i ];
4752  if( sum > tid ) {
4753  // Granularity == thread
4754  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4755  int osID = procarr[ i ];
4756  KMP_CPU_SET( osID, mask);
4757  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4758  int coreID = i / nth_per_core;
4759  for( int ii = 0; ii < nth_per_core; ii++ ) {
4760  int osID = procarr[ coreID * nth_per_core + ii ];
4761  if( osID != -1 ) {
4762  KMP_CPU_SET( osID, mask);
4763  }
4764  }
4765  }
4766  break;
4767  }
4768  }
4769  __kmp_free( newarr );
4770  }
4771 
4772  if (__kmp_affinity_verbose) {
4773  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4774  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4775  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4776  tid, buf);
4777  }
4778  __kmp_set_system_affinity( mask, TRUE );
4779  KMP_CPU_FREE_FROM_STACK(mask);
4780  }
4781 }
4782 
4783 #if KMP_OS_LINUX
4784 // We don't need this entry for Windows because
4785 // there is GetProcessAffinityMask() api
4786 //
4787 // The intended usage is indicated by these steps:
4788 // 1) The user gets the current affinity mask
4789 // 2) Then sets the affinity by calling this function
4790 // 3) Error check the return value
4791 // 4) Use non-OpenMP parallelization
4792 // 5) Reset the affinity to what was stored in step 1)
4793 #ifdef __cplusplus
4794 extern "C"
4795 #endif
4796 int
4797 kmp_set_thread_affinity_mask_initial()
4798 // the function returns 0 on success,
4799 // -1 if we cannot bind thread
4800 // >0 (errno) if an error happened during binding
4801 {
4802  int gtid = __kmp_get_gtid();
4803  if (gtid < 0) {
4804  // Do not touch non-omp threads
4805  KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4806  "non-omp thread, returning\n"));
4807  return -1;
4808  }
4809  if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
4810  KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4811  "affinity not initialized, returning\n"));
4812  return -1;
4813  }
4814  KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4815  "set full mask for thread %d\n", gtid));
4816  KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
4817  return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
4818 }
4819 #endif
4820 
4821 #endif // KMP_AFFINITY_SUPPORTED
kmp_uint32 depth
Definition: kmp_affinity.h:161
kmp_uint32 * numPerLevel
Definition: kmp_affinity.h:170