Add posix implementation for platform/numa.h functions, relying
on hwloc.

PiperOrigin-RevId: 235742876
diff --git a/.bazelrc b/.bazelrc
index 17285af..1741091 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -67,6 +67,7 @@
 build:gdr --define=with_gdr_support=true
 build:ngraph --define=with_ngraph_support=true
 build:verbs --define=with_verbs_support=true
+build:numa --define=with_numa_support=true
 
 # Options to disable default on features
 build:noaws --define=no_aws_support=true
diff --git a/configure.py b/configure.py
index 3eb09a1..673825c 100644
--- a/configure.py
+++ b/configure.py
@@ -1751,6 +1751,7 @@
   config_info_line('gdr', 'Build with GDR support.')
   config_info_line('verbs', 'Build with libverbs support.')
   config_info_line('ngraph', 'Build with Intel nGraph support.')
+  config_info_line('numa', 'Build with NUMA support.')
   config_info_line(
       'dynamic_kernels',
       '(Experimental) Build kernels into separate shared objects.')
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index f53982f..e1d988a 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -304,6 +304,12 @@
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "with_numa_support",
+    define_values = {"with_numa_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
 # Crosses between framework_shared_object and a bunch of other configurations
 # due to limitations in nested select() statements.
 config_setting(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 64aed37..8f5de68 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -128,6 +128,9 @@
     "tf_additional_libdevice_srcs",
     "tf_additional_minimal_lib_srcs",
     "tf_additional_mpi_lib_defines",
+    "tf_additional_numa_deps",
+    "tf_additional_numa_lib_defines",
+    "tf_additional_numa_copts",
     "tf_additional_proto_hdrs",
     "tf_additional_proto_srcs",
     "tf_additional_test_deps",
@@ -388,15 +391,15 @@
         ":platform_port_hdrs",
         ":platform_port_internal_hdrs",
     ],
-    copts = tf_copts(),
+    copts = tf_copts() + tf_additional_numa_copts(),
     visibility = ["//tensorflow/core:__subpackages__"],
     deps = [
         ":lib_platform",
         ":platform_base",
-        "//tensorflow/core/platform/default/build_config:port",
         "@com_google_absl//absl/base",
+        "//tensorflow/core/platform/default/build_config:port",
         "@snappy",
-    ],
+    ] + tf_additional_numa_deps(),
 )
 
 filegroup(
@@ -2278,11 +2281,14 @@
 ]
 
 # Replicated for lib_internal and lib_internal_impl.
-LIB_INTERNAL_DEFINES = (tf_additional_lib_defines() + [
-                            "TF_USE_SNAPPY",
-                        ] + tf_additional_verbs_lib_defines() +
-                        tf_additional_mpi_lib_defines() +
-                        tf_additional_gdr_lib_defines())
+LIB_INTERNAL_DEFINES = (
+    tf_additional_lib_defines() + [
+        "TF_USE_SNAPPY",
+    ] + tf_additional_verbs_lib_defines() +
+    tf_additional_mpi_lib_defines() +
+    tf_additional_gdr_lib_defines() +
+    tf_additional_numa_lib_defines()
+)
 
 cc_library(
     name = "lib_internal",
@@ -2355,19 +2361,20 @@
     copts = tf_copts(),
     defines = LIB_INTERNAL_DEFINES,
     deps = tf_additional_lib_deps() + [
-        ":lib_hash_crc32c_accelerate_internal",
-        ":lib_proto_parsing",
-        ":abi",
-        ":core_stringpiece",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "//third_party/eigen3",
-        "//tensorflow/core/platform/default/build_config:platformlib",
-        "@snappy",
-        "@zlib_archive//:zlib",
-        "@double_conversion//:double-conversion",
-        "@protobuf_archive//:protobuf",
-    ] + tf_protos_all_impl() + tf_protos_grappler_impl(),
+               ":lib_hash_crc32c_accelerate_internal",
+               ":lib_proto_parsing",
+               ":abi",
+               ":core_stringpiece",
+               "@com_google_absl//absl/memory",
+               "@com_google_absl//absl/strings",
+               "//third_party/eigen3",
+               "//tensorflow/core/platform/default/build_config:platformlib",
+               "@snappy",
+               "@zlib_archive//:zlib",
+               "@double_conversion//:double-conversion",
+               "@protobuf_archive//:protobuf",
+           ] + tf_protos_all_impl() + tf_protos_grappler_impl() +
+           tf_additional_numa_deps(),
 )
 
 # File compiled with extra flags to get cpu-specific acceleration.
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index f9ac4ff..f6f449a 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -725,6 +725,12 @@
         "//conditions:default": [],
     })
 
+def tf_additional_numa_lib_defines():
+    return select({
+        "//tensorflow:with_numa_support": ["TENSORFLOW_USE_NUMA"],
+        "//conditions:default": [],
+    })
+
 def tf_py_clif_cc(name, visibility = None, **kwargs):
     pass
 
@@ -757,3 +763,26 @@
             "//third_party/mkl:intel_binary_blob",
         ],
     )
+
+def tf_additional_numa_deps():
+    return select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:darwin": [],
+        "//conditions:default": [
+            "@hwloc",
+        ],
+    })
+
+def tf_additional_numa_copts():
+    return select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:darwin": [],
+        "//conditions:default": [
+            "-Ithird_party/hwloc/hwloc-master/include",
+            "-DTENSORFLOW_USE_NUMA",
+        ],
+    })
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 807e008..1561632 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -45,6 +45,10 @@
 #include <thread>
 #endif
 
+#if TENSORFLOW_USE_NUMA
+#include "hwloc.h"  // TF:hwloc
+#endif
+
 namespace tensorflow {
 namespace port {
 
@@ -115,16 +119,94 @@
   return (ht_per_core > 0) ? ht_per_core : 1;
 }
 
-bool NUMAEnabled() {
-  // Not yet implemented: coming soon.
-  return false;
+#ifdef TENSORFLOW_USE_NUMA
+namespace {
+static hwloc_topology_t hwloc_topology_handle;
+
+bool HaveHWLocTopology() {
+  // One time initialization
+  static bool init = []() {
+    if (hwloc_topology_init(&hwloc_topology_handle)) {
+      LOG(ERROR) << "Call to hwloc_topology_init() failed";
+      return false;
+    }
+    if (hwloc_topology_load(hwloc_topology_handle)) {
+      LOG(ERROR) << "Call to hwloc_topology_load() failed";
+      return false;
+    }
+    return true;
+  }();
+  return init;
 }
 
-int NUMANumNodes() { return 1; }
+// Return the first hwloc object of the given type whose os_index
+// matches 'index'.
+hwloc_obj_t GetHWLocTypeIndex(hwloc_obj_type_t tp, int index) {
+  hwloc_obj_t obj = nullptr;
+  if (index >= 0) {
+    while ((obj = hwloc_get_next_obj_by_type(hwloc_topology_handle, tp, obj)) !=
+           nullptr) {
+      if (obj->os_index == index) break;
+    }
+  }
+  return obj;
+}
+}  // namespace
+#endif  // TENSORFLOW_USE_NUMA
 
-void NUMASetThreadNodeAffinity(int node) {}
+bool NUMAEnabled() { return (NUMANumNodes() > 1); }
 
-int NUMAGetThreadNodeAffinity() { return kNUMANoAffinity; }
+int NUMANumNodes() {
+#ifdef TENSORFLOW_USE_NUMA
+  if (HaveHWLocTopology()) {
+    int num_numanodes =
+        hwloc_get_nbobjs_by_type(hwloc_topology_handle, HWLOC_OBJ_NUMANODE);
+    return std::max(1, num_numanodes);
+  } else {
+    return 1;
+  }
+#else
+  return 1;
+#endif  // TENSORFLOW_USE_NUMA
+}
+
+void NUMASetThreadNodeAffinity(int node) {
+#ifdef TENSORFLOW_USE_NUMA
+  if (HaveHWLocTopology()) {
+    // Find the corresponding NUMA node topology object.
+    hwloc_obj_t obj = GetHWLocTypeIndex(HWLOC_OBJ_NUMANODE, node);
+    if (obj) {
+      hwloc_set_cpubind(hwloc_topology_handle, obj->cpuset,
+                        HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT);
+    } else {
+      LOG(ERROR) << "Could not find hwloc NUMA node " << node;
+    }
+  }
+#endif  // TENSORFLOW_USE_NUMA
+}
+
+int NUMAGetThreadNodeAffinity() {
+  int node_index = kNUMANoAffinity;
+#ifdef TENSORFLOW_USE_NUMA
+  if (HaveHWLocTopology()) {
+    hwloc_cpuset_t thread_cpuset = hwloc_bitmap_alloc();
+    hwloc_get_cpubind(hwloc_topology_handle, thread_cpuset,
+                      HWLOC_CPUBIND_THREAD);
+    hwloc_obj_t obj = nullptr;
+    // Return the first NUMA node whose cpuset is a (non-proper) superset of
+    // that of the current thread.
+    while ((obj = hwloc_get_next_obj_by_type(
+                hwloc_topology_handle, HWLOC_OBJ_NUMANODE, obj)) != nullptr) {
+      if (hwloc_bitmap_isincluded(thread_cpuset, obj->cpuset)) {
+        node_index = obj->os_index;
+        break;
+      }
+    }
+    hwloc_bitmap_free(thread_cpuset);
+  }
+#endif  // TENSORFLOW_USE_NUMA
+  return node_index;
+}
 
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
@@ -154,12 +236,54 @@
 void Free(void* ptr) { free(ptr); }
 
 void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
+#ifdef TENSORFLOW_USE_NUMA
+  if (HaveHWLocTopology()) {
+    hwloc_obj_t numa_node = GetHWLocTypeIndex(HWLOC_OBJ_NUMANODE, node);
+    if (numa_node) {
+      return hwloc_alloc_membind(hwloc_topology_handle, size,
+                                 numa_node->nodeset, HWLOC_MEMBIND_BIND,
+                                 HWLOC_MEMBIND_BYNODESET);
+    } else {
+      LOG(ERROR) << "Failed to find hwloc NUMA node " << node;
+    }
+  }
+#endif  // TENSORFLOW_USE_NUMA
   return AlignedMalloc(size, minimum_alignment);
 }
 
-void NUMAFree(void* ptr, size_t size) { Free(ptr); }
+void NUMAFree(void* ptr, size_t size) {
+#ifdef TENSORFLOW_USE_NUMA
+  if (HaveHWLocTopology()) {
+    hwloc_free(hwloc_topology_handle, ptr, size);
+    return;
+  }
+#endif  // TENSORFLOW_USE_NUMA
+  Free(ptr);
+}
 
-int NUMAGetMemAffinity(const void* addr) { return kNUMANoAffinity; }
+int NUMAGetMemAffinity(const void* addr) {
+  int node = kNUMANoAffinity;
+#ifdef TENSORFLOW_USE_NUMA
+  if (HaveHWLocTopology() && addr) {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    if (!hwloc_get_area_memlocation(hwloc_topology_handle, addr, 4, nodeset,
+                                    HWLOC_MEMBIND_BYNODESET)) {
+      hwloc_obj_t obj = nullptr;
+      while ((obj = hwloc_get_next_obj_by_type(
+                  hwloc_topology_handle, HWLOC_OBJ_NUMANODE, obj)) != nullptr) {
+        if (hwloc_bitmap_isincluded(nodeset, obj->nodeset)) {
+          node = obj->os_index;
+          break;
+        }
+      }
+      hwloc_bitmap_free(nodeset);
+    } else {
+      LOG(ERROR) << "Failed call to hwloc_get_area_memlocation.";
+    }
+  }
+#endif  // TENSORFLOW_USE_NUMA
+  return node;
+}
 
 void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
   // No-op.
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 0c81ebe..525c05b 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -130,6 +130,7 @@
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@highwayhash//:LICENSE",
+        "@hwloc//:LICENSE",
         "@icu//:icu4c/LICENSE",
         "@jpeg//:LICENSE.md",
         "@llvm//:LICENSE.TXT",
@@ -199,6 +200,7 @@
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@highwayhash//:LICENSE",
+        "@hwloc//:LICENSE",
         "@icu//:icu4j/main/shared/licenses/LICENSE",
         "@jpeg//:LICENSE.md",
         "@llvm//:LICENSE.TXT",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 90dfca2..88f13a0 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -171,6 +171,7 @@
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@highwayhash//:LICENSE",
+        "@hwloc//:LICENSE",
         "@icu//:icu4c/LICENSE",
         "@jpeg//:LICENSE.md",
         "@keras_applications_archive//:LICENSE",