util: completely rewrite and do AMD Zen L3 cache pinning correctly

This queries the CPU cache topology correctly.

Acked-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7054>
diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
index f9d22c3..d8dcad5 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -2002,8 +2002,9 @@
 
    if (param == PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE) {
       /* Pin the gallium thread as requested. */
-      util_pin_thread_to_L3(tc->queue.threads[0], value,
-                            util_cpu_caps.cores_per_L3);
+      util_set_thread_affinity(tc->queue.threads[0],
+                               util_cpu_caps.L3_affinity_mask[value],
+                               NULL, UTIL_MAX_CPUS);
 
       /* Execute this immediately (without enqueuing).
        * It's required to be thread-safe.
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index d554ea5..8a0aedf 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -311,8 +311,9 @@
 {
    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
 
-   util_pin_thread_to_L3(ws->cs_queue.threads[0], cache,
-                         util_cpu_caps.cores_per_L3);
+   util_set_thread_affinity(ws->cs_queue.threads[0],
+                            util_cpu_caps.L3_affinity_mask[cache],
+                            NULL, UTIL_MAX_CPUS);
 }
 
 static uint32_t kms_handle_hash(const void *key)
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index b9a092d..569d273 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -798,8 +798,9 @@
    struct radeon_drm_winsys *rws = (struct radeon_drm_winsys*)ws;
 
    if (util_queue_is_initialized(&rws->cs_queue)) {
-      util_pin_thread_to_L3(rws->cs_queue.threads[0], cache,
-            util_cpu_caps.cores_per_L3);
+      util_set_thread_affinity(rws->cs_queue.threads[0],
+                               util_cpu_caps.L3_affinity_mask[cache],
+                               NULL, UTIL_MAX_CPUS);
    }
 }
 
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 0e22d0c..676db28 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -139,7 +139,7 @@
                 ++st->pin_thread_counter % 512 == 0)) {
       int cpu = util_get_current_cpu();
       if (cpu >= 0) {
-         unsigned L3_cache = cpu / util_cpu_caps.cores_per_L3;
+         unsigned L3_cache = util_cpu_caps.cpu_to_L3[cpu];
 
          pipe->set_context_param(pipe,
                                  PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE,
diff --git a/src/util/u_cpu_detect.c b/src/util/u_cpu_detect.c
index ab06495..af3663a 100644
--- a/src/util/u_cpu_detect.c
+++ b/src/util/u_cpu_detect.c
@@ -37,8 +37,12 @@
 
 #include "util/u_debug.h"
 #include "u_cpu_detect.h"
+#include "u_math.h"
 #include "c11/threads.h"
 
+#include <stdio.h>
+#include <inttypes.h>
+
 #if defined(PIPE_ARCH_PPC)
 #if defined(PIPE_OS_APPLE)
 #include <sys/sysctl.h>
@@ -83,9 +87,7 @@
 #endif
 
 
-#ifdef DEBUG
 DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false)
-#endif
 
 
 struct util_cpu_caps util_cpu_caps;
@@ -432,21 +434,104 @@
 static void
 get_cpu_topology(void)
 {
-   /* Default. This is correct if L3 is not present or there is only one. */
+   /* Default. This is OK if L3 is not present or there is only one. */
    util_cpu_caps.cores_per_L3 = util_cpu_caps.nr_cpus;
+   util_cpu_caps.num_L3_caches = 1;
 
 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    /* AMD Zen */
    if (util_cpu_caps.x86_cpu_type == 0x17) {
       uint32_t regs[4];
 
-      /* Query the L3 cache topology information. */
+      /* Query the L3 cache count. */
       cpuid_count(0x8000001D, 3, regs);
       unsigned cache_level = (regs[0] >> 5) & 0x7;
-      unsigned cores_per_cache = ((regs[0] >> 14) & 0xfff) + 1;
+      unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1;
 
-      if (cache_level == 3)
-         util_cpu_caps.cores_per_L3 = cores_per_cache;
+      if (cache_level != 3 || cores_per_L3 == util_cpu_caps.nr_cpus)
+         return;
+
+      uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0};
+      uint32_t mask[UTIL_MAX_CPUS / 32] = {0};
+      uint32_t allowed_mask[UTIL_MAX_CPUS / 32] = {0};
+      uint32_t apic_id[UTIL_MAX_CPUS];
+      bool saved = false;
+
+      /* Query APIC IDs from each CPU core.
+       *
+       * An APIC ID is a logical ID of the CPU with respect to the cache
+       * hierarchy, meaning that consecutive APIC IDs are neighbours in
+       * the hierarchy, e.g. sharing the same cache.
+       *
+       * For example, CPU 0 can have APIC ID 0 and CPU 12 can have APIC ID 1,
+       * which means that both CPU 0 and 12 are next to each other.
+       * (e.g. they are 2 threads belonging to 1 SMT2 core)
+       *
+       * We need to find out which CPUs share the same L3 cache and they can
+       * be all over the place.
+       *
+       * Querying the APIC ID can only be done by pinning the current thread
+       * to each core. The original affinity mask is saved.
+       */
+      for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS;
+           i++) {
+         uint32_t cpu_bit = 1u << (i % 32);
+
+         mask[i / 32] = cpu_bit;
+
+         if (util_set_current_thread_affinity(mask,
+                                              !saved ? saved_mask : NULL,
+                                              UTIL_MAX_CPUS)) {
+            saved = true;
+            allowed_mask[i / 32] |= cpu_bit;
+
+            /* Query the APIC ID of the current core. */
+            cpuid(0x00000001, regs);
+            apic_id[i] = regs[1] >> 24;
+         }
+         mask[i / 32] = 0;
+      }
+
+      if (saved) {
+
+         /* We succeeded in using at least one CPU. */
+         util_cpu_caps.num_L3_caches = util_cpu_caps.nr_cpus / cores_per_L3;
+         util_cpu_caps.cores_per_L3 = cores_per_L3;
+         util_cpu_caps.L3_affinity_mask = calloc(sizeof(util_affinity_mask),
+                                                 util_cpu_caps.num_L3_caches);
+
+         for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS;
+              i++) {
+            uint32_t cpu_bit = 1u << (i % 32);
+
+            if (allowed_mask[i / 32] & cpu_bit) {
+               /* Each APIC ID bit represents a topology level, so we need
+                * to round up to the next power of two.
+                */
+               unsigned L3_index = apic_id[i] /
+                                   util_next_power_of_two(cores_per_L3);
+
+               util_cpu_caps.L3_affinity_mask[L3_index][i / 32] |= cpu_bit;
+               util_cpu_caps.cpu_to_L3[i] = L3_index;
+            }
+         }
+
+         if (debug_get_option_dump_cpu()) {
+            fprintf(stderr, "CPU <-> L3 cache mapping:\n");
+            for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) {
+               fprintf(stderr, "  - L3 %u mask = ", i);
+               for (int j = util_cpu_caps.nr_cpus - 1; j >= 0; j -= 32)
+                  fprintf(stderr, "%08x ", util_cpu_caps.L3_affinity_mask[i][j / 32]);
+               fprintf(stderr, "\n");
+            }
+         }
+
+         /* Restore the original affinity mask. */
+         util_set_current_thread_affinity(saved_mask, NULL, UTIL_MAX_CPUS);
+      } else {
+         if (debug_get_option_dump_cpu())
+            fprintf(stderr, "Cannot set thread affinity for any thread.\n");
+      }
    }
 #endif
 }
@@ -606,7 +691,6 @@
 
    get_cpu_topology();
 
-#ifdef DEBUG
    if (debug_get_option_dump_cpu()) {
       debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
 
@@ -643,7 +727,6 @@
       debug_printf("util_cpu_caps.has_avx512vl = %u\n", util_cpu_caps.has_avx512vl);
       debug_printf("util_cpu_caps.has_avx512vbmi = %u\n", util_cpu_caps.has_avx512vbmi);
    }
-#endif
 }
 
 static once_flag cpu_once_flag = ONCE_FLAG_INIT;
diff --git a/src/util/u_cpu_detect.h b/src/util/u_cpu_detect.h
index a09aca8..2e47ee6 100644
--- a/src/util/u_cpu_detect.h
+++ b/src/util/u_cpu_detect.h
@@ -37,12 +37,14 @@
 
 
 #include "pipe/p_config.h"
+#include "util/u_thread.h"
 
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
+typedef uint32_t util_affinity_mask[UTIL_MAX_CPUS / 32];
 
 struct util_cpu_caps {
    int nr_cpus;
@@ -50,7 +52,6 @@
    /* Feature flags */
    int x86_cpu_type;
    unsigned cacheline;
-   unsigned cores_per_L3;
 
    unsigned has_intel:1;
    unsigned has_tsc:1;
@@ -84,6 +85,13 @@
    unsigned has_avx512bw:1;
    unsigned has_avx512vl:1;
    unsigned has_avx512vbmi:1;
+
+   unsigned num_L3_caches;
+   unsigned cores_per_L3;
+
+   uint16_t cpu_to_L3[UTIL_MAX_CPUS];
+   /* Affinity masks for each L3 cache. */
+   util_affinity_mask *L3_affinity_mask;
 };
 
 extern struct util_cpu_caps
diff --git a/src/util/u_thread.h b/src/util/u_thread.h
index 93d8b0f..bdfb05e 100644
--- a/src/util/u_thread.h
+++ b/src/util/u_thread.h
@@ -62,6 +62,7 @@
 
 /* For util_set_thread_affinity to size the mask. */
 #define UTIL_MAX_CPUS               1024  /* this should be enough */
+#define UTIL_MAX_L3_CACHES          UTIL_MAX_CPUS
 
 static inline int
 util_get_current_cpu(void)
@@ -198,33 +199,6 @@
 #endif
 }
 
-/**
- * An AMD Zen CPU consists of multiple modules where each module has its own L3
- * cache. Inter-thread communication such as locks and atomics between modules
- * is very expensive. It's desirable to pin a group of closely cooperating
- * threads to one group of cores sharing L3.
- *
- * \param thread        thread
- * \param L3_index      index of the L3 cache
- * \param cores_per_L3  number of CPU cores shared by one L3
- */
-static inline bool
-util_pin_thread_to_L3(thrd_t thread, unsigned L3_index, unsigned cores_per_L3)
-{
-   unsigned num_mask_bits = DIV_ROUND_UP((L3_index + 1) * cores_per_L3, 32);
-   uint32_t mask[UTIL_MAX_CPUS / 32];
-
-   assert((L3_index + 1) * cores_per_L3 <= UTIL_MAX_CPUS);
-
-   for (unsigned i = 0; i < cores_per_L3; i++) {
-      unsigned core = L3_index * cores_per_L3 + i;
-
-      mask[core / 32] |= 1u << (core % 32);
-   }
-
-   return util_set_thread_affinity(thread, mask, NULL, num_mask_bits);
-}
-
 
 /*
  * Thread statistics.