XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 1 | // Copyright 2019 Google LLC |
| 2 | // |
| 3 | // This source code is licensed under the BSD-style license found in the |
| 4 | // LICENSE file in the root directory of this source tree. |
| 5 | |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 6 | #include <pthread.h> |
| 7 | #include <sched.h> |
Marat Dukhan | 452662b | 2019-10-03 00:14:39 -0700 | [diff] [blame] | 8 | #ifdef __ANDROID__ |
| 9 | #include <malloc.h> |
| 10 | #endif |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 11 | #if defined(__SSE__) || defined(__x86_64__) |
| 12 | #include <xmmintrin.h> |
| 13 | #endif |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 14 | |
| 15 | #include <cstdio> |
| 16 | #include <cstdlib> |
| 17 | #include <cstring> |
| 18 | |
| 19 | #include <cpuinfo.h> |
| 20 | |
| 21 | #include "bench/utils.h" |
| 22 | |
| 23 | |
| 24 | static void* wipe_buffer = nullptr; |
| 25 | static size_t wipe_buffer_size = 0; |
| 26 | |
Marat Dukhan | 4232323 | 2019-10-23 02:09:02 -0700 | [diff] [blame] | 27 | static pthread_once_t wipe_buffer_guard = PTHREAD_ONCE_INIT; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 28 | |
Marat Dukhan | 4232323 | 2019-10-23 02:09:02 -0700 | [diff] [blame] | 29 | static void InitWipeBuffer() { |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 30 | // Default: the largest know cache size (128 MB Intel Crystalwell L4 cache). |
| 31 | wipe_buffer_size = 128 * 1024 * 1024; |
| 32 | if (cpuinfo_initialize()) { |
Marat Dukhan | d62f3cc | 2019-10-01 12:37:52 -0700 | [diff] [blame] | 33 | wipe_buffer_size = benchmark::utils::GetMaxCacheSize(); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 34 | } |
| 35 | #if defined(__ANDROID__) |
| 36 | // memalign is obsolete, but it is the only option on Android until API level 17. |
| 37 | wipe_buffer = memalign(128, wipe_buffer_size); |
| 38 | #else |
| 39 | (void) posix_memalign((void**) &wipe_buffer, 128, wipe_buffer_size); |
| 40 | #endif |
| 41 | if (wipe_buffer != nullptr) { |
| 42 | memset(wipe_buffer, 0xA5, wipe_buffer_size); |
| 43 | } |
| 44 | } |
| 45 | |
| 46 | namespace benchmark { |
| 47 | namespace utils { |
| 48 | |
Marat Dukhan | 4232323 | 2019-10-23 02:09:02 -0700 | [diff] [blame] | 49 | uint32_t PrefetchToL1(const void* ptr, size_t size) { |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 50 | uint32_t step = 16; |
| 51 | if (cpuinfo_initialize()) { |
| 52 | step = cpuinfo_get_l1d_cache(0)->line_size; |
| 53 | } |
| 54 | const uint8_t* u8_ptr = static_cast<const uint8_t*>(ptr); |
| 55 | // Compute and return sum of data to prevent compiler from removing data reads. |
| 56 | uint32_t sum = 0; |
| 57 | while (size >= step) { |
| 58 | sum += uint32_t(*u8_ptr); |
| 59 | u8_ptr += step; |
| 60 | size -= step; |
| 61 | } |
| 62 | return sum; |
| 63 | } |
| 64 | |
Marat Dukhan | 4232323 | 2019-10-23 02:09:02 -0700 | [diff] [blame] | 65 | uint32_t WipeCache() { |
| 66 | pthread_once(&wipe_buffer_guard, &InitWipeBuffer); |
| 67 | return PrefetchToL1(wipe_buffer, wipe_buffer_size); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 68 | } |
| 69 | |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 70 | void DisableDenormals() { |
| 71 | #if defined(__SSE__) || defined(__x86_64__) |
| 72 | _mm_setcsr(_mm_getcsr() | 0x8040); |
| 73 | #elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) |
| 74 | uint32_t fpscr; |
| 75 | __asm__ __volatile__( |
| 76 | "VMRS %[fpscr], fpscr\n" |
| 77 | "ORR %[fpscr], #0x1000000\n" |
| 78 | "VMSR fpscr, %[fpscr]\n" |
| 79 | : [fpscr] "=r" (fpscr)); |
| 80 | #elif defined(__aarch64__) |
| 81 | uint64_t fpcr; |
| 82 | __asm__ __volatile__( |
| 83 | "MRS %[fpcr], fpcr\n" |
| 84 | "ORR %w[fpcr], %w[fpcr], 0x1000000\n" |
| 85 | "ORR %w[fpcr], %w[fpcr], 0x80000\n" |
| 86 | "MSR fpcr, %[fpcr]\n" |
| 87 | : [fpcr] "=r" (fpcr)); |
| 88 | #endif |
| 89 | } |
| 90 | |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 91 | // Return clockrate in Hz |
Frank Barchard | bb4c18b | 2019-09-30 11:05:52 -0700 | [diff] [blame] | 92 | uint64_t GetCurrentCpuFrequency() { |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 93 | #ifdef __linux__ |
| 94 | int freq = 0; |
| 95 | char cpuinfo_name[512]; |
| 96 | int cpu = sched_getcpu(); |
Frank Barchard | bb4c18b | 2019-09-30 11:05:52 -0700 | [diff] [blame] | 97 | snprintf(cpuinfo_name, sizeof(cpuinfo_name), |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 98 | "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq", cpu); |
| 99 | |
| 100 | FILE* f = fopen(cpuinfo_name, "r"); |
| 101 | if (f) { |
| 102 | if (fscanf(f, "%d", &freq)) { |
| 103 | fclose(f); |
| 104 | return uint64_t(freq) * 1000; |
| 105 | } |
| 106 | fclose(f); |
| 107 | } |
| 108 | #endif // __linux__ |
| 109 | return 0; |
| 110 | } |
| 111 | |
Marat Dukhan | d62f3cc | 2019-10-01 12:37:52 -0700 | [diff] [blame] | 112 | size_t GetMaxCacheSize() { |
| 113 | if (!cpuinfo_initialize()) { |
| 114 | #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 |
| 115 | // DynamIQ max: 4 MB |
| 116 | return 4 * 1024 * 1024; |
| 117 | #else |
| 118 | // Intel eDRAM max: 128 MB |
| 119 | return 128 * 1024 * 1024; |
| 120 | #endif |
| 121 | } |
| 122 | const cpuinfo_processor* processor = cpuinfo_get_processor(0); |
| 123 | #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 |
| 124 | // There is no precise way to detect cache size on ARM/ARM64, and cache size reported by cpuinfo |
| 125 | // may underestimate the actual cache size. Thus, we use microarchitecture-specific maximum. |
| 126 | switch (processor->core->uarch) { |
| 127 | case cpuinfo_uarch_xscale: |
| 128 | case cpuinfo_uarch_arm11: |
| 129 | case cpuinfo_uarch_scorpion: |
| 130 | case cpuinfo_uarch_krait: |
| 131 | case cpuinfo_uarch_kryo: |
Marat Dukhan | 1f5d9bc | 2020-01-02 09:11:16 -0800 | [diff] [blame] | 132 | case cpuinfo_uarch_exynos_m1: |
| 133 | case cpuinfo_uarch_exynos_m2: |
| 134 | case cpuinfo_uarch_exynos_m3: |
Marat Dukhan | d62f3cc | 2019-10-01 12:37:52 -0700 | [diff] [blame] | 135 | // cpuinfo-detected cache size always correct. |
| 136 | break; |
| 137 | case cpuinfo_uarch_cortex_a5: |
| 138 | // Max observed (NXP Vybrid SoC) |
| 139 | return 512 * 1024; |
| 140 | case cpuinfo_uarch_cortex_a7: |
| 141 | // Cortex-A7 MPCore Technical Reference Manual: |
| 142 | // 7.1. About the L2 Memory system |
| 143 | // The L2 memory system consists of an: |
| 144 | // - Optional tightly-coupled L2 cache that includes: |
| 145 | // - Configurable L2 cache size of 128KB, 256KB, 512KB, and 1MB. |
| 146 | return 1024 * 1024; |
| 147 | case cpuinfo_uarch_cortex_a8: |
| 148 | // Cortex-A8 Technical Reference Manual: |
| 149 | // 8.1. About the L2 memory system |
| 150 | // The key features of the L2 memory system include: |
| 151 | // - configurable cache size of 0KB, 128KB, 256KB, 512KB, and 1MB |
| 152 | return 1024 * 1024; |
| 153 | case cpuinfo_uarch_cortex_a9: |
| 154 | // Max observed (e.g. Exynos 4212) |
| 155 | return 1024 * 1024; |
| 156 | case cpuinfo_uarch_cortex_a12: |
| 157 | case cpuinfo_uarch_cortex_a17: |
| 158 | // ARM Cortex-A17 MPCore Processor Technical Reference Manual: |
| 159 | // 7.1. About the L2 Memory system |
| 160 | // The key features of the L2 memory system include: |
| 161 | // - An integrated L2 cache: |
| 162 | // - The cache size is implemented as either 256KB, 512KB, 1MB, 2MB, 4MB or 8MB. |
| 163 | return 8 * 1024 * 1024; |
| 164 | case cpuinfo_uarch_cortex_a15: |
| 165 | // ARM Cortex-A15 MPCore Processor Technical Reference Manual: |
| 166 | // 7.1. About the L2 memory system |
| 167 | // The features of the L2 memory system include: |
| 168 | // - Configurable L2 cache size of 512KB, 1MB, 2MB and 4MB. |
| 169 | return 4 * 1024 * 1024; |
| 170 | case cpuinfo_uarch_cortex_a35: |
| 171 | // ARM Cortex‑A35 Processor Technical Reference Manual: |
| 172 | // 7.1 About the L2 memory system |
| 173 | // L2 cache |
| 174 | // - Further features of the L2 cache are: |
| 175 | // - Configurable size of 128KB, 256KB, 512KB, and 1MB. |
| 176 | return 1024 * 1024; |
| 177 | case cpuinfo_uarch_cortex_a53: |
| 178 | // ARM Cortex-A53 MPCore Processor Technical Reference Manual: |
| 179 | // 7.1. About the L2 memory system |
| 180 | // The L2 memory system consists of an: |
| 181 | // - Optional tightly-coupled L2 cache that includes: |
| 182 | // - Configurable L2 cache size of 128KB, 256KB, 512KB, 1MB and 2MB. |
| 183 | return 2 * 1024 * 1024; |
| 184 | case cpuinfo_uarch_cortex_a57: |
| 185 | // ARM Cortex-A57 MPCore Processor Technical Reference Manual: |
| 186 | // 7.1 About the L2 memory system |
| 187 | // The features of the L2 memory system include: |
| 188 | // - Configurable L2 cache size of 512KB, 1MB, and 2MB. |
| 189 | return 2 * 1024 * 1024; |
| 190 | case cpuinfo_uarch_cortex_a72: |
| 191 | // ARM Cortex-A72 MPCore Processor Technical Reference Manual: |
| 192 | // 7.1 About the L2 memory system |
| 193 | // The features of the L2 memory system include: |
| 194 | // - Configurable L2 cache size of 512KB, 1MB, 2MB and 4MB. |
| 195 | return 4 * 1024 * 1024; |
| 196 | case cpuinfo_uarch_cortex_a73: |
| 197 | // ARM Cortex‑A73 MPCore Processor Technical Reference Manual |
| 198 | // 7.1 About the L2 memory system |
| 199 | // The L2 memory system consists of: |
| 200 | // - A tightly-integrated L2 cache with: |
| 201 | // - A configurable size of 256KB, 512KB, 1MB, 2MB, 4MB, or 8MB. |
| 202 | return 8 * 1024 * 1024; |
| 203 | default: |
| 204 | // ARM DynamIQ Shared Unit Technical Reference Manual |
| 205 | // 1.3 Implementation options |
| 206 | // L3_CACHE_SIZE |
| 207 | // - 256KB |
| 208 | // - 512KB |
| 209 | // - 1024KB |
| 210 | // - 1536KB |
| 211 | // - 2048KB |
| 212 | // - 3072KB |
| 213 | // - 4096KB |
| 214 | return 4 * 1024 * 1024; |
| 215 | } |
| 216 | #endif |
| 217 | if (processor->cache.l4 != NULL) { |
| 218 | return processor->cache.l4->size; |
| 219 | } else if (processor->cache.l3 != NULL) { |
| 220 | return processor->cache.l3->size; |
| 221 | } else if (processor->cache.l2 != NULL) { |
| 222 | return processor->cache.l2->size; |
| 223 | } else if (processor->cache.l1d != NULL) { |
| 224 | return processor->cache.l1d->size; |
| 225 | } else { |
| 226 | return 0; |
| 227 | } |
| 228 | } |
| 229 | |
Marat Dukhan | bad48fe | 2019-11-04 10:35:22 -0800 | [diff] [blame] | 230 | void MultiThreadingParameters(benchmark::internal::Benchmark* benchmark) { |
| 231 | benchmark->ArgName("T"); |
| 232 | |
| 233 | // Disabled thread pool (execution on the caller thread only). |
| 234 | benchmark->Arg(1); |
| 235 | |
| 236 | if (cpuinfo_initialize()) { |
| 237 | // All cores except the little ones. |
| 238 | uint32_t max_cores = cpuinfo_get_cores_count(); |
| 239 | if (cpuinfo_get_clusters_count() > 1) { |
| 240 | max_cores -= cpuinfo_get_cluster(cpuinfo_get_clusters_count() - 1)->core_count; |
| 241 | } |
| 242 | for (uint32_t t = 2; t <= max_cores; t++) { |
| 243 | benchmark->Arg(t); |
| 244 | } |
| 245 | |
| 246 | // All cores (if more than one cluster). |
| 247 | if (cpuinfo_get_cores_count() > max_cores) { |
| 248 | benchmark->Arg(cpuinfo_get_cores_count()); |
| 249 | } |
| 250 | |
| 251 | // All cores + hyperthreads (only if hyperthreading supported). |
| 252 | if (cpuinfo_get_processors_count() > cpuinfo_get_cores_count()) { |
| 253 | benchmark->Arg(cpuinfo_get_processors_count()); |
| 254 | } |
| 255 | } |
| 256 | } |
| 257 | |
Marat Dukhan | c8466f5 | 2019-11-25 18:01:10 -0800 | [diff] [blame] | 258 | |
| 259 | bool CheckNEON(benchmark::State& state) { |
| 260 | if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon()) { |
| 261 | state.SkipWithError("no NEON extension"); |
| 262 | return false; |
| 263 | } |
| 264 | return true; |
| 265 | } |
| 266 | |
| 267 | bool CheckNEONFMA(benchmark::State& state) { |
| 268 | if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fma()) { |
| 269 | state.SkipWithError("no NEON-FMA extension"); |
| 270 | return false; |
| 271 | } |
| 272 | return true; |
| 273 | } |
| 274 | |
| 275 | bool CheckSSE41(benchmark::State& state) { |
| 276 | if (!cpuinfo_initialize() || !cpuinfo_has_x86_sse4_1()) { |
| 277 | state.SkipWithError("no SSE4.1 extension"); |
| 278 | return false; |
| 279 | } |
| 280 | return true; |
| 281 | } |
| 282 | |
| 283 | bool CheckAVX(benchmark::State& state) { |
| 284 | if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx()) { |
| 285 | state.SkipWithError("no AVX extension"); |
| 286 | return false; |
| 287 | } |
| 288 | return true; |
| 289 | } |
| 290 | |
| 291 | bool CheckFMA3(benchmark::State& state) { |
| 292 | if (!cpuinfo_initialize() || !cpuinfo_has_x86_fma3()) { |
| 293 | state.SkipWithError("no FMA3 extension"); |
| 294 | return false; |
| 295 | } |
| 296 | return true; |
| 297 | } |
| 298 | |
| 299 | bool CheckAVX2(benchmark::State& state) { |
| 300 | if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx2()) { |
| 301 | state.SkipWithError("no AVX2 extension"); |
| 302 | return false; |
| 303 | } |
| 304 | return true; |
| 305 | } |
| 306 | |
| 307 | bool CheckAVX512F(benchmark::State& state) { |
| 308 | if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f()) { |
| 309 | state.SkipWithError("no AVX512F extension"); |
| 310 | return false; |
| 311 | } |
| 312 | return true; |
| 313 | } |
| 314 | |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 315 | } // namespace utils |
| 316 | } // namespace benchmark |