blob: 84424df09b3d39b20e4a6d22f9f0a0e4fbcb3f9a [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
XNNPACK Teamb455b122019-09-27 18:10:33 -07006#include <pthread.h>
7#include <sched.h>
Marat Dukhan452662b2019-10-03 00:14:39 -07008#ifdef __ANDROID__
9 #include <malloc.h>
10#endif
Marat Dukhan4a4a7fa2019-10-21 13:46:14 -070011#if defined(__SSE__) || defined(__x86_64__)
12 #include <xmmintrin.h>
13#endif
XNNPACK Teamb455b122019-09-27 18:10:33 -070014
15#include <cstdio>
16#include <cstdlib>
17#include <cstring>
18
19#include <cpuinfo.h>
20
21#include "bench/utils.h"
22
23
24static void* wipe_buffer = nullptr;
25static size_t wipe_buffer_size = 0;
26
Marat Dukhan42323232019-10-23 02:09:02 -070027static pthread_once_t wipe_buffer_guard = PTHREAD_ONCE_INIT;
XNNPACK Teamb455b122019-09-27 18:10:33 -070028
Marat Dukhan42323232019-10-23 02:09:02 -070029static void InitWipeBuffer() {
XNNPACK Teamb455b122019-09-27 18:10:33 -070030 // Default: the largest know cache size (128 MB Intel Crystalwell L4 cache).
31 wipe_buffer_size = 128 * 1024 * 1024;
32 if (cpuinfo_initialize()) {
Marat Dukhand62f3cc2019-10-01 12:37:52 -070033 wipe_buffer_size = benchmark::utils::GetMaxCacheSize();
XNNPACK Teamb455b122019-09-27 18:10:33 -070034 }
35#if defined(__ANDROID__)
36 // memalign is obsolete, but it is the only option on Android until API level 17.
37 wipe_buffer = memalign(128, wipe_buffer_size);
38#else
39 (void) posix_memalign((void**) &wipe_buffer, 128, wipe_buffer_size);
40#endif
41 if (wipe_buffer != nullptr) {
42 memset(wipe_buffer, 0xA5, wipe_buffer_size);
43 }
44}
45
46namespace benchmark {
47namespace utils {
48
Marat Dukhan42323232019-10-23 02:09:02 -070049uint32_t PrefetchToL1(const void* ptr, size_t size) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070050 uint32_t step = 16;
51 if (cpuinfo_initialize()) {
52 step = cpuinfo_get_l1d_cache(0)->line_size;
53 }
54 const uint8_t* u8_ptr = static_cast<const uint8_t*>(ptr);
55 // Compute and return sum of data to prevent compiler from removing data reads.
56 uint32_t sum = 0;
57 while (size >= step) {
58 sum += uint32_t(*u8_ptr);
59 u8_ptr += step;
60 size -= step;
61 }
62 return sum;
63}
64
Marat Dukhan42323232019-10-23 02:09:02 -070065uint32_t WipeCache() {
66 pthread_once(&wipe_buffer_guard, &InitWipeBuffer);
67 return PrefetchToL1(wipe_buffer, wipe_buffer_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -070068}
69
Marat Dukhan4a4a7fa2019-10-21 13:46:14 -070070void DisableDenormals() {
71#if defined(__SSE__) || defined(__x86_64__)
72 _mm_setcsr(_mm_getcsr() | 0x8040);
73#elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
74 uint32_t fpscr;
75 __asm__ __volatile__(
76 "VMRS %[fpscr], fpscr\n"
77 "ORR %[fpscr], #0x1000000\n"
78 "VMSR fpscr, %[fpscr]\n"
79 : [fpscr] "=r" (fpscr));
80#elif defined(__aarch64__)
81 uint64_t fpcr;
82 __asm__ __volatile__(
83 "MRS %[fpcr], fpcr\n"
84 "ORR %w[fpcr], %w[fpcr], 0x1000000\n"
85 "ORR %w[fpcr], %w[fpcr], 0x80000\n"
86 "MSR fpcr, %[fpcr]\n"
87 : [fpcr] "=r" (fpcr));
88#endif
89}
90
XNNPACK Teamb455b122019-09-27 18:10:33 -070091// Return clockrate in Hz
Frank Barchardbb4c18b2019-09-30 11:05:52 -070092uint64_t GetCurrentCpuFrequency() {
XNNPACK Teamb455b122019-09-27 18:10:33 -070093#ifdef __linux__
94 int freq = 0;
95 char cpuinfo_name[512];
96 int cpu = sched_getcpu();
Frank Barchardbb4c18b2019-09-30 11:05:52 -070097 snprintf(cpuinfo_name, sizeof(cpuinfo_name),
XNNPACK Teamb455b122019-09-27 18:10:33 -070098 "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq", cpu);
99
100 FILE* f = fopen(cpuinfo_name, "r");
101 if (f) {
102 if (fscanf(f, "%d", &freq)) {
103 fclose(f);
104 return uint64_t(freq) * 1000;
105 }
106 fclose(f);
107 }
108#endif // __linux__
109 return 0;
110}
111
Marat Dukhand62f3cc2019-10-01 12:37:52 -0700112size_t GetMaxCacheSize() {
113 if (!cpuinfo_initialize()) {
114 #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
115 // DynamIQ max: 4 MB
116 return 4 * 1024 * 1024;
117 #else
118 // Intel eDRAM max: 128 MB
119 return 128 * 1024 * 1024;
120 #endif
121 }
122 const cpuinfo_processor* processor = cpuinfo_get_processor(0);
123 #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
124 // There is no precise way to detect cache size on ARM/ARM64, and cache size reported by cpuinfo
125 // may underestimate the actual cache size. Thus, we use microarchitecture-specific maximum.
126 switch (processor->core->uarch) {
127 case cpuinfo_uarch_xscale:
128 case cpuinfo_uarch_arm11:
129 case cpuinfo_uarch_scorpion:
130 case cpuinfo_uarch_krait:
131 case cpuinfo_uarch_kryo:
Marat Dukhan1f5d9bc2020-01-02 09:11:16 -0800132 case cpuinfo_uarch_exynos_m1:
133 case cpuinfo_uarch_exynos_m2:
134 case cpuinfo_uarch_exynos_m3:
Marat Dukhand62f3cc2019-10-01 12:37:52 -0700135 // cpuinfo-detected cache size always correct.
136 break;
137 case cpuinfo_uarch_cortex_a5:
138 // Max observed (NXP Vybrid SoC)
139 return 512 * 1024;
140 case cpuinfo_uarch_cortex_a7:
141 // Cortex-A7 MPCore Technical Reference Manual:
142 // 7.1. About the L2 Memory system
143 // The L2 memory system consists of an:
144 // - Optional tightly-coupled L2 cache that includes:
145 // - Configurable L2 cache size of 128KB, 256KB, 512KB, and 1MB.
146 return 1024 * 1024;
147 case cpuinfo_uarch_cortex_a8:
148 // Cortex-A8 Technical Reference Manual:
149 // 8.1. About the L2 memory system
150 // The key features of the L2 memory system include:
151 // - configurable cache size of 0KB, 128KB, 256KB, 512KB, and 1MB
152 return 1024 * 1024;
153 case cpuinfo_uarch_cortex_a9:
154 // Max observed (e.g. Exynos 4212)
155 return 1024 * 1024;
156 case cpuinfo_uarch_cortex_a12:
157 case cpuinfo_uarch_cortex_a17:
158 // ARM Cortex-A17 MPCore Processor Technical Reference Manual:
159 // 7.1. About the L2 Memory system
160 // The key features of the L2 memory system include:
161 // - An integrated L2 cache:
162 // - The cache size is implemented as either 256KB, 512KB, 1MB, 2MB, 4MB or 8MB.
163 return 8 * 1024 * 1024;
164 case cpuinfo_uarch_cortex_a15:
165 // ARM Cortex-A15 MPCore Processor Technical Reference Manual:
166 // 7.1. About the L2 memory system
167 // The features of the L2 memory system include:
168 // - Configurable L2 cache size of 512KB, 1MB, 2MB and 4MB.
169 return 4 * 1024 * 1024;
170 case cpuinfo_uarch_cortex_a35:
171 // ARM Cortex‑A35 Processor Technical Reference Manual:
172 // 7.1 About the L2 memory system
173 // L2 cache
174 // - Further features of the L2 cache are:
175 // - Configurable size of 128KB, 256KB, 512KB, and 1MB.
176 return 1024 * 1024;
177 case cpuinfo_uarch_cortex_a53:
178 // ARM Cortex-A53 MPCore Processor Technical Reference Manual:
179 // 7.1. About the L2 memory system
180 // The L2 memory system consists of an:
181 // - Optional tightly-coupled L2 cache that includes:
182 // - Configurable L2 cache size of 128KB, 256KB, 512KB, 1MB and 2MB.
183 return 2 * 1024 * 1024;
184 case cpuinfo_uarch_cortex_a57:
185 // ARM Cortex-A57 MPCore Processor Technical Reference Manual:
186 // 7.1 About the L2 memory system
187 // The features of the L2 memory system include:
188 // - Configurable L2 cache size of 512KB, 1MB, and 2MB.
189 return 2 * 1024 * 1024;
190 case cpuinfo_uarch_cortex_a72:
191 // ARM Cortex-A72 MPCore Processor Technical Reference Manual:
192 // 7.1 About the L2 memory system
193 // The features of the L2 memory system include:
194 // - Configurable L2 cache size of 512KB, 1MB, 2MB and 4MB.
195 return 4 * 1024 * 1024;
196 case cpuinfo_uarch_cortex_a73:
197 // ARM Cortex‑A73 MPCore Processor Technical Reference Manual
198 // 7.1 About the L2 memory system
199 // The L2 memory system consists of:
200 // - A tightly-integrated L2 cache with:
201 // - A configurable size of 256KB, 512KB, 1MB, 2MB, 4MB, or 8MB.
202 return 8 * 1024 * 1024;
203 default:
204 // ARM DynamIQ Shared Unit Technical Reference Manual
205 // 1.3 Implementation options
206 // L3_CACHE_SIZE
207 // - 256KB
208 // - 512KB
209 // - 1024KB
210 // - 1536KB
211 // - 2048KB
212 // - 3072KB
213 // - 4096KB
214 return 4 * 1024 * 1024;
215 }
216 #endif
217 if (processor->cache.l4 != NULL) {
218 return processor->cache.l4->size;
219 } else if (processor->cache.l3 != NULL) {
220 return processor->cache.l3->size;
221 } else if (processor->cache.l2 != NULL) {
222 return processor->cache.l2->size;
223 } else if (processor->cache.l1d != NULL) {
224 return processor->cache.l1d->size;
225 } else {
226 return 0;
227 }
228}
229
Marat Dukhanbad48fe2019-11-04 10:35:22 -0800230void MultiThreadingParameters(benchmark::internal::Benchmark* benchmark) {
231 benchmark->ArgName("T");
232
233 // Disabled thread pool (execution on the caller thread only).
234 benchmark->Arg(1);
235
236 if (cpuinfo_initialize()) {
237 // All cores except the little ones.
238 uint32_t max_cores = cpuinfo_get_cores_count();
239 if (cpuinfo_get_clusters_count() > 1) {
240 max_cores -= cpuinfo_get_cluster(cpuinfo_get_clusters_count() - 1)->core_count;
241 }
242 for (uint32_t t = 2; t <= max_cores; t++) {
243 benchmark->Arg(t);
244 }
245
246 // All cores (if more than one cluster).
247 if (cpuinfo_get_cores_count() > max_cores) {
248 benchmark->Arg(cpuinfo_get_cores_count());
249 }
250
251 // All cores + hyperthreads (only if hyperthreading supported).
252 if (cpuinfo_get_processors_count() > cpuinfo_get_cores_count()) {
253 benchmark->Arg(cpuinfo_get_processors_count());
254 }
255 }
256}
257
Marat Dukhanc8466f52019-11-25 18:01:10 -0800258
259bool CheckNEON(benchmark::State& state) {
260 if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon()) {
261 state.SkipWithError("no NEON extension");
262 return false;
263 }
264 return true;
265}
266
267bool CheckNEONFMA(benchmark::State& state) {
268 if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fma()) {
269 state.SkipWithError("no NEON-FMA extension");
270 return false;
271 }
272 return true;
273}
274
275bool CheckSSE41(benchmark::State& state) {
276 if (!cpuinfo_initialize() || !cpuinfo_has_x86_sse4_1()) {
277 state.SkipWithError("no SSE4.1 extension");
278 return false;
279 }
280 return true;
281}
282
283bool CheckAVX(benchmark::State& state) {
284 if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx()) {
285 state.SkipWithError("no AVX extension");
286 return false;
287 }
288 return true;
289}
290
291bool CheckFMA3(benchmark::State& state) {
292 if (!cpuinfo_initialize() || !cpuinfo_has_x86_fma3()) {
293 state.SkipWithError("no FMA3 extension");
294 return false;
295 }
296 return true;
297}
298
299bool CheckAVX2(benchmark::State& state) {
300 if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx2()) {
301 state.SkipWithError("no AVX2 extension");
302 return false;
303 }
304 return true;
305}
306
307bool CheckAVX512F(benchmark::State& state) {
308 if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f()) {
309 state.SkipWithError("no AVX512F extension");
310 return false;
311 }
312 return true;
313}
314
XNNPACK Teamb455b122019-09-27 18:10:33 -0700315} // namespace utils
316} // namespace benchmark