blob: b4d3dc791133790cd56ffd90233ea02a0f97da8b [file] [log] [blame]
Marat Dukhan346a9e52019-11-15 09:06:30 -08001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cfloat>
8#include <cmath>
9#include <functional>
Marat Dukhanced880d2020-12-13 17:52:27 -080010#include <memory>
Marat Dukhan3a305212020-12-06 19:24:27 -080011#include <numeric>
Marat Dukhan346a9e52019-11-15 09:06:30 -080012#include <random>
13#include <vector>
14
Marat Dukhan3a305212020-12-06 19:24:27 -080015#include <cpuinfo.h>
16#include <pthreadpool.h>
17
Marat Dukhan346a9e52019-11-15 09:06:30 -080018#include <benchmark/benchmark.h>
19#include <fp16/fp16.h>
20
Marat Dukhan3a305212020-12-06 19:24:27 -080021#include "bench/utils.h"
Marat Dukhan346a9e52019-11-15 09:06:30 -080022#include <xnnpack/AlignedAllocator.h>
23#include <xnnpack/common.h>
24#include <xnnpack/math-stubs.h>
25
26
Marat Dukhan3a305212020-12-06 19:24:27 -080027struct ComputeErrorContext {
28 const float* input;
29 const float* output;
30 float* error;
31};
32
33static void ComputeError(
34 struct ComputeErrorContext* context,
35 size_t start,
36 size_t range)
37{
38 const float* input = context->input;
39 const float* output = context->output;
40 float* error = context->error;
41 for (size_t i = start; i < start + range; i++) {
42 const double input_val = input[i];
43 double output_ref = 0.0;
44 if (input_val < 0.0) {
45 const double exp_val = std::exp(input_val);
46 output_ref = exp_val / (1.0 + exp_val);
47 } else {
48 output_ref = 1.0 / (1.0 + std::exp(-input_val));
49 }
50 const double abs_error = std::abs(output_ref - double(output[i]));
51 const float output_abs = std::abs(output_ref);
52 const float output_ulp = fp32_from_bits(fp32_to_bits(output_abs) + 1) - output_abs;
53 error[i] = float(abs_error / output_ulp);
54 }
55}
56
Marat Dukhan346a9e52019-11-15 09:06:30 -080057static void SigmoidError(benchmark::State& state,
58 xnn_f32_unary_math_function sigmoid,
Marat Dukhan3a305212020-12-06 19:24:27 -080059 benchmark::utils::IsaCheckFunction isa_check = nullptr)
Marat Dukhan346a9e52019-11-15 09:06:30 -080060{
Marat Dukhan3a305212020-12-06 19:24:27 -080061 if (!cpuinfo_initialize()) {
62 state.SkipWithError("failed cpuinfo init");
63 return;
64 }
65 if (isa_check && !isa_check(state)) {
66 return;
67 }
68
Marat Dukhan346a9e52019-11-15 09:06:30 -080069 // The smallest x for which sigmoidf(x) is normalized (-0x1.5D589Ep+6f).
70 const uint32_t min_input = 0xC2AEAC4F;
71 // The largest x for which sigmoidf(x) is not 1.0f (0x1.154244p+4f).
72 const uint32_t max_input = 0x418AA122;
Marat Dukhan3a305212020-12-06 19:24:27 -080073 // Number of elements in one block of inputs/outputs.
74 // Combining multiple elements in a block reduce function call overhead.
75 const size_t block_size = 16384;
76 // Number of elements in one parallelization tile. Worker threads process this many elements in each task.
77 const size_t tile_size = 64;
Marat Dukhan346a9e52019-11-15 09:06:30 -080078
Marat Dukhan3a305212020-12-06 19:24:27 -080079 uint32_t num_threads = cpuinfo_get_cores_count();
80 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
81 // Use all cores except for the least performant cluster
82 if (cpuinfo_get_clusters_count() > 1) {
83 num_threads -= cpuinfo_get_cluster(cpuinfo_get_clusters_count() - 1)->core_count;
84 }
85 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
86
87 std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool(
88 pthreadpool_create(num_threads), pthreadpool_destroy);
89
90 std::vector<float, AlignedAllocator<float, 64>> x(block_size);
91 std::vector<float, AlignedAllocator<float, 64>> y(block_size);
92 std::vector<float> ulp_error(block_size);
93 float max_ulp_error = 0.0f;
94
95 ComputeErrorContext context;
96 context.input = x.data();
97 context.output = y.data();
98 context.error = ulp_error.data();
Marat Dukhan346a9e52019-11-15 09:06:30 -080099 for (auto _ : state) {
Marat Dukhan3a305212020-12-06 19:24:27 -0800100 for (uint32_t n = min_input; int32_t(n) < 0; n -= block_size) {
101 for (uint32_t i = 0; i < block_size; i++) {
Marat Dukhan346a9e52019-11-15 09:06:30 -0800102 x[i] = fp32_from_bits(std::max<uint32_t>(n - i, 0x80000000));
103 }
104 std::fill(y.begin(), y.end(), std::nanf(""));
105
Marat Dukhan3a305212020-12-06 19:24:27 -0800106 sigmoid(block_size * sizeof(float), x.data(), y.data());
Marat Dukhan346a9e52019-11-15 09:06:30 -0800107
Marat Dukhan3a305212020-12-06 19:24:27 -0800108 pthreadpool_parallelize_1d_tile_1d(
109 threadpool.get(),
110 reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(ComputeError),
111 static_cast<void*>(&context),
112 block_size, tile_size, 0 /* flags */);
113
114 max_ulp_error = std::accumulate(ulp_error.cbegin(), ulp_error.cend(), max_ulp_error,
115 static_cast<const float& (*)(const float&, const float&)>(std::max<float>));
Marat Dukhan346a9e52019-11-15 09:06:30 -0800116 }
Marat Dukhan3a305212020-12-06 19:24:27 -0800117 for (uint32_t n = 0; n < max_input; n += block_size) {
118 for (uint32_t i = 0; i < block_size; i++) {
Marat Dukhan346a9e52019-11-15 09:06:30 -0800119 x[i] = fp32_from_bits(std::min<uint32_t>(n + i, max_input));
120 }
121 std::fill(y.begin(), y.end(), std::nanf(""));
122
Marat Dukhan3a305212020-12-06 19:24:27 -0800123 sigmoid(block_size * sizeof(float), x.data(), y.data());
Marat Dukhan346a9e52019-11-15 09:06:30 -0800124
Marat Dukhan3a305212020-12-06 19:24:27 -0800125 pthreadpool_parallelize_1d_tile_1d(
126 threadpool.get(),
127 reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(ComputeError),
128 static_cast<void*>(&context),
129 block_size, tile_size, 0 /* flags */);
130
131 max_ulp_error = std::accumulate(ulp_error.cbegin(), ulp_error.cend(), max_ulp_error,
132 static_cast<const float& (*)(const float&, const float&)>(std::max<float>));
Marat Dukhan346a9e52019-11-15 09:06:30 -0800133 }
134 }
135
136 state.counters["ULPERROR"] = benchmark::Counter(max_ulp_error);
137}
138
139#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Marat Dukhan3a305212020-12-06 19:24:27 -0800140 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut64_p2_nr2recps,
141 xnn_math_f32_sigmoid__neonfma_rr1_lut64_p2_nr2recps,
142 benchmark::utils::CheckNEONFMA)
143 ->Unit(benchmark::kMillisecond)
144 ->Iterations(1);
145 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut64_p2_nr1recps1fma,
146 xnn_math_f32_sigmoid__neonfma_rr1_lut64_p2_nr1recps1fma,
147 benchmark::utils::CheckNEONFMA)
148 ->Unit(benchmark::kMillisecond)
149 ->Iterations(1);
150 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut64_p2_nr2fma,
151 xnn_math_f32_sigmoid__neonfma_rr1_lut64_p2_nr2fma,
152 benchmark::utils::CheckNEONFMA)
153 ->Unit(benchmark::kMillisecond)
154 ->Iterations(1);
155 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut64_p2_nr2recps,
156 xnn_math_f32_sigmoid__neonfma_rr2_lut64_p2_nr2recps,
157 benchmark::utils::CheckNEONFMA)
158 ->Unit(benchmark::kMillisecond)
159 ->Iterations(1);
160 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut64_p2_nr1recps1fma,
161 xnn_math_f32_sigmoid__neonfma_rr2_lut64_p2_nr1recps1fma,
162 benchmark::utils::CheckNEONFMA)
163 ->Unit(benchmark::kMillisecond)
164 ->Iterations(1);
165 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut64_p2_nr2fma,
166 xnn_math_f32_sigmoid__neonfma_rr2_lut64_p2_nr2fma,
167 benchmark::utils::CheckNEONFMA)
168 ->Unit(benchmark::kMillisecond)
169 ->Iterations(1);
170 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut2048_p1_nr2recps,
171 xnn_math_f32_sigmoid__neonfma_rr1_lut2048_p1_nr2recps,
172 benchmark::utils::CheckNEONFMA)
173 ->Unit(benchmark::kMillisecond)
174 ->Iterations(1);
175 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut2048_p1_nr1recps1fma,
176 xnn_math_f32_sigmoid__neonfma_rr1_lut2048_p1_nr1recps1fma,
177 benchmark::utils::CheckNEONFMA)
178 ->Unit(benchmark::kMillisecond)
179 ->Iterations(1);
180 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut2048_p1_nr2fma,
181 xnn_math_f32_sigmoid__neonfma_rr1_lut2048_p1_nr2fma,
182 benchmark::utils::CheckNEONFMA)
183 ->Unit(benchmark::kMillisecond)
184 ->Iterations(1);
185 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut2048_p1_nr2recps,
186 xnn_math_f32_sigmoid__neonfma_rr2_lut2048_p1_nr2recps,
187 benchmark::utils::CheckNEONFMA)
188 ->Unit(benchmark::kMillisecond)
189 ->Iterations(1);
190 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut2048_p1_nr1recps1fma,
191 xnn_math_f32_sigmoid__neonfma_rr2_lut2048_p1_nr1recps1fma,
192 benchmark::utils::CheckNEONFMA)
193 ->Unit(benchmark::kMillisecond)
194 ->Iterations(1);
195 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut2048_p1_nr2fma,
196 xnn_math_f32_sigmoid__neonfma_rr2_lut2048_p1_nr2fma,
197 benchmark::utils::CheckNEONFMA)
198 ->Unit(benchmark::kMillisecond)
199 ->Iterations(1);
200 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_p5_nr2recps,
201 xnn_math_f32_sigmoid__neonfma_rr1_p5_nr2recps,
202 benchmark::utils::CheckNEONFMA)
203 ->Unit(benchmark::kMillisecond)
204 ->Iterations(1);
205 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_p5_nr1recps1fma,
206 xnn_math_f32_sigmoid__neonfma_rr1_p5_nr1recps1fma,
207 benchmark::utils::CheckNEONFMA)
208 ->Unit(benchmark::kMillisecond)
209 ->Iterations(1);
210 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_p5_nr2fma,
211 xnn_math_f32_sigmoid__neonfma_rr1_p5_nr2fma,
212 benchmark::utils::CheckNEONFMA)
213 ->Unit(benchmark::kMillisecond)
214 ->Iterations(1);
215 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_p5_nr2recps,
216 xnn_math_f32_sigmoid__neonfma_rr2_p5_nr2recps,
217 benchmark::utils::CheckNEONFMA)
218 ->Unit(benchmark::kMillisecond)
219 ->Iterations(1);
220 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_p5_nr1recps1fma,
221 xnn_math_f32_sigmoid__neonfma_rr2_p5_nr1recps1fma,
222 benchmark::utils::CheckNEONFMA)
223 ->Unit(benchmark::kMillisecond)
224 ->Iterations(1);
225 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_p5_nr2fma,
226 xnn_math_f32_sigmoid__neonfma_rr2_p5_nr2fma,
227 benchmark::utils::CheckNEONFMA)
228 ->Unit(benchmark::kMillisecond)
229 ->Iterations(1);
Marat Dukhan68b3b452020-01-02 10:11:15 -0800230
Marat Dukhan3a305212020-12-06 19:24:27 -0800231 BENCHMARK_CAPTURE(SigmoidError, neon_rr2_lut64_p2_nr2recps,
232 xnn_math_f32_sigmoid__neon_rr2_lut64_p2_nr2recps,
233 benchmark::utils::CheckNEON)
234 ->Unit(benchmark::kMillisecond)
235 ->Iterations(1);
236 BENCHMARK_CAPTURE(SigmoidError, neon_rr2_lut2048_p1_nr2recps,
237 xnn_math_f32_sigmoid__neon_rr2_lut2048_p1_nr2recps,
238 benchmark::utils::CheckNEON)
239 ->Unit(benchmark::kMillisecond)
240 ->Iterations(1);
241 BENCHMARK_CAPTURE(SigmoidError, neon_rr2_p5_nr2recps,
242 xnn_math_f32_sigmoid__neon_rr2_p5_nr2recps,
243 benchmark::utils::CheckNEON)
244 ->Unit(benchmark::kMillisecond)
245 ->Iterations(1);
Marat Dukhan346a9e52019-11-15 09:06:30 -0800246#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
247
Marat Dukhan22aae132019-11-22 17:10:29 -0800248#if XNN_ARCH_ARM64
Marat Dukhan3a305212020-12-06 19:24:27 -0800249 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut2048_p1_div,
250 xnn_math_f32_sigmoid__neonfma_rr1_lut2048_p1_div)
251 ->Unit(benchmark::kMillisecond)
252 ->Iterations(1);
253 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut2048_p1_div,
254 xnn_math_f32_sigmoid__neonfma_rr2_lut2048_p1_div)
255 ->Unit(benchmark::kMillisecond)
256 ->Iterations(1);
257 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut64_p2_div,
258 xnn_math_f32_sigmoid__neonfma_rr1_lut64_p2_div)
259 ->Unit(benchmark::kMillisecond)
260 ->Iterations(1);
261 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut64_p2_div,
262 xnn_math_f32_sigmoid__neonfma_rr2_lut64_p2_div)
263 ->Unit(benchmark::kMillisecond)
264 ->Iterations(1);
265 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_p5_div,
266 xnn_math_f32_sigmoid__neonfma_rr1_p5_div)
267 ->Unit(benchmark::kMillisecond)
268 ->Iterations(1);
269 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_p5_div,
270 xnn_math_f32_sigmoid__neonfma_rr2_p5_div)
271 ->Unit(benchmark::kMillisecond)
272 ->Iterations(1);
Marat Dukhan22aae132019-11-22 17:10:29 -0800273#endif // XNN_ARCH_ARM64
274
Marat Dukhan80bafd22019-11-18 10:16:01 -0800275#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhan3a305212020-12-06 19:24:27 -0800276 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut16_p3_perm_scalef_nr1fma,
277 xnn_math_f32_sigmoid__avx512f_rr1_lut16_p3_perm_scalef_nr1fma,
278 benchmark::utils::CheckAVX512F)
279 ->Unit(benchmark::kMillisecond)
280 ->Iterations(1);
281 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut16_p3_perm_scalef_nr1fma1adj,
282 xnn_math_f32_sigmoid__avx512f_rr1_lut16_p3_perm_scalef_nr1fma1adj,
283 benchmark::utils::CheckAVX512F)
284 ->Unit(benchmark::kMillisecond)
285 ->Iterations(1);
286 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut16_p3_perm_scalef_div,
287 xnn_math_f32_sigmoid__avx512f_rr1_lut16_p3_perm_scalef_div,
288 benchmark::utils::CheckAVX512F)
289 ->Unit(benchmark::kMillisecond)
290 ->Iterations(1);
291 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut16_p3_perm_scalef_nr1fma,
292 xnn_math_f32_sigmoid__avx512f_rr2_lut16_p3_perm_scalef_nr1fma,
293 benchmark::utils::CheckAVX512F)
294 ->Unit(benchmark::kMillisecond)
295 ->Iterations(1);
296 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut16_p3_perm_scalef_nr1fma1adj,
297 xnn_math_f32_sigmoid__avx512f_rr2_lut16_p3_perm_scalef_nr1fma1adj,
298 benchmark::utils::CheckAVX512F)
299 ->Unit(benchmark::kMillisecond)
300 ->Iterations(1);
301 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut16_p3_perm_scalef_div,
302 xnn_math_f32_sigmoid__avx512f_rr2_lut16_p3_perm_scalef_div,
303 benchmark::utils::CheckAVX512F)
304 ->Unit(benchmark::kMillisecond)
305 ->Iterations(1);
306 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut32_p2_perm2_scalef_nr1fma,
307 xnn_math_f32_sigmoid__avx512f_rr1_lut32_p2_perm2_scalef_nr1fma,
308 benchmark::utils::CheckAVX512F)
309 ->Unit(benchmark::kMillisecond)
310 ->Iterations(1);
311 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut32_p2_perm2_scalef_nr1fma1adj,
312 xnn_math_f32_sigmoid__avx512f_rr1_lut32_p2_perm2_scalef_nr1fma1adj,
313 benchmark::utils::CheckAVX512F)
314 ->Unit(benchmark::kMillisecond)
315 ->Iterations(1);
316 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut32_p2_perm2_scalef_div,
317 xnn_math_f32_sigmoid__avx512f_rr1_lut32_p2_perm2_scalef_div,
318 benchmark::utils::CheckAVX512F)
319 ->Unit(benchmark::kMillisecond)
320 ->Iterations(1);
321 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut32_p2_perm2_scalef_nr1fma,
322 xnn_math_f32_sigmoid__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma,
323 benchmark::utils::CheckAVX512F)
324 ->Unit(benchmark::kMillisecond)
325 ->Iterations(1);
326 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut32_p2_perm2_scalef_nr1fma1adj,
327 xnn_math_f32_sigmoid__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma1adj,
328 benchmark::utils::CheckAVX512F)
329 ->Unit(benchmark::kMillisecond)
330 ->Iterations(1);
331 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut32_p2_perm2_scalef_div,
332 xnn_math_f32_sigmoid__avx512f_rr2_lut32_p2_perm2_scalef_div,
333 benchmark::utils::CheckAVX512F)
334 ->Unit(benchmark::kMillisecond)
335 ->Iterations(1);
336 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut64_p2_gather_scalef_nr1fma,
337 xnn_math_f32_sigmoid__avx512f_rr1_lut64_p2_gather_scalef_nr1fma,
338 benchmark::utils::CheckAVX512F)
339 ->Unit(benchmark::kMillisecond)
340 ->Iterations(1);
341 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut64_p2_gather_scalef_nr1fma1adj,
342 xnn_math_f32_sigmoid__avx512f_rr1_lut64_p2_gather_scalef_nr1fma1adj,
343 benchmark::utils::CheckAVX512F)
344 ->Unit(benchmark::kMillisecond)
345 ->Iterations(1);
346 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut64_p2_gather_scalef_div,
347 xnn_math_f32_sigmoid__avx512f_rr1_lut64_p2_gather_scalef_div,
348 benchmark::utils::CheckAVX512F)
349 ->Unit(benchmark::kMillisecond)
350 ->Iterations(1);
351 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut64_p2_gather_scalef_nr1fma,
352 xnn_math_f32_sigmoid__avx512f_rr2_lut64_p2_gather_scalef_nr1fma,
353 benchmark::utils::CheckAVX512F)
354 ->Unit(benchmark::kMillisecond)
355 ->Iterations(1);
356 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut64_p2_gather_scalef_nr1fma1adj,
357 xnn_math_f32_sigmoid__avx512f_rr2_lut64_p2_gather_scalef_nr1fma1adj,
358 benchmark::utils::CheckAVX512F)
359 ->Unit(benchmark::kMillisecond)
360 ->Iterations(1);
361 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut64_p2_gather_scalef_div,
362 xnn_math_f32_sigmoid__avx512f_rr2_lut64_p2_gather_scalef_div,
363 benchmark::utils::CheckAVX512F)
364 ->Unit(benchmark::kMillisecond)
365 ->Iterations(1);
366 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_p5_scalef_nr1fma,
367 xnn_math_f32_sigmoid__avx512f_rr1_p5_scalef_nr1fma,
368 benchmark::utils::CheckAVX512F)
369 ->Unit(benchmark::kMillisecond)
370 ->Iterations(1);
371 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_p5_scalef_nr1fma1adj,
372 xnn_math_f32_sigmoid__avx512f_rr1_p5_scalef_nr1fma1adj,
373 benchmark::utils::CheckAVX512F)
374 ->Unit(benchmark::kMillisecond)
375 ->Iterations(1);
376 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_p5_scalef_div,
377 xnn_math_f32_sigmoid__avx512f_rr1_p5_scalef_div,
378 benchmark::utils::CheckAVX512F)
379 ->Unit(benchmark::kMillisecond)
380 ->Iterations(1);
381 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_p5_scalef_nr1fma,
382 xnn_math_f32_sigmoid__avx512f_rr2_p5_scalef_nr1fma,
383 benchmark::utils::CheckAVX512F)
384 ->Unit(benchmark::kMillisecond)
385 ->Iterations(1);
386 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_p5_scalef_nr1fma1adj,
387 xnn_math_f32_sigmoid__avx512f_rr2_p5_scalef_nr1fma1adj,
388 benchmark::utils::CheckAVX512F)
389 ->Unit(benchmark::kMillisecond)
390 ->Iterations(1);
391 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_p5_scalef_div,
392 xnn_math_f32_sigmoid__avx512f_rr2_p5_scalef_div,
393 benchmark::utils::CheckAVX512F)
394 ->Unit(benchmark::kMillisecond)
395 ->Iterations(1);
Marat Dukhan47ffc412020-09-20 23:30:28 -0700396
Marat Dukhan3a305212020-12-06 19:24:27 -0800397 BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_lut64_p2_gather_nr1fma,
398 xnn_math_f32_sigmoid__avx2_rr1_lut64_p2_gather_nr1fma,
399 benchmark::utils::CheckAVX2)
400 ->Unit(benchmark::kMillisecond)
401 ->Iterations(1);
402 BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_lut64_p2_gather_nr2fma,
403 xnn_math_f32_sigmoid__avx2_rr1_lut64_p2_gather_nr2fma,
404 benchmark::utils::CheckAVX2)
405 ->Unit(benchmark::kMillisecond)
406 ->Iterations(1);
407 BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_lut64_p2_gather_nr2fma1adj,
408 xnn_math_f32_sigmoid__avx2_rr1_lut64_p2_gather_nr2fma1adj,
409 benchmark::utils::CheckAVX2)
410 ->Unit(benchmark::kMillisecond)
411 ->Iterations(1);
412 BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_lut64_p2_gather_div,
413 xnn_math_f32_sigmoid__avx2_rr1_lut64_p2_gather_div,
414 benchmark::utils::CheckAVX2)
415 ->Unit(benchmark::kMillisecond)
416 ->Iterations(1);
417 BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_lut64_p2_gather_nr1fma,
418 xnn_math_f32_sigmoid__avx2_rr2_lut64_p2_gather_nr1fma,
419 benchmark::utils::CheckAVX2)
420 ->Unit(benchmark::kMillisecond)
421 ->Iterations(1);
422 BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_lut64_p2_gather_nr2fma,
423 xnn_math_f32_sigmoid__avx2_rr2_lut64_p2_gather_nr2fma,
424 benchmark::utils::CheckAVX2)
425 ->Unit(benchmark::kMillisecond)
426 ->Iterations(1);
427 BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_lut64_p2_gather_nr2fma1adj,
428 xnn_math_f32_sigmoid__avx2_rr2_lut64_p2_gather_nr2fma1adj,
429 benchmark::utils::CheckAVX2)
430 ->Unit(benchmark::kMillisecond)
431 ->Iterations(1);
432 BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_lut64_p2_gather_div,
433 xnn_math_f32_sigmoid__avx2_rr2_lut64_p2_gather_div,
434 benchmark::utils::CheckAVX2)
435 ->Unit(benchmark::kMillisecond)
436 ->Iterations(1);
437 BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_p5_nr1fma,
438 xnn_math_f32_sigmoid__avx2_rr1_p5_nr1fma,
439 benchmark::utils::CheckAVX2)
440 ->Unit(benchmark::kMillisecond)
441 ->Iterations(1);
442 BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_p5_nr2fma,
443 xnn_math_f32_sigmoid__avx2_rr1_p5_nr2fma,
444 benchmark::utils::CheckAVX2)
445 ->Unit(benchmark::kMillisecond)
446 ->Iterations(1);
447 BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_p5_div,
448 xnn_math_f32_sigmoid__avx2_rr1_p5_div,
449 benchmark::utils::CheckAVX2)
450 ->Unit(benchmark::kMillisecond)
451 ->Iterations(1);
452 BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_p5_nr1fma,
453 xnn_math_f32_sigmoid__avx2_rr2_p5_nr1fma,
454 benchmark::utils::CheckAVX2)
455 ->Unit(benchmark::kMillisecond)
456 ->Iterations(1);
457 BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_p5_nr2fma,
458 xnn_math_f32_sigmoid__avx2_rr2_p5_nr2fma,
459 benchmark::utils::CheckAVX2)
460 ->Unit(benchmark::kMillisecond)
461 ->Iterations(1);
462 BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_p5_div,
463 xnn_math_f32_sigmoid__avx2_rr2_p5_div,
464 benchmark::utils::CheckAVX2)
465 ->Unit(benchmark::kMillisecond)
466 ->Iterations(1);
Marat Dukhan47ffc412020-09-20 23:30:28 -0700467
Marat Dukhan3a305212020-12-06 19:24:27 -0800468 BENCHMARK_CAPTURE(SigmoidError, avx_rr2_lut64_p2_div,
469 xnn_math_f32_sigmoid__avx_rr2_lut64_p2_div,
470 benchmark::utils::CheckAVX)
471 ->Unit(benchmark::kMillisecond)
472 ->Iterations(1);
473 BENCHMARK_CAPTURE(SigmoidError, avx_rr2_p5_nr1,
474 xnn_math_f32_sigmoid__avx_rr2_p5_nr1,
475 benchmark::utils::CheckAVX)
476 ->Unit(benchmark::kMillisecond)
477 ->Iterations(1);
478 BENCHMARK_CAPTURE(SigmoidError, avx_rr2_p5_nr2,
479 xnn_math_f32_sigmoid__avx_rr2_p5_nr2,
480 benchmark::utils::CheckAVX)
481 ->Unit(benchmark::kMillisecond)
482 ->Iterations(1);
483 BENCHMARK_CAPTURE(SigmoidError, avx_rr2_p5_div,
484 xnn_math_f32_sigmoid__avx_rr2_p5_div,
485 benchmark::utils::CheckAVX)
486 ->Unit(benchmark::kMillisecond)
487 ->Iterations(1);
Marat Dukhan36173d22020-10-15 17:14:26 -0700488
Marat Dukhan3a305212020-12-06 19:24:27 -0800489 BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_lut64_p2_nr1,
490 xnn_math_f32_sigmoid__sse2_rr2_lut64_p2_nr1)
491 ->Unit(benchmark::kMillisecond)
492 ->Iterations(1);
493 BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_lut64_p2_nr2,
494 xnn_math_f32_sigmoid__sse2_rr2_lut64_p2_nr2)
495 ->Unit(benchmark::kMillisecond)
496 ->Iterations(1);
497 BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_lut64_p2_div,
498 xnn_math_f32_sigmoid__sse2_rr2_lut64_p2_div)
499 ->Unit(benchmark::kMillisecond)
500 ->Iterations(1);
501 BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_p5_nr1,
502 xnn_math_f32_sigmoid__sse2_rr2_p5_nr1)
503 ->Unit(benchmark::kMillisecond)
504 ->Iterations(1);
505 BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_p5_nr2,
506 xnn_math_f32_sigmoid__sse2_rr2_p5_nr2)
507 ->Unit(benchmark::kMillisecond)
508 ->Iterations(1);
509 BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_p5_div,
510 xnn_math_f32_sigmoid__sse2_rr2_p5_div)
511 ->Unit(benchmark::kMillisecond)
512 ->Iterations(1);
Marat Dukhan80bafd22019-11-18 10:16:01 -0800513#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
514
Marat Dukhan9340b6c2020-07-16 22:59:48 -0700515#if XNN_ARCH_WASMSIMD
Marat Dukhan3a305212020-12-06 19:24:27 -0800516 BENCHMARK_CAPTURE(SigmoidError, wasmsimd_rr2_lut64_p2_div,
517 xnn_math_f32_sigmoid__wasmsimd_rr2_lut64_p2_div)
518 ->Unit(benchmark::kMillisecond)
519 ->Iterations(1);
520 BENCHMARK_CAPTURE(SigmoidError, wasmsimd_rr2_p5_div,
521 xnn_math_f32_sigmoid__wasmsimd_rr2_p5_div)
522 ->Unit(benchmark::kMillisecond)
523 ->Iterations(1);
Marat Dukhan9340b6c2020-07-16 22:59:48 -0700524#endif // XNN_ARCH_WASMSIMD
525
Marat Dukhan3a305212020-12-06 19:24:27 -0800526BENCHMARK_CAPTURE(SigmoidError, scalar_rr2_lut64_p2_div,
527 xnn_math_f32_sigmoid__scalar_rr2_lut64_p2_div)
528 ->Unit(benchmark::kMillisecond)
529 ->Iterations(1);
530BENCHMARK_CAPTURE(SigmoidError, scalar_rr2_lut2048_p1_div,
531 xnn_math_f32_sigmoid__scalar_rr2_lut2048_p1_div)
532 ->Unit(benchmark::kMillisecond)
533 ->Iterations(1);
534BENCHMARK_CAPTURE(SigmoidError, scalar_rr2_p5_div,
535 xnn_math_f32_sigmoid__scalar_rr2_p5_div)
536 ->Unit(benchmark::kMillisecond)
537 ->Iterations(1);
Marat Dukhan5739f702019-12-22 19:45:09 -0800538
Marat Dukhan346a9e52019-11-15 09:06:30 -0800539#ifndef XNNPACK_BENCHMARK_NO_MAIN
540BENCHMARK_MAIN();
541#endif