blob: 19eab7292895195566fd6faf06dabc8ef626e400 [file] [log] [blame]
Marat Dukhan4c4eb002019-12-08 21:27:49 -08001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cmath>
8#include <functional>
9#include <random>
10#include <vector>
11
12#include <benchmark/benchmark.h>
13#include "bench/utils.h"
14#include <xnnpack/AlignedAllocator.h>
15#include <xnnpack/common.h>
16#include <xnnpack/params.h>
17#include <xnnpack/raddstoreexpminusmax.h>
18#include <xnnpack/rmax.h>
19
20
21static void f32_raddstoreexpminusmax(
22 benchmark::State& state,
23 xnn_f32_rmax_ukernel_function rmax,
24 xnn_f32_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,
25 benchmark::utils::IsaCheckFunction isa_check = nullptr)
26{
27 if (isa_check && !isa_check(state)) {
28 return;
29 }
30
31 const size_t n = state.range(0);
32 const size_t cache_line_size_max = 128;
33 const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float));
34
35 std::random_device random_device;
36 auto rng = std::mt19937(random_device());
37 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), rng);
38
39 const size_t num_buffers = 1 +
40 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
41 std::vector<float, AlignedAllocator<float, 64>> x(n);
42 std::vector<float, AlignedAllocator<float, 64>> y(packed_n * num_buffers);
43
44 std::generate(x.begin(), x.end(), std::ref(f32rng));
45
46 benchmark::utils::DisableDenormals();
47
48 size_t buffer_index = 0;
49 for (auto _ : state) {
50 state.PauseTiming();
51 float x_max = nanf("");
52 rmax(n * sizeof(float), x.data(), &x_max);
53 if (++buffer_index == num_buffers) {
54 buffer_index = 0;
55 }
56 state.ResumeTiming();
57
58 float y_sum = nanf("");
59 raddstoreexpminusmax(n * sizeof(float), x.data(), y.data() + buffer_index * packed_n, &y_sum, x_max);
60 }
61
62 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
63 state.counters["elements"] =
64 benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
65 state.counters["bytes"] =
66 benchmark::Counter(uint64_t(state.iterations()) * 2 * sizeof(float) * n, benchmark::Counter::kIsRate);
67}
68
69static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
70 b->ArgName("N");
71 for (int32_t n = 10000; n <= 100000000; n *= 10) {
72 b->Arg(n);
73 }
74}
75
Marat Dukhan8137e4c2020-01-25 12:56:58 -080076#if XNN_ARCH_ARM || XNN_ARCH_ARM64
77 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x4,
78 xnn_f32_rmax_ukernel__neon,
79 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x4,
80 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
81 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x8,
82 xnn_f32_rmax_ukernel__neon,
83 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x8,
84 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
85 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x8_acc2,
86 xnn_f32_rmax_ukernel__neon,
87 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x8_acc2,
88 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
89 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x12,
90 xnn_f32_rmax_ukernel__neon,
91 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x12,
92 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
93 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x12_acc2,
94 xnn_f32_rmax_ukernel__neon,
95 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x12_acc2,
96 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
97 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x12_acc3,
98 xnn_f32_rmax_ukernel__neon,
99 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x12_acc3,
100 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
101 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x16,
102 xnn_f32_rmax_ukernel__neon,
103 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x16,
104 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
105 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x16_acc2,
106 xnn_f32_rmax_ukernel__neon,
107 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x16_acc2,
108 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
109 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x16_acc4,
110 xnn_f32_rmax_ukernel__neon,
111 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x16_acc4,
112 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
113 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x20,
114 xnn_f32_rmax_ukernel__neon,
115 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x20,
116 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
117 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x20_acc2,
118 xnn_f32_rmax_ukernel__neon,
119 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x20_acc2,
120 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
121 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x20_acc5,
122 xnn_f32_rmax_ukernel__neon,
123 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x20_acc5,
124 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
125
126 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x4,
127 xnn_f32_rmax_ukernel__neon,
128 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x4,
129 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
130 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x8,
131 xnn_f32_rmax_ukernel__neon,
132 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x8,
133 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
134 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x8_acc2,
135 xnn_f32_rmax_ukernel__neon,
136 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x8_acc2,
137 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
138 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x12,
139 xnn_f32_rmax_ukernel__neon,
140 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x12,
141 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
142 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x12_acc2,
143 xnn_f32_rmax_ukernel__neon,
144 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x12_acc2,
145 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
146 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x12_acc3,
147 xnn_f32_rmax_ukernel__neon,
148 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x12_acc3,
149 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
150 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x16,
151 xnn_f32_rmax_ukernel__neon,
152 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x16,
153 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
154 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x16_acc2,
155 xnn_f32_rmax_ukernel__neon,
156 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x16_acc2,
157 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
158 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x16_acc4,
159 xnn_f32_rmax_ukernel__neon,
160 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x16_acc4,
161 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
162 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x20,
163 xnn_f32_rmax_ukernel__neon,
164 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x20,
165 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
166 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x20_acc2,
167 xnn_f32_rmax_ukernel__neon,
168 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x20_acc2,
169 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
170 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x20_acc5,
171 xnn_f32_rmax_ukernel__neon,
172 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x20_acc5,
173 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
174
175 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x4,
176 xnn_f32_rmax_ukernel__neon,
177 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x4,
178 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
179 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x8,
180 xnn_f32_rmax_ukernel__neon,
181 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x8,
182 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
183 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x8_acc2,
184 xnn_f32_rmax_ukernel__neon,
185 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x8_acc2,
186 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
187 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x12,
188 xnn_f32_rmax_ukernel__neon,
189 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x12,
190 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
191 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x12_acc2,
192 xnn_f32_rmax_ukernel__neon,
193 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x12_acc2,
194 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
195 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x12_acc3,
196 xnn_f32_rmax_ukernel__neon,
197 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x12_acc3,
198 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
199 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x16,
200 xnn_f32_rmax_ukernel__neon,
201 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x16,
202 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
203 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x16_acc2,
204 xnn_f32_rmax_ukernel__neon,
205 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x16_acc2,
206 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
207 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x16_acc4,
208 xnn_f32_rmax_ukernel__neon,
209 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x16_acc4,
210 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
211 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x20,
212 xnn_f32_rmax_ukernel__neon,
213 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x20,
214 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
215 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x20_acc2,
216 xnn_f32_rmax_ukernel__neon,
217 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x20_acc2,
218 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
219 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x20_acc5,
220 xnn_f32_rmax_ukernel__neon,
221 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x20_acc5,
222 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
223
224 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x4,
225 xnn_f32_rmax_ukernel__neon,
226 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x4,
227 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
228 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x8,
229 xnn_f32_rmax_ukernel__neon,
230 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x8,
231 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
232 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x8_acc2,
233 xnn_f32_rmax_ukernel__neon,
234 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x8_acc2,
235 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
236 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x12,
237 xnn_f32_rmax_ukernel__neon,
238 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x12,
239 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
240 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x12_acc2,
241 xnn_f32_rmax_ukernel__neon,
242 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x12_acc2,
243 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
244 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x12_acc3,
245 xnn_f32_rmax_ukernel__neon,
246 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x12_acc3,
247 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
248 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x16,
249 xnn_f32_rmax_ukernel__neon,
250 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x16,
251 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
252 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x16_acc2,
253 xnn_f32_rmax_ukernel__neon,
254 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x16_acc2,
255 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
256 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x16_acc4,
257 xnn_f32_rmax_ukernel__neon,
258 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x16_acc4,
259 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
260 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x20,
261 xnn_f32_rmax_ukernel__neon,
262 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x20,
263 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
264 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x20_acc2,
265 xnn_f32_rmax_ukernel__neon,
266 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x20_acc2,
267 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
268 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x20_acc5,
269 xnn_f32_rmax_ukernel__neon,
270 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x20_acc5,
271 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
272#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
273
Marat Dukhan4c4eb002019-12-08 21:27:49 -0800274#if XNN_ARCH_X86 || XNN_ARCH_X86_64
275 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x128,
276 xnn_f32_rmax_ukernel__avx,
277 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x128,
278 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
279 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x128_acc2,
280 xnn_f32_rmax_ukernel__avx,
281 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
282 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
283 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x128_acc4,
284 xnn_f32_rmax_ukernel__avx,
285 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x128_acc4,
286 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
287
288 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x144,
289 xnn_f32_rmax_ukernel__avx,
290 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x144,
291 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
292 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x144_acc3,
293 xnn_f32_rmax_ukernel__avx,
294 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x144_acc3,
295 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
296
297 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x160,
298 xnn_f32_rmax_ukernel__avx,
299 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x160,
300 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
301 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x160_acc2,
302 xnn_f32_rmax_ukernel__avx,
303 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x160_acc2,
304 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
305 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x160_acc5,
306 xnn_f32_rmax_ukernel__avx,
307 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x160_acc5,
308 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
309
310 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x192,
311 xnn_f32_rmax_ukernel__avx,
312 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x192,
313 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
314 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x192_acc2,
315 xnn_f32_rmax_ukernel__avx,
316 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x192_acc2,
317 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
318 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x192_acc3,
319 xnn_f32_rmax_ukernel__avx,
320 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x192_acc3,
321 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
322 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x192_acc6,
323 xnn_f32_rmax_ukernel__avx,
324 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x192_acc6,
325 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
326
327 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x64,
328 xnn_f32_rmax_ukernel__avx,
329 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64,
330 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
331 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x64_acc2,
332 xnn_f32_rmax_ukernel__avx,
333 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2,
334 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
335 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x64_acc4,
336 xnn_f32_rmax_ukernel__avx,
337 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc4,
338 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
339
340 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x72,
341 xnn_f32_rmax_ukernel__avx,
342 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x72,
343 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
344 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x72_acc3,
345 xnn_f32_rmax_ukernel__avx,
346 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x72_acc3,
347 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
348
349 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x80,
350 xnn_f32_rmax_ukernel__avx,
351 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x80,
352 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
353 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x80_acc2,
354 xnn_f32_rmax_ukernel__avx,
355 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x80_acc2,
356 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
357 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x80_acc5,
358 xnn_f32_rmax_ukernel__avx,
359 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x80_acc5,
360 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
361
362 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x96,
363 xnn_f32_rmax_ukernel__avx,
364 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x96,
365 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
366 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x96_acc2,
367 xnn_f32_rmax_ukernel__avx,
368 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x96_acc2,
369 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
370 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x96_acc3,
371 xnn_f32_rmax_ukernel__avx,
372 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x96_acc3,
373 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
374 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x96_acc6,
375 xnn_f32_rmax_ukernel__avx,
376 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x96_acc6,
377 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
Marat Dukhanb39689d2020-01-24 13:32:20 -0800378
379 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x4,
380 xnn_f32_rmax_ukernel__sse,
381 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x4)->Apply(CharacteristicArguments)->UseRealTime();
382 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x8,
383 xnn_f32_rmax_ukernel__sse,
384 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8)->Apply(CharacteristicArguments)->UseRealTime();
385 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x8_acc2,
386 xnn_f32_rmax_ukernel__sse,
387 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8_acc2)->Apply(CharacteristicArguments)->UseRealTime();
388 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x12,
389 xnn_f32_rmax_ukernel__sse,
390 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12)->Apply(CharacteristicArguments)->UseRealTime();
391 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x12_acc2,
392 xnn_f32_rmax_ukernel__sse,
393 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc2)->Apply(CharacteristicArguments)->UseRealTime();
394 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x12_acc3,
395 xnn_f32_rmax_ukernel__sse,
396 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc3)->Apply(CharacteristicArguments)->UseRealTime();
397 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x16,
398 xnn_f32_rmax_ukernel__sse,
399 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16)->Apply(CharacteristicArguments)->UseRealTime();
400 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x16_acc2,
401 xnn_f32_rmax_ukernel__sse,
402 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc2)->Apply(CharacteristicArguments)->UseRealTime();
403 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x16_acc4,
404 xnn_f32_rmax_ukernel__sse,
405 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc4)->Apply(CharacteristicArguments)->UseRealTime();
406 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x20,
407 xnn_f32_rmax_ukernel__sse,
408 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20)->Apply(CharacteristicArguments)->UseRealTime();
409 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x20_acc2,
410 xnn_f32_rmax_ukernel__sse,
411 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc2)->Apply(CharacteristicArguments)->UseRealTime();
412 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x20_acc5,
413 xnn_f32_rmax_ukernel__sse,
414 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc5)->Apply(CharacteristicArguments)->UseRealTime();
Marat Dukhan4c4eb002019-12-08 21:27:49 -0800415#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
416
Marat Dukhanb39689d2020-01-24 13:32:20 -0800417#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
418 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x4,
419 xnn_f32_rmax_ukernel__psimd,
420 xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x4)->Apply(CharacteristicArguments)->UseRealTime();
421 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x8,
422 xnn_f32_rmax_ukernel__psimd,
423 xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8)->Apply(CharacteristicArguments)->UseRealTime();
424 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x8_acc2,
425 xnn_f32_rmax_ukernel__psimd,
426 xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8_acc2)->Apply(CharacteristicArguments)->UseRealTime();
427 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x12,
428 xnn_f32_rmax_ukernel__psimd,
429 xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12)->Apply(CharacteristicArguments)->UseRealTime();
430 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x12_acc2,
431 xnn_f32_rmax_ukernel__psimd,
432 xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc2)->Apply(CharacteristicArguments)->UseRealTime();
433 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x12_acc3,
434 xnn_f32_rmax_ukernel__psimd,
435 xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc3)->Apply(CharacteristicArguments)->UseRealTime();
436 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x16,
437 xnn_f32_rmax_ukernel__psimd,
438 xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16)->Apply(CharacteristicArguments)->UseRealTime();
439 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x16_acc2,
440 xnn_f32_rmax_ukernel__psimd,
441 xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc2)->Apply(CharacteristicArguments)->UseRealTime();
442 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x16_acc4,
443 xnn_f32_rmax_ukernel__psimd,
444 xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc4)->Apply(CharacteristicArguments)->UseRealTime();
445 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x20,
446 xnn_f32_rmax_ukernel__psimd,
447 xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20)->Apply(CharacteristicArguments)->UseRealTime();
448 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x20_acc2,
449 xnn_f32_rmax_ukernel__psimd,
450 xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc2)->Apply(CharacteristicArguments)->UseRealTime();
451 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x20_acc5,
452 xnn_f32_rmax_ukernel__psimd,
453 xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc5)->Apply(CharacteristicArguments)->UseRealTime();
454#endif // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
455
Marat Dukhanf46f6752020-01-21 11:03:49 -0800456BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_lut64_p2_x1,
457 xnn_f32_rmax_ukernel__scalar,
458 xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x1)->Apply(CharacteristicArguments)->UseRealTime();
459BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_lut64_p2_x2,
460 xnn_f32_rmax_ukernel__scalar,
461 xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2)->Apply(CharacteristicArguments)->UseRealTime();
462BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_lut64_p2_x2_acc2,
463 xnn_f32_rmax_ukernel__scalar,
464 xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2_acc2)->Apply(CharacteristicArguments)->UseRealTime();
465BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_lut64_p2_x4,
466 xnn_f32_rmax_ukernel__scalar,
467 xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4)->Apply(CharacteristicArguments)->UseRealTime();
468BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_lut64_p2_x4_acc2,
469 xnn_f32_rmax_ukernel__scalar,
470 xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc2)->Apply(CharacteristicArguments)->UseRealTime();
471BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_lut64_p2_x4_acc4,
472 xnn_f32_rmax_ukernel__scalar,
473 xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc4)->Apply(CharacteristicArguments)->UseRealTime();
474
475BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_p5_x1,
476 xnn_f32_rmax_ukernel__scalar,
477 xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x1)->Apply(CharacteristicArguments)->UseRealTime();
478BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_p5_x2,
479 xnn_f32_rmax_ukernel__scalar,
480 xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2)->Apply(CharacteristicArguments)->UseRealTime();
481BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_p5_x2_acc2,
482 xnn_f32_rmax_ukernel__scalar,
483 xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2_acc2)->Apply(CharacteristicArguments)->UseRealTime();
484BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_p5_x4,
485 xnn_f32_rmax_ukernel__scalar,
486 xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4)->Apply(CharacteristicArguments)->UseRealTime();
487BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_p5_x4_acc2,
488 xnn_f32_rmax_ukernel__scalar,
489 xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2)->Apply(CharacteristicArguments)->UseRealTime();
490BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_p5_x4_acc4,
491 xnn_f32_rmax_ukernel__scalar,
492 xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc4)->Apply(CharacteristicArguments)->UseRealTime();
493
Marat Dukhan4c4eb002019-12-08 21:27:49 -0800494#ifndef XNNPACK_BENCHMARK_NO_MAIN
495BENCHMARK_MAIN();
496#endif