blob: c5fe76d846f67cd9b69410e841c590dde806912e [file] [log] [blame]
Marat Dukhan4c4eb002019-12-08 21:27:49 -08001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cmath>
8#include <functional>
9#include <random>
10#include <vector>
11
12#include <benchmark/benchmark.h>
13#include "bench/utils.h"
14#include <xnnpack/AlignedAllocator.h>
15#include <xnnpack/common.h>
16#include <xnnpack/params.h>
17#include <xnnpack/raddstoreexpminusmax.h>
18#include <xnnpack/rmax.h>
19
20
21static void f32_raddstoreexpminusmax(
22 benchmark::State& state,
23 xnn_f32_rmax_ukernel_function rmax,
24 xnn_f32_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,
25 benchmark::utils::IsaCheckFunction isa_check = nullptr)
26{
27 if (isa_check && !isa_check(state)) {
28 return;
29 }
30
Marat Dukhand713e8a2020-12-04 14:23:12 -080031 const size_t elements = state.range(0);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080032 const size_t cache_line_size_max = 128;
Marat Dukhand713e8a2020-12-04 14:23:12 -080033 const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
Marat Dukhan4c4eb002019-12-08 21:27:49 -080034
35 std::random_device random_device;
36 auto rng = std::mt19937(random_device());
Marat Dukhan44f0ca72020-08-02 21:46:58 -070037 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
Marat Dukhan4c4eb002019-12-08 21:27:49 -080038
39 const size_t num_buffers = 1 +
Marat Dukhand713e8a2020-12-04 14:23:12 -080040 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
41 std::vector<float, AlignedAllocator<float, 64>> x(elements);
42 std::vector<float, AlignedAllocator<float, 64>> y(packed_elements * num_buffers);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080043
44 std::generate(x.begin(), x.end(), std::ref(f32rng));
45
46 benchmark::utils::DisableDenormals();
47
48 size_t buffer_index = 0;
49 for (auto _ : state) {
50 state.PauseTiming();
51 float x_max = nanf("");
Marat Dukhand713e8a2020-12-04 14:23:12 -080052 rmax(elements * sizeof(float), x.data(), &x_max);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080053 if (++buffer_index == num_buffers) {
54 buffer_index = 0;
55 }
56 state.ResumeTiming();
57
58 float y_sum = nanf("");
Marat Dukhand713e8a2020-12-04 14:23:12 -080059 raddstoreexpminusmax(elements * sizeof(float), x.data(), y.data() + buffer_index * packed_elements, &y_sum, x_max);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080060 }
61
Marat Dukhand713e8a2020-12-04 14:23:12 -080062 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
63 if (cpu_frequency != 0) {
64 state.counters["cpufreq"] = cpu_frequency;
65 }
66
67 const size_t elements_per_iteration = elements;
Marat Dukhan4c4eb002019-12-08 21:27:49 -080068 state.counters["elements"] =
Marat Dukhand713e8a2020-12-04 14:23:12 -080069 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
70
71 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080072 state.counters["bytes"] =
Marat Dukhand713e8a2020-12-04 14:23:12 -080073 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080074}
75
76static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
77 b->ArgName("N");
78 for (int32_t n = 10000; n <= 100000000; n *= 10) {
79 b->Arg(n);
80 }
81}
82
Marat Dukhan8137e4c2020-01-25 12:56:58 -080083#if XNN_ARCH_ARM || XNN_ARCH_ARM64
84 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x4,
85 xnn_f32_rmax_ukernel__neon,
86 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x4,
87 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
88 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x8,
89 xnn_f32_rmax_ukernel__neon,
90 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x8,
91 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
92 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x8_acc2,
93 xnn_f32_rmax_ukernel__neon,
94 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x8_acc2,
95 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
96 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x12,
97 xnn_f32_rmax_ukernel__neon,
98 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x12,
99 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
100 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x12_acc2,
101 xnn_f32_rmax_ukernel__neon,
102 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x12_acc2,
103 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
104 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x12_acc3,
105 xnn_f32_rmax_ukernel__neon,
106 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x12_acc3,
107 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
108 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x16,
109 xnn_f32_rmax_ukernel__neon,
110 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x16,
111 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
112 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x16_acc2,
113 xnn_f32_rmax_ukernel__neon,
114 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x16_acc2,
115 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
116 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x16_acc4,
117 xnn_f32_rmax_ukernel__neon,
118 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x16_acc4,
119 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
120 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x20,
121 xnn_f32_rmax_ukernel__neon,
122 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x20,
123 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
124 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x20_acc2,
125 xnn_f32_rmax_ukernel__neon,
126 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x20_acc2,
127 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
128 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_p5_x20_acc5,
129 xnn_f32_rmax_ukernel__neon,
130 xnn_f32_raddstoreexpminusmax_ukernel__neon_p5_x20_acc5,
131 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
132
133 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x4,
134 xnn_f32_rmax_ukernel__neon,
135 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x4,
136 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
137 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x8,
138 xnn_f32_rmax_ukernel__neon,
139 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x8,
140 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
141 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x8_acc2,
142 xnn_f32_rmax_ukernel__neon,
143 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x8_acc2,
144 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
145 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x12,
146 xnn_f32_rmax_ukernel__neon,
147 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x12,
148 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
149 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x12_acc2,
150 xnn_f32_rmax_ukernel__neon,
151 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x12_acc2,
152 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
153 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x12_acc3,
154 xnn_f32_rmax_ukernel__neon,
155 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x12_acc3,
156 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
157 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x16,
158 xnn_f32_rmax_ukernel__neon,
159 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x16,
160 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
161 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x16_acc2,
162 xnn_f32_rmax_ukernel__neon,
163 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x16_acc2,
164 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
165 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x16_acc4,
166 xnn_f32_rmax_ukernel__neon,
167 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x16_acc4,
168 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
169 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x20,
170 xnn_f32_rmax_ukernel__neon,
171 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x20,
172 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
173 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x20_acc2,
174 xnn_f32_rmax_ukernel__neon,
175 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x20_acc2,
176 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
177 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_lut64_p2_x20_acc5,
178 xnn_f32_rmax_ukernel__neon,
179 xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x20_acc5,
180 benchmark::utils::CheckNEON)->Apply(CharacteristicArguments)->UseRealTime();
181
182 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x4,
183 xnn_f32_rmax_ukernel__neon,
184 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x4,
185 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
186 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x8,
187 xnn_f32_rmax_ukernel__neon,
188 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x8,
189 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
190 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x8_acc2,
191 xnn_f32_rmax_ukernel__neon,
192 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x8_acc2,
193 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
194 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x12,
195 xnn_f32_rmax_ukernel__neon,
196 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x12,
197 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
198 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x12_acc2,
199 xnn_f32_rmax_ukernel__neon,
200 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x12_acc2,
201 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
202 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x12_acc3,
203 xnn_f32_rmax_ukernel__neon,
204 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x12_acc3,
205 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
206 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x16,
207 xnn_f32_rmax_ukernel__neon,
208 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x16,
209 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
210 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x16_acc2,
211 xnn_f32_rmax_ukernel__neon,
212 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x16_acc2,
213 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
214 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x16_acc4,
215 xnn_f32_rmax_ukernel__neon,
216 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x16_acc4,
217 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
218 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x20,
219 xnn_f32_rmax_ukernel__neon,
220 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x20,
221 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
222 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x20_acc2,
223 xnn_f32_rmax_ukernel__neon,
224 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x20_acc2,
225 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
226 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_p5_x20_acc5,
227 xnn_f32_rmax_ukernel__neon,
228 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x20_acc5,
229 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
230
231 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x4,
232 xnn_f32_rmax_ukernel__neon,
233 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x4,
234 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
235 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x8,
236 xnn_f32_rmax_ukernel__neon,
237 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x8,
238 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
239 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x8_acc2,
240 xnn_f32_rmax_ukernel__neon,
241 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x8_acc2,
242 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
243 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x12,
244 xnn_f32_rmax_ukernel__neon,
245 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x12,
246 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
247 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x12_acc2,
248 xnn_f32_rmax_ukernel__neon,
249 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x12_acc2,
250 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
251 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x12_acc3,
252 xnn_f32_rmax_ukernel__neon,
253 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x12_acc3,
254 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
255 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x16,
256 xnn_f32_rmax_ukernel__neon,
257 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x16,
258 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
259 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x16_acc2,
260 xnn_f32_rmax_ukernel__neon,
261 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x16_acc2,
262 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
263 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x16_acc4,
264 xnn_f32_rmax_ukernel__neon,
265 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x16_acc4,
266 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
267 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x20,
268 xnn_f32_rmax_ukernel__neon,
269 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x20,
270 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
271 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x20_acc2,
272 xnn_f32_rmax_ukernel__neon,
273 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x20_acc2,
274 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
275 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_lut64_p2_x20_acc5,
276 xnn_f32_rmax_ukernel__neon,
277 xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x20_acc5,
278 benchmark::utils::CheckNEONFMA)->Apply(CharacteristicArguments)->UseRealTime();
279#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
280
Marat Dukhan4c4eb002019-12-08 21:27:49 -0800281#if XNN_ARCH_X86 || XNN_ARCH_X86_64
282 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x128,
283 xnn_f32_rmax_ukernel__avx,
284 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x128,
285 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
286 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x128_acc2,
287 xnn_f32_rmax_ukernel__avx,
288 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
289 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
290 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x128_acc4,
291 xnn_f32_rmax_ukernel__avx,
292 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x128_acc4,
293 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
294
295 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x144,
296 xnn_f32_rmax_ukernel__avx,
297 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x144,
298 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
299 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x144_acc3,
300 xnn_f32_rmax_ukernel__avx,
301 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x144_acc3,
302 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
303
304 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x160,
305 xnn_f32_rmax_ukernel__avx,
306 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x160,
307 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
308 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x160_acc2,
309 xnn_f32_rmax_ukernel__avx,
310 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x160_acc2,
311 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
312 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x160_acc5,
313 xnn_f32_rmax_ukernel__avx,
314 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x160_acc5,
315 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
316
317 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x192,
318 xnn_f32_rmax_ukernel__avx,
319 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x192,
320 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
321 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x192_acc2,
322 xnn_f32_rmax_ukernel__avx,
323 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x192_acc2,
324 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
325 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x192_acc3,
326 xnn_f32_rmax_ukernel__avx,
327 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x192_acc3,
328 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
329 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_p5_scalef_x192_acc6,
330 xnn_f32_rmax_ukernel__avx,
331 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x192_acc6,
332 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
333
334 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x64,
335 xnn_f32_rmax_ukernel__avx,
336 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64,
337 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
338 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x64_acc2,
339 xnn_f32_rmax_ukernel__avx,
340 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2,
341 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
342 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x64_acc4,
343 xnn_f32_rmax_ukernel__avx,
344 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc4,
345 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
346
347 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x72,
348 xnn_f32_rmax_ukernel__avx,
349 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x72,
350 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
351 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x72_acc3,
352 xnn_f32_rmax_ukernel__avx,
353 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x72_acc3,
354 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
355
356 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x80,
357 xnn_f32_rmax_ukernel__avx,
358 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x80,
359 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
360 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x80_acc2,
361 xnn_f32_rmax_ukernel__avx,
362 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x80_acc2,
363 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
364 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x80_acc5,
365 xnn_f32_rmax_ukernel__avx,
366 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x80_acc5,
367 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
368
369 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x96,
370 xnn_f32_rmax_ukernel__avx,
371 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x96,
372 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
373 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x96_acc2,
374 xnn_f32_rmax_ukernel__avx,
375 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x96_acc2,
376 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
377 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x96_acc3,
378 xnn_f32_rmax_ukernel__avx,
379 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x96_acc3,
380 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
381 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_p5_x96_acc6,
382 xnn_f32_rmax_ukernel__avx,
383 xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x96_acc6,
384 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
Marat Dukhanb39689d2020-01-24 13:32:20 -0800385
386 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x4,
387 xnn_f32_rmax_ukernel__sse,
388 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x4)->Apply(CharacteristicArguments)->UseRealTime();
389 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x8,
390 xnn_f32_rmax_ukernel__sse,
391 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8)->Apply(CharacteristicArguments)->UseRealTime();
392 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x8_acc2,
393 xnn_f32_rmax_ukernel__sse,
394 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8_acc2)->Apply(CharacteristicArguments)->UseRealTime();
395 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x12,
396 xnn_f32_rmax_ukernel__sse,
397 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12)->Apply(CharacteristicArguments)->UseRealTime();
398 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x12_acc2,
399 xnn_f32_rmax_ukernel__sse,
400 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc2)->Apply(CharacteristicArguments)->UseRealTime();
401 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x12_acc3,
402 xnn_f32_rmax_ukernel__sse,
403 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc3)->Apply(CharacteristicArguments)->UseRealTime();
404 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x16,
405 xnn_f32_rmax_ukernel__sse,
406 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16)->Apply(CharacteristicArguments)->UseRealTime();
407 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x16_acc2,
408 xnn_f32_rmax_ukernel__sse,
409 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc2)->Apply(CharacteristicArguments)->UseRealTime();
410 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x16_acc4,
411 xnn_f32_rmax_ukernel__sse,
412 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc4)->Apply(CharacteristicArguments)->UseRealTime();
413 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x20,
414 xnn_f32_rmax_ukernel__sse,
415 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20)->Apply(CharacteristicArguments)->UseRealTime();
416 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x20_acc2,
417 xnn_f32_rmax_ukernel__sse,
418 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc2)->Apply(CharacteristicArguments)->UseRealTime();
419 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_p5_x20_acc5,
420 xnn_f32_rmax_ukernel__sse,
421 xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc5)->Apply(CharacteristicArguments)->UseRealTime();
Marat Dukhan4c4eb002019-12-08 21:27:49 -0800422#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
423
Marat Dukhan52238f02020-07-16 15:30:28 -0700424#if XNN_ARCH_WASMSIMD
425 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_p5_x4,
426 xnn_f32_rmax_ukernel__wasmsimd_arm,
427 xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x4)->Apply(CharacteristicArguments)->UseRealTime();
428 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_p5_x8,
429 xnn_f32_rmax_ukernel__wasmsimd_arm,
430 xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x8)->Apply(CharacteristicArguments)->UseRealTime();
431 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_p5_x8_acc2,
432 xnn_f32_rmax_ukernel__wasmsimd_arm,
433 xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x8_acc2)->Apply(CharacteristicArguments)->UseRealTime();
434 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_p5_x12,
435 xnn_f32_rmax_ukernel__wasmsimd_arm,
436 xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x12)->Apply(CharacteristicArguments)->UseRealTime();
437 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_p5_x12_acc2,
438 xnn_f32_rmax_ukernel__wasmsimd_arm,
439 xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x12_acc2)->Apply(CharacteristicArguments)->UseRealTime();
440 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_p5_x12_acc3,
441 xnn_f32_rmax_ukernel__wasmsimd_arm,
442 xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x12_acc3)->Apply(CharacteristicArguments)->UseRealTime();
443 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_p5_x16,
444 xnn_f32_rmax_ukernel__wasmsimd_arm,
445 xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x16)->Apply(CharacteristicArguments)->UseRealTime();
446 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_p5_x16_acc2,
447 xnn_f32_rmax_ukernel__wasmsimd_arm,
448 xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x16_acc2)->Apply(CharacteristicArguments)->UseRealTime();
449 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_p5_x16_acc4,
450 xnn_f32_rmax_ukernel__wasmsimd_arm,
451 xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x16_acc4)->Apply(CharacteristicArguments)->UseRealTime();
452 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_p5_x20,
453 xnn_f32_rmax_ukernel__wasmsimd_arm,
454 xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x20)->Apply(CharacteristicArguments)->UseRealTime();
455 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_p5_x20_acc2,
456 xnn_f32_rmax_ukernel__wasmsimd_arm,
457 xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x20_acc2)->Apply(CharacteristicArguments)->UseRealTime();
458 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_p5_x20_acc5,
459 xnn_f32_rmax_ukernel__wasmsimd_arm,
460 xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x20_acc5)->Apply(CharacteristicArguments)->UseRealTime();
461#endif // XNN_ARCH_WASMSIMD
462
Marat Dukhanf46f6752020-01-21 11:03:49 -0800463BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_lut64_p2_x1,
464 xnn_f32_rmax_ukernel__scalar,
465 xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x1)->Apply(CharacteristicArguments)->UseRealTime();
466BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_lut64_p2_x2,
467 xnn_f32_rmax_ukernel__scalar,
468 xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2)->Apply(CharacteristicArguments)->UseRealTime();
469BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_lut64_p2_x2_acc2,
470 xnn_f32_rmax_ukernel__scalar,
471 xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2_acc2)->Apply(CharacteristicArguments)->UseRealTime();
472BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_lut64_p2_x4,
473 xnn_f32_rmax_ukernel__scalar,
474 xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4)->Apply(CharacteristicArguments)->UseRealTime();
475BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_lut64_p2_x4_acc2,
476 xnn_f32_rmax_ukernel__scalar,
477 xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc2)->Apply(CharacteristicArguments)->UseRealTime();
478BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_lut64_p2_x4_acc4,
479 xnn_f32_rmax_ukernel__scalar,
480 xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc4)->Apply(CharacteristicArguments)->UseRealTime();
481
482BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_p5_x1,
483 xnn_f32_rmax_ukernel__scalar,
484 xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x1)->Apply(CharacteristicArguments)->UseRealTime();
485BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_p5_x2,
486 xnn_f32_rmax_ukernel__scalar,
487 xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2)->Apply(CharacteristicArguments)->UseRealTime();
488BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_p5_x2_acc2,
489 xnn_f32_rmax_ukernel__scalar,
490 xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2_acc2)->Apply(CharacteristicArguments)->UseRealTime();
491BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_p5_x4,
492 xnn_f32_rmax_ukernel__scalar,
493 xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4)->Apply(CharacteristicArguments)->UseRealTime();
494BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_p5_x4_acc2,
495 xnn_f32_rmax_ukernel__scalar,
496 xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2)->Apply(CharacteristicArguments)->UseRealTime();
497BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_p5_x4_acc4,
498 xnn_f32_rmax_ukernel__scalar,
499 xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc4)->Apply(CharacteristicArguments)->UseRealTime();
500
Marat Dukhan4c4eb002019-12-08 21:27:49 -0800501#ifndef XNNPACK_BENCHMARK_NO_MAIN
502BENCHMARK_MAIN();
503#endif