blob: d4093e5aefa419a85bf6e112cce3f53eb99f0184 [file] [log] [blame]
Marat Dukhan83a8d2f2021-07-29 16:41:19 -07001// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cmath>
8#include <functional>
9#include <random>
10#include <vector>
11
12#include <benchmark/benchmark.h>
13#include "bench/utils.h"
14
15#include <xnnpack/AlignedAllocator.h>
16#include <xnnpack/common.h>
17#include <xnnpack/params.h>
18#include <xnnpack/params-init.h>
Marat Dukhan64287252021-09-07 16:20:03 -070019#include <xnnpack/vaddsub.h>
Marat Dukhan83a8d2f2021-07-29 16:41:19 -070020
21
22static void qs8_vaddc(
23 benchmark::State& state,
Marat Dukhan64287252021-09-07 16:20:03 -070024 xnn_qs8_vaddsub_minmax_ukernel_function vaddc,
25 xnn_init_qs8_addsub_minmax_params_fn init_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -070026 benchmark::utils::IsaCheckFunction isa_check = nullptr)
27{
28 if (isa_check && !isa_check(state)) {
29 return;
30 }
31
32 const size_t num_elements = state.range(0);
33
34 std::random_device random_device;
35 auto rng = std::mt19937(random_device());
Marat Dukhan1ef9de82021-07-29 17:15:33 -070036 auto i8rng = std::bind(
37 std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
38 std::ref(rng));
Marat Dukhan83a8d2f2021-07-29 16:41:19 -070039
40 std::vector<int8_t, AlignedAllocator<int8_t, 64>> a(num_elements);
41 std::vector<int8_t, AlignedAllocator<int8_t, 64>> sum(num_elements);
42 std::generate(a.begin(), a.end(), std::ref(i8rng));
43 const int8_t b = i8rng();
44
Marat Dukhan64287252021-09-07 16:20:03 -070045 union xnn_qs8_addsub_minmax_params params;
Marat Dukhan83a8d2f2021-07-29 16:41:19 -070046 init_params(&params,
47 1 /* a zero point */, 1 /* b zero point */, 1 /* output zero point */,
48 0.5f /* a-output scale */, 0.75f /* b-output scale */,
49 std::numeric_limits<int8_t>::min() + 1, std::numeric_limits<int8_t>::max() - 1);
50 for (auto _ : state) {
51 vaddc(num_elements * sizeof(int8_t), a.data(), &b, sum.data(), &params);
52 }
53
54 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
55 if (cpu_frequency != 0) {
56 state.counters["cpufreq"] = cpu_frequency;
57 }
58
59 const size_t num_elements_per_iteration = num_elements;
60 state.counters["num_elements"] =
61 benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate);
62
63 const size_t bytes_per_iteration = 2 * num_elements * sizeof(int8_t);
64 state.counters["bytes"] =
65 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
66}
67
68#if XNN_ARCH_ARM || XNN_ARCH_ARM64
69 BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x8,
70 xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8,
71 xnn_init_qs8_add_minmax_neon_params,
72 benchmark::utils::CheckNEON)
73 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
74 ->UseRealTime();
75 BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x16,
76 xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
77 xnn_init_qs8_add_minmax_neon_params,
78 benchmark::utils::CheckNEON)
79 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
80 ->UseRealTime();
81 BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x24,
82 xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x24,
83 xnn_init_qs8_add_minmax_neon_params,
84 benchmark::utils::CheckNEON)
85 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
86 ->UseRealTime();
87 BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x32,
88 xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
89 xnn_init_qs8_add_minmax_neon_params,
90 benchmark::utils::CheckNEON)
91 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
92 ->UseRealTime();
Marat Dukhaneb3cff32021-07-30 11:35:27 -070093
94 BENCHMARK_CAPTURE(qs8_vaddc, neon_ld128_x16,
95 xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16,
96 xnn_init_qs8_add_minmax_neon_params,
97 benchmark::utils::CheckNEON)
98 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
99 ->UseRealTime();
100 BENCHMARK_CAPTURE(qs8_vaddc, neon_ld128_x32,
101 xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32,
102 xnn_init_qs8_add_minmax_neon_params,
103 benchmark::utils::CheckNEON)
104 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
105 ->UseRealTime();
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700106#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
107
108#if XNN_ARCH_X86 || XNN_ARCH_X86_64
109 BENCHMARK_CAPTURE(qs8_vaddc, avx512skx_mul32_ld128_x16,
110 xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
111 xnn_init_qs8_add_minmax_avx512_params,
112 benchmark::utils::CheckAVX512SKX)
113 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
114 ->UseRealTime();
115 BENCHMARK_CAPTURE(qs8_vaddc, avx512skx_mul32_ld128_x32,
116 xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x32,
117 xnn_init_qs8_add_minmax_avx512_params,
118 benchmark::utils::CheckAVX512SKX)
119 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
120 ->UseRealTime();
121
122 BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x8,
123 xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x8,
124 xnn_init_qs8_add_minmax_avx2_params,
125 benchmark::utils::CheckAVX2)
126 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
127 ->UseRealTime();
128 BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x16,
129 xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
130 xnn_init_qs8_add_minmax_avx2_params,
131 benchmark::utils::CheckAVX2)
132 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
133 ->UseRealTime();
134 BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x24,
135 xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x24,
136 xnn_init_qs8_add_minmax_avx2_params,
137 benchmark::utils::CheckAVX2)
138 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
139 ->UseRealTime();
140 BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x32,
141 xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x32,
142 xnn_init_qs8_add_minmax_avx2_params,
143 benchmark::utils::CheckAVX2)
144 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
145 ->UseRealTime();
146
147 BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x8,
148 xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
149 xnn_init_qs8_add_minmax_sse4_mul32_params,
150 benchmark::utils::CheckXOP)
151 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
152 ->UseRealTime();
153 BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x16,
154 xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x16,
155 xnn_init_qs8_add_minmax_sse4_mul32_params,
156 benchmark::utils::CheckXOP)
157 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
158 ->UseRealTime();
159 BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x24,
160 xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x24,
161 xnn_init_qs8_add_minmax_sse4_mul32_params,
162 benchmark::utils::CheckXOP)
163 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
164 ->UseRealTime();
165 BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x32,
166 xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x32,
167 xnn_init_qs8_add_minmax_sse4_mul32_params,
168 benchmark::utils::CheckXOP)
169 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
170 ->UseRealTime();
171
172 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x8,
173 xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x8,
174 xnn_init_qs8_add_minmax_sse4_mul16_params,
175 benchmark::utils::CheckAVX)
176 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
177 ->UseRealTime();
178 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x16,
179 xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x16,
180 xnn_init_qs8_add_minmax_sse4_mul16_params,
181 benchmark::utils::CheckAVX)
182 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
183 ->UseRealTime();
184 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x24,
185 xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x24,
186 xnn_init_qs8_add_minmax_sse4_mul16_params,
187 benchmark::utils::CheckAVX)
188 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
189 ->UseRealTime();
190 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x32,
191 xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x32,
192 xnn_init_qs8_add_minmax_sse4_mul16_params,
193 benchmark::utils::CheckAVX)
194 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
195 ->UseRealTime();
196
197 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x8,
198 xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700199 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700200 benchmark::utils::CheckAVX)
201 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
202 ->UseRealTime();
203 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x16,
204 xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x16,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700205 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700206 benchmark::utils::CheckAVX)
207 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
208 ->UseRealTime();
209 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x24,
210 xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x24,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700211 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700212 benchmark::utils::CheckAVX)
213 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
214 ->UseRealTime();
215 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x32,
216 xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x32,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700217 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700218 benchmark::utils::CheckAVX)
219 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
220 ->UseRealTime();
221
222 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x8,
223 xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
224 xnn_init_qs8_add_minmax_sse4_mul16_params,
225 benchmark::utils::CheckSSE41)
226 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
227 ->UseRealTime();
228 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x16,
229 xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16,
230 xnn_init_qs8_add_minmax_sse4_mul16_params,
231 benchmark::utils::CheckSSE41)
232 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
233 ->UseRealTime();
234 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x24,
235 xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24,
236 xnn_init_qs8_add_minmax_sse4_mul16_params,
237 benchmark::utils::CheckSSE41)
238 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
239 ->UseRealTime();
240 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x32,
241 xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32,
242 xnn_init_qs8_add_minmax_sse4_mul16_params,
243 benchmark::utils::CheckSSE41)
244 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
245 ->UseRealTime();
246
247 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x8,
248 xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x8,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700249 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700250 benchmark::utils::CheckSSE41)
251 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
252 ->UseRealTime();
253 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x16,
254 xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x16,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700255 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700256 benchmark::utils::CheckSSE41)
257 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
258 ->UseRealTime();
259 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x24,
260 xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x24,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700261 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700262 benchmark::utils::CheckSSE41)
263 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
264 ->UseRealTime();
265 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x32,
266 xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x32,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700267 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700268 benchmark::utils::CheckSSE41)
269 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
270 ->UseRealTime();
271
272 BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x8,
273 xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
274 xnn_init_qs8_add_minmax_sse2_params)
275 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
276 ->UseRealTime();
277 BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x16,
278 xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16,
279 xnn_init_qs8_add_minmax_sse2_params)
280 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
281 ->UseRealTime();
282 BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x24,
283 xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24,
284 xnn_init_qs8_add_minmax_sse2_params)
285 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
286 ->UseRealTime();
287 BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x32,
288 xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32,
289 xnn_init_qs8_add_minmax_sse2_params)
290 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
291 ->UseRealTime();
292#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
293
Marat Dukhan4c617792021-12-21 15:47:58 -0800294#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700295 BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x8,
296 xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8,
297 xnn_init_qs8_add_minmax_wasmsimd_params)
298 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
299 ->UseRealTime();
300 BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x16,
301 xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x16,
302 xnn_init_qs8_add_minmax_wasmsimd_params)
303 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
304 ->UseRealTime();
305 BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x24,
306 xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x24,
307 xnn_init_qs8_add_minmax_wasmsimd_params)
308 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
309 ->UseRealTime();
310 BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x32,
311 xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
312 xnn_init_qs8_add_minmax_wasmsimd_params)
313 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
314 ->UseRealTime();
Marat Dukhan4c617792021-12-21 15:47:58 -0800315#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700316
317BENCHMARK_CAPTURE(qs8_vaddc, scalar_x1,
318 xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
319 xnn_init_qs8_add_minmax_scalar_params)
320 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
321 ->UseRealTime();
322BENCHMARK_CAPTURE(qs8_vaddc, scalar_x2,
323 xnn_qs8_vaddc_minmax_ukernel__scalar_x2,
324 xnn_init_qs8_add_minmax_scalar_params)
325 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
326 ->UseRealTime();
327BENCHMARK_CAPTURE(qs8_vaddc, scalar_x4,
328 xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
329 xnn_init_qs8_add_minmax_scalar_params)
330 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
331 ->UseRealTime();
332
333#ifndef XNNPACK_BENCHMARK_NO_MAIN
334BENCHMARK_MAIN();
335#endif