blob: 4ba0b3c49a273ac69b9ab7de9e7343965528d471 [file] [log] [blame]
Marat Dukhan83a8d2f2021-07-29 16:41:19 -07001// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cmath>
8#include <functional>
9#include <random>
10#include <vector>
11
12#include <benchmark/benchmark.h>
13#include "bench/utils.h"
14
15#include <xnnpack/AlignedAllocator.h>
16#include <xnnpack/common.h>
17#include <xnnpack/params.h>
18#include <xnnpack/params-init.h>
Marat Dukhan64287252021-09-07 16:20:03 -070019#include <xnnpack/vaddsub.h>
Marat Dukhan83a8d2f2021-07-29 16:41:19 -070020
21
22static void qs8_vadd(
23 benchmark::State& state,
Marat Dukhan64287252021-09-07 16:20:03 -070024 xnn_qs8_vaddsub_minmax_ukernel_function vadd,
25 xnn_init_qs8_addsub_minmax_params_fn init_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -070026 benchmark::utils::IsaCheckFunction isa_check = nullptr)
27{
28 if (isa_check && !isa_check(state)) {
29 return;
30 }
31
32 const size_t num_elements = state.range(0);
33
34 std::random_device random_device;
35 auto rng = std::mt19937(random_device());
Marat Dukhan1ef9de82021-07-29 17:15:33 -070036 auto i8rng = std::bind(
37 std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
38 std::ref(rng));
Marat Dukhan83a8d2f2021-07-29 16:41:19 -070039
40 std::vector<int8_t, AlignedAllocator<int8_t, 64>> a(num_elements);
41 std::vector<int8_t, AlignedAllocator<int8_t, 64>> b(num_elements);
42 std::vector<int8_t, AlignedAllocator<int8_t, 64>> sum(num_elements);
43 std::generate(a.begin(), a.end(), std::ref(i8rng));
44 std::generate(b.begin(), b.end(), std::ref(i8rng));
45
Marat Dukhan64287252021-09-07 16:20:03 -070046 union xnn_qs8_addsub_minmax_params params;
Marat Dukhan83a8d2f2021-07-29 16:41:19 -070047 init_params(&params,
48 1 /* a zero point */, 1 /* b zero point */, 1 /* output zero point */,
49 0.5f /* a-output scale */, 0.75f /* b-output scale */,
50 std::numeric_limits<int8_t>::min() + 1, std::numeric_limits<int8_t>::max() - 1);
51 for (auto _ : state) {
52 vadd(num_elements * sizeof(int8_t), a.data(), b.data(), sum.data(), &params);
53 }
54
55 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
56 if (cpu_frequency != 0) {
57 state.counters["cpufreq"] = cpu_frequency;
58 }
59
60 const size_t num_elements_per_iteration = num_elements;
61 state.counters["num_elements"] =
62 benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate);
63
64 const size_t bytes_per_iteration = 3 * num_elements * sizeof(int8_t);
65 state.counters["bytes"] =
66 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
67}
68
69#if XNN_ARCH_ARM || XNN_ARCH_ARM64
70 BENCHMARK_CAPTURE(qs8_vadd, neon_ld64_x8,
71 xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8,
72 xnn_init_qs8_add_minmax_neon_params,
73 benchmark::utils::CheckNEON)
74 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
75 ->UseRealTime();
76 BENCHMARK_CAPTURE(qs8_vadd, neon_ld64_x16,
77 xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16,
78 xnn_init_qs8_add_minmax_neon_params,
79 benchmark::utils::CheckNEON)
80 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
81 ->UseRealTime();
82 BENCHMARK_CAPTURE(qs8_vadd, neon_ld64_x24,
83 xnn_qs8_vadd_minmax_ukernel__neon_ld64_x24,
84 xnn_init_qs8_add_minmax_neon_params,
85 benchmark::utils::CheckNEON)
86 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
87 ->UseRealTime();
88 BENCHMARK_CAPTURE(qs8_vadd, neon_ld64_x32,
89 xnn_qs8_vadd_minmax_ukernel__neon_ld64_x32,
90 xnn_init_qs8_add_minmax_neon_params,
91 benchmark::utils::CheckNEON)
92 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
93 ->UseRealTime();
Marat Dukhaneb3cff32021-07-30 11:35:27 -070094
95 BENCHMARK_CAPTURE(qs8_vadd, neon_ld128_x16,
96 xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16,
97 xnn_init_qs8_add_minmax_neon_params,
98 benchmark::utils::CheckNEON)
99 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
100 ->UseRealTime();
101 BENCHMARK_CAPTURE(qs8_vadd, neon_ld128_x32,
102 xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32,
103 xnn_init_qs8_add_minmax_neon_params,
104 benchmark::utils::CheckNEON)
105 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
106 ->UseRealTime();
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700107#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
108
109#if XNN_ARCH_X86 || XNN_ARCH_X86_64
110 BENCHMARK_CAPTURE(qs8_vadd, avx512skx_mul32_ld128_x16,
111 xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
112 xnn_init_qs8_add_minmax_avx512_params,
113 benchmark::utils::CheckAVX512SKX)
114 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
115 ->UseRealTime();
116 BENCHMARK_CAPTURE(qs8_vadd, avx512skx_mul32_ld128_x32,
117 xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x32,
118 xnn_init_qs8_add_minmax_avx512_params,
119 benchmark::utils::CheckAVX512SKX)
120 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
121 ->UseRealTime();
122
123 BENCHMARK_CAPTURE(qs8_vadd, avx2_mul32_ld64_x8,
124 xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x8,
125 xnn_init_qs8_add_minmax_avx2_params,
126 benchmark::utils::CheckAVX2)
127 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
128 ->UseRealTime();
129 BENCHMARK_CAPTURE(qs8_vadd, avx2_mul32_ld64_x16,
130 xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
131 xnn_init_qs8_add_minmax_avx2_params,
132 benchmark::utils::CheckAVX2)
133 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
134 ->UseRealTime();
135 BENCHMARK_CAPTURE(qs8_vadd, avx2_mul32_ld64_x24,
136 xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x24,
137 xnn_init_qs8_add_minmax_avx2_params,
138 benchmark::utils::CheckAVX2)
139 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
140 ->UseRealTime();
141 BENCHMARK_CAPTURE(qs8_vadd, avx2_mul32_ld64_x32,
142 xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x32,
143 xnn_init_qs8_add_minmax_avx2_params,
144 benchmark::utils::CheckAVX2)
145 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
146 ->UseRealTime();
147
148 BENCHMARK_CAPTURE(qs8_vadd, xop_mul32_ld32_x8,
149 xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
150 xnn_init_qs8_add_minmax_sse4_mul32_params,
151 benchmark::utils::CheckXOP)
152 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
153 ->UseRealTime();
154 BENCHMARK_CAPTURE(qs8_vadd, xop_mul32_ld32_x16,
155 xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x16,
156 xnn_init_qs8_add_minmax_sse4_mul32_params,
157 benchmark::utils::CheckXOP)
158 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
159 ->UseRealTime();
160 BENCHMARK_CAPTURE(qs8_vadd, xop_mul32_ld32_x24,
161 xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x24,
162 xnn_init_qs8_add_minmax_sse4_mul32_params,
163 benchmark::utils::CheckXOP)
164 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
165 ->UseRealTime();
166 BENCHMARK_CAPTURE(qs8_vadd, xop_mul32_ld32_x32,
167 xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x32,
168 xnn_init_qs8_add_minmax_sse4_mul32_params,
169 benchmark::utils::CheckXOP)
170 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
171 ->UseRealTime();
172
173 BENCHMARK_CAPTURE(qs8_vadd, avx_mul16_ld64_x8,
174 xnn_qs8_vadd_minmax_ukernel__avx_mul16_ld64_x8,
175 xnn_init_qs8_add_minmax_sse4_mul16_params,
176 benchmark::utils::CheckAVX)
177 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
178 ->UseRealTime();
179 BENCHMARK_CAPTURE(qs8_vadd, avx_mul16_ld64_x16,
180 xnn_qs8_vadd_minmax_ukernel__avx_mul16_ld64_x16,
181 xnn_init_qs8_add_minmax_sse4_mul16_params,
182 benchmark::utils::CheckAVX)
183 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
184 ->UseRealTime();
185 BENCHMARK_CAPTURE(qs8_vadd, avx_mul16_ld64_x24,
186 xnn_qs8_vadd_minmax_ukernel__avx_mul16_ld64_x24,
187 xnn_init_qs8_add_minmax_sse4_mul16_params,
188 benchmark::utils::CheckAVX)
189 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
190 ->UseRealTime();
191 BENCHMARK_CAPTURE(qs8_vadd, avx_mul16_ld64_x32,
192 xnn_qs8_vadd_minmax_ukernel__avx_mul16_ld64_x32,
193 xnn_init_qs8_add_minmax_sse4_mul16_params,
194 benchmark::utils::CheckAVX)
195 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
196 ->UseRealTime();
197
198 BENCHMARK_CAPTURE(qs8_vadd, avx_mul32_ld32_x8,
199 xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700200 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700201 benchmark::utils::CheckAVX)
202 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
203 ->UseRealTime();
204 BENCHMARK_CAPTURE(qs8_vadd, avx_mul32_ld32_x16,
205 xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x16,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700206 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700207 benchmark::utils::CheckAVX)
208 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
209 ->UseRealTime();
210 BENCHMARK_CAPTURE(qs8_vadd, avx_mul32_ld32_x24,
211 xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x24,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700212 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700213 benchmark::utils::CheckAVX)
214 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
215 ->UseRealTime();
216 BENCHMARK_CAPTURE(qs8_vadd, avx_mul32_ld32_x32,
217 xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x32,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700218 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700219 benchmark::utils::CheckAVX)
220 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
221 ->UseRealTime();
222
223 BENCHMARK_CAPTURE(qs8_vadd, sse41_mul16_ld64_x8,
224 xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
225 xnn_init_qs8_add_minmax_sse4_mul16_params,
226 benchmark::utils::CheckSSE41)
227 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
228 ->UseRealTime();
229 BENCHMARK_CAPTURE(qs8_vadd, sse41_mul16_ld64_x16,
230 xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16,
231 xnn_init_qs8_add_minmax_sse4_mul16_params,
232 benchmark::utils::CheckSSE41)
233 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
234 ->UseRealTime();
235 BENCHMARK_CAPTURE(qs8_vadd, sse41_mul16_ld64_x24,
236 xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24,
237 xnn_init_qs8_add_minmax_sse4_mul16_params,
238 benchmark::utils::CheckSSE41)
239 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
240 ->UseRealTime();
241 BENCHMARK_CAPTURE(qs8_vadd, sse41_mul16_ld64_x32,
242 xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32,
243 xnn_init_qs8_add_minmax_sse4_mul16_params,
244 benchmark::utils::CheckSSE41)
245 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
246 ->UseRealTime();
247
248 BENCHMARK_CAPTURE(qs8_vadd, sse41_mul32_ld32_x8,
249 xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700250 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700251 benchmark::utils::CheckSSE41)
252 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
253 ->UseRealTime();
254 BENCHMARK_CAPTURE(qs8_vadd, sse41_mul32_ld32_x16,
255 xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x16,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700256 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700257 benchmark::utils::CheckSSE41)
258 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
259 ->UseRealTime();
260 BENCHMARK_CAPTURE(qs8_vadd, sse41_mul32_ld32_x24,
261 xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x24,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700262 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700263 benchmark::utils::CheckSSE41)
264 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
265 ->UseRealTime();
266 BENCHMARK_CAPTURE(qs8_vadd, sse41_mul32_ld32_x32,
267 xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x32,
Marat Dukhan1ef9de82021-07-29 17:15:33 -0700268 xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700269 benchmark::utils::CheckSSE41)
270 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
271 ->UseRealTime();
272
273 BENCHMARK_CAPTURE(qs8_vadd, sse2_mul16_ld64_x8,
274 xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
275 xnn_init_qs8_add_minmax_sse2_params)
276 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
277 ->UseRealTime();
278 BENCHMARK_CAPTURE(qs8_vadd, sse2_mul16_ld64_x16,
279 xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16,
280 xnn_init_qs8_add_minmax_sse2_params)
281 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
282 ->UseRealTime();
283 BENCHMARK_CAPTURE(qs8_vadd, sse2_mul16_ld64_x24,
284 xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24,
285 xnn_init_qs8_add_minmax_sse2_params)
286 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
287 ->UseRealTime();
288 BENCHMARK_CAPTURE(qs8_vadd, sse2_mul16_ld64_x32,
289 xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32,
290 xnn_init_qs8_add_minmax_sse2_params)
291 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
292 ->UseRealTime();
293#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
294
Marat Dukhan4c617792021-12-21 15:47:58 -0800295#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700296 BENCHMARK_CAPTURE(qs8_vadd, wasmsimd_x8,
297 xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8,
298 xnn_init_qs8_add_minmax_wasmsimd_params)
299 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
300 ->UseRealTime();
301 BENCHMARK_CAPTURE(qs8_vadd, wasmsimd_x16,
302 xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16,
303 xnn_init_qs8_add_minmax_wasmsimd_params)
304 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
305 ->UseRealTime();
306 BENCHMARK_CAPTURE(qs8_vadd, wasmsimd_x24,
307 xnn_qs8_vadd_minmax_ukernel__wasmsimd_x24,
308 xnn_init_qs8_add_minmax_wasmsimd_params)
309 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
310 ->UseRealTime();
311 BENCHMARK_CAPTURE(qs8_vadd, wasmsimd_x32,
312 xnn_qs8_vadd_minmax_ukernel__wasmsimd_x32,
313 xnn_init_qs8_add_minmax_wasmsimd_params)
314 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
315 ->UseRealTime();
Marat Dukhan4c617792021-12-21 15:47:58 -0800316#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan83a8d2f2021-07-29 16:41:19 -0700317
318BENCHMARK_CAPTURE(qs8_vadd, scalar_x1,
319 xnn_qs8_vadd_minmax_ukernel__scalar_x1,
320 xnn_init_qs8_add_minmax_scalar_params)
321 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
322 ->UseRealTime();
323BENCHMARK_CAPTURE(qs8_vadd, scalar_x2,
324 xnn_qs8_vadd_minmax_ukernel__scalar_x2,
325 xnn_init_qs8_add_minmax_scalar_params)
326 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
327 ->UseRealTime();
328BENCHMARK_CAPTURE(qs8_vadd, scalar_x4,
329 xnn_qs8_vadd_minmax_ukernel__scalar_x4,
330 xnn_init_qs8_add_minmax_scalar_params)
331 ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
332 ->UseRealTime();
333
334#ifndef XNNPACK_BENCHMARK_NO_MAIN
335BENCHMARK_MAIN();
336#endif