blob: c26b63daf53462404ae318ad9ce62fc161c37770 [file] [log] [blame]
Marat Dukhan0744fa02021-07-26 22:56:27 -07001// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cfloat>
8#include <cmath>
9#include <functional>
10#include <random>
11#include <vector>
12
13#include <benchmark/benchmark.h>
14#include "bench/dwconv.h"
15#include "bench/utils.h"
16#include <xnnpack/AlignedAllocator.h>
17#include <xnnpack/common.h>
18#include <xnnpack/dwconv.h>
19#include <xnnpack/indirection.h>
20#include <xnnpack/operator.h>
21#include <xnnpack/pack.h>
22#include <xnnpack/params-init.h>
23#include <xnnpack/params.h>
24
25
26static void DWConvBenchmark(benchmark::State& state,
27 xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv,
28 xnn_init_qs8_conv_minmax_params_fn init_params,
29 uint32_t channel_tile, uint32_t primary_tile,
30 benchmark::utils::IsaCheckFunction isa_check = nullptr)
31{
32 if (isa_check && !isa_check(state)) {
33 return;
34 }
35
36 const size_t input_height = state.range(0);
37 const size_t input_width = state.range(1);
38 const size_t kernel_height = state.range(2);
39 const size_t kernel_width = state.range(3);
40 const size_t padding_height = state.range(4);
41 const size_t padding_width = state.range(5);
42 const size_t subsampling = state.range(6);
43 const size_t dilation = state.range(7);
44 const size_t channels = state.range(8);
45
46 const size_t kernel_size = kernel_height * kernel_width;
47 if (kernel_size != primary_tile) {
48 state.SkipWithError("kernel size mismatch");
49 return;
50 }
51
52 std::random_device random_device;
53 auto rng = std::mt19937(random_device());
54 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
55 auto i8rng = std::bind(
56 std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()), std::ref(rng));
57
58 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
59 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
60 const size_t padding_left = padding_width / 2;
61 const size_t padding_top = padding_height / 2;
62 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
63 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
64 const size_t output_size = output_height * output_width;
65 const size_t step_width = dilation == 1 ? subsampling : kernel_width;
66 const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
67
68 const size_t c_stride = benchmark::utils::RoundUp<size_t>(channels, channel_tile);
69
70 std::vector<int8_t> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(int8_t));
71 std::generate(a.begin(), a.end(), std::ref(i8rng));
72 std::vector<int8_t> k(channels * kernel_height * kernel_width);
73 std::generate(k.begin(), k.end(), std::ref(i8rng));
74 std::vector<int32_t> b(channels);
75 std::generate(b.begin(), b.end(), std::ref(i32rng));
76
77 std::vector<int8_t> z(channels + XNN_EXTRA_BYTES / sizeof(int8_t));
78
79 const size_t k_elements = kernel_size * c_stride;
80 const size_t b_elements = c_stride;
81 const size_t w_size = k_elements * sizeof(int8_t) + b_elements * sizeof(int32_t);
82 const size_t i_elements = output_height * step_height;
83 const size_t c_elements = output_size * channels;
84 const size_t num_buffers = 1 +
85 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
86 (c_elements * sizeof(int8_t) + w_size) + sizeof(void*) * i_elements);
87
88 std::vector<char, AlignedAllocator<char, 64>> w(w_size * num_buffers);
89 std::fill(w.begin(), w.end(), 0.0f);
90 struct xnn_qs8_packing_params packing_params;
91 packing_params.input_zero_point = 0;
92 xnn_pack_qs8_dwconv_ghw_w(kernel_height, kernel_width, channels, channel_tile,
93 k.data(), b.data(), w.data(), 0 /* extra bytes */, &packing_params);
94 for (size_t n = 1; n < num_buffers; n++) {
95 std::copy(w.cbegin(), w.cbegin() + w_size, w.begin() + n * w_size);
96 }
97
98 std::vector<const int8_t*> i(i_elements * num_buffers);
99 xnn_operator convolution_op = { };
100 convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
101 convolution_op.input = a.data();
102 convolution_op.input_pixel_stride = channels;
103 convolution_op.zero_buffer = z.data();
104 convolution_op.input_height = input_height;
105 convolution_op.input_width = input_width;
106 convolution_op.output_height = output_height;
107 convolution_op.output_width = output_width;
108 convolution_op.kernel_height = kernel_height;
109 convolution_op.kernel_width = kernel_width;
110 convolution_op.stride_height = subsampling;
111 convolution_op.stride_width = subsampling;
112 convolution_op.dilation_height = dilation;
113 convolution_op.dilation_width = dilation;
114 convolution_op.padding_top = padding_top;
115 convolution_op.padding_left = padding_left;
116
117 xnn_indirection_init_dwconv2d(&convolution_op, step_height, step_width, 0 /* log2(sizeof(int8_t)) */);
118 for (size_t n = 1; n < num_buffers; n++) {
119 std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
120 }
121
122 std::vector<int8_t> c(c_elements * num_buffers);
123 std::fill(c.begin(), c.end(), std::nanf(""));
124
125 xnn_qs8_conv_minmax_params params;
126 init_params(&params,
127 0.5f /* scale */, 0 /* output zero point */, std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
128
129 size_t buffer_index = 0;
130 for (auto _ : state) {
131 state.PauseTiming();
132 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(int8_t));
133 buffer_index = (buffer_index + 1) % num_buffers;
134 state.ResumeTiming();
135
136 for (size_t y = 0; y < output_height; y++) {
137 dwconv(channels, output_width,
138 i.data() + buffer_index * i_elements + step_height * y,
139 w.data() + buffer_index * w_size,
140 c.data() + buffer_index * c_elements + y * output_width * channels,
141 kernel_height * step_width * sizeof(void*), 0,
142 0, z.data(), &params);
143 }
144 }
145
146 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
147 if (cpu_frequency != 0) {
148 state.counters["cpufreq"] = cpu_frequency;
149 }
150
151 state.counters["FLOPS"] = benchmark::Counter(
152 uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
153 benchmark::Counter::kIsRate);
154
155 state.counters["bytes"] = benchmark::Counter(
156 uint64_t(state.iterations()) * channels * ((output_size + input_height * input_width + kernel_size) * sizeof(int8_t) + sizeof(int32_t)),
157 benchmark::Counter::kIsRate);
158}
159
160
161#if XNN_ARCH_ARM || XNN_ARCH_ARM64
162 static void qs8_dwconv_up8x9__neon_mul8_ld64(benchmark::State& state, const char* net) {
163 DWConvBenchmark(state,
164 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul8_ld64,
165 xnn_init_qs8_conv_minmax_rndnu_neon_params,
166 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
167 }
168 static void qs8_dwconv_up16x9__neon_mul8_ld64(benchmark::State& state, const char* net) {
169 DWConvBenchmark(state,
170 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8_ld64,
171 xnn_init_qs8_conv_minmax_rndnu_neon_params,
172 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
173 }
174 static void qs8_dwconv_up16x9__neon_mul8_ld128(benchmark::State& state, const char* net) {
175 DWConvBenchmark(state,
176 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8_ld128,
177 xnn_init_qs8_conv_minmax_rndnu_neon_params,
178 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
179 }
180 static void qs8_dwconv_up8x9__neon_mla8_ld64(benchmark::State& state, const char* net) {
181 DWConvBenchmark(state,
182 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mla8_ld64,
183 xnn_init_qs8_conv_minmax_rndnu_neon_params,
184 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
185 }
186 static void qs8_dwconv_up16x9__neon_mla8_ld64(benchmark::State& state, const char* net) {
187 DWConvBenchmark(state,
188 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64,
189 xnn_init_qs8_conv_minmax_rndnu_neon_params,
190 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
191 }
192 static void qs8_dwconv_up16x9__neon_mla8_ld128(benchmark::State& state, const char* net) {
193 DWConvBenchmark(state,
194 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld128,
195 xnn_init_qs8_conv_minmax_rndnu_neon_params,
196 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
197 }
198 static void qs8_dwconv_up8x9__neon_mul16(benchmark::State& state, const char* net) {
199 DWConvBenchmark(state,
200 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16,
201 xnn_init_qs8_conv_minmax_rndnu_neon_params,
202 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
203 }
204 static void qs8_dwconv_up16x9__neon_mul16(benchmark::State& state, const char* net) {
205 DWConvBenchmark(state,
206 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul16,
207 xnn_init_qs8_conv_minmax_rndnu_neon_params,
208 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
209 }
Frank Barchard2aa2e2a2021-09-16 14:59:13 -0700210 static void qs8_dwconv_up24x9__neon_mul16(benchmark::State& state, const char* net) {
211 DWConvBenchmark(state,
212 xnn_qs8_dwconv_minmax_rndnu_ukernel_up24x9__neon_mul16,
213 xnn_init_qs8_conv_minmax_rndnu_neon_params,
214 24 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
215 }
216 static void qs8_dwconv_up32x9__neon_mul16(benchmark::State& state, const char* net) {
217 DWConvBenchmark(state,
218 xnn_qs8_dwconv_minmax_rndnu_ukernel_up32x9__neon_mul16,
219 xnn_init_qs8_conv_minmax_rndnu_neon_params,
220 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
221 }
Frank Barchard7da8b022021-08-31 09:49:10 -0700222 static void qs8_dwconv_up8x25__neon_mul8_ld64(benchmark::State& state, const char* net) {
223 DWConvBenchmark(state,
224 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8_ld64,
225 xnn_init_qs8_conv_minmax_rndnu_neon_params,
226 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
227 }
228 static void qs8_dwconv_up16x25__neon_mul8_ld64(benchmark::State& state, const char* net) {
229 DWConvBenchmark(state,
230 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mul8_ld64,
231 xnn_init_qs8_conv_minmax_rndnu_neon_params,
232 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
233 }
234 static void qs8_dwconv_up16x25__neon_mul8_ld128(benchmark::State& state, const char* net) {
235 DWConvBenchmark(state,
236 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mul8_ld128,
237 xnn_init_qs8_conv_minmax_rndnu_neon_params,
238 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
239 }
240 static void qs8_dwconv_up8x25__neon_mla8_ld64(benchmark::State& state, const char* net) {
241 DWConvBenchmark(state,
242 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mla8_ld64,
243 xnn_init_qs8_conv_minmax_rndnu_neon_params,
244 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
245 }
246 static void qs8_dwconv_up16x25__neon_mla8_ld64(benchmark::State& state, const char* net) {
247 DWConvBenchmark(state,
248 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld64,
249 xnn_init_qs8_conv_minmax_rndnu_neon_params,
250 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
251 }
252 static void qs8_dwconv_up16x25__neon_mla8_ld128(benchmark::State& state, const char* net) {
253 DWConvBenchmark(state,
254 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld128,
255 xnn_init_qs8_conv_minmax_rndnu_neon_params,
256 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
257 }
258 static void qs8_dwconv_up8x25__neon_mul16(benchmark::State& state, const char* net) {
259 DWConvBenchmark(state,
260 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16,
261 xnn_init_qs8_conv_minmax_rndnu_neon_params,
262 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
263 }
264 static void qs8_dwconv_up16x25__neon_mul16(benchmark::State& state, const char* net) {
265 DWConvBenchmark(state,
266 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mul16,
267 xnn_init_qs8_conv_minmax_rndnu_neon_params,
268 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
269 }
Frank Barchard2aa2e2a2021-09-16 14:59:13 -0700270 static void qs8_dwconv_up24x25__neon_mul16(benchmark::State& state, const char* net) {
271 DWConvBenchmark(state,
272 xnn_qs8_dwconv_minmax_rndnu_ukernel_up24x25__neon_mul16,
273 xnn_init_qs8_conv_minmax_rndnu_neon_params,
274 24 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
275 }
276 static void qs8_dwconv_up32x25__neon_mul16(benchmark::State& state, const char* net) {
277 DWConvBenchmark(state,
278 xnn_qs8_dwconv_minmax_rndnu_ukernel_up32x25__neon_mul16,
279 xnn_init_qs8_conv_minmax_rndnu_neon_params,
280 32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
281 }
Marat Dukhan0744fa02021-07-26 22:56:27 -0700282
283 BENCHMARK_DWCONV(qs8_dwconv_up8x9__neon_mul8_ld64);
284 BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mul8_ld64);
285 BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mul8_ld128);
286 BENCHMARK_DWCONV(qs8_dwconv_up8x9__neon_mla8_ld64);
287 BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mla8_ld64);
288 BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mla8_ld128);
289 BENCHMARK_DWCONV(qs8_dwconv_up8x9__neon_mul16);
290 BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mul16);
Frank Barchard2aa2e2a2021-09-16 14:59:13 -0700291 BENCHMARK_DWCONV(qs8_dwconv_up24x9__neon_mul16);
292 BENCHMARK_DWCONV(qs8_dwconv_up32x9__neon_mul16);
Frank Barchard7da8b022021-08-31 09:49:10 -0700293 BENCHMARK_DWCONV(qs8_dwconv_up8x25__neon_mul8_ld64);
294 BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mul8_ld64);
295 BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mul8_ld128);
296 BENCHMARK_DWCONV(qs8_dwconv_up8x25__neon_mla8_ld64);
297 BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mla8_ld64);
298 BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mla8_ld128);
299 BENCHMARK_DWCONV(qs8_dwconv_up8x25__neon_mul16);
300 BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mul16);
Frank Barchard2aa2e2a2021-09-16 14:59:13 -0700301 BENCHMARK_DWCONV(qs8_dwconv_up24x25__neon_mul16);
302 BENCHMARK_DWCONV(qs8_dwconv_up32x25__neon_mul16);
Marat Dukhan0744fa02021-07-26 22:56:27 -0700303#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
304
305
306#if XNN_ARCH_X86 || XNN_ARCH_X86_64
307 static void qs8_dwconv_up16x9__avx512skx_mul32(benchmark::State& state, const char* net) {
308 DWConvBenchmark(state,
309 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx512skx_mul32,
310 xnn_init_qs8_conv_minmax_fp32_avx512_params,
311 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX);
312 }
313 static void qs8_dwconv_up32x9__avx512skx_mul32(benchmark::State& state, const char* net) {
314 DWConvBenchmark(state,
315 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32,
316 xnn_init_qs8_conv_minmax_fp32_avx512_params,
317 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX);
318 }
Marat Dukhan881ab022021-07-28 13:49:26 -0700319 static void qs8_dwconv_up16x9__avx2_mul16_vpmovsx(benchmark::State& state, const char* net) {
Marat Dukhan0744fa02021-07-26 22:56:27 -0700320 DWConvBenchmark(state,
Marat Dukhan881ab022021-07-28 13:49:26 -0700321 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_vpmovsx,
Marat Dukhan0744fa02021-07-26 22:56:27 -0700322 xnn_init_qs8_conv_minmax_fp32_avx2_params,
323 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
324 }
Marat Dukhan881ab022021-07-28 13:49:26 -0700325 static void qs8_dwconv_up32x9__avx2_mul16_vpmovsx(benchmark::State& state, const char* net) {
Marat Dukhan0744fa02021-07-26 22:56:27 -0700326 DWConvBenchmark(state,
Marat Dukhan881ab022021-07-28 13:49:26 -0700327 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_vpmovsx,
328 xnn_init_qs8_conv_minmax_fp32_avx2_params,
329 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
330 }
331 static void qs8_dwconv_up16x9__avx2_mul16_vpunpck(benchmark::State& state, const char* net) {
332 DWConvBenchmark(state,
333 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_vpunpck,
334 xnn_init_qs8_conv_minmax_fp32_avx2_params,
335 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
336 }
337 static void qs8_dwconv_up32x9__avx2_mul16_vpunpck(benchmark::State& state, const char* net) {
338 DWConvBenchmark(state,
339 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_vpunpck,
Marat Dukhan0744fa02021-07-26 22:56:27 -0700340 xnn_init_qs8_conv_minmax_fp32_avx2_params,
341 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
342 }
Marat Dukhan60bb7ec2021-07-28 18:51:28 -0700343 static void qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck(benchmark::State& state, const char* net) {
344 DWConvBenchmark(state,
345 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_add16_vpunpck,
346 xnn_init_qs8_conv_minmax_fp32_avx2_params,
347 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
348 }
349 static void qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck(benchmark::State& state, const char* net) {
350 DWConvBenchmark(state,
351 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_add16_vpunpck,
352 xnn_init_qs8_conv_minmax_fp32_avx2_params,
353 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
354 }
Marat Dukhan0744fa02021-07-26 22:56:27 -0700355 static void qs8_dwconv_up8x9__avx2_mul32(benchmark::State& state, const char* net) {
356 DWConvBenchmark(state,
357 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx2_mul32,
358 xnn_init_qs8_conv_minmax_fp32_avx2_params,
359 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
360 }
361 static void qs8_dwconv_up16x9__avx2_mul32(benchmark::State& state, const char* net) {
362 DWConvBenchmark(state,
363 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32,
364 xnn_init_qs8_conv_minmax_fp32_avx2_params,
365 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
366 }
367 static void qs8_dwconv_up32x9__avx2_mul32(benchmark::State& state, const char* net) {
368 DWConvBenchmark(state,
369 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul32,
370 xnn_init_qs8_conv_minmax_fp32_avx2_params,
371 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
372 }
373 static void qs8_dwconv_up8x9__xop_mul16_add16(benchmark::State& state, const char* net) {
374 DWConvBenchmark(state,
375 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul16_add16,
376 xnn_init_qs8_conv_minmax_fp32_sse4_params,
377 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
378 }
379 static void qs8_dwconv_up16x9__xop_mul16_add16(benchmark::State& state, const char* net) {
380 DWConvBenchmark(state,
381 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16,
382 xnn_init_qs8_conv_minmax_fp32_sse4_params,
383 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
384 }
385 static void qs8_dwconv_up8x9__avx_mul16(benchmark::State& state, const char* net) {
386 DWConvBenchmark(state,
387 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16,
388 xnn_init_qs8_conv_minmax_fp32_sse4_params,
389 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
390 }
391 static void qs8_dwconv_up16x9__avx_mul16(benchmark::State& state, const char* net) {
392 DWConvBenchmark(state,
393 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16,
394 xnn_init_qs8_conv_minmax_fp32_sse4_params,
395 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
396 }
397 static void qs8_dwconv_up8x9__avx_mul16_add16(benchmark::State& state, const char* net) {
398 DWConvBenchmark(state,
399 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16_add16,
400 xnn_init_qs8_conv_minmax_fp32_sse4_params,
401 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
402 }
403 static void qs8_dwconv_up16x9__avx_mul16_add16(benchmark::State& state, const char* net) {
404 DWConvBenchmark(state,
405 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16,
406 xnn_init_qs8_conv_minmax_fp32_sse4_params,
407 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
408 }
409 static void qs8_dwconv_up8x9__avx_mul32(benchmark::State& state, const char* net) {
410 DWConvBenchmark(state,
411 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32,
412 xnn_init_qs8_conv_minmax_fp32_sse4_params,
413 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
414 }
415 static void qs8_dwconv_up16x9__avx_mul32(benchmark::State& state, const char* net) {
416 DWConvBenchmark(state,
417 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32,
418 xnn_init_qs8_conv_minmax_fp32_sse4_params,
419 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
420 }
421 static void qs8_dwconv_up8x9__sse41_mul16(benchmark::State& state, const char* net) {
422 DWConvBenchmark(state,
423 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16,
424 xnn_init_qs8_conv_minmax_fp32_sse4_params,
425 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
426 }
427 static void qs8_dwconv_up16x9__sse41_mul16(benchmark::State& state, const char* net) {
428 DWConvBenchmark(state,
429 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16,
430 xnn_init_qs8_conv_minmax_fp32_sse4_params,
431 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
432 }
433 static void qs8_dwconv_up8x9__sse41_mul16_add16(benchmark::State& state, const char* net) {
434 DWConvBenchmark(state,
435 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16,
436 xnn_init_qs8_conv_minmax_fp32_sse4_params,
437 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
438 }
439 static void qs8_dwconv_up16x9__sse41_mul16_add16(benchmark::State& state, const char* net) {
440 DWConvBenchmark(state,
441 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16_add16,
442 xnn_init_qs8_conv_minmax_fp32_sse4_params,
443 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
444 }
445 static void qs8_dwconv_up8x9__sse41_mul32(benchmark::State& state, const char* net) {
446 DWConvBenchmark(state,
447 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32,
448 xnn_init_qs8_conv_minmax_fp32_sse4_params,
449 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
450 }
451 static void qs8_dwconv_up16x9__sse41_mul32(benchmark::State& state, const char* net) {
452 DWConvBenchmark(state,
453 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32,
454 xnn_init_qs8_conv_minmax_fp32_sse4_params,
455 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
456 }
457 static void qs8_dwconv_up8x9__sse2_mul16(benchmark::State& state, const char* net) {
458 DWConvBenchmark(state,
459 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16,
460 xnn_init_qs8_conv_minmax_fp32_sse2_params,
461 8 /* channel tile */, 9 /* primary tile */);
462 }
463 static void qs8_dwconv_up16x9__sse2_mul16(benchmark::State& state, const char* net) {
464 DWConvBenchmark(state,
465 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse2_mul16,
466 xnn_init_qs8_conv_minmax_fp32_sse2_params,
467 16 /* channel tile */, 9 /* primary tile */);
468 }
469 static void qs8_dwconv_up8x9__sse2_mul16_add16(benchmark::State& state, const char* net) {
470 DWConvBenchmark(state,
471 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16,
472 xnn_init_qs8_conv_minmax_fp32_sse2_params,
473 8 /* channel tile */, 9 /* primary tile */);
474 }
475 static void qs8_dwconv_up16x9__sse2_mul16_add16(benchmark::State& state, const char* net) {
476 DWConvBenchmark(state,
477 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse2_mul16_add16,
478 xnn_init_qs8_conv_minmax_fp32_sse2_params,
479 16 /* channel tile */, 9 /* primary tile */);
480 }
481
482 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx512skx_mul32);
483 BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx512skx_mul32);
484
Marat Dukhan881ab022021-07-28 13:49:26 -0700485 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul16_vpmovsx);
486 BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul16_vpmovsx);
487 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul16_vpunpck);
488 BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul16_vpunpck);
Marat Dukhanbbe88242021-07-28 19:17:31 -0700489 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck);
490 BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck);
Marat Dukhan0744fa02021-07-26 22:56:27 -0700491 BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx2_mul32);
492 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul32);
493 BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul32);
494
495 BENCHMARK_DWCONV(qs8_dwconv_up8x9__xop_mul16_add16);
496 BENCHMARK_DWCONV(qs8_dwconv_up16x9__xop_mul16_add16);
497
498 BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx_mul16);
499 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx_mul16);
500 BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx_mul16_add16);
501 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx_mul16_add16);
502 BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx_mul32);
503 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx_mul32);
504
505 BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse41_mul16);
506 BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse41_mul16);
507 BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse41_mul16_add16);
508 BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse41_mul16_add16);
509 BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse41_mul32);
510 BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse41_mul32);
511
512 BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse2_mul16);
513 BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse2_mul16);
514 BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse2_mul16_add16);
515 BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse2_mul16_add16);
516#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
517
518
Marat Dukhan4c617792021-12-21 15:47:58 -0800519#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan0744fa02021-07-26 22:56:27 -0700520 static void qs8_dwconv_up8x9__wasmsimd_mul16(benchmark::State& state, const char* net) {
521 DWConvBenchmark(state,
522 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16,
523 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
524 8 /* channel tile */, 9 /* primary tile */);
525 }
526 static void qs8_dwconv_up16x9__wasmsimd_mul16(benchmark::State& state, const char* net) {
527 DWConvBenchmark(state,
528 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16,
529 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
530 16 /* channel tile */, 9 /* primary tile */);
531 }
532
533 BENCHMARK_DWCONV(qs8_dwconv_up8x9__wasmsimd_mul16);
534 BENCHMARK_DWCONV(qs8_dwconv_up16x9__wasmsimd_mul16);
Marat Dukhan4c617792021-12-21 15:47:58 -0800535#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan0744fa02021-07-26 22:56:27 -0700536
537
Marat Dukhan7c1115f2022-01-04 17:18:41 -0800538#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
539 static void qs8_dwconv_up1x9__wasm_fmagic(benchmark::State& state, const char* net) {
540 DWConvBenchmark(state,
541 xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__wasm_fmagic,
542 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
543 1 /* channel tile */, 9 /* primary tile */);
544 }
545 static void qs8_dwconv_up2x9__wasm_fmagic(benchmark::State& state, const char* net) {
546 DWConvBenchmark(state,
547 xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic,
548 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
549 2 /* channel tile */, 9 /* primary tile */);
550 }
551 static void qs8_dwconv_up4x9__wasm_fmagic(benchmark::State& state, const char* net) {
552 DWConvBenchmark(state,
553 xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__wasm_fmagic,
554 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
555 4 /* channel tile */, 9 /* primary tile */);
556 }
557
558 BENCHMARK_DWCONV(qs8_dwconv_up1x9__wasm_fmagic);
559 BENCHMARK_DWCONV(qs8_dwconv_up2x9__wasm_fmagic);
560 BENCHMARK_DWCONV(qs8_dwconv_up4x9__wasm_fmagic);
561#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
562
563
Marat Dukhan2ac722e2022-01-04 01:54:20 -0800564static void qs8_dwconv_up1x9__scalar_fmagic(benchmark::State& state, const char* net) {
Marat Dukhan0744fa02021-07-26 22:56:27 -0700565 DWConvBenchmark(state,
Marat Dukhan2ac722e2022-01-04 01:54:20 -0800566 xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic,
567 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
Marat Dukhan0744fa02021-07-26 22:56:27 -0700568 1 /* channel tile */, 9 /* primary tile */);
569}
Marat Dukhan2ac722e2022-01-04 01:54:20 -0800570static void qs8_dwconv_up2x9__scalar_fmagic(benchmark::State& state, const char* net) {
Marat Dukhan0744fa02021-07-26 22:56:27 -0700571 DWConvBenchmark(state,
Marat Dukhan2ac722e2022-01-04 01:54:20 -0800572 xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_fmagic,
573 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
Marat Dukhan0744fa02021-07-26 22:56:27 -0700574 2 /* channel tile */, 9 /* primary tile */);
575}
Marat Dukhan2ac722e2022-01-04 01:54:20 -0800576static void qs8_dwconv_up4x9__scalar_fmagic(benchmark::State& state, const char* net) {
Marat Dukhan0744fa02021-07-26 22:56:27 -0700577 DWConvBenchmark(state,
Marat Dukhan2ac722e2022-01-04 01:54:20 -0800578 xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_fmagic,
579 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
Marat Dukhan0744fa02021-07-26 22:56:27 -0700580 4 /* channel tile */, 9 /* primary tile */);
581}
582
Marat Dukhan440e8ed2022-01-04 15:30:57 -0800583static void qs8_dwconv_up1x9__scalar_imagic(benchmark::State& state, const char* net) {
584 DWConvBenchmark(state,
585 xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_imagic,
586 xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
587 1 /* channel tile */, 9 /* primary tile */);
588}
589static void qs8_dwconv_up2x9__scalar_imagic(benchmark::State& state, const char* net) {
590 DWConvBenchmark(state,
591 xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic,
592 xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
593 2 /* channel tile */, 9 /* primary tile */);
594}
595static void qs8_dwconv_up4x9__scalar_imagic(benchmark::State& state, const char* net) {
596 DWConvBenchmark(state,
597 xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_imagic,
598 xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
599 4 /* channel tile */, 9 /* primary tile */);
600}
601
602static void qs8_dwconv_up1x9__scalar_lrintf(benchmark::State& state, const char* net) {
603 DWConvBenchmark(state,
604 xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_lrintf,
605 xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
606 1 /* channel tile */, 9 /* primary tile */);
607}
608static void qs8_dwconv_up2x9__scalar_lrintf(benchmark::State& state, const char* net) {
609 DWConvBenchmark(state,
610 xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf,
611 xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
612 2 /* channel tile */, 9 /* primary tile */);
613}
614static void qs8_dwconv_up4x9__scalar_lrintf(benchmark::State& state, const char* net) {
615 DWConvBenchmark(state,
616 xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_lrintf,
617 xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
618 4 /* channel tile */, 9 /* primary tile */);
619}
Marat Dukhan0744fa02021-07-26 22:56:27 -0700620
Marat Dukhan2ac722e2022-01-04 01:54:20 -0800621BENCHMARK_DWCONV(qs8_dwconv_up1x9__scalar_fmagic);
622BENCHMARK_DWCONV(qs8_dwconv_up2x9__scalar_fmagic);
623BENCHMARK_DWCONV(qs8_dwconv_up4x9__scalar_fmagic);
Marat Dukhan0744fa02021-07-26 22:56:27 -0700624
Marat Dukhan440e8ed2022-01-04 15:30:57 -0800625BENCHMARK_DWCONV(qs8_dwconv_up1x9__scalar_imagic);
626BENCHMARK_DWCONV(qs8_dwconv_up2x9__scalar_imagic);
627BENCHMARK_DWCONV(qs8_dwconv_up4x9__scalar_imagic);
628
629BENCHMARK_DWCONV(qs8_dwconv_up1x9__scalar_lrintf);
630BENCHMARK_DWCONV(qs8_dwconv_up2x9__scalar_lrintf);
631BENCHMARK_DWCONV(qs8_dwconv_up4x9__scalar_lrintf);
632
Marat Dukhan0744fa02021-07-26 22:56:27 -0700633
634#ifndef XNNPACK_BENCHMARK_NO_MAIN
635BENCHMARK_MAIN();
636#endif