blob: e06554cf67b05a6efe34f1dd572a0225adb9622d [file] [log] [blame]
Marat Dukhan0744fa02021-07-26 22:56:27 -07001// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cfloat>
8#include <cmath>
9#include <functional>
10#include <random>
11#include <vector>
12
13#include <benchmark/benchmark.h>
14#include "bench/dwconv.h"
15#include "bench/utils.h"
16#include <xnnpack/AlignedAllocator.h>
17#include <xnnpack/common.h>
18#include <xnnpack/dwconv.h>
19#include <xnnpack/indirection.h>
20#include <xnnpack/operator.h>
21#include <xnnpack/pack.h>
22#include <xnnpack/params-init.h>
23#include <xnnpack/params.h>
24
25
26static void DWConvBenchmark(benchmark::State& state,
27 xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv,
28 xnn_init_qs8_conv_minmax_params_fn init_params,
29 uint32_t channel_tile, uint32_t primary_tile,
30 benchmark::utils::IsaCheckFunction isa_check = nullptr)
31{
32 if (isa_check && !isa_check(state)) {
33 return;
34 }
35
36 const size_t input_height = state.range(0);
37 const size_t input_width = state.range(1);
38 const size_t kernel_height = state.range(2);
39 const size_t kernel_width = state.range(3);
40 const size_t padding_height = state.range(4);
41 const size_t padding_width = state.range(5);
42 const size_t subsampling = state.range(6);
43 const size_t dilation = state.range(7);
44 const size_t channels = state.range(8);
45
46 const size_t kernel_size = kernel_height * kernel_width;
47 if (kernel_size != primary_tile) {
48 state.SkipWithError("kernel size mismatch");
49 return;
50 }
51
52 std::random_device random_device;
53 auto rng = std::mt19937(random_device());
54 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
55 auto i8rng = std::bind(
56 std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()), std::ref(rng));
57
58 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
59 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
60 const size_t padding_left = padding_width / 2;
61 const size_t padding_top = padding_height / 2;
62 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
63 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
64 const size_t output_size = output_height * output_width;
65 const size_t step_width = dilation == 1 ? subsampling : kernel_width;
66 const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
67
68 const size_t c_stride = benchmark::utils::RoundUp<size_t>(channels, channel_tile);
69
70 std::vector<int8_t> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(int8_t));
71 std::generate(a.begin(), a.end(), std::ref(i8rng));
72 std::vector<int8_t> k(channels * kernel_height * kernel_width);
73 std::generate(k.begin(), k.end(), std::ref(i8rng));
74 std::vector<int32_t> b(channels);
75 std::generate(b.begin(), b.end(), std::ref(i32rng));
76
77 std::vector<int8_t> z(channels + XNN_EXTRA_BYTES / sizeof(int8_t));
78
79 const size_t k_elements = kernel_size * c_stride;
80 const size_t b_elements = c_stride;
81 const size_t w_size = k_elements * sizeof(int8_t) + b_elements * sizeof(int32_t);
82 const size_t i_elements = output_height * step_height;
83 const size_t c_elements = output_size * channels;
84 const size_t num_buffers = 1 +
85 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
86 (c_elements * sizeof(int8_t) + w_size) + sizeof(void*) * i_elements);
87
88 std::vector<char, AlignedAllocator<char, 64>> w(w_size * num_buffers);
89 std::fill(w.begin(), w.end(), 0.0f);
90 struct xnn_qs8_packing_params packing_params;
91 packing_params.input_zero_point = 0;
92 xnn_pack_qs8_dwconv_ghw_w(kernel_height, kernel_width, channels, channel_tile,
93 k.data(), b.data(), w.data(), 0 /* extra bytes */, &packing_params);
94 for (size_t n = 1; n < num_buffers; n++) {
95 std::copy(w.cbegin(), w.cbegin() + w_size, w.begin() + n * w_size);
96 }
97
98 std::vector<const int8_t*> i(i_elements * num_buffers);
99 xnn_operator convolution_op = { };
100 convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
101 convolution_op.input = a.data();
102 convolution_op.input_pixel_stride = channels;
103 convolution_op.zero_buffer = z.data();
104 convolution_op.input_height = input_height;
105 convolution_op.input_width = input_width;
106 convolution_op.output_height = output_height;
107 convolution_op.output_width = output_width;
108 convolution_op.kernel_height = kernel_height;
109 convolution_op.kernel_width = kernel_width;
110 convolution_op.stride_height = subsampling;
111 convolution_op.stride_width = subsampling;
112 convolution_op.dilation_height = dilation;
113 convolution_op.dilation_width = dilation;
114 convolution_op.padding_top = padding_top;
115 convolution_op.padding_left = padding_left;
116
117 xnn_indirection_init_dwconv2d(&convolution_op, step_height, step_width, 0 /* log2(sizeof(int8_t)) */);
118 for (size_t n = 1; n < num_buffers; n++) {
119 std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
120 }
121
122 std::vector<int8_t> c(c_elements * num_buffers);
123 std::fill(c.begin(), c.end(), std::nanf(""));
124
125 xnn_qs8_conv_minmax_params params;
126 init_params(&params,
127 0.5f /* scale */, 0 /* output zero point */, std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
128
129 size_t buffer_index = 0;
130 for (auto _ : state) {
131 state.PauseTiming();
132 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(int8_t));
133 buffer_index = (buffer_index + 1) % num_buffers;
134 state.ResumeTiming();
135
136 for (size_t y = 0; y < output_height; y++) {
137 dwconv(channels, output_width,
138 i.data() + buffer_index * i_elements + step_height * y,
139 w.data() + buffer_index * w_size,
140 c.data() + buffer_index * c_elements + y * output_width * channels,
141 kernel_height * step_width * sizeof(void*), 0,
142 0, z.data(), &params);
143 }
144 }
145
146 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
147 if (cpu_frequency != 0) {
148 state.counters["cpufreq"] = cpu_frequency;
149 }
150
151 state.counters["FLOPS"] = benchmark::Counter(
152 uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
153 benchmark::Counter::kIsRate);
154
155 state.counters["bytes"] = benchmark::Counter(
156 uint64_t(state.iterations()) * channels * ((output_size + input_height * input_width + kernel_size) * sizeof(int8_t) + sizeof(int32_t)),
157 benchmark::Counter::kIsRate);
158}
159
160
161#if XNN_ARCH_ARM || XNN_ARCH_ARM64
162 static void qs8_dwconv_up8x9__neon_mul8_ld64(benchmark::State& state, const char* net) {
163 DWConvBenchmark(state,
164 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul8_ld64,
165 xnn_init_qs8_conv_minmax_rndnu_neon_params,
166 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
167 }
168 static void qs8_dwconv_up16x9__neon_mul8_ld64(benchmark::State& state, const char* net) {
169 DWConvBenchmark(state,
170 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8_ld64,
171 xnn_init_qs8_conv_minmax_rndnu_neon_params,
172 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
173 }
174 static void qs8_dwconv_up16x9__neon_mul8_ld128(benchmark::State& state, const char* net) {
175 DWConvBenchmark(state,
176 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8_ld128,
177 xnn_init_qs8_conv_minmax_rndnu_neon_params,
178 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
179 }
180 static void qs8_dwconv_up8x9__neon_mla8_ld64(benchmark::State& state, const char* net) {
181 DWConvBenchmark(state,
182 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mla8_ld64,
183 xnn_init_qs8_conv_minmax_rndnu_neon_params,
184 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
185 }
186 static void qs8_dwconv_up16x9__neon_mla8_ld64(benchmark::State& state, const char* net) {
187 DWConvBenchmark(state,
188 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64,
189 xnn_init_qs8_conv_minmax_rndnu_neon_params,
190 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
191 }
192 static void qs8_dwconv_up16x9__neon_mla8_ld128(benchmark::State& state, const char* net) {
193 DWConvBenchmark(state,
194 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld128,
195 xnn_init_qs8_conv_minmax_rndnu_neon_params,
196 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
197 }
198 static void qs8_dwconv_up8x9__neon_mul16(benchmark::State& state, const char* net) {
199 DWConvBenchmark(state,
200 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16,
201 xnn_init_qs8_conv_minmax_rndnu_neon_params,
202 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
203 }
204 static void qs8_dwconv_up16x9__neon_mul16(benchmark::State& state, const char* net) {
205 DWConvBenchmark(state,
206 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul16,
207 xnn_init_qs8_conv_minmax_rndnu_neon_params,
208 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
209 }
210
211 BENCHMARK_DWCONV(qs8_dwconv_up8x9__neon_mul8_ld64);
212 BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mul8_ld64);
213 BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mul8_ld128);
214 BENCHMARK_DWCONV(qs8_dwconv_up8x9__neon_mla8_ld64);
215 BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mla8_ld64);
216 BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mla8_ld128);
217 BENCHMARK_DWCONV(qs8_dwconv_up8x9__neon_mul16);
218 BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mul16);
219#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
220
221
222#if XNN_ARCH_X86 || XNN_ARCH_X86_64
223 static void qs8_dwconv_up16x9__avx512skx_mul32(benchmark::State& state, const char* net) {
224 DWConvBenchmark(state,
225 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx512skx_mul32,
226 xnn_init_qs8_conv_minmax_fp32_avx512_params,
227 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX);
228 }
229 static void qs8_dwconv_up32x9__avx512skx_mul32(benchmark::State& state, const char* net) {
230 DWConvBenchmark(state,
231 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32,
232 xnn_init_qs8_conv_minmax_fp32_avx512_params,
233 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX);
234 }
235 static void qs8_dwconv_up16x9__avx2_mul16(benchmark::State& state, const char* net) {
236 DWConvBenchmark(state,
237 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16,
238 xnn_init_qs8_conv_minmax_fp32_avx2_params,
239 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
240 }
241 static void qs8_dwconv_up32x9__avx2_mul16(benchmark::State& state, const char* net) {
242 DWConvBenchmark(state,
243 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16,
244 xnn_init_qs8_conv_minmax_fp32_avx2_params,
245 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
246 }
247 static void qs8_dwconv_up8x9__avx2_mul32(benchmark::State& state, const char* net) {
248 DWConvBenchmark(state,
249 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx2_mul32,
250 xnn_init_qs8_conv_minmax_fp32_avx2_params,
251 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
252 }
253 static void qs8_dwconv_up16x9__avx2_mul32(benchmark::State& state, const char* net) {
254 DWConvBenchmark(state,
255 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32,
256 xnn_init_qs8_conv_minmax_fp32_avx2_params,
257 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
258 }
259 static void qs8_dwconv_up32x9__avx2_mul32(benchmark::State& state, const char* net) {
260 DWConvBenchmark(state,
261 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul32,
262 xnn_init_qs8_conv_minmax_fp32_avx2_params,
263 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
264 }
265 static void qs8_dwconv_up8x9__xop_mul16_add16(benchmark::State& state, const char* net) {
266 DWConvBenchmark(state,
267 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul16_add16,
268 xnn_init_qs8_conv_minmax_fp32_sse4_params,
269 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
270 }
271 static void qs8_dwconv_up16x9__xop_mul16_add16(benchmark::State& state, const char* net) {
272 DWConvBenchmark(state,
273 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16,
274 xnn_init_qs8_conv_minmax_fp32_sse4_params,
275 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
276 }
277 static void qs8_dwconv_up8x9__avx_mul16(benchmark::State& state, const char* net) {
278 DWConvBenchmark(state,
279 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16,
280 xnn_init_qs8_conv_minmax_fp32_sse4_params,
281 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
282 }
283 static void qs8_dwconv_up16x9__avx_mul16(benchmark::State& state, const char* net) {
284 DWConvBenchmark(state,
285 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16,
286 xnn_init_qs8_conv_minmax_fp32_sse4_params,
287 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
288 }
289 static void qs8_dwconv_up8x9__avx_mul16_add16(benchmark::State& state, const char* net) {
290 DWConvBenchmark(state,
291 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16_add16,
292 xnn_init_qs8_conv_minmax_fp32_sse4_params,
293 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
294 }
295 static void qs8_dwconv_up16x9__avx_mul16_add16(benchmark::State& state, const char* net) {
296 DWConvBenchmark(state,
297 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16,
298 xnn_init_qs8_conv_minmax_fp32_sse4_params,
299 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
300 }
301 static void qs8_dwconv_up8x9__avx_mul32(benchmark::State& state, const char* net) {
302 DWConvBenchmark(state,
303 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32,
304 xnn_init_qs8_conv_minmax_fp32_sse4_params,
305 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
306 }
307 static void qs8_dwconv_up16x9__avx_mul32(benchmark::State& state, const char* net) {
308 DWConvBenchmark(state,
309 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32,
310 xnn_init_qs8_conv_minmax_fp32_sse4_params,
311 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
312 }
313 static void qs8_dwconv_up8x9__sse41_mul16(benchmark::State& state, const char* net) {
314 DWConvBenchmark(state,
315 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16,
316 xnn_init_qs8_conv_minmax_fp32_sse4_params,
317 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
318 }
319 static void qs8_dwconv_up16x9__sse41_mul16(benchmark::State& state, const char* net) {
320 DWConvBenchmark(state,
321 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16,
322 xnn_init_qs8_conv_minmax_fp32_sse4_params,
323 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
324 }
325 static void qs8_dwconv_up8x9__sse41_mul16_add16(benchmark::State& state, const char* net) {
326 DWConvBenchmark(state,
327 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16,
328 xnn_init_qs8_conv_minmax_fp32_sse4_params,
329 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
330 }
331 static void qs8_dwconv_up16x9__sse41_mul16_add16(benchmark::State& state, const char* net) {
332 DWConvBenchmark(state,
333 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16_add16,
334 xnn_init_qs8_conv_minmax_fp32_sse4_params,
335 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
336 }
337 static void qs8_dwconv_up8x9__sse41_mul32(benchmark::State& state, const char* net) {
338 DWConvBenchmark(state,
339 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32,
340 xnn_init_qs8_conv_minmax_fp32_sse4_params,
341 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
342 }
343 static void qs8_dwconv_up16x9__sse41_mul32(benchmark::State& state, const char* net) {
344 DWConvBenchmark(state,
345 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32,
346 xnn_init_qs8_conv_minmax_fp32_sse4_params,
347 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
348 }
349 static void qs8_dwconv_up8x9__sse2_mul16(benchmark::State& state, const char* net) {
350 DWConvBenchmark(state,
351 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16,
352 xnn_init_qs8_conv_minmax_fp32_sse2_params,
353 8 /* channel tile */, 9 /* primary tile */);
354 }
355 static void qs8_dwconv_up16x9__sse2_mul16(benchmark::State& state, const char* net) {
356 DWConvBenchmark(state,
357 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse2_mul16,
358 xnn_init_qs8_conv_minmax_fp32_sse2_params,
359 16 /* channel tile */, 9 /* primary tile */);
360 }
361 static void qs8_dwconv_up8x9__sse2_mul16_add16(benchmark::State& state, const char* net) {
362 DWConvBenchmark(state,
363 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16,
364 xnn_init_qs8_conv_minmax_fp32_sse2_params,
365 8 /* channel tile */, 9 /* primary tile */);
366 }
367 static void qs8_dwconv_up16x9__sse2_mul16_add16(benchmark::State& state, const char* net) {
368 DWConvBenchmark(state,
369 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse2_mul16_add16,
370 xnn_init_qs8_conv_minmax_fp32_sse2_params,
371 16 /* channel tile */, 9 /* primary tile */);
372 }
373
374 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx512skx_mul32);
375 BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx512skx_mul32);
376
377 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul16);
378 BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul16);
379 BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx2_mul32);
380 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul32);
381 BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul32);
382
383 BENCHMARK_DWCONV(qs8_dwconv_up8x9__xop_mul16_add16);
384 BENCHMARK_DWCONV(qs8_dwconv_up16x9__xop_mul16_add16);
385
386 BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx_mul16);
387 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx_mul16);
388 BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx_mul16_add16);
389 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx_mul16_add16);
390 BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx_mul32);
391 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx_mul32);
392
393 BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse41_mul16);
394 BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse41_mul16);
395 BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse41_mul16_add16);
396 BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse41_mul16_add16);
397 BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse41_mul32);
398 BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse41_mul32);
399
400 BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse2_mul16);
401 BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse2_mul16);
402 BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse2_mul16_add16);
403 BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse2_mul16_add16);
404#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
405
406
407#if XNN_ARCH_WASMSIMD
408 static void qs8_dwconv_up8x9__wasmsimd_mul16(benchmark::State& state, const char* net) {
409 DWConvBenchmark(state,
410 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16,
411 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
412 8 /* channel tile */, 9 /* primary tile */);
413 }
414 static void qs8_dwconv_up16x9__wasmsimd_mul16(benchmark::State& state, const char* net) {
415 DWConvBenchmark(state,
416 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16,
417 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
418 16 /* channel tile */, 9 /* primary tile */);
419 }
420
421 BENCHMARK_DWCONV(qs8_dwconv_up8x9__wasmsimd_mul16);
422 BENCHMARK_DWCONV(qs8_dwconv_up16x9__wasmsimd_mul16);
423#endif // XNN_ARCH_WASMSIMD
424
425
426static void qs8_dwconv_up1x9__scalar_lrint(benchmark::State& state, const char* net) {
427 DWConvBenchmark(state,
428 xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_lrint,
429 xnn_init_qs8_conv_minmax_fp32_scalar_lrint_params,
430 1 /* channel tile */, 9 /* primary tile */);
431}
432static void qs8_dwconv_up2x9__scalar_lrint(benchmark::State& state, const char* net) {
433 DWConvBenchmark(state,
434 xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrint,
435 xnn_init_qs8_conv_minmax_fp32_scalar_lrint_params,
436 2 /* channel tile */, 9 /* primary tile */);
437}
438static void qs8_dwconv_up4x9__scalar_lrint(benchmark::State& state, const char* net) {
439 DWConvBenchmark(state,
440 xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_lrint,
441 xnn_init_qs8_conv_minmax_fp32_scalar_lrint_params,
442 4 /* channel tile */, 9 /* primary tile */);
443}
444static void qs8_dwconv_up1x9__scalar_magic(benchmark::State& state, const char* net) {
445 DWConvBenchmark(state,
446 xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_magic,
447 xnn_init_qs8_conv_minmax_fp32_scalar_magic_params,
448 1 /* channel tile */, 9 /* primary tile */);
449}
450static void qs8_dwconv_up2x9__scalar_magic(benchmark::State& state, const char* net) {
451 DWConvBenchmark(state,
452 xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_magic,
453 xnn_init_qs8_conv_minmax_fp32_scalar_magic_params,
454 2 /* channel tile */, 9 /* primary tile */);
455}
456static void qs8_dwconv_up4x9__scalar_magic(benchmark::State& state, const char* net) {
457 DWConvBenchmark(state,
458 xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_magic,
459 xnn_init_qs8_conv_minmax_fp32_scalar_magic_params,
460 4 /* channel tile */, 9 /* primary tile */);
461}
462
463BENCHMARK_DWCONV(qs8_dwconv_up1x9__scalar_lrint);
464BENCHMARK_DWCONV(qs8_dwconv_up2x9__scalar_lrint);
465BENCHMARK_DWCONV(qs8_dwconv_up4x9__scalar_lrint);
466
467BENCHMARK_DWCONV(qs8_dwconv_up1x9__scalar_magic);
468BENCHMARK_DWCONV(qs8_dwconv_up2x9__scalar_magic);
469BENCHMARK_DWCONV(qs8_dwconv_up4x9__scalar_magic);
470
471
472#ifndef XNNPACK_BENCHMARK_NO_MAIN
473BENCHMARK_MAIN();
474#endif