blob: ddee87d0d187b401b888f844b50547a87809cc38 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cfloat>
8#include <cmath>
9#include <functional>
Zhi An Ng1425eb52022-02-03 14:24:44 -080010#include <limits>
XNNPACK Teamb455b122019-09-27 18:10:33 -070011#include <random>
12#include <vector>
13
XNNPACK Teamb455b122019-09-27 18:10:33 -070014#include <benchmark/benchmark.h>
15#include "bench/conv.h"
16#include "bench/utils.h"
17#include <xnnpack/AlignedAllocator.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070018#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070019#include <xnnpack/igemm.h>
20#include <xnnpack/indirection.h>
21#include <xnnpack/operator.h>
22#include <xnnpack/pack.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070023#include <xnnpack/params-init.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070024#include <xnnpack/params.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070025
26
27static void IGEMMBenchmark(benchmark::State& state,
Marat Dukhande06f492020-04-09 00:19:31 -070028 xnn_f32_igemm_minmax_ukernel_function f32_igemm,
Marat Dukhanc8466f52019-11-25 18:01:10 -080029 uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
Marat Dukhan104ae5e2021-05-24 13:41:57 -070030 xnn_init_f32_minmax_params_fn init_params,
Marat Dukhanc8466f52019-11-25 18:01:10 -080031 benchmark::utils::IsaCheckFunction isa_check = nullptr)
XNNPACK Teamb455b122019-09-27 18:10:33 -070032{
Marat Dukhanc8466f52019-11-25 18:01:10 -080033 if (isa_check && !isa_check(state)) {
34 return;
35 }
XNNPACK Teamb455b122019-09-27 18:10:33 -070036
37 const size_t input_height = state.range(0);
38 const size_t input_width = state.range(1);
39 const size_t kernel_height = state.range(2);
40 const size_t kernel_width = state.range(3);
41 const size_t kernel_size = kernel_height * kernel_width;
42 const size_t padding_height = state.range(4);
43 const size_t padding_width = state.range(5);
44 const size_t subsampling = state.range(6);
45 const size_t dilation = state.range(7);
46 const size_t group_input_channels = state.range(8);
47 const size_t group_output_channels = state.range(9);
48
49 std::random_device random_device;
50 auto rng = std::mt19937(random_device());
Marat Dukhan44f0ca72020-08-02 21:46:58 -070051 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -070052
53 const size_t output_pixel_stride = group_output_channels;
54 const size_t input_pixel_stride = group_input_channels;
55 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
56 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
57 const size_t padding_left = padding_width / 2;
58 const size_t padding_top = padding_height / 2;
59 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
60 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
61 const size_t output_size = output_height * output_width;
62
Marat Dukhan42323232019-10-23 02:09:02 -070063 const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
64 const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
Marat Dukhanfbd67a72022-01-31 18:03:50 -080065 const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
XNNPACK Teamb455b122019-09-27 18:10:33 -070066
67 std::vector<float> a(input_height * input_width * input_pixel_stride);
68 std::generate(a.begin(), a.end(), std::ref(f32rng));
69 std::vector<float> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
70 std::generate(k.begin(), k.end(), std::ref(f32rng));
71 std::vector<float> b(group_output_channels);
72 std::generate(b.begin(), b.end(), std::ref(f32rng));
73
74 std::vector<float> z(group_input_channels);
75
Marat Dukhanfbd67a72022-01-31 18:03:50 -080076 const size_t w_elements = kernel_size * kc_stride * nc_stride + nc_stride;
XNNPACK Teamb455b122019-09-27 18:10:33 -070077 const size_t i_elements = mc_stride * kernel_size;
78 const size_t c_elements = output_height * output_width * output_pixel_stride;
79 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -070080 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -070081 sizeof(float) * (w_elements + c_elements) + sizeof(void*) * i_elements);
82
Marat Dukhane13e6392021-07-26 22:22:35 -070083 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
XNNPACK Teamb455b122019-09-27 18:10:33 -070084 std::fill(w.begin(), w.end(), 0.0f);
85 xnn_pack_f32_conv_goki_w(
86 1 /* groups */, group_output_channels, kernel_size, group_input_channels,
Marat Dukhane06c8132021-06-03 08:59:11 -070087 nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
XNNPACK Teamb455b122019-09-27 18:10:33 -070088 for (size_t n = 1; n < num_buffers; n++) {
89 std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
90 }
91
92 std::vector<const float*> i(i_elements * num_buffers);
93 xnn_operator convolution_op = { };
94 convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
95 convolution_op.input = a.data();
96 convolution_op.input_pixel_stride = input_pixel_stride;
97 convolution_op.zero_buffer = z.data();
98 convolution_op.groups = 1;
99 convolution_op.group_input_channels = group_input_channels;
100 convolution_op.batch_size = 1;
101 convolution_op.input_height = input_height;
102 convolution_op.input_width = input_width;
103 convolution_op.output_height = output_height;
104 convolution_op.output_width = output_width;
105 convolution_op.kernel_height = kernel_height;
106 convolution_op.kernel_width = kernel_width;
107 convolution_op.stride_height = subsampling;
108 convolution_op.stride_width = subsampling;
109 convolution_op.dilation_height = dilation;
110 convolution_op.dilation_width = dilation;
111 convolution_op.padding_top = padding_top;
112 convolution_op.padding_left = padding_left;
113 xnn_indirection_init_conv2d(&convolution_op, mr, 2 /* log2(sizeof(float)) */);
114 for (size_t n = 1; n < num_buffers; n++) {
115 std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
116 }
117
118 std::vector<float> c(c_elements * num_buffers);
119 std::fill(c.begin(), c.end(), std::nanf(""));
120
Marat Dukhanf56f4c42021-05-17 01:47:20 -0700121 xnn_f32_minmax_params params;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700122 init_params(&params,
123 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700124
125 size_t buffer_index = 0;
126 for (auto _ : state) {
127 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700128 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700129 buffer_index = (buffer_index + 1) % num_buffers;
130 state.ResumeTiming();
131
132 for (uint32_t m = 0; m < output_size; m += mr) {
133 const uint32_t mb = min(output_size - m, mr);
Zhi An Ng21888332022-02-03 13:35:48 -0800134 f32_igemm(
135 mb, group_output_channels, group_input_channels * sizeof(float), kernel_size * mr * sizeof(void*),
136 i.data() + buffer_index * i_elements + m,
137 w.data() + buffer_index * w_elements,
138 c.data() + buffer_index * c_elements + m * group_output_channels, group_output_channels * sizeof(float), nr * sizeof(float),
139 0, z.data(), &params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700140 }
141 }
142
Marat Dukhand713e8a2020-12-04 14:23:12 -0800143 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
144 if (cpu_frequency != 0) {
145 state.counters["cpufreq"] = cpu_frequency;
146 }
147
XNNPACK Teamb455b122019-09-27 18:10:33 -0700148 state.counters["FLOPS"] = benchmark::Counter(
149 uint64_t(state.iterations()) * 2 *
150 output_height * output_width *
151 group_input_channels * group_output_channels *
152 kernel_height * kernel_width,
153 benchmark::Counter::kIsRate);
154}
155
Zhi An Ng1425eb52022-02-03 14:24:44 -0800156#if XNN_PLATFORM_JIT
Zhi An Ng665cb232022-01-07 15:56:41 -0800157 static void IGEMMBenchmark(benchmark::State& state,
Zhi An Ng0ec25cf2022-01-19 11:38:55 -0800158 xnn_jit_igemm_code_generator_function generator,
Zhi An Ng665cb232022-01-07 15:56:41 -0800159 size_t mr, size_t nr, size_t kr, size_t sr,
160 xnn_init_f32_minmax_params_fn init_params,
161 benchmark::utils::IsaCheckFunction isa_check = nullptr)
Zhi An Ng1425eb52022-02-03 14:24:44 -0800162{
163 if (isa_check && !isa_check(state)) {
164 return;
Zhi An Ng665cb232022-01-07 15:56:41 -0800165 }
166
Zhi An Ng1425eb52022-02-03 14:24:44 -0800167 const size_t input_height = state.range(0);
168 const size_t input_width = state.range(1);
169 const size_t kernel_height = state.range(2);
170 const size_t kernel_width = state.range(3);
171 const size_t kernel_size = kernel_height * kernel_width;
172 const size_t padding_height = state.range(4);
173 const size_t padding_width = state.range(5);
174 const size_t subsampling = state.range(6);
175 const size_t dilation = state.range(7);
176 const size_t group_input_channels = state.range(8);
177 const size_t group_output_channels = state.range(9);
178
179 std::random_device random_device;
180 auto rng = std::mt19937(random_device());
181 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
182
183 const size_t output_pixel_stride = group_output_channels;
184 const size_t input_pixel_stride = group_input_channels;
185 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
186 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
187 const size_t padding_left = padding_width / 2;
188 const size_t padding_top = padding_height / 2;
189 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
190 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
191 const size_t output_size = output_height * output_width;
192
193 const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
194 const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
195 const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
196
197 std::vector<float> a(input_height * input_width * input_pixel_stride);
198 std::generate(a.begin(), a.end(), std::ref(f32rng));
199 std::vector<float> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
200 std::generate(k.begin(), k.end(), std::ref(f32rng));
201 std::vector<float> b(group_output_channels);
202 std::generate(b.begin(), b.end(), std::ref(f32rng));
203
204 std::vector<float> z(group_input_channels);
205
206 const size_t w_elements = kernel_size * kc_stride * nc_stride + nc_stride;
207 const size_t i_elements = mc_stride * kernel_size;
208 const size_t c_elements = output_height * output_width * output_pixel_stride;
209 const size_t num_buffers = 1 +
210 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
211 sizeof(float) * (w_elements + c_elements) + sizeof(void*) * i_elements);
212
213 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
214 std::fill(w.begin(), w.end(), 0.0f);
215 xnn_pack_f32_conv_goki_w(
216 1 /* groups */, group_output_channels, kernel_size, group_input_channels,
217 nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
218 for (size_t n = 1; n < num_buffers; n++) {
219 std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
220 }
221
222 std::vector<const float*> i(i_elements * num_buffers);
223 xnn_operator convolution_op = { };
224 convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
225 convolution_op.input = a.data();
226 convolution_op.input_pixel_stride = input_pixel_stride;
227 convolution_op.zero_buffer = z.data();
228 convolution_op.groups = 1;
229 convolution_op.group_input_channels = group_input_channels;
230 convolution_op.batch_size = 1;
231 convolution_op.input_height = input_height;
232 convolution_op.input_width = input_width;
233 convolution_op.output_height = output_height;
234 convolution_op.output_width = output_width;
235 convolution_op.kernel_height = kernel_height;
236 convolution_op.kernel_width = kernel_width;
237 convolution_op.stride_height = subsampling;
238 convolution_op.stride_width = subsampling;
239 convolution_op.dilation_height = dilation;
240 convolution_op.dilation_width = dilation;
241 convolution_op.padding_top = padding_top;
242 convolution_op.padding_left = padding_left;
243 xnn_indirection_init_conv2d(&convolution_op, mr, 2 /* log2(sizeof(float)) */);
244 for (size_t n = 1; n < num_buffers; n++) {
245 std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
246 }
247
248 std::vector<float> c(c_elements * num_buffers);
249 std::fill(c.begin(), c.end(), std::nanf(""));
250
251 xnn_f32_minmax_params params;
252 init_params(&params,
253 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
254
255 jit_gemm_params jit_params = {
256 .f32_minmax = {
257 .min = -std::numeric_limits<float>::infinity(),
258 .max = +std::numeric_limits<float>::infinity()
259 }
260 };
261
262 xnn_code_buffer code_buffer;
263 xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
264 generator(&code_buffer, group_output_channels, group_input_channels * sizeof(float), kernel_size * mr * sizeof(void *), &jit_params);
265 auto f32_igemm = reinterpret_cast<xnn_f32_igemm_minmax_ukernel_function>(code_buffer.code);
266
267 size_t buffer_index = 0;
268 for (auto _ : state) {
269 state.PauseTiming();
270 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
271 buffer_index = (buffer_index + 1) % num_buffers;
272 state.ResumeTiming();
273
274 for (uint32_t m = 0; m < output_size; m += mr) {
275 const uint32_t mb = min(output_size - m, mr);
276 f32_igemm(
277 mb, group_output_channels, group_input_channels * sizeof(float), kernel_size * mr * sizeof(void*),
278 i.data() + buffer_index * i_elements + m,
279 w.data() + buffer_index * w_elements,
280 c.data() + buffer_index * c_elements + m * group_output_channels, group_output_channels * sizeof(float), nr * sizeof(float),
281 0, z.data(), &params);
282 }
283 }
284 xnn_release_code_memory(&code_buffer);
285
286 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
287 if (cpu_frequency != 0) {
288 state.counters["cpufreq"] = cpu_frequency;
289 }
290
291 state.counters["FLOPS"] = benchmark::Counter(
292 uint64_t(state.iterations()) * 2 *
293 output_height * output_width *
294 group_input_channels * group_output_channels *
295 kernel_height * kernel_width,
296 benchmark::Counter::kIsRate);
297
298}
299#endif // XNN_PLATFORM_JIT
300
301#if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
Zhi An Ngf30a8592022-02-03 16:49:19 -0800302 static void jit_f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
303 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
304 xnn_init_f32_minmax_scalar_params);
305 }
306 static void jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
307 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
308 xnn_init_f32_minmax_scalar_params);
309 }
Zhi An Ng1425eb52022-02-03 14:24:44 -0800310 static void jit_f32_igemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
311 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
312 xnn_init_f32_minmax_scalar_params);
313 }
314 static void jit_f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
315 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
316 xnn_init_f32_minmax_scalar_params);
317 }
318
Zhi An Ngf30a8592022-02-03 16:49:19 -0800319 BENCHMARK_CONV(jit_f32_igemm_1x8__aarch64_neonfma_cortex_a75)
320 BENCHMARK_CONV(jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75)
Zhi An Ng1425eb52022-02-03 14:24:44 -0800321 BENCHMARK_CONV(jit_f32_igemm_6x8__aarch64_neonfma_cortex_a75)
322 BENCHMARK_CONV(jit_f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75)
323#endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
324
325#if XNN_ARCH_ARM && XNN_PLATFORM_JIT
Zhi An Ng665cb232022-01-07 15:56:41 -0800326 static void jit_f32_igemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
327 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
328 xnn_init_f32_minmax_scalar_params);
329 }
330 static void jit_f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
331 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
332 xnn_init_f32_minmax_scalar_params);
333 }
334 static void jit_f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
335 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
336 xnn_init_f32_minmax_scalar_params);
337 }
338 static void jit_f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
339 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
340 xnn_init_f32_minmax_scalar_params);
341 }
342 static void jit_f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) {
343 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
344 xnn_init_f32_minmax_scalar_params);
345 }
346 static void jit_f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
347 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
348 xnn_init_f32_minmax_scalar_params);
349 }
350
351 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_ld64)
352 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a7)
353 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a53)
354 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a55)
355 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_prfm_cortex_a75)
356 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a75)
Zhi An Ng1425eb52022-02-03 14:24:44 -0800357#endif // XNN_ARCH_ARM && XNN_PLATFORM_JIT
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700358
Frank Barcharddc38f072020-02-10 13:21:42 -0800359#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
360 static void f32_igemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700361 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
362 xnn_init_f32_minmax_scalar_params);
Frank Barcharddc38f072020-02-10 13:21:42 -0800363 }
Frank Barchard490febe2020-07-16 18:42:17 -0700364 static void f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700365 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
366 xnn_init_f32_minmax_scalar_params);
Frank Barchard569561d2020-06-17 13:11:12 -0700367 }
Frank Barchardb7dd29e2020-03-11 12:37:10 -0700368 static void f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700369 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
370 xnn_init_f32_minmax_scalar_params);
Frank Barchardb7dd29e2020-03-11 12:37:10 -0700371 }
372 static void f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700373 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
374 xnn_init_f32_minmax_scalar_params);
Frank Barchardb7dd29e2020-03-11 12:37:10 -0700375 }
Frank Barchard78735862022-01-04 16:47:44 -0800376 static void f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) {
377 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700378 xnn_init_f32_minmax_scalar_params);
Frank Barchardb7dd29e2020-03-11 12:37:10 -0700379 }
380 static void f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700381 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
382 xnn_init_f32_minmax_scalar_params);
Frank Barchardb7dd29e2020-03-11 12:37:10 -0700383 }
Frank Barcharddc38f072020-02-10 13:21:42 -0800384
385 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_ld64)
Frank Barchard490febe2020-07-16 18:42:17 -0700386 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a7)
Frank Barchardb7dd29e2020-03-11 12:37:10 -0700387 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a53)
388 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a55)
Frank Barchard78735862022-01-04 16:47:44 -0800389 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_prfm_cortex_a75)
Frank Barchardb7dd29e2020-03-11 12:37:10 -0700390 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a75)
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700391#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
392
Frank Barcharddc38f072020-02-10 13:21:42 -0800393
Frank Barcharddbafc582019-10-09 16:30:48 -0700394#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700395 static void f32_igemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700396 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1,
397 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700398 }
Frank Barchard21be34f2019-10-09 19:32:19 -0700399 static void f32_igemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700400 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1,
401 xnn_init_f32_minmax_scalar_params);
Frank Barchard21be34f2019-10-09 19:32:19 -0700402 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700403 static void f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700404 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
405 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700406 }
Frank Barchard143a1102021-06-15 09:15:34 -0700407 static void f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
408 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
409 xnn_init_f32_minmax_scalar_params);
410 }
Frank Barchard46fb8072019-10-25 12:54:22 -0700411 static void f32_igemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700412 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1,
413 xnn_init_f32_minmax_scalar_params);
Frank Barchard46fb8072019-10-25 12:54:22 -0700414 }
Frank Barchard8fb90552020-03-16 11:36:09 -0700415 static void f32_igemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700416 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, 4, 8, 1, 1,
417 xnn_init_f32_minmax_scalar_params);
Frank Barchard8fb90552020-03-16 11:36:09 -0700418 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700419 static void f32_igemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700420 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1,
421 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700422 }
Frank Barchard143a1102021-06-15 09:15:34 -0700423 static void f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
424 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8, 1, 1,
425 xnn_init_f32_minmax_scalar_params);
426 }
Frank Barcharde3491242021-06-11 14:04:57 -0700427 static void f32_igemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
428 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1,
429 xnn_init_f32_minmax_scalar_params);
430 }
Frank Barchard79cd5f92021-06-21 17:34:59 -0700431 static void f32_igemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
432 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1,
433 xnn_init_f32_minmax_scalar_params);
434 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700435 static void f32_igemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700436 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1,
437 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700438 }
Frank Barchard143a1102021-06-15 09:15:34 -0700439 static void f32_igemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
440 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, 5, 8, 1, 1,
441 xnn_init_f32_minmax_scalar_params);
442 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700443 static void f32_igemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700444 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1,
445 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700446 }
Frank Barcharda7fb8552019-10-23 17:14:17 -0700447 static void f32_igemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700448 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1,
449 xnn_init_f32_minmax_scalar_params);
Frank Barcharda7fb8552019-10-23 17:14:17 -0700450 }
Frank Barchard91e19992020-03-09 18:46:14 -0700451 static void f32_igemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700452 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, 6, 8, 1, 1,
453 xnn_init_f32_minmax_scalar_params);
Frank Barchard91e19992020-03-09 18:46:14 -0700454 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700455 static void f32_igemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700456 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1,
457 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700458 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700459 static void f32_igemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700460 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
461 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700462 }
Frank Barchard143a1102021-06-15 09:15:34 -0700463 static void f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
464 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
465 xnn_init_f32_minmax_scalar_params);
466 }
Frank Barcharde3491242021-06-11 14:04:57 -0700467 static void f32_igemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
468 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1,
469 xnn_init_f32_minmax_scalar_params);
470 }
Frank Barchard79cd5f92021-06-21 17:34:59 -0700471 static void f32_igemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
472 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1,
473 xnn_init_f32_minmax_scalar_params);
474 }
Frank Barchard91317c52019-11-22 10:54:35 -0800475 static void f32_igemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700476 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1,
477 xnn_init_f32_minmax_scalar_params);
Frank Barchard91317c52019-11-22 10:54:35 -0800478 }
Frank Barchard91317c52019-11-22 10:54:35 -0800479 static void f32_igemm_4x2__neonfma_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700480 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64, 4, 2, 1, 1,
481 xnn_init_f32_minmax_scalar_params);
Frank Barchard91317c52019-11-22 10:54:35 -0800482 }
Frank Barchard91317c52019-11-22 10:54:35 -0800483 static void f32_igemm_4x4__neonfma_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700484 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x4__neonfma_lane_ld64, 4, 4, 1, 1,
485 xnn_init_f32_minmax_scalar_params);
Frank Barchard91317c52019-11-22 10:54:35 -0800486 }
Frank Barchard91317c52019-11-22 10:54:35 -0800487 static void f32_igemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700488 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1,
489 xnn_init_f32_minmax_scalar_params);
Frank Barchard91317c52019-11-22 10:54:35 -0800490 }
Frank Barchard91317c52019-11-22 10:54:35 -0800491 static void f32_igemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700492 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1,
493 xnn_init_f32_minmax_scalar_params);
Frank Barchard91317c52019-11-22 10:54:35 -0800494 }
Frank Barchard91317c52019-11-22 10:54:35 -0800495 static void f32_igemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700496 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1,
497 xnn_init_f32_minmax_scalar_params);
Frank Barchard91317c52019-11-22 10:54:35 -0800498 }
Frank Barchard69172d92019-11-26 16:22:39 -0800499 static void f32_igemm_6x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700500 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128, 6, 8, 1, 1,
501 xnn_init_f32_minmax_scalar_params);
Frank Barchard69172d92019-11-26 16:22:39 -0800502 }
503
XNNPACK Teamb455b122019-09-27 18:10:33 -0700504 BENCHMARK_CONV(f32_igemm_1x12__aarch64_neonfma_cortex_a53)
Frank Barchard21be34f2019-10-09 19:32:19 -0700505 BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_cortex_a53)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700506 BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_cortex_a75)
Frank Barchard143a1102021-06-15 09:15:34 -0700507 BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700508 BENCHMARK_CONV(f32_igemm_4x12__aarch64_neonfma_cortex_a53)
Frank Barchard46fb8072019-10-25 12:54:22 -0700509 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a53)
Frank Barchard8fb90552020-03-16 11:36:09 -0700510 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a55)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700511 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a75)
Frank Barchard143a1102021-06-15 09:15:34 -0700512 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75)
Frank Barcharde3491242021-06-11 14:04:57 -0700513 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_ld64)
Frank Barchard79cd5f92021-06-21 17:34:59 -0700514 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_ld128)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700515 BENCHMARK_CONV(f32_igemm_5x8__aarch64_neonfma_cortex_a75)
Frank Barchard143a1102021-06-15 09:15:34 -0700516 BENCHMARK_CONV(f32_igemm_5x8__aarch64_neonfma_prfm_cortex_a75)
Frank Barcharda7fb8552019-10-23 17:14:17 -0700517 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a53)
Frank Barchard91e19992020-03-09 18:46:14 -0700518 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a55)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700519 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a73)
520 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a75)
Frank Barchard143a1102021-06-15 09:15:34 -0700521 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75)
Frank Barcharde3491242021-06-11 14:04:57 -0700522 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_ld64)
Frank Barchard79cd5f92021-06-21 17:34:59 -0700523 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_ld128)
Frank Barchard91317c52019-11-22 10:54:35 -0800524 BENCHMARK_CONV(f32_igemm_1x8__neonfma_lane_ld64)
525 BENCHMARK_CONV(f32_igemm_4x2__neonfma_lane_ld64)
526 BENCHMARK_CONV(f32_igemm_4x4__neonfma_lane_ld64)
527 BENCHMARK_CONV(f32_igemm_4x8__neonfma_lane_ld128)
528 BENCHMARK_CONV(f32_igemm_4x8__neonfma_lane_ld64)
529 BENCHMARK_CONV(f32_igemm_6x8__neonfma_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800530 BENCHMARK_CONV(f32_igemm_6x8__neonfma_lane_ld128)
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700531#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
532
XNNPACK Teamb455b122019-09-27 18:10:33 -0700533
Frank Barchard4c3e5a92021-08-16 19:17:39 -0700534#if XNN_ARCH_ARM || XNN_ARCH_ARM64
535 static void f32_igemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
536 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1,
537 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
538 }
539 static void f32_igemm_4x2__neon_lane_ld64(benchmark::State& state, const char* net) {
540 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64, 4, 2, 1, 1,
541 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
542 }
543 static void f32_igemm_4x4__neon_lane_ld64(benchmark::State& state, const char* net) {
544 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x4__neon_lane_ld64, 4, 4, 1, 1,
545 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
546 }
547 static void f32_igemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
548 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1,
549 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
550 }
551 static void f32_igemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
552 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1,
553 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
554 }
555 static void f32_igemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
556 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1,
557 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
558 }
559 static void f32_igemm_6x8__neon_lane_ld128(benchmark::State& state, const char* net) {
560 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128, 6, 8, 1, 1,
561 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
562 }
563 static void f32_igemm_1x8__neon_dup_ld64(benchmark::State& state, const char* net) {
564 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64, 1, 8, 1, 1,
565 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
566 }
567 static void f32_igemm_4x8__neon_dup_ld128(benchmark::State& state, const char* net) {
568 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld128, 4, 8, 1, 1,
569 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
570 }
571 static void f32_igemm_4x8__neon_dup_ld64(benchmark::State& state, const char* net) {
572 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64, 4, 8, 1, 1,
573 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
574 }
575 static void f32_igemm_6x8__neon_dup_ld64(benchmark::State& state, const char* net) {
576 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64, 6, 8, 1, 1,
577 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
578 }
579 static void f32_igemm_6x8__neon_dup_ld128(benchmark::State& state, const char* net) {
580 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld128, 6, 8, 1, 1,
581 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
582 }
583 static void f32_igemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
584 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1,
585 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
586 }
587 static void f32_igemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
588 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1,
589 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
590 }
591 static void f32_igemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
592 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1,
593 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
594 }
595 static void f32_igemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
596 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1,
597 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
598 }
599 static void f32_igemm_6x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
600 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld128, 6, 8, 1, 1,
601 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
602 }
603 static void f32_igemm_1x8s4__neon(benchmark::State& state, const char* net) {
604 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8s4__neon, 1, 8, 1, 4,
605 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
606 }
607 static void f32_igemm_4x8s4__neon(benchmark::State& state, const char* net) {
608 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__neon, 4, 8, 1, 4,
609 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
610 }
611 static void f32_igemm_6x8s4__neon(benchmark::State& state, const char* net) {
612 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8s4__neon, 6, 8, 1, 4,
613 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
614 }
615 static void f32_igemm_8x8s4__neon(benchmark::State& state, const char* net) {
616 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_8x8s4__neon, 8, 8, 1, 4,
617 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
618 }
619 static void f32_igemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
620 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma, 1, 8, 1, 4,
621 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
622 }
623 static void f32_igemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
624 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma, 4, 8, 1, 4,
625 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
626 }
627 static void f32_igemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
628 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma, 6, 8, 1, 4,
629 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
630 }
631 static void f32_igemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
632 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma, 8, 8, 1, 4,
633 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
634 }
635
636 BENCHMARK_CONV(f32_igemm_1x8__neon_lane_ld64)
637 BENCHMARK_CONV(f32_igemm_4x2__neon_lane_ld64)
638 BENCHMARK_CONV(f32_igemm_4x4__neon_lane_ld64)
639 BENCHMARK_CONV(f32_igemm_4x8__neon_lane_ld128)
640 BENCHMARK_CONV(f32_igemm_4x8__neon_lane_ld64)
641 BENCHMARK_CONV(f32_igemm_6x8__neon_lane_ld64)
642 BENCHMARK_CONV(f32_igemm_6x8__neon_lane_ld128)
643 BENCHMARK_CONV(f32_igemm_1x8__neon_dup_ld64)
644 BENCHMARK_CONV(f32_igemm_4x8__neon_dup_ld128)
645 BENCHMARK_CONV(f32_igemm_4x8__neon_dup_ld64)
646 BENCHMARK_CONV(f32_igemm_6x8__neon_dup_ld64)
647 BENCHMARK_CONV(f32_igemm_6x8__neon_dup_ld128)
648 BENCHMARK_CONV(f32_igemm_1x8__neonfma_dup_ld64)
649 BENCHMARK_CONV(f32_igemm_4x8__neonfma_dup_ld128)
650 BENCHMARK_CONV(f32_igemm_4x8__neonfma_dup_ld64)
651 BENCHMARK_CONV(f32_igemm_6x8__neonfma_dup_ld64)
652 BENCHMARK_CONV(f32_igemm_6x8__neonfma_dup_ld128)
653 BENCHMARK_CONV(f32_igemm_1x8s4__neon)
654 BENCHMARK_CONV(f32_igemm_4x8s4__neon)
655 BENCHMARK_CONV(f32_igemm_6x8s4__neon)
656 BENCHMARK_CONV(f32_igemm_8x8s4__neon)
657 BENCHMARK_CONV(f32_igemm_1x8s4__neonfma)
658 BENCHMARK_CONV(f32_igemm_4x8s4__neonfma)
659 BENCHMARK_CONV(f32_igemm_6x8s4__neonfma)
660 BENCHMARK_CONV(f32_igemm_8x8s4__neonfma)
661#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
662
663
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700664#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700665 static void f32_igemm_1x8__sse_load1(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700666 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__sse_load1, 1, 8, 1, 1,
667 xnn_init_f32_minmax_sse_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700668 }
Marat Dukhan802fcae2020-12-11 14:37:25 -0800669 static void f32_igemm_3x8__sse_load1(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700670 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__sse_load1, 3, 8, 1, 1,
671 xnn_init_f32_minmax_sse_params);
Marat Dukhan802fcae2020-12-11 14:37:25 -0800672 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700673 static void f32_igemm_4x8__sse_load1(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700674 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__sse_load1, 4, 8, 1, 1,
675 xnn_init_f32_minmax_sse_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700676 }
Marat Dukhan802fcae2020-12-11 14:37:25 -0800677 static void f32_igemm_5x8__sse_load1(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700678 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__sse_load1, 5, 8, 1, 1,
679 xnn_init_f32_minmax_sse_params);
Marat Dukhan802fcae2020-12-11 14:37:25 -0800680 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700681
682 static void f32_igemm_1x8__sse_dup(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700683 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__sse_dup, 1, 8, 1, 1,
684 xnn_init_f32_minmax_sse_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700685 }
Marat Dukhan802fcae2020-12-11 14:37:25 -0800686 static void f32_igemm_3x8__sse_dup(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700687 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__sse_dup, 3, 8, 1, 1,
688 xnn_init_f32_minmax_sse_params);
Marat Dukhan802fcae2020-12-11 14:37:25 -0800689 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700690 static void f32_igemm_4x8__sse_dup(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700691 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__sse_dup, 4, 8, 1, 1,
692 xnn_init_f32_minmax_sse_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700693 }
Marat Dukhan802fcae2020-12-11 14:37:25 -0800694 static void f32_igemm_5x8__sse_dup(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700695 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__sse_dup, 5, 8, 1, 1,
696 xnn_init_f32_minmax_sse_params);
Marat Dukhan802fcae2020-12-11 14:37:25 -0800697 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700698
699 static void f32_igemm_1x8s4__sse(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700700 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8s4__sse, 1, 8, 1, 4,
701 xnn_init_f32_minmax_sse_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700702 }
Marat Dukhan802fcae2020-12-11 14:37:25 -0800703 static void f32_igemm_3x8s4__sse(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700704 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8s4__sse, 3, 8, 1, 4,
705 xnn_init_f32_minmax_sse_params);
Marat Dukhan802fcae2020-12-11 14:37:25 -0800706 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700707 static void f32_igemm_4x8s4__sse(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700708 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__sse, 4, 8, 1, 4,
709 xnn_init_f32_minmax_sse_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700710 }
Marat Dukhan802fcae2020-12-11 14:37:25 -0800711 static void f32_igemm_5x8s4__sse(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700712 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8s4__sse, 5, 8, 1, 4,
713 xnn_init_f32_minmax_sse_params);
Marat Dukhan802fcae2020-12-11 14:37:25 -0800714 }
715
716 static void f32_igemm_1x8__sse2_dup(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700717 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup, 1, 8, 1, 1,
718 xnn_init_f32_minmax_sse_params);
Marat Dukhan802fcae2020-12-11 14:37:25 -0800719 }
720 static void f32_igemm_3x8__sse2_dup(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700721 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup, 3, 8, 1, 1,
722 xnn_init_f32_minmax_sse_params);
Marat Dukhan802fcae2020-12-11 14:37:25 -0800723 }
724 static void f32_igemm_4x8__sse2_dup(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700725 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__sse2_dup, 4, 8, 1, 1,
726 xnn_init_f32_minmax_sse_params);
Marat Dukhan802fcae2020-12-11 14:37:25 -0800727 }
728 static void f32_igemm_5x8__sse2_dup(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700729 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__sse2_dup, 5, 8, 1, 1,
730 xnn_init_f32_minmax_sse_params);
Marat Dukhan802fcae2020-12-11 14:37:25 -0800731 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700732
Marat Dukhanfda12b82019-11-21 12:27:59 -0800733 static void f32_igemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700734 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast, 1, 8, 1, 1,
735 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800736 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800737 static void f32_igemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700738 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__avx_broadcast, 4, 8, 1, 1,
739 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800740 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800741 static void f32_igemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700742 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast, 5, 8, 1, 1,
743 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800744 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800745 static void f32_igemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700746 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast, 6, 8, 1, 1,
747 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800748 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800749 static void f32_igemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700750 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast, 7, 8, 1, 1,
751 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800752 }
753
754 static void f32_igemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700755 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1,
756 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800757 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800758 static void f32_igemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700759 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1,
760 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800761 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800762 static void f32_igemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700763 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1,
764 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800765 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800766 static void f32_igemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700767 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1,
768 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800769 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800770 static void f32_igemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700771 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1,
772 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800773 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800774 static void f32_igemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700775 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1,
776 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800777 }
778
Marat Dukhan0f349c42019-11-27 11:58:54 -0800779 static void f32_igemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700780 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast, 1, 16, 1, 1,
781 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800782 }
783 static void f32_igemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700784 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x16__avx512f_broadcast, 4, 16, 1, 1,
785 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800786 }
787 static void f32_igemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700788 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x16__avx512f_broadcast, 5, 16, 1, 1,
789 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800790 }
791 static void f32_igemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700792 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x16__avx512f_broadcast, 6, 16, 1, 1,
793 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800794 }
795 static void f32_igemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700796 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast, 7, 16, 1, 1,
797 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800798 }
799 static void f32_igemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700800 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_8x16__avx512f_broadcast, 8, 16, 1, 1,
801 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800802 }
803
XNNPACK Teamb455b122019-09-27 18:10:33 -0700804 BENCHMARK_CONV(f32_igemm_1x8__sse_load1)
Marat Dukhan802fcae2020-12-11 14:37:25 -0800805 BENCHMARK_CONV(f32_igemm_3x8__sse_load1)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700806 BENCHMARK_CONV(f32_igemm_4x8__sse_load1)
Marat Dukhan802fcae2020-12-11 14:37:25 -0800807 BENCHMARK_CONV(f32_igemm_5x8__sse_load1)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800808
XNNPACK Teamb455b122019-09-27 18:10:33 -0700809 BENCHMARK_CONV(f32_igemm_1x8__sse_dup)
Marat Dukhan802fcae2020-12-11 14:37:25 -0800810 BENCHMARK_CONV(f32_igemm_3x8__sse_dup)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700811 BENCHMARK_CONV(f32_igemm_4x8__sse_dup)
Marat Dukhan802fcae2020-12-11 14:37:25 -0800812 BENCHMARK_CONV(f32_igemm_5x8__sse_dup)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800813
XNNPACK Teamb455b122019-09-27 18:10:33 -0700814 BENCHMARK_CONV(f32_igemm_1x8s4__sse)
Marat Dukhan802fcae2020-12-11 14:37:25 -0800815 BENCHMARK_CONV(f32_igemm_3x8s4__sse)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700816 BENCHMARK_CONV(f32_igemm_4x8s4__sse)
Marat Dukhan802fcae2020-12-11 14:37:25 -0800817 BENCHMARK_CONV(f32_igemm_5x8s4__sse)
818
819 BENCHMARK_CONV(f32_igemm_1x8__sse2_dup)
820 BENCHMARK_CONV(f32_igemm_3x8__sse2_dup)
821 BENCHMARK_CONV(f32_igemm_4x8__sse2_dup)
822 BENCHMARK_CONV(f32_igemm_5x8__sse2_dup)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800823
Marat Dukhanfda12b82019-11-21 12:27:59 -0800824 BENCHMARK_CONV(f32_igemm_1x8__avx_broadcast)
825 BENCHMARK_CONV(f32_igemm_4x8__avx_broadcast)
826 BENCHMARK_CONV(f32_igemm_5x8__avx_broadcast)
827 BENCHMARK_CONV(f32_igemm_6x8__avx_broadcast)
828 BENCHMARK_CONV(f32_igemm_7x8__avx_broadcast)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800829
Marat Dukhanfda12b82019-11-21 12:27:59 -0800830 BENCHMARK_CONV(f32_igemm_1x8__fma3_broadcast)
831 BENCHMARK_CONV(f32_igemm_4x8__fma3_broadcast)
832 BENCHMARK_CONV(f32_igemm_5x8__fma3_broadcast)
833 BENCHMARK_CONV(f32_igemm_6x8__fma3_broadcast)
834 BENCHMARK_CONV(f32_igemm_7x8__fma3_broadcast)
835 BENCHMARK_CONV(f32_igemm_8x8__fma3_broadcast)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800836
837 BENCHMARK_CONV(f32_igemm_1x16__avx512f_broadcast)
838 BENCHMARK_CONV(f32_igemm_4x16__avx512f_broadcast)
839 BENCHMARK_CONV(f32_igemm_5x16__avx512f_broadcast)
840 BENCHMARK_CONV(f32_igemm_6x16__avx512f_broadcast)
841 BENCHMARK_CONV(f32_igemm_7x16__avx512f_broadcast)
842 BENCHMARK_CONV(f32_igemm_8x16__avx512f_broadcast)
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700843#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
844
XNNPACK Teamb455b122019-09-27 18:10:33 -0700845
Marat Dukhan4c617792021-12-21 15:47:58 -0800846#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Frank Barchard0725b8d2020-12-07 11:07:35 -0800847 static void f32_igemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700848 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, 3, 8, 1, 1,
849 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700850 }
851
Frank Barchard0725b8d2020-12-07 11:07:35 -0800852 static void f32_igemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700853 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, 4, 8, 1, 1,
854 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700855 }
856
Frank Barchard0725b8d2020-12-07 11:07:35 -0800857 static void f32_igemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700858 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, 5, 8, 1, 1,
859 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700860 }
861
Frank Barchard0725b8d2020-12-07 11:07:35 -0800862 static void f32_igemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700863 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, 6, 8, 1, 1,
864 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700865 }
866
Frank Barchard0725b8d2020-12-07 11:07:35 -0800867 static void f32_igemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700868 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, 3, 8, 1, 1,
869 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700870 }
871
Frank Barchard0725b8d2020-12-07 11:07:35 -0800872 static void f32_igemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700873 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, 4, 8, 1, 1,
874 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700875 }
876
Frank Barchard0725b8d2020-12-07 11:07:35 -0800877 static void f32_igemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700878 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, 5, 8, 1, 1,
879 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700880 }
881
Frank Barchard0725b8d2020-12-07 11:07:35 -0800882 static void f32_igemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700883 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, 6, 8, 1, 1,
884 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700885 }
886
Frank Barchard0725b8d2020-12-07 11:07:35 -0800887 static void f32_igemm_3x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700888 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat, 3, 8, 1, 1,
889 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700890 }
891
Frank Barchard0725b8d2020-12-07 11:07:35 -0800892 static void f32_igemm_4x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700893 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_splat, 4, 8, 1, 1,
894 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700895 }
896
Frank Barchard0725b8d2020-12-07 11:07:35 -0800897 static void f32_igemm_5x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700898 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat, 5, 8, 1, 1,
899 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700900 }
901
Frank Barchard0725b8d2020-12-07 11:07:35 -0800902 static void f32_igemm_6x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700903 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat, 6, 8, 1, 1,
904 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700905 }
906
Frank Barchard0725b8d2020-12-07 11:07:35 -0800907 static void f32_igemm_3x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700908 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat, 3, 8, 1, 1,
909 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700910 }
911
Frank Barchard0725b8d2020-12-07 11:07:35 -0800912 static void f32_igemm_4x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700913 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat, 4, 8, 1, 1,
914 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700915 }
916
Frank Barchard0725b8d2020-12-07 11:07:35 -0800917 static void f32_igemm_5x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700918 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat, 5, 8, 1, 1,
919 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700920 }
921
Frank Barchard0725b8d2020-12-07 11:07:35 -0800922 static void f32_igemm_6x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700923 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat, 6, 8, 1, 1,
924 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700925 }
926
927 static void f32_igemm_3x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700928 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm, 3, 8, 1, 4,
929 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700930 }
931
932 static void f32_igemm_4x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700933 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, 8, 1, 4,
934 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700935 }
936
937 static void f32_igemm_5x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700938 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm, 5, 8, 1, 4,
939 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700940 }
941
942 static void f32_igemm_6x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700943 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm, 6, 8, 1, 4,
944 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700945 }
946
947 static void f32_igemm_3x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700948 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86, 3, 8, 1, 4,
949 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700950 }
951
952 static void f32_igemm_4x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700953 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, 8, 1, 4,
954 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700955 }
956
957 static void f32_igemm_5x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700958 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86, 5, 8, 1, 4,
959 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700960 }
961
962 static void f32_igemm_6x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700963 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86, 6, 8, 1, 4,
964 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700965 }
966
Frank Barchard0725b8d2020-12-07 11:07:35 -0800967 BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_arm_loadsplat)
968 BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_arm_loadsplat)
969 BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_arm_loadsplat)
970 BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_arm_loadsplat)
971 BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_x86_loadsplat)
972 BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_x86_loadsplat)
973 BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_x86_loadsplat)
974 BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_x86_loadsplat)
975 BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_arm_splat)
976 BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_arm_splat)
977 BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_arm_splat)
978 BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_arm_splat)
979 BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_x86_splat)
980 BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_x86_splat)
981 BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_x86_splat)
982 BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_x86_splat)
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700983 BENCHMARK_CONV(f32_igemm_3x8s4__wasmsimd_arm)
984 BENCHMARK_CONV(f32_igemm_4x8s4__wasmsimd_arm)
985 BENCHMARK_CONV(f32_igemm_5x8s4__wasmsimd_arm)
986 BENCHMARK_CONV(f32_igemm_6x8s4__wasmsimd_arm)
987 BENCHMARK_CONV(f32_igemm_3x8s4__wasmsimd_x86)
988 BENCHMARK_CONV(f32_igemm_4x8s4__wasmsimd_x86)
989 BENCHMARK_CONV(f32_igemm_5x8s4__wasmsimd_x86)
990 BENCHMARK_CONV(f32_igemm_6x8s4__wasmsimd_x86)
Marat Dukhan4c617792021-12-21 15:47:58 -0800991#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700992
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700993
XNNPACK Teamb455b122019-09-27 18:10:33 -0700994static void f32_igemm_1x4__scalar(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700995 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x4__scalar, 1, 4, 1, 1,
996 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700997}
998
999static void f32_igemm_2x4__scalar(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001000 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_2x4__scalar, 2, 4, 1, 1,
1001 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001002}
1003
1004static void f32_igemm_4x4__scalar(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001005 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x4__scalar, 4, 4, 1, 1,
1006 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001007}
1008
1009BENCHMARK_CONV(f32_igemm_1x4__scalar)
1010BENCHMARK_CONV(f32_igemm_2x4__scalar)
1011BENCHMARK_CONV(f32_igemm_4x4__scalar)
1012
XNNPACK Teamb455b122019-09-27 18:10:33 -07001013#ifndef XNNPACK_BENCHMARK_NO_MAIN
1014BENCHMARK_MAIN();
1015#endif