blob: ce31f126baaa86535bb1015b0e1aeeb22d6ac5c5 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <algorithm>
10#include <cfloat>
11#include <chrono>
12#include <cmath>
13#include <functional>
14#include <mutex>
15#include <random>
16#include <vector>
17
18#include <cpuinfo.h>
19
Frank Barchardbb4c18b2019-09-30 11:05:52 -070020#include <benchmark/benchmark.h>
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070021#ifdef BENCHMARK_RUY
Benoit Jacobb038fdc2020-03-25 12:14:20 -070022#include "ruy/ruy.h"
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070023#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -070024#include "bench/gemm.h"
Frank Barchardbb4c18b2019-09-30 11:05:52 -070025#include "bench/utils.h"
XNNPACK Teamb455b122019-09-27 18:10:33 -070026#include <xnnpack/AlignedAllocator.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070027#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070028#include <xnnpack/gemm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070029#include <xnnpack/pack.h>
Frank Barchardbb4c18b2019-09-30 11:05:52 -070030#include <xnnpack/packx.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070031#include <xnnpack/params-init.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070032#include <xnnpack/params.h>
33#include <xnnpack/ppmm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070034
XNNPACK Teamb455b122019-09-27 18:10:33 -070035
36static void GEMMBenchmark(benchmark::State& state,
Marat Dukhande06f492020-04-09 00:19:31 -070037 xnn_f32_gemm_minmax_ukernel_function gemm,
Marat Dukhanc8466f52019-11-25 18:01:10 -080038 size_t mr, size_t nr, size_t kr, size_t sr,
39 benchmark::utils::IsaCheckFunction isa_check = nullptr)
XNNPACK Teamb455b122019-09-27 18:10:33 -070040{
41 if (!cpuinfo_initialize()) {
42 state.SkipWithError("cpuinfo initialization failed");
43 return;
44 }
Marat Dukhanc8466f52019-11-25 18:01:10 -080045 if (isa_check && !isa_check(state)) {
46 return;
47 }
XNNPACK Teamb455b122019-09-27 18:10:33 -070048
49 const size_t mc = state.range(0);
50 const size_t nc = state.range(1);
51 const size_t kc = state.range(2);
52
Marat Dukhan42323232019-10-23 02:09:02 -070053 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
54 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr);
XNNPACK Teamb455b122019-09-27 18:10:33 -070055
56 std::random_device random_device;
57 auto rng = std::mt19937(random_device());
58 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
59
60 std::vector<float> a(mc * kc);
61 std::generate(a.begin(), a.end(), std::ref(f32rng));
62 std::vector<float> k(nc * kc);
63 std::generate(k.begin(), k.end(), std::ref(f32rng));
64 std::vector<float> b(nc);
65 std::generate(b.begin(), b.end(), std::ref(f32rng));
66
67 const size_t w_elements = nc_stride * kc_stride + nc_stride;
68 const size_t c_elements = mc * nc;
69 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -070070 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -070071 sizeof(float) * (w_elements + c_elements));
72
73 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
74 std::fill(w.begin(), w.end(), 0.0f);
Marat Dukhanb42f8662020-07-06 20:46:13 -070075 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), nullptr);
XNNPACK Teamb455b122019-09-27 18:10:33 -070076 std::vector<float> c(c_elements * num_buffers);
77 std::fill(c.begin(), c.end(), std::nanf(""));
78
Frank Barcharde70dbeb2020-05-01 15:46:41 -070079 xnn_f32_minmax_params params =
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070080 xnn_init_f32_minmax_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -070081
82 size_t buffer_index = 0;
83 for (auto _ : state) {
84 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
85 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
86 // - W is not in cache (for any cache level)
87 // - C is not in cache (for any cache level)
88 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -070089 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -070090 buffer_index = (buffer_index + 1) % num_buffers;
91 state.ResumeTiming();
92
93 for (uint32_t m = 0; m < mc; m += mr) {
94 const uint32_t mb = min(mc - m, mr);
95 gemm(
96 mb, nc, kc * sizeof(float),
97 a.data() + m * kc, kc * sizeof(float),
98 w.data() + buffer_index * nc_stride * (kc_stride + 1),
99 c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
Frank Barcharde70dbeb2020-05-01 15:46:41 -0700100 &params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700101 }
102 }
103
104 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
105 state.counters["FLOPS"] = benchmark::Counter(
106 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
107}
108
109static void PPMM1PBenchmark(benchmark::State& state,
Marat Dukhande06f492020-04-09 00:19:31 -0700110 xnn_f32_ppmm_minmax_ukernel_function ppmm,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700111 xnn_x32_packx_ukernel_function packx,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800112 size_t mr, size_t nr,
113 benchmark::utils::IsaCheckFunction isa_check = nullptr)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700114{
115 if (!cpuinfo_initialize()) {
116 state.SkipWithError("cpuinfo initialization failed");
117 return;
118 }
Marat Dukhanc8466f52019-11-25 18:01:10 -0800119 if (isa_check && !isa_check(state)) {
120 return;
121 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700122
123 const size_t mc = state.range(0);
124 const size_t nc = state.range(1);
125 const size_t kc = state.range(2);
126
Marat Dukhan42323232019-10-23 02:09:02 -0700127 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700128
129 std::random_device random_device;
130 auto rng = std::mt19937(random_device());
131 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
132
133 std::vector<float> a(mc * kc);
134 std::generate(a.begin(), a.end(), std::ref(f32rng));
135 std::vector<float> k(nc * kc);
136 std::generate(k.begin(), k.end(), std::ref(f32rng));
137 std::vector<float> b(nc);
138 std::generate(b.begin(), b.end(), std::ref(f32rng));
139
140 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mr * kc);
141
142 const size_t w_elements = nc_stride * kc + nc_stride;
143 const size_t c_elements = mc * nc;
144 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700145 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700146 sizeof(float) * (w_elements + c_elements));
147
148 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
149 std::fill(w.begin(), w.end(), 0.0f);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700150 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data(), nullptr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700151 std::vector<float> c(c_elements * num_buffers);
152 std::fill(c.begin(), c.end(), std::nanf(""));
153
Frank Barcharde70dbeb2020-05-01 15:46:41 -0700154 xnn_f32_minmax_params params =
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700155 xnn_init_f32_minmax_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700156
157 size_t buffer_index = 0;
158 for (auto _ : state) {
159 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
160 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
161 // - W is not in cache (for any cache level)
162 // - C is not in cache (for any cache level)
163 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700164 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700165 buffer_index = (buffer_index + 1) % num_buffers;
166 state.ResumeTiming();
167
168 for (uint32_t m = 0; m < mc; m += mr) {
169 const uint32_t mb = min(mc - m, mr);
170 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data());
171 ppmm(
172 mb, nc, kc * sizeof(float),
173 reinterpret_cast<const float*>(t.data()),
174 w.data() + nc_stride * buffer_index * (kc + 1),
175 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
Frank Barcharde70dbeb2020-05-01 15:46:41 -0700176 &params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700177 }
178 }
179
180 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
181 state.counters["FLOPS"] = benchmark::Counter(
182 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
183}
184
185static void PPMM2PBenchmark(benchmark::State& state,
Marat Dukhande06f492020-04-09 00:19:31 -0700186 xnn_f32_ppmm_minmax_ukernel_function ppmm,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700187 xnn_x32_packx_ukernel_function packx,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800188 size_t mr, size_t nr,
189 benchmark::utils::IsaCheckFunction isa_check = nullptr)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700190{
191 if (!cpuinfo_initialize()) {
192 state.SkipWithError("cpuinfo initialization failed");
193 return;
194 }
Marat Dukhanc8466f52019-11-25 18:01:10 -0800195 if (isa_check && !isa_check(state)) {
196 return;
197 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700198
199 const size_t mc = state.range(0);
200 const size_t nc = state.range(1);
201 const size_t kc = state.range(2);
202
Marat Dukhan42323232019-10-23 02:09:02 -0700203 const size_t mc_stride = benchmark::utils::RoundUp(mc, mr);
204 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700205
206 std::random_device random_device;
207 auto rng = std::mt19937(random_device());
208 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
209
210 std::vector<float> a(mc * kc);
211 std::generate(a.begin(), a.end(), std::ref(f32rng));
212 std::vector<float> k(nc * kc);
213 std::generate(k.begin(), k.end(), std::ref(f32rng));
214 std::vector<float> b(nc);
215 std::generate(b.begin(), b.end(), std::ref(f32rng));
216
217 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mc_stride * kc);
218
219 const size_t w_elements = nc_stride * kc + nc_stride;
220 const size_t c_elements = mc * nc;
221 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700222 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700223 sizeof(float) * (w_elements + c_elements));
224
225 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
226 std::fill(w.begin(), w.end(), 0.0f);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700227 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data(), nullptr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700228 std::vector<float> c(c_elements * num_buffers);
229 std::fill(c.begin(), c.end(), std::nanf(""));
230
Frank Barcharde70dbeb2020-05-01 15:46:41 -0700231 xnn_f32_minmax_params params =
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700232 xnn_init_f32_minmax_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700233
234 size_t buffer_index = 0;
235 for (auto _ : state) {
236 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
237 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
238 // - W is not in cache (for any cache level)
239 // - C is not in cache (for any cache level)
240 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700241 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700242 buffer_index = (buffer_index + 1) % num_buffers;
243 state.ResumeTiming();
244
245 for (uint32_t m = 0; m < mc; m += mr) {
246 const uint32_t mb = min(mc - m, mr);
247 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data() + m * kc);
248 }
249 for (uint32_t m = 0; m < mc; m += mr) {
250 const uint32_t mb = min(mc - m, mr);
251 ppmm(
252 mb, nc, kc * sizeof(float),
253 reinterpret_cast<const float*>(t.data() + m * kc),
254 w.data() + nc_stride * buffer_index * (kc + 1),
255 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
Frank Barcharde70dbeb2020-05-01 15:46:41 -0700256 &params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700257 }
258 }
259
260 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
261 state.counters["FLOPS"] = benchmark::Counter(
262 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
263}
264
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700265#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700266static void RuyBenchmark(benchmark::State& state, uint32_t threads)
267{
268 std::random_device random_device;
269 auto rng = std::mt19937(random_device());
270 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
271
272 const size_t mc = state.range(0);
273 const size_t nc = state.range(1);
274 const size_t kc = state.range(2);
275
276 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700277 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700278 sizeof(float) * (nc * (mc + kc + 1)));
279
280 std::vector<float> a(mc * kc);
281 std::generate(a.begin(), a.end(), std::ref(f32rng));
282 std::vector<float> k(num_buffers * nc * kc);
283 std::generate(k.begin(), k.end(), std::ref(f32rng));
284 std::vector<float> b(num_buffers * nc);
285 std::generate(b.begin(), b.end(), std::ref(f32rng));
286 std::vector<float> c(num_buffers * nc * mc);
287 std::fill(c.begin(), c.end(), std::nanf(""));
288
289 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
290 static ruy::Context context;
Benoit Jacob349701a2020-04-15 19:35:24 -0700291 context.set_max_num_threads(threads);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700292
293 ruy::Matrix<float> ruy_a;
Benoit Jacob349701a2020-04-15 19:35:24 -0700294 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700295 ruy::Matrix<float> ruy_b;
Benoit Jacob349701a2020-04-15 19:35:24 -0700296 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
297 ruy_b.set_data(a.data());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700298 ruy::Matrix<float> ruy_c;
Benoit Jacob349701a2020-04-15 19:35:24 -0700299 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700300
Benoit Jacobb026e222020-04-16 12:30:03 -0700301 ruy::MulParams<float, float> mul_params;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700302
303 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
304 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
305 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
306 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
307 static std::once_flag warmup;
308 std::call_once(warmup, [&](){
309 auto start = std::chrono::steady_clock::now();
310 do {
Benoit Jacob349701a2020-04-15 19:35:24 -0700311 ruy_a.set_data(k.data());
312 ruy_c.set_data(c.data());
Benoit Jacobb026e222020-04-16 12:30:03 -0700313 mul_params.set_bias(b.data());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700314
Benoit Jacobb026e222020-04-16 12:30:03 -0700315 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700316 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
317 });
318
319 size_t buffer_index = 0;
320 for (auto _ : state) {
321 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
322 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
323 // - K is not in cache (for any cache level)
324 // - B is not in cache (for any cache level)
325 // - C is not in cache (for any cache level)
326 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700327 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700328 buffer_index = (buffer_index + 1) % num_buffers;
329 state.ResumeTiming();
330
Benoit Jacob349701a2020-04-15 19:35:24 -0700331 ruy_a.set_data(k.data() + buffer_index * nc * kc);
332 ruy_c.set_data(c.data() + buffer_index * mc * nc);
Benoit Jacobb026e222020-04-16 12:30:03 -0700333 mul_params.set_bias(b.data() + buffer_index * nc);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700334
Benoit Jacobb026e222020-04-16 12:30:03 -0700335 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700336 }
337
338 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
339 state.counters["FLOPS"] = benchmark::Counter(
340 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
341}
342
343static void ruy_st(benchmark::State& state, const char* net)
344{
345 RuyBenchmark(state, 1);
346}
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700347#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700348
349
Frank Barcharddbafc582019-10-09 16:30:48 -0700350#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard3cb54f92020-04-10 10:46:08 -0700351 static void f32_gemm_1x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
352 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, 1, 8, 1, 1);
353 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800354 static void f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700355 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700356 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800357 static void f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700358 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1);
Frank Barchard21be34f2019-10-09 19:32:19 -0700359 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800360 static void f32_gemm_1x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700361 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a57, 1, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700362 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800363 static void f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700364 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700365 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800366 static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700367 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700368 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800369 static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700370 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1);
Frank Barchard46fb8072019-10-25 12:54:22 -0700371 }
Frank Barchard8fb90552020-03-16 11:36:09 -0700372 static void f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700373 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, 4, 8, 1, 1);
Frank Barchard8fb90552020-03-16 11:36:09 -0700374 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800375 static void f32_gemm_4x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700376 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a57, 4, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700377 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800378 static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700379 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700380 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800381 static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700382 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700383 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800384 static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700385 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700386 }
Frank Barchard387c2d12019-12-16 19:14:07 -0800387 static void f32_gemm_5x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700388 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a57, 5, 8, 1, 1);
Frank Barchard387c2d12019-12-16 19:14:07 -0800389 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800390 static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700391 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700392 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800393 static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700394 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700395 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800396 static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700397 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700398 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800399 static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700400 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1);
Frank Barcharda7fb8552019-10-23 17:14:17 -0700401 }
Frank Barchard91e19992020-03-09 18:46:14 -0700402 static void f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700403 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, 6, 8, 1, 1);
Frank Barchard91e19992020-03-09 18:46:14 -0700404 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800405 static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700406 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700407 }
Frank Barchard387c2d12019-12-16 19:14:07 -0800408 static void f32_gemm_6x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700409 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57, 6, 8, 1, 1);
Frank Barchard387c2d12019-12-16 19:14:07 -0800410 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800411 static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700412 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700413 }
Frank Barchard91317c52019-11-22 10:54:35 -0800414 static void f32_gemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700415 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1);
Frank Barchard91317c52019-11-22 10:54:35 -0800416 }
Frank Barchard91317c52019-11-22 10:54:35 -0800417 static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700418 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1);
Frank Barchard91317c52019-11-22 10:54:35 -0800419 }
Frank Barchard91317c52019-11-22 10:54:35 -0800420 static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700421 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1);
Frank Barchard91317c52019-11-22 10:54:35 -0800422 }
Frank Barchard91317c52019-11-22 10:54:35 -0800423 static void f32_gemm_5x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700424 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64, 5, 8, 1, 1);
Frank Barchard91317c52019-11-22 10:54:35 -0800425 }
Frank Barchard91317c52019-11-22 10:54:35 -0800426 static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700427 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1);
Frank Barchard91317c52019-11-22 10:54:35 -0800428 }
Frank Barchard69172d92019-11-26 16:22:39 -0800429 static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700430 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128, 6, 8, 1, 1);
Frank Barchard5243bb02019-11-22 16:37:50 -0800431 }
Frank Barchard3cb54f92020-04-10 10:46:08 -0700432 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_ld64)
Frank Barchard95bebc92019-11-15 18:18:28 -0800433 BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)
434 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a53)
435 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a57)
436 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a75)
437 BENCHMARK_GEMM(f32_gemm_4x12__aarch64_neonfma_cortex_a53)
438 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
Frank Barchard8fb90552020-03-16 11:36:09 -0700439 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a55)
Frank Barchard95bebc92019-11-15 18:18:28 -0800440 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a57)
441 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
442 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld128)
443 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld64)
Frank Barchard387c2d12019-12-16 19:14:07 -0800444 BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_cortex_a57)
Frank Barchard95bebc92019-11-15 18:18:28 -0800445 BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_cortex_a75)
446 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a53)
Frank Barchard91e19992020-03-09 18:46:14 -0700447 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a55)
Frank Barchard95bebc92019-11-15 18:18:28 -0800448 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a73)
Frank Barchard387c2d12019-12-16 19:14:07 -0800449 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a57)
Frank Barchard95bebc92019-11-15 18:18:28 -0800450 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a75)
451 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld64)
452 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld128)
Frank Barchard91317c52019-11-22 10:54:35 -0800453 BENCHMARK_GEMM(f32_gemm_1x8__neonfma_lane_ld64)
Frank Barchard91317c52019-11-22 10:54:35 -0800454 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800455 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld128)
Frank Barchard91317c52019-11-22 10:54:35 -0800456 BENCHMARK_GEMM(f32_gemm_5x8__neonfma_lane_ld64)
457 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800458 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld128)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700459#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700460
Frank Barchard8b0f0262019-11-27 23:18:40 -0800461#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
Marat Dukhan3b98f6b2020-05-17 10:09:22 -0700462 static void f32_gemm_4x4__aarch32_vfp_ld64(benchmark::State& state, const char* net) {
463 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, 4, 4, 1, 1, benchmark::utils::CheckVFP);
464 }
Frank Barchard8b0f0262019-11-27 23:18:40 -0800465 static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700466 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEON);
Frank Barchard3e237f22019-12-04 23:08:51 -0800467 }
Frank Barchard490febe2020-07-16 18:42:17 -0700468 static void f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
469 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1, benchmark::utils::CheckNEON);
Frank Barchard569561d2020-06-17 13:11:12 -0700470 }
Frank Barchard13916042019-12-11 10:56:34 -0800471 static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700472 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1, benchmark::utils::CheckNEON);
Frank Barchard13916042019-12-11 10:56:34 -0800473 }
Frank Barchardb7dd29e2020-03-11 12:37:10 -0700474 static void f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700475 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1, benchmark::utils::CheckNEON);
Frank Barchardb7dd29e2020-03-11 12:37:10 -0700476 }
Frank Barchard3e237f22019-12-04 23:08:51 -0800477 static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700478 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1, benchmark::utils::CheckNEON);
Frank Barchard8b0f0262019-11-27 23:18:40 -0800479 }
Frank Barchard9f7d5552019-12-12 10:58:10 -0800480 static void f32_gemm_4x8__aarch32_neon_pld_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700481 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_pld_cortex_a75, 4, 8, 1, 1, benchmark::utils::CheckNEON);
Frank Barchard9f7d5552019-12-12 10:58:10 -0800482 }
Frank Barchard8b0f0262019-11-27 23:18:40 -0800483
Marat Dukhan3b98f6b2020-05-17 10:09:22 -0700484 BENCHMARK_GEMM(f32_gemm_4x4__aarch32_vfp_ld64)
Frank Barchard8b0f0262019-11-27 23:18:40 -0800485 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_ld64)
Frank Barchard490febe2020-07-16 18:42:17 -0700486 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a7)
Frank Barchard13916042019-12-11 10:56:34 -0800487 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a53)
Frank Barchardb7dd29e2020-03-11 12:37:10 -0700488 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a55)
Frank Barchard3e237f22019-12-04 23:08:51 -0800489 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a75)
Frank Barchard9f7d5552019-12-12 10:58:10 -0800490 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_pld_cortex_a75)
Frank Barchard8b0f0262019-11-27 23:18:40 -0800491#endif // XNN_ARCH_ARM
492
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700493#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -0800494 static void f32_gemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700495 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700496 }
Frank Barchard91317c52019-11-22 10:54:35 -0800497 static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700498 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700499 }
Frank Barchard91317c52019-11-22 10:54:35 -0800500 static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700501 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700502 }
Frank Barchard91317c52019-11-22 10:54:35 -0800503 static void f32_gemm_5x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700504 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700505 }
Frank Barchard91317c52019-11-22 10:54:35 -0800506 static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700507 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700508 }
Frank Barchard69172d92019-11-26 16:22:39 -0800509 static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700510 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, 6, 8, 1, 1, benchmark::utils::CheckNEON);
Frank Barchard69172d92019-11-26 16:22:39 -0800511 }
512 static void f32_gemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700513 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEONFMA);
Frank Barchard69172d92019-11-26 16:22:39 -0800514 }
515 static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700516 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEONFMA);
Frank Barchard69172d92019-11-26 16:22:39 -0800517 }
518 static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700519 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEONFMA);
Frank Barchard69172d92019-11-26 16:22:39 -0800520 }
521 static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700522 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEONFMA);
Frank Barchard69172d92019-11-26 16:22:39 -0800523 }
524 static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700525 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld128, 6, 8, 1, 1, benchmark::utils::CheckNEONFMA);
Frank Barchard69172d92019-11-26 16:22:39 -0800526 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800527 static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700528 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__neon, 1, 8, 1, 4, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800529 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800530 static void f32_gemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700531 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, 1, 8, 1, 4, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800532 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800533 static void f32_gemm_4x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700534 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__neon, 4, 8, 1, 4, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800535 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800536 static void f32_gemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700537 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma, 4, 8, 1, 4, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800538 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800539 static void f32_gemm_6x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700540 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__neon, 6, 8, 1, 4, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800541 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800542 static void f32_gemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700543 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, 6, 8, 1, 4, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800544 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800545 static void f32_gemm_8x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700546 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8s4__neon, 8, 8, 1, 4, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800547 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800548 static void f32_gemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700549 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, 8, 8, 1, 4, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800550 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800551 static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700552 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8, benchmark::utils::CheckNEONFMA);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700553 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800554 static void f32_ppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700555 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8, benchmark::utils::CheckNEONFMA);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700556 }
557
Frank Barchard91317c52019-11-22 10:54:35 -0800558 BENCHMARK_GEMM(f32_gemm_1x8__neon_lane_ld64)
Frank Barchard91317c52019-11-22 10:54:35 -0800559 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800560 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld128)
Frank Barchard91317c52019-11-22 10:54:35 -0800561 BENCHMARK_GEMM(f32_gemm_5x8__neon_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800562 BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld64)
563 BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld128)
564 BENCHMARK_GEMM(f32_gemm_1x8__neonfma_dup_ld64)
565 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld128)
566 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld64)
567 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld64)
568 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld128)
Frank Barcharddf06d802019-11-20 15:53:46 -0800569 BENCHMARK_GEMM(f32_gemm_1x8s4__neon)
570 BENCHMARK_GEMM(f32_gemm_1x8s4__neonfma)
571 BENCHMARK_GEMM(f32_gemm_4x8s4__neon)
572 BENCHMARK_GEMM(f32_gemm_4x8s4__neonfma)
573 BENCHMARK_GEMM(f32_gemm_6x8s4__neon)
574 BENCHMARK_GEMM(f32_gemm_6x8s4__neonfma)
575 BENCHMARK_GEMM(f32_gemm_8x8s4__neon)
576 BENCHMARK_GEMM(f32_gemm_8x8s4__neonfma)
Frank Barchard95bebc92019-11-15 18:18:28 -0800577 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__neonfma)
578 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__neonfma)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700579#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700580
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700581#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Frank Barchard95bebc92019-11-15 18:18:28 -0800582 static void f32_gemm_1x8__sse_load1(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700583 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse_load1, 1, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700584 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800585 static void f32_gemm_4x8__sse_load1(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700586 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, 4, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700587 }
588
Frank Barchard95bebc92019-11-15 18:18:28 -0800589 static void f32_gemm_1x8__sse_dup(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700590 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse_dup, 1, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700591 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800592 static void f32_gemm_4x8__sse_dup(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700593 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse_dup, 4, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700594 }
595
Frank Barchard95bebc92019-11-15 18:18:28 -0800596 static void f32_gemm_1x8s4__sse(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700597 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__sse, 1, 8, 1, 4);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700598 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800599 static void f32_gemm_4x8s4__sse(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700600 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__sse, 4, 8, 1, 4);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700601 }
602
Frank Barchard95bebc92019-11-15 18:18:28 -0800603 static void f32_ppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700604 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700605 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800606 static void f32_ppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700607 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700608 }
609
Marat Dukhanfda12b82019-11-21 12:27:59 -0800610 static void f32_gemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700611 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, 1, 8, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800612 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800613 static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700614 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast, 4, 8, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800615 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800616 static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700617 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, 5, 8, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800618 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800619 static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700620 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, 6, 8, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800621 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800622 static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700623 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast, 7, 8, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800624 }
Marat Dukhan27121322019-12-09 14:57:40 -0800625 static void f32_gemm_1x16__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700626 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast, 1, 16, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhan27121322019-12-09 14:57:40 -0800627 }
628 static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700629 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, 4, 16, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhan27121322019-12-09 14:57:40 -0800630 }
631 static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700632 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast, 4, 16, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhan27121322019-12-09 14:57:40 -0800633 }
634 static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700635 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast, 5, 16, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhan27121322019-12-09 14:57:40 -0800636 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800637
638 static void f32_gemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700639 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800640 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800641 static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700642 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800643 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800644 static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700645 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800646 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800647 static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700648 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800649 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800650 static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700651 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800652 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800653 static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700654 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800655 }
Marat Dukhan27121322019-12-09 14:57:40 -0800656 static void f32_gemm_1x16__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700657 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, 1, 16, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800658 }
659 static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700660 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, 4, 16, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800661 }
662 static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700663 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast, 4, 16, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800664 }
665 static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700666 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast, 5, 16, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800667 }
668
669 static void f32_gemm_1x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700670 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast, 1, 16, 1, 4, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800671 }
672 static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700673 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast, 4, 16, 1, 4, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800674 }
675 static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700676 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast, 4, 16, 1, 4, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800677 }
678 static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700679 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast, 5, 16, 1, 4, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800680 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800681
Marat Dukhan0f349c42019-11-27 11:58:54 -0800682 static void f32_gemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700683 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, 1, 16, 1, 1, benchmark::utils::CheckAVX512F);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800684 }
685 static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700686 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__avx512f_broadcast, 4, 16, 1, 1, benchmark::utils::CheckAVX512F);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800687 }
688 static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700689 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__avx512f_broadcast, 5, 16, 1, 1, benchmark::utils::CheckAVX512F);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800690 }
691 static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700692 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, 6, 16, 1, 1, benchmark::utils::CheckAVX512F);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800693 }
694 static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700695 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, 7, 16, 1, 1, benchmark::utils::CheckAVX512F);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800696 }
697 static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700698 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, 8, 16, 1, 1, benchmark::utils::CheckAVX512F);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800699 }
700
Frank Barchard95bebc92019-11-15 18:18:28 -0800701 BENCHMARK_GEMM(f32_gemm_1x8__sse_load1)
702 BENCHMARK_GEMM(f32_gemm_4x8__sse_load1)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800703
Frank Barchard95bebc92019-11-15 18:18:28 -0800704 BENCHMARK_GEMM(f32_gemm_1x8__sse_dup)
705 BENCHMARK_GEMM(f32_gemm_4x8__sse_dup)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800706
Frank Barchard95bebc92019-11-15 18:18:28 -0800707 BENCHMARK_GEMM(f32_gemm_1x8s4__sse)
708 BENCHMARK_GEMM(f32_gemm_4x8s4__sse)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800709
Frank Barchard95bebc92019-11-15 18:18:28 -0800710 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__sse)
711 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__sse)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800712
Marat Dukhanfda12b82019-11-21 12:27:59 -0800713 BENCHMARK_GEMM(f32_gemm_1x8__avx_broadcast)
714 BENCHMARK_GEMM(f32_gemm_4x8__avx_broadcast)
715 BENCHMARK_GEMM(f32_gemm_5x8__avx_broadcast)
716 BENCHMARK_GEMM(f32_gemm_6x8__avx_broadcast)
717 BENCHMARK_GEMM(f32_gemm_7x8__avx_broadcast)
Marat Dukhan27121322019-12-09 14:57:40 -0800718 BENCHMARK_GEMM(f32_gemm_1x16__avx_broadcast)
719 BENCHMARK_GEMM(f32_gemm_3x16__avx_broadcast)
720 BENCHMARK_GEMM(f32_gemm_4x16__avx_broadcast)
721 BENCHMARK_GEMM(f32_gemm_5x16__avx_broadcast)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800722
Marat Dukhanfda12b82019-11-21 12:27:59 -0800723 BENCHMARK_GEMM(f32_gemm_1x8__fma3_broadcast)
724 BENCHMARK_GEMM(f32_gemm_4x8__fma3_broadcast)
725 BENCHMARK_GEMM(f32_gemm_5x8__fma3_broadcast)
726 BENCHMARK_GEMM(f32_gemm_6x8__fma3_broadcast)
727 BENCHMARK_GEMM(f32_gemm_7x8__fma3_broadcast)
728 BENCHMARK_GEMM(f32_gemm_8x8__fma3_broadcast)
Marat Dukhan27121322019-12-09 14:57:40 -0800729 BENCHMARK_GEMM(f32_gemm_1x16__fma3_broadcast)
730 BENCHMARK_GEMM(f32_gemm_3x16__fma3_broadcast)
731 BENCHMARK_GEMM(f32_gemm_4x16__fma3_broadcast)
732 BENCHMARK_GEMM(f32_gemm_5x16__fma3_broadcast)
733
734 BENCHMARK_GEMM(f32_gemm_1x16s4__fma3_broadcast)
735 BENCHMARK_GEMM(f32_gemm_3x16s4__fma3_broadcast)
736 BENCHMARK_GEMM(f32_gemm_4x16s4__fma3_broadcast)
737 BENCHMARK_GEMM(f32_gemm_5x16s4__fma3_broadcast)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800738
739 BENCHMARK_GEMM(f32_gemm_1x16__avx512f_broadcast)
740 BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast)
741 BENCHMARK_GEMM(f32_gemm_5x16__avx512f_broadcast)
742 BENCHMARK_GEMM(f32_gemm_6x16__avx512f_broadcast)
743 BENCHMARK_GEMM(f32_gemm_7x16__avx512f_broadcast)
744 BENCHMARK_GEMM(f32_gemm_8x16__avx512f_broadcast)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700745#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700746
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700747#if XNN_ARCH_WASMSIMD
748 static void f32_gemm_3x8__wasmsimd_loadsplat_arm(benchmark::State& state, const char* net) {
749 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_loadsplat_arm, 3, 8, 1, 1);
750 }
751
752 static void f32_gemm_4x8__wasmsimd_loadsplat_arm(benchmark::State& state, const char* net) {
753 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_loadsplat_arm, 4, 8, 1, 1);
754 }
755
756 static void f32_gemm_5x8__wasmsimd_loadsplat_arm(benchmark::State& state, const char* net) {
757 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_loadsplat_arm, 5, 8, 1, 1);
758 }
759
760 static void f32_gemm_6x8__wasmsimd_loadsplat_arm(benchmark::State& state, const char* net) {
761 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_loadsplat_arm, 6, 8, 1, 1);
762 }
763
764 static void f32_gemm_3x8__wasmsimd_loadsplat_x86(benchmark::State& state, const char* net) {
765 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_loadsplat_x86, 3, 8, 1, 1);
766 }
767
768 static void f32_gemm_4x8__wasmsimd_loadsplat_x86(benchmark::State& state, const char* net) {
769 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_loadsplat_x86, 4, 8, 1, 1);
770 }
771
772 static void f32_gemm_5x8__wasmsimd_loadsplat_x86(benchmark::State& state, const char* net) {
773 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_loadsplat_x86, 5, 8, 1, 1);
774 }
775
776 static void f32_gemm_6x8__wasmsimd_loadsplat_x86(benchmark::State& state, const char* net) {
777 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_loadsplat_x86, 6, 8, 1, 1);
778 }
779
780 static void f32_gemm_3x8__wasmsimd_splat_arm(benchmark::State& state, const char* net) {
781 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_splat_arm, 3, 8, 1, 1);
782 }
783
784 static void f32_gemm_4x8__wasmsimd_splat_arm(benchmark::State& state, const char* net) {
785 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_splat_arm, 4, 8, 1, 1);
786 }
787
788 static void f32_gemm_5x8__wasmsimd_splat_arm(benchmark::State& state, const char* net) {
789 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_splat_arm, 5, 8, 1, 1);
790 }
791
792 static void f32_gemm_6x8__wasmsimd_splat_arm(benchmark::State& state, const char* net) {
793 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_splat_arm, 6, 8, 1, 1);
794 }
795
796 static void f32_gemm_3x8__wasmsimd_splat_x86(benchmark::State& state, const char* net) {
797 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_splat_x86, 3, 8, 1, 1);
798 }
799
800 static void f32_gemm_4x8__wasmsimd_splat_x86(benchmark::State& state, const char* net) {
801 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_splat_x86, 4, 8, 1, 1);
802 }
803
804 static void f32_gemm_5x8__wasmsimd_splat_x86(benchmark::State& state, const char* net) {
805 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_splat_x86, 5, 8, 1, 1);
806 }
807
808 static void f32_gemm_6x8__wasmsimd_splat_x86(benchmark::State& state, const char* net) {
809 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_splat_x86, 6, 8, 1, 1);
810 }
811
812 static void f32_gemm_3x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
813 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, 3, 8, 1, 4);
814 }
815
816 static void f32_gemm_4x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
817 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, 8, 1, 4);
818 }
819
820 static void f32_gemm_5x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
821 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm, 5, 8, 1, 4);
822 }
823
824 static void f32_gemm_6x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
825 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, 6, 8, 1, 4);
826 }
827
828 static void f32_gemm_3x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
829 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, 3, 8, 1, 4);
830 }
831
832 static void f32_gemm_4x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
833 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, 8, 1, 4);
834 }
835
836 static void f32_gemm_5x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
837 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86, 5, 8, 1, 4);
838 }
839
840 static void f32_gemm_6x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
841 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, 6, 8, 1, 4);
842 }
843
Marat Dukhan0d0d8822020-07-23 23:37:56 -0700844 static void f32_ppmm_4x8_unipass__wasmsimd_splat_arm(benchmark::State& state, const char* net) {
845 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_splat_arm, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8);
846 }
847 static void f32_ppmm_4x8_unipass__wasmsimd_splat_x86(benchmark::State& state, const char* net) {
848 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_splat_x86, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8);
849 }
850
851 static void f32_ppmm_4x8_twopass__wasmsimd_splat_arm(benchmark::State& state, const char* net) {
852 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_splat_arm, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8);
853 }
854 static void f32_ppmm_4x8_twopass__wasmsimd_splat_x86(benchmark::State& state, const char* net) {
855 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_splat_x86, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8);
856 }
857
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700858 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_loadsplat_arm)
859 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_loadsplat_arm)
860 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_loadsplat_arm)
861 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_loadsplat_arm)
862 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_loadsplat_x86)
863 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_loadsplat_x86)
864 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_loadsplat_x86)
865 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_loadsplat_x86)
866 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_splat_arm)
867 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_splat_arm)
868 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_splat_arm)
869 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_splat_arm)
870 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_splat_x86)
871 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_splat_x86)
872 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_splat_x86)
873 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_splat_x86)
874 BENCHMARK_GEMM(f32_gemm_3x8s4__wasmsimd_arm)
875 BENCHMARK_GEMM(f32_gemm_4x8s4__wasmsimd_arm)
876 BENCHMARK_GEMM(f32_gemm_5x8s4__wasmsimd_arm)
877 BENCHMARK_GEMM(f32_gemm_6x8s4__wasmsimd_arm)
878 BENCHMARK_GEMM(f32_gemm_3x8s4__wasmsimd_x86)
879 BENCHMARK_GEMM(f32_gemm_4x8s4__wasmsimd_x86)
880 BENCHMARK_GEMM(f32_gemm_5x8s4__wasmsimd_x86)
881 BENCHMARK_GEMM(f32_gemm_6x8s4__wasmsimd_x86)
Marat Dukhan0d0d8822020-07-23 23:37:56 -0700882 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__wasmsimd_splat_arm)
883 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__wasmsimd_splat_x86)
884 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__wasmsimd_splat_arm)
885 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__wasmsimd_splat_x86)
Marat Dukhan1bbf96b2020-06-15 23:01:20 -0700886#endif // XNN_ARCH_WASMSIMD
887
Frank Barchard95bebc92019-11-15 18:18:28 -0800888static void f32_gemm_1x4__scalar(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700889 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x4__scalar, 1, 4, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700890}
891
Frank Barchard95bebc92019-11-15 18:18:28 -0800892static void f32_gemm_2x4__scalar(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700893 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_2x4__scalar, 2, 4, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700894}
895
Frank Barchard95bebc92019-11-15 18:18:28 -0800896static void f32_gemm_4x4__scalar(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700897 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x4__scalar, 4, 4, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700898}
899
Frank Barchard95bebc92019-11-15 18:18:28 -0800900static void f32_ppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700901 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700902}
903
Frank Barchard95bebc92019-11-15 18:18:28 -0800904static void f32_ppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700905 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700906}
907
Frank Barchard95bebc92019-11-15 18:18:28 -0800908static void f32_ppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700909 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700910}
911
Frank Barchard95bebc92019-11-15 18:18:28 -0800912static void f32_ppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700913 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700914}
915
Frank Barchard95bebc92019-11-15 18:18:28 -0800916static void f32_ppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700917 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700918}
919
Frank Barchard95bebc92019-11-15 18:18:28 -0800920static void f32_ppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700921 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700922}
923
Frank Barchard95bebc92019-11-15 18:18:28 -0800924static void f32_ppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700925 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700926}
927
Frank Barchard95bebc92019-11-15 18:18:28 -0800928static void f32_ppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
Marat Dukhande06f492020-04-09 00:19:31 -0700929 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700930}
931
Frank Barchard95bebc92019-11-15 18:18:28 -0800932BENCHMARK_GEMM(f32_gemm_1x4__scalar)
933BENCHMARK_GEMM(f32_gemm_2x4__scalar)
934BENCHMARK_GEMM(f32_gemm_4x4__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700935
Frank Barchard95bebc92019-11-15 18:18:28 -0800936BENCHMARK_GEMM(f32_ppmm_2x4_unipass__scalar)
937BENCHMARK_GEMM(f32_ppmm_4x2_unipass__scalar)
938BENCHMARK_GEMM(f32_ppmm_4x4_unipass__scalar)
939BENCHMARK_GEMM(f32_ppmm_3x3_unipass__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700940
Frank Barchard95bebc92019-11-15 18:18:28 -0800941BENCHMARK_GEMM(f32_ppmm_2x4_twopass__scalar)
942BENCHMARK_GEMM(f32_ppmm_4x2_twopass__scalar)
943BENCHMARK_GEMM(f32_ppmm_4x4_twopass__scalar)
944BENCHMARK_GEMM(f32_ppmm_3x3_twopass__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700945
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700946#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700947BENCHMARK_GEMM(ruy_st)
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700948#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700949
950#ifndef XNNPACK_BENCHMARK_NO_MAIN
951BENCHMARK_MAIN();
952#endif