blob: f236572329e3302cd6f7151187ccd778c59d9b41 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <algorithm>
10#include <cfloat>
11#include <chrono>
12#include <cmath>
13#include <functional>
14#include <mutex>
15#include <random>
16#include <vector>
17
Frank Barchardbb4c18b2019-09-30 11:05:52 -070018#include <benchmark/benchmark.h>
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070019#ifdef BENCHMARK_RUY
Benoit Jacobb038fdc2020-03-25 12:14:20 -070020#include "ruy/ruy.h"
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070021#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -070022#include "bench/gemm.h"
Frank Barchardbb4c18b2019-09-30 11:05:52 -070023#include "bench/utils.h"
XNNPACK Teamb455b122019-09-27 18:10:33 -070024#include <xnnpack/AlignedAllocator.h>
Zhi An Ng25764d82022-01-07 11:27:36 -080025#include <xnnpack/allocator.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070026#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070027#include <xnnpack/gemm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070028#include <xnnpack/pack.h>
Frank Barchardbb4c18b2019-09-30 11:05:52 -070029#include <xnnpack/packx.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070030#include <xnnpack/params-init.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070031#include <xnnpack/params.h>
32#include <xnnpack/ppmm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070033
XNNPACK Teamb455b122019-09-27 18:10:33 -070034
35static void GEMMBenchmark(benchmark::State& state,
Marat Dukhande06f492020-04-09 00:19:31 -070036 xnn_f32_gemm_minmax_ukernel_function gemm,
Marat Dukhanc8466f52019-11-25 18:01:10 -080037 size_t mr, size_t nr, size_t kr, size_t sr,
Marat Dukhan104ae5e2021-05-24 13:41:57 -070038 xnn_init_f32_minmax_params_fn init_params,
Marat Dukhanc8466f52019-11-25 18:01:10 -080039 benchmark::utils::IsaCheckFunction isa_check = nullptr)
XNNPACK Teamb455b122019-09-27 18:10:33 -070040{
Marat Dukhanc8466f52019-11-25 18:01:10 -080041 if (isa_check && !isa_check(state)) {
42 return;
43 }
XNNPACK Teamb455b122019-09-27 18:10:33 -070044
45 const size_t mc = state.range(0);
46 const size_t nc = state.range(1);
47 const size_t kc = state.range(2);
48
Marat Dukhan42323232019-10-23 02:09:02 -070049 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
Marat Dukhanfbd67a72022-01-31 18:03:50 -080050 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
XNNPACK Teamb455b122019-09-27 18:10:33 -070051
52 std::random_device random_device;
53 auto rng = std::mt19937(random_device());
Marat Dukhan44f0ca72020-08-02 21:46:58 -070054 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -070055
56 std::vector<float> a(mc * kc);
57 std::generate(a.begin(), a.end(), std::ref(f32rng));
58 std::vector<float> k(nc * kc);
59 std::generate(k.begin(), k.end(), std::ref(f32rng));
60 std::vector<float> b(nc);
61 std::generate(b.begin(), b.end(), std::ref(f32rng));
62
63 const size_t w_elements = nc_stride * kc_stride + nc_stride;
64 const size_t c_elements = mc * nc;
65 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -070066 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -070067 sizeof(float) * (w_elements + c_elements));
68
Marat Dukhane13e6392021-07-26 22:22:35 -070069 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
XNNPACK Teamb455b122019-09-27 18:10:33 -070070 std::fill(w.begin(), w.end(), 0.0f);
Marat Dukhan0b043742021-06-02 18:29:11 -070071 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
XNNPACK Teamb455b122019-09-27 18:10:33 -070072 std::vector<float> c(c_elements * num_buffers);
73 std::fill(c.begin(), c.end(), std::nanf(""));
74
Marat Dukhanf56f4c42021-05-17 01:47:20 -070075 xnn_f32_minmax_params params;
Marat Dukhan104ae5e2021-05-24 13:41:57 -070076 init_params(&params,
77 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -070078
79 size_t buffer_index = 0;
80 for (auto _ : state) {
81 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
82 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
83 // - W is not in cache (for any cache level)
84 // - C is not in cache (for any cache level)
85 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -070086 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -070087 buffer_index = (buffer_index + 1) % num_buffers;
88 state.ResumeTiming();
89
90 for (uint32_t m = 0; m < mc; m += mr) {
91 const uint32_t mb = min(mc - m, mr);
92 gemm(
93 mb, nc, kc * sizeof(float),
94 a.data() + m * kc, kc * sizeof(float),
95 w.data() + buffer_index * nc_stride * (kc_stride + 1),
96 c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
Frank Barcharde70dbeb2020-05-01 15:46:41 -070097 &params);
XNNPACK Teamb455b122019-09-27 18:10:33 -070098 }
99 }
100
Marat Dukhand713e8a2020-12-04 14:23:12 -0800101 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
102 if (cpu_frequency != 0) {
103 state.counters["cpufreq"] = cpu_frequency;
104 }
105
XNNPACK Teamb455b122019-09-27 18:10:33 -0700106 state.counters["FLOPS"] = benchmark::Counter(
107 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
108}
109
110static void PPMM1PBenchmark(benchmark::State& state,
Marat Dukhande06f492020-04-09 00:19:31 -0700111 xnn_f32_ppmm_minmax_ukernel_function ppmm,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700112 xnn_x32_packx_ukernel_function packx,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800113 size_t mr, size_t nr,
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700114 xnn_init_f32_minmax_params_fn init_params,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800115 benchmark::utils::IsaCheckFunction isa_check = nullptr)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700116{
Marat Dukhanc8466f52019-11-25 18:01:10 -0800117 if (isa_check && !isa_check(state)) {
118 return;
119 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700120
121 const size_t mc = state.range(0);
122 const size_t nc = state.range(1);
123 const size_t kc = state.range(2);
124
Marat Dukhan42323232019-10-23 02:09:02 -0700125 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700126
127 std::random_device random_device;
128 auto rng = std::mt19937(random_device());
Marat Dukhan44f0ca72020-08-02 21:46:58 -0700129 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700130
131 std::vector<float> a(mc * kc);
132 std::generate(a.begin(), a.end(), std::ref(f32rng));
133 std::vector<float> k(nc * kc);
134 std::generate(k.begin(), k.end(), std::ref(f32rng));
135 std::vector<float> b(nc);
136 std::generate(b.begin(), b.end(), std::ref(f32rng));
137
Marat Dukhane13e6392021-07-26 22:22:35 -0700138 std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> t(mr * kc);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700139
140 const size_t w_elements = nc_stride * kc + nc_stride;
141 const size_t c_elements = mc * nc;
142 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700143 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700144 sizeof(float) * (w_elements + c_elements));
145
Marat Dukhane13e6392021-07-26 22:22:35 -0700146 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700147 std::fill(w.begin(), w.end(), 0.0f);
Marat Dukhan0b043742021-06-02 18:29:11 -0700148 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data(), 0, nullptr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700149 std::vector<float> c(c_elements * num_buffers);
150 std::fill(c.begin(), c.end(), std::nanf(""));
151
Marat Dukhanf56f4c42021-05-17 01:47:20 -0700152 xnn_f32_minmax_params params;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700153 init_params(&params,
154 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700155
156 size_t buffer_index = 0;
157 for (auto _ : state) {
158 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
159 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
160 // - W is not in cache (for any cache level)
161 // - C is not in cache (for any cache level)
162 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700163 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700164 buffer_index = (buffer_index + 1) % num_buffers;
165 state.ResumeTiming();
166
167 for (uint32_t m = 0; m < mc; m += mr) {
168 const uint32_t mb = min(mc - m, mr);
169 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data());
170 ppmm(
171 mb, nc, kc * sizeof(float),
172 reinterpret_cast<const float*>(t.data()),
173 w.data() + nc_stride * buffer_index * (kc + 1),
174 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
Frank Barcharde70dbeb2020-05-01 15:46:41 -0700175 &params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700176 }
177 }
178
Marat Dukhand713e8a2020-12-04 14:23:12 -0800179 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
180 if (cpu_frequency != 0) {
181 state.counters["cpufreq"] = cpu_frequency;
182 }
183
XNNPACK Teamb455b122019-09-27 18:10:33 -0700184 state.counters["FLOPS"] = benchmark::Counter(
185 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
186}
187
188static void PPMM2PBenchmark(benchmark::State& state,
Marat Dukhande06f492020-04-09 00:19:31 -0700189 xnn_f32_ppmm_minmax_ukernel_function ppmm,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700190 xnn_x32_packx_ukernel_function packx,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800191 size_t mr, size_t nr,
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700192 xnn_init_f32_minmax_params_fn init_params,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800193 benchmark::utils::IsaCheckFunction isa_check = nullptr)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700194{
Marat Dukhanc8466f52019-11-25 18:01:10 -0800195 if (isa_check && !isa_check(state)) {
196 return;
197 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700198
199 const size_t mc = state.range(0);
200 const size_t nc = state.range(1);
201 const size_t kc = state.range(2);
202
Marat Dukhan42323232019-10-23 02:09:02 -0700203 const size_t mc_stride = benchmark::utils::RoundUp(mc, mr);
204 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700205
206 std::random_device random_device;
207 auto rng = std::mt19937(random_device());
Marat Dukhan44f0ca72020-08-02 21:46:58 -0700208 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700209
210 std::vector<float> a(mc * kc);
211 std::generate(a.begin(), a.end(), std::ref(f32rng));
212 std::vector<float> k(nc * kc);
213 std::generate(k.begin(), k.end(), std::ref(f32rng));
214 std::vector<float> b(nc);
215 std::generate(b.begin(), b.end(), std::ref(f32rng));
216
Marat Dukhane13e6392021-07-26 22:22:35 -0700217 std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> t(mc_stride * kc);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700218
219 const size_t w_elements = nc_stride * kc + nc_stride;
220 const size_t c_elements = mc * nc;
221 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700222 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700223 sizeof(float) * (w_elements + c_elements));
224
Marat Dukhane13e6392021-07-26 22:22:35 -0700225 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700226 std::fill(w.begin(), w.end(), 0.0f);
Marat Dukhan0b043742021-06-02 18:29:11 -0700227 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data(), 0, nullptr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700228 std::vector<float> c(c_elements * num_buffers);
229 std::fill(c.begin(), c.end(), std::nanf(""));
230
Marat Dukhanf56f4c42021-05-17 01:47:20 -0700231 xnn_f32_minmax_params params;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700232 init_params(&params,
233 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700234
235 size_t buffer_index = 0;
236 for (auto _ : state) {
237 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
238 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
239 // - W is not in cache (for any cache level)
240 // - C is not in cache (for any cache level)
241 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700242 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700243 buffer_index = (buffer_index + 1) % num_buffers;
244 state.ResumeTiming();
245
246 for (uint32_t m = 0; m < mc; m += mr) {
247 const uint32_t mb = min(mc - m, mr);
248 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data() + m * kc);
249 }
250 for (uint32_t m = 0; m < mc; m += mr) {
251 const uint32_t mb = min(mc - m, mr);
252 ppmm(
253 mb, nc, kc * sizeof(float),
254 reinterpret_cast<const float*>(t.data() + m * kc),
255 w.data() + nc_stride * buffer_index * (kc + 1),
256 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
Frank Barcharde70dbeb2020-05-01 15:46:41 -0700257 &params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700258 }
259 }
260
Marat Dukhand713e8a2020-12-04 14:23:12 -0800261 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
262 if (cpu_frequency != 0) {
263 state.counters["cpufreq"] = cpu_frequency;
264 }
265
XNNPACK Teamb455b122019-09-27 18:10:33 -0700266 state.counters["FLOPS"] = benchmark::Counter(
267 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
268}
269
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700270#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700271static void RuyBenchmark(benchmark::State& state, uint32_t threads)
272{
273 std::random_device random_device;
274 auto rng = std::mt19937(random_device());
Marat Dukhan44f0ca72020-08-02 21:46:58 -0700275 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700276
277 const size_t mc = state.range(0);
278 const size_t nc = state.range(1);
279 const size_t kc = state.range(2);
280
281 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700282 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700283 sizeof(float) * (nc * (mc + kc + 1)));
284
285 std::vector<float> a(mc * kc);
286 std::generate(a.begin(), a.end(), std::ref(f32rng));
287 std::vector<float> k(num_buffers * nc * kc);
288 std::generate(k.begin(), k.end(), std::ref(f32rng));
289 std::vector<float> b(num_buffers * nc);
290 std::generate(b.begin(), b.end(), std::ref(f32rng));
291 std::vector<float> c(num_buffers * nc * mc);
292 std::fill(c.begin(), c.end(), std::nanf(""));
293
294 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
295 static ruy::Context context;
Benoit Jacob349701a2020-04-15 19:35:24 -0700296 context.set_max_num_threads(threads);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700297
298 ruy::Matrix<float> ruy_a;
Benoit Jacob349701a2020-04-15 19:35:24 -0700299 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700300 ruy::Matrix<float> ruy_b;
Benoit Jacob349701a2020-04-15 19:35:24 -0700301 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
302 ruy_b.set_data(a.data());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700303 ruy::Matrix<float> ruy_c;
Benoit Jacob349701a2020-04-15 19:35:24 -0700304 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700305
Benoit Jacobb026e222020-04-16 12:30:03 -0700306 ruy::MulParams<float, float> mul_params;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700307
308 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
309 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
310 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
311 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
312 static std::once_flag warmup;
313 std::call_once(warmup, [&](){
314 auto start = std::chrono::steady_clock::now();
315 do {
Benoit Jacob349701a2020-04-15 19:35:24 -0700316 ruy_a.set_data(k.data());
317 ruy_c.set_data(c.data());
Benoit Jacobb026e222020-04-16 12:30:03 -0700318 mul_params.set_bias(b.data());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700319
Benoit Jacobb026e222020-04-16 12:30:03 -0700320 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700321 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
322 });
323
324 size_t buffer_index = 0;
325 for (auto _ : state) {
326 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
327 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
328 // - K is not in cache (for any cache level)
329 // - B is not in cache (for any cache level)
330 // - C is not in cache (for any cache level)
331 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700332 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700333 buffer_index = (buffer_index + 1) % num_buffers;
334 state.ResumeTiming();
335
Benoit Jacob349701a2020-04-15 19:35:24 -0700336 ruy_a.set_data(k.data() + buffer_index * nc * kc);
337 ruy_c.set_data(c.data() + buffer_index * mc * nc);
Benoit Jacobb026e222020-04-16 12:30:03 -0700338 mul_params.set_bias(b.data() + buffer_index * nc);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700339
Benoit Jacobb026e222020-04-16 12:30:03 -0700340 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700341 }
342
Marat Dukhand713e8a2020-12-04 14:23:12 -0800343 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
344 if (cpu_frequency != 0) {
345 state.counters["cpufreq"] = cpu_frequency;
346 }
347
XNNPACK Teamb455b122019-09-27 18:10:33 -0700348 state.counters["FLOPS"] = benchmark::Counter(
349 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
350}
351
352static void ruy_st(benchmark::State& state, const char* net)
353{
354 RuyBenchmark(state, 1);
355}
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700356#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700357
Zhi An Ngd2360742022-01-27 13:32:03 -0800358#if XNN_PLATFORM_JIT
Zhi An Ng25764d82022-01-07 11:27:36 -0800359static void GEMMBenchmark(benchmark::State& state,
Zhi An Ng83844ae2022-01-14 09:52:25 -0800360 xnn_jit_gemm_code_generator_function generator,
Zhi An Ng25764d82022-01-07 11:27:36 -0800361 size_t mr, size_t nr, size_t kr, size_t sr,
362 xnn_init_f32_minmax_params_fn init_params,
363 benchmark::utils::IsaCheckFunction isa_check = nullptr)
364{
Zhi An Ng70ea0a22022-01-20 10:55:00 -0800365 if (isa_check && !isa_check(state)) {
366 return;
367 }
368
369 const size_t mc = state.range(0);
Zhi An Ng83844ae2022-01-14 09:52:25 -0800370 const size_t nc = state.range(1);
371 const size_t kc = state.range(2);
Zhi An Ng70ea0a22022-01-20 10:55:00 -0800372
373 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
374 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr);
375
376 std::random_device random_device;
377 auto rng = std::mt19937(random_device());
378 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
379
380 std::vector<float> a(mc * kc);
381 std::generate(a.begin(), a.end(), std::ref(f32rng));
382 std::vector<float> k(nc * kc);
383 std::generate(k.begin(), k.end(), std::ref(f32rng));
384 std::vector<float> b(nc);
385 std::generate(b.begin(), b.end(), std::ref(f32rng));
386
387 const size_t w_elements = nc_stride * kc_stride + nc_stride;
388 const size_t c_elements = mc * nc;
389 const size_t num_buffers = 1 +
390 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
391 sizeof(float) * (w_elements + c_elements));
392
393 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
394 std::fill(w.begin(), w.end(), 0.0f);
395 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
396 std::vector<float> c(c_elements * num_buffers);
397 std::fill(c.begin(), c.end(), std::nanf(""));
398
399 xnn_f32_minmax_params params;
400 init_params(&params,
401 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
402
403 xnn_code_buffer code_buffer;
404 xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
405 jit_gemm_params jit_params = {
Zhi An Ngf9fc9ec2022-02-01 13:19:31 -0800406 .f32_minmax = {
407 .min = -std::numeric_limits<float>::infinity(),
408 .max = +std::numeric_limits<float>::infinity()
409 }
Zhi An Ng70ea0a22022-01-20 10:55:00 -0800410 };
Zhi An Ng9fd2f3e2022-02-02 14:15:34 -0800411 generator(&code_buffer, nc, kc * sizeof(float), &jit_params);
Zhi An Ng70ea0a22022-01-20 10:55:00 -0800412 xnn_f32_gemm_minmax_ukernel_function gemm = reinterpret_cast<xnn_f32_gemm_minmax_ukernel_function>(code_buffer.code);
413
414 size_t buffer_index = 0;
415 for (auto _ : state) {
416 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
417 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
418 // - W is not in cache (for any cache level)
419 // - C is not in cache (for any cache level)
420 state.PauseTiming();
421 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
422 buffer_index = (buffer_index + 1) % num_buffers;
423 state.ResumeTiming();
424
425 for (uint32_t m = 0; m < mc; m += mr) {
426 const uint32_t mb = min(mc - m, mr);
427 gemm(
428 mb, nc, kc * sizeof(float),
429 a.data() + m * kc, kc * sizeof(float),
430 w.data() + buffer_index * nc_stride * (kc_stride + 1),
431 c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
432 &params);
433 }
434 }
435
Zhi An Ng25764d82022-01-07 11:27:36 -0800436 xnn_release_code_memory(&code_buffer);
Zhi An Ng70ea0a22022-01-20 10:55:00 -0800437
438 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
439 if (cpu_frequency != 0) {
440 state.counters["cpufreq"] = cpu_frequency;
441 }
442
443 state.counters["FLOPS"] = benchmark::Counter(
444 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
Zhi An Ng25764d82022-01-07 11:27:36 -0800445}
Zhi An Ngd2360742022-01-27 13:32:03 -0800446#endif // XNN_PLATFORM_JIT
XNNPACK Teamb455b122019-09-27 18:10:33 -0700447
Frank Barcharddbafc582019-10-09 16:30:48 -0700448#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard3cb54f92020-04-10 10:46:08 -0700449 static void f32_gemm_1x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700450 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, 1, 8, 1, 1,
451 xnn_init_f32_minmax_scalar_params);
Frank Barchard3cb54f92020-04-10 10:46:08 -0700452 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800453 static void f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700454 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1,
455 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700456 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800457 static void f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700458 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1,
459 xnn_init_f32_minmax_scalar_params);
Frank Barchard21be34f2019-10-09 19:32:19 -0700460 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800461 static void f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700462 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
463 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700464 }
Frank Barchard143a1102021-06-15 09:15:34 -0700465 static void f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
466 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
467 xnn_init_f32_minmax_scalar_params);
468 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800469 static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700470 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1,
471 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700472 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800473 static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700474 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1,
475 xnn_init_f32_minmax_scalar_params);
Frank Barchard46fb8072019-10-25 12:54:22 -0700476 }
Frank Barchard8fb90552020-03-16 11:36:09 -0700477 static void f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700478 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, 4, 8, 1, 1,
479 xnn_init_f32_minmax_scalar_params);
Frank Barchard8fb90552020-03-16 11:36:09 -0700480 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800481 static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700482 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1,
483 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700484 }
Frank Barchard143a1102021-06-15 09:15:34 -0700485 static void f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
486 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8, 1, 1,
487 xnn_init_f32_minmax_scalar_params);
488 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800489 static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700490 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1,
491 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700492 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800493 static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700494 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1,
495 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700496 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800497 static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700498 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1,
499 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700500 }
Frank Barchard143a1102021-06-15 09:15:34 -0700501 static void f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
502 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, 5, 8, 1, 1,
503 xnn_init_f32_minmax_scalar_params);
504 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800505 static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700506 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1,
507 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700508 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800509 static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700510 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1,
511 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700512 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800513 static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700514 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1,
515 xnn_init_f32_minmax_scalar_params);
Frank Barcharda7fb8552019-10-23 17:14:17 -0700516 }
Frank Barchard91e19992020-03-09 18:46:14 -0700517 static void f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700518 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, 6, 8, 1, 1,
519 xnn_init_f32_minmax_scalar_params);
Frank Barchard91e19992020-03-09 18:46:14 -0700520 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800521 static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700522 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1,
523 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700524 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800525 static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700526 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
527 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700528 }
Frank Barchard143a1102021-06-15 09:15:34 -0700529 static void f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
530 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
531 xnn_init_f32_minmax_scalar_params);
532 }
Frank Barchard91317c52019-11-22 10:54:35 -0800533 static void f32_gemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700534 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1,
535 xnn_init_f32_minmax_scalar_params);
Frank Barchard91317c52019-11-22 10:54:35 -0800536 }
Frank Barchard91317c52019-11-22 10:54:35 -0800537 static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700538 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1,
539 xnn_init_f32_minmax_scalar_params);
Frank Barchard91317c52019-11-22 10:54:35 -0800540 }
Frank Barchard91317c52019-11-22 10:54:35 -0800541 static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700542 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1,
543 xnn_init_f32_minmax_scalar_params);
Frank Barchard91317c52019-11-22 10:54:35 -0800544 }
Frank Barchard91317c52019-11-22 10:54:35 -0800545 static void f32_gemm_5x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700546 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64, 5, 8, 1, 1,
547 xnn_init_f32_minmax_scalar_params);
Frank Barchard91317c52019-11-22 10:54:35 -0800548 }
Frank Barchard91317c52019-11-22 10:54:35 -0800549 static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700550 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1,
551 xnn_init_f32_minmax_scalar_params);
Frank Barchard91317c52019-11-22 10:54:35 -0800552 }
Frank Barchard69172d92019-11-26 16:22:39 -0800553 static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700554 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128, 6, 8, 1, 1,
555 xnn_init_f32_minmax_scalar_params);
Frank Barchard5243bb02019-11-22 16:37:50 -0800556 }
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700557
Frank Barchard3cb54f92020-04-10 10:46:08 -0700558 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_ld64)
Frank Barchard95bebc92019-11-15 18:18:28 -0800559 BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)
560 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a53)
Frank Barchard95bebc92019-11-15 18:18:28 -0800561 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a75)
Frank Barchard143a1102021-06-15 09:15:34 -0700562 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75)
Frank Barchard95bebc92019-11-15 18:18:28 -0800563 BENCHMARK_GEMM(f32_gemm_4x12__aarch64_neonfma_cortex_a53)
564 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
Frank Barchard8fb90552020-03-16 11:36:09 -0700565 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a55)
Frank Barchard95bebc92019-11-15 18:18:28 -0800566 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
Frank Barchard143a1102021-06-15 09:15:34 -0700567 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75)
Frank Barchard95bebc92019-11-15 18:18:28 -0800568 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld128)
569 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld64)
570 BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_cortex_a75)
Frank Barchard143a1102021-06-15 09:15:34 -0700571 BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75)
Frank Barchard95bebc92019-11-15 18:18:28 -0800572 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a53)
Frank Barchard91e19992020-03-09 18:46:14 -0700573 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a55)
Frank Barchard95bebc92019-11-15 18:18:28 -0800574 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a73)
575 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a75)
Frank Barchard143a1102021-06-15 09:15:34 -0700576 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75)
Frank Barchard95bebc92019-11-15 18:18:28 -0800577 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld64)
578 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld128)
Frank Barchard91317c52019-11-22 10:54:35 -0800579 BENCHMARK_GEMM(f32_gemm_1x8__neonfma_lane_ld64)
Frank Barchard91317c52019-11-22 10:54:35 -0800580 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800581 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld128)
Frank Barchard91317c52019-11-22 10:54:35 -0800582 BENCHMARK_GEMM(f32_gemm_5x8__neonfma_lane_ld64)
583 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800584 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld128)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700585#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700586
Frank Barchard8b0f0262019-11-27 23:18:40 -0800587#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
Marat Dukhan3b98f6b2020-05-17 10:09:22 -0700588 static void f32_gemm_4x4__aarch32_vfp_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700589 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, 4, 4, 1, 1,
590 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckVFP);
Marat Dukhan3b98f6b2020-05-17 10:09:22 -0700591 }
Frank Barchard8b0f0262019-11-27 23:18:40 -0800592 static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700593 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
594 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
Frank Barchard3e237f22019-12-04 23:08:51 -0800595 }
Frank Barchard490febe2020-07-16 18:42:17 -0700596 static void f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700597 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
598 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
Frank Barchard569561d2020-06-17 13:11:12 -0700599 }
Frank Barchard13916042019-12-11 10:56:34 -0800600 static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700601 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
602 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
Frank Barchard13916042019-12-11 10:56:34 -0800603 }
Frank Barchardb7dd29e2020-03-11 12:37:10 -0700604 static void f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700605 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
606 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
Frank Barchardb7dd29e2020-03-11 12:37:10 -0700607 }
Frank Barchard3e237f22019-12-04 23:08:51 -0800608 static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700609 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
610 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
Frank Barchard8b0f0262019-11-27 23:18:40 -0800611 }
Frank Barchard78735862022-01-04 16:47:44 -0800612 static void f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) {
613 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700614 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
Frank Barchard9f7d5552019-12-12 10:58:10 -0800615 }
Frank Barchard8b0f0262019-11-27 23:18:40 -0800616
Marat Dukhan3b98f6b2020-05-17 10:09:22 -0700617 BENCHMARK_GEMM(f32_gemm_4x4__aarch32_vfp_ld64)
Frank Barchard8b0f0262019-11-27 23:18:40 -0800618 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_ld64)
Frank Barchard490febe2020-07-16 18:42:17 -0700619 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a7)
Frank Barchard13916042019-12-11 10:56:34 -0800620 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a53)
Frank Barchardb7dd29e2020-03-11 12:37:10 -0700621 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a55)
Frank Barchard3e237f22019-12-04 23:08:51 -0800622 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a75)
Frank Barchard78735862022-01-04 16:47:44 -0800623 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_prfm_cortex_a75)
Frank Barchard8b0f0262019-11-27 23:18:40 -0800624#endif // XNN_ARCH_ARM
625
Zhi An Ngd2360742022-01-27 13:32:03 -0800626#if XNN_ARCH_ARM && XNN_PLATFORM_JIT
Zhi An Ng25764d82022-01-07 11:27:36 -0800627 static void jit_f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net)
628 {
629 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
630 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
631 }
632 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net)
633 {
634 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
635 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
636 }
637 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net)
638 {
639 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
640 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
641 }
642 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net)
643 {
644 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
645 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
646 }
647 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net)
648 {
649 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
650 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
651 }
652 static void jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net)
653 {
654 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
655 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
656 }
657
658 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_ld64)
659 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a7)
660 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a53)
661 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a55)
662 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a75)
663 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75)
Zhi An Ngd2360742022-01-27 13:32:03 -0800664#endif // XNN_ARCH_ARM && XNN_PLATFORM_JIT
665
666#if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
Zhi An Ngeb7256b2022-02-03 16:02:54 -0800667 static void jit_f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net)
668 {
669 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
670 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
671 }
672 static void jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net)
673 {
674 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
675 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
676 }
Zhi An Ngd2360742022-01-27 13:32:03 -0800677 static void jit_f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net)
678 {
679 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
680 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
681 }
682 static void jit_f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net)
683 {
684 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
685 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
686 }
Zhi An Ngeb7256b2022-02-03 16:02:54 -0800687 BENCHMARK_GEMM(jit_f32_gemm_1x8__aarch64_neonfma_cortex_a75)
688 BENCHMARK_GEMM(jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75)
Zhi An Ngd2360742022-01-27 13:32:03 -0800689 BENCHMARK_GEMM(jit_f32_gemm_6x8__aarch64_neonfma_cortex_a75)
690 BENCHMARK_GEMM(jit_f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75)
691#endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700692
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700693#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -0800694 static void f32_gemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700695 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1,
696 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700697 }
Frank Barchard91317c52019-11-22 10:54:35 -0800698 static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700699 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1,
700 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700701 }
Frank Barchard91317c52019-11-22 10:54:35 -0800702 static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700703 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1,
704 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700705 }
Frank Barchard91317c52019-11-22 10:54:35 -0800706 static void f32_gemm_5x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700707 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1,
708 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700709 }
Frank Barchard91317c52019-11-22 10:54:35 -0800710 static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700711 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1,
712 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700713 }
Frank Barchard69172d92019-11-26 16:22:39 -0800714 static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700715 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, 6, 8, 1, 1,
716 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
Frank Barchard69172d92019-11-26 16:22:39 -0800717 }
718 static void f32_gemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700719 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1,
720 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
Frank Barchard69172d92019-11-26 16:22:39 -0800721 }
722 static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700723 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1,
724 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
Frank Barchard69172d92019-11-26 16:22:39 -0800725 }
726 static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700727 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1,
728 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
Frank Barchard69172d92019-11-26 16:22:39 -0800729 }
730 static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700731 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1,
732 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
Frank Barchard69172d92019-11-26 16:22:39 -0800733 }
734 static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700735 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld128, 6, 8, 1, 1,
736 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
Frank Barchard69172d92019-11-26 16:22:39 -0800737 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800738 static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700739 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__neon, 1, 8, 1, 4,
740 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800741 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800742 static void f32_gemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700743 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, 1, 8, 1, 4,
744 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800745 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800746 static void f32_gemm_4x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700747 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__neon, 4, 8, 1, 4,
748 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800749 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800750 static void f32_gemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700751 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma, 4, 8, 1, 4,
752 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800753 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800754 static void f32_gemm_6x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700755 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__neon, 6, 8, 1, 4,
756 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800757 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800758 static void f32_gemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700759 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, 6, 8, 1, 4,
760 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800761 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800762 static void f32_gemm_8x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700763 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8s4__neon, 8, 8, 1, 4,
764 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800765 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800766 static void f32_gemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700767 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, 8, 8, 1, 4,
768 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800769 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800770 static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700771 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8,
772 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700773 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800774 static void f32_ppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700775 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8,
776 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700777 }
778
Frank Barchard91317c52019-11-22 10:54:35 -0800779 BENCHMARK_GEMM(f32_gemm_1x8__neon_lane_ld64)
Frank Barchard91317c52019-11-22 10:54:35 -0800780 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800781 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld128)
Frank Barchard91317c52019-11-22 10:54:35 -0800782 BENCHMARK_GEMM(f32_gemm_5x8__neon_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800783 BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld64)
784 BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld128)
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700785
Frank Barchard69172d92019-11-26 16:22:39 -0800786 BENCHMARK_GEMM(f32_gemm_1x8__neonfma_dup_ld64)
787 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld128)
788 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld64)
789 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld64)
790 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld128)
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700791
Frank Barcharddf06d802019-11-20 15:53:46 -0800792 BENCHMARK_GEMM(f32_gemm_1x8s4__neon)
Frank Barcharddf06d802019-11-20 15:53:46 -0800793 BENCHMARK_GEMM(f32_gemm_4x8s4__neon)
Frank Barcharddf06d802019-11-20 15:53:46 -0800794 BENCHMARK_GEMM(f32_gemm_6x8s4__neon)
Frank Barcharddf06d802019-11-20 15:53:46 -0800795 BENCHMARK_GEMM(f32_gemm_8x8s4__neon)
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700796
797 BENCHMARK_GEMM(f32_gemm_1x8s4__neonfma)
798 BENCHMARK_GEMM(f32_gemm_4x8s4__neonfma)
799 BENCHMARK_GEMM(f32_gemm_6x8s4__neonfma)
Frank Barcharddf06d802019-11-20 15:53:46 -0800800 BENCHMARK_GEMM(f32_gemm_8x8s4__neonfma)
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700801
Frank Barchard95bebc92019-11-15 18:18:28 -0800802 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__neonfma)
803 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__neonfma)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700804#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700805
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700806
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700807#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhan81025932021-05-26 09:01:05 -0700808 static void f32_gemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
809 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, 1, 16, 1, 1,
810 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700811 }
Marat Dukhan81025932021-05-26 09:01:05 -0700812 static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) {
813 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__avx512f_broadcast, 4, 16, 1, 1,
814 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
Marat Dukhan802fcae2020-12-11 14:37:25 -0800815 }
Marat Dukhan81025932021-05-26 09:01:05 -0700816 static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) {
817 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__avx512f_broadcast, 5, 16, 1, 1,
818 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700819 }
Marat Dukhan81025932021-05-26 09:01:05 -0700820 static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) {
821 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, 6, 16, 1, 1,
822 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
Marat Dukhan802fcae2020-12-11 14:37:25 -0800823 }
Marat Dukhan81025932021-05-26 09:01:05 -0700824 static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) {
825 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, 7, 16, 1, 1,
826 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700827 }
Marat Dukhan81025932021-05-26 09:01:05 -0700828 static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) {
829 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, 8, 16, 1, 1,
830 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
Marat Dukhan27121322019-12-09 14:57:40 -0800831 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800832
833 static void f32_gemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700834 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1,
835 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800836 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800837 static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700838 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1,
839 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800840 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800841 static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700842 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1,
843 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800844 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800845 static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700846 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1,
847 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800848 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800849 static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700850 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1,
851 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800852 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800853 static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700854 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1,
855 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800856 }
Marat Dukhan27121322019-12-09 14:57:40 -0800857 static void f32_gemm_1x16__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700858 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, 1, 16, 1, 1,
859 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800860 }
861 static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700862 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, 4, 16, 1, 1,
863 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800864 }
865 static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700866 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast, 4, 16, 1, 1,
867 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800868 }
869 static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700870 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast, 5, 16, 1, 1,
871 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800872 }
873
874 static void f32_gemm_1x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700875 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast, 1, 16, 1, 4,
876 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800877 }
878 static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700879 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast, 4, 16, 1, 4,
880 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800881 }
882 static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700883 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast, 4, 16, 1, 4,
884 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800885 }
886 static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700887 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast, 5, 16, 1, 4,
888 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
Marat Dukhan27121322019-12-09 14:57:40 -0800889 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800890
Marat Dukhan81025932021-05-26 09:01:05 -0700891 static void f32_gemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
892 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, 1, 8, 1, 1,
893 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800894 }
Marat Dukhan81025932021-05-26 09:01:05 -0700895 static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
896 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast, 4, 8, 1, 1,
897 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800898 }
Marat Dukhan81025932021-05-26 09:01:05 -0700899 static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
900 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, 5, 8, 1, 1,
901 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800902 }
Marat Dukhan81025932021-05-26 09:01:05 -0700903 static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
904 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, 6, 8, 1, 1,
905 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800906 }
Marat Dukhan81025932021-05-26 09:01:05 -0700907 static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
908 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast, 7, 8, 1, 1,
909 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800910 }
Marat Dukhan81025932021-05-26 09:01:05 -0700911 static void f32_gemm_1x16__avx_broadcast(benchmark::State& state, const char* net) {
912 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast, 1, 16, 1, 1,
913 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
914 }
915 static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, const char* net) {
916 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, 4, 16, 1, 1,
917 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
918 }
919 static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, const char* net) {
920 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast, 4, 16, 1, 1,
921 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
922 }
923 static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, const char* net) {
924 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast, 5, 16, 1, 1,
925 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
Marat Dukhan0f349c42019-11-27 11:58:54 -0800926 }
927
Marat Dukhan81025932021-05-26 09:01:05 -0700928 static void f32_gemm_1x8__sse2_dup(benchmark::State& state, const char* net) {
929 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, 1, 8, 1, 1,
930 xnn_init_f32_minmax_sse_params);
931 }
932 static void f32_gemm_3x8__sse2_dup(benchmark::State& state, const char* net) {
933 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup, 3, 8, 1, 1,
934 xnn_init_f32_minmax_sse_params);
935 }
936 static void f32_gemm_4x8__sse2_dup(benchmark::State& state, const char* net) {
937 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup, 4, 8, 1, 1,
938 xnn_init_f32_minmax_sse_params);
939 }
940 static void f32_gemm_5x8__sse2_dup(benchmark::State& state, const char* net) {
941 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, 5, 8, 1, 1,
942 xnn_init_f32_minmax_sse_params);
943 }
Marat Dukhan0f349c42019-11-27 11:58:54 -0800944
Marat Dukhan81025932021-05-26 09:01:05 -0700945 static void f32_gemm_1x8__sse_load1(benchmark::State& state, const char* net) {
946 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse_load1, 1, 8, 1, 1,
947 xnn_init_f32_minmax_sse_params);
948 }
949 static void f32_gemm_3x8__sse_load1(benchmark::State& state, const char* net) {
950 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, 3, 8, 1, 1,
951 xnn_init_f32_minmax_sse_params);
952 }
953 static void f32_gemm_4x8__sse_load1(benchmark::State& state, const char* net) {
954 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, 4, 8, 1, 1,
955 xnn_init_f32_minmax_sse_params);
956 }
957 static void f32_gemm_5x8__sse_load1(benchmark::State& state, const char* net) {
958 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, 5, 8, 1, 1,
959 xnn_init_f32_minmax_sse_params);
960 }
Marat Dukhan0f349c42019-11-27 11:58:54 -0800961
Marat Dukhan81025932021-05-26 09:01:05 -0700962 static void f32_gemm_1x8__sse_dup(benchmark::State& state, const char* net) {
963 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse_dup, 1, 8, 1, 1,
964 xnn_init_f32_minmax_sse_params);
965 }
966 static void f32_gemm_3x8__sse_dup(benchmark::State& state, const char* net) {
967 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, 3, 8, 1, 1,
968 xnn_init_f32_minmax_sse_params);
969 }
970 static void f32_gemm_4x8__sse_dup(benchmark::State& state, const char* net) {
971 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse_dup, 4, 8, 1, 1,
972 xnn_init_f32_minmax_sse_params);
973 }
974 static void f32_gemm_5x8__sse_dup(benchmark::State& state, const char* net) {
975 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, 5, 8, 1, 1,
976 xnn_init_f32_minmax_sse_params);
977 }
Marat Dukhan802fcae2020-12-11 14:37:25 -0800978
Marat Dukhan81025932021-05-26 09:01:05 -0700979 static void f32_gemm_1x8s4__sse(benchmark::State& state, const char* net) {
980 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__sse, 1, 8, 1, 4,
981 xnn_init_f32_minmax_sse_params);
982 }
983 static void f32_gemm_3x8s4__sse(benchmark::State& state, const char* net) {
984 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__sse, 3, 8, 1, 4,
985 xnn_init_f32_minmax_sse_params);
986 }
987 static void f32_gemm_4x8s4__sse(benchmark::State& state, const char* net) {
988 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__sse, 4, 8, 1, 4,
989 xnn_init_f32_minmax_sse_params);
990 }
991 static void f32_gemm_5x8s4__sse(benchmark::State& state, const char* net) {
992 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__sse, 5, 8, 1, 4,
993 xnn_init_f32_minmax_sse_params);
994 }
Marat Dukhan0f349c42019-11-27 11:58:54 -0800995
Marat Dukhan81025932021-05-26 09:01:05 -0700996 static void f32_ppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
997 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8,
998 xnn_init_f32_minmax_sse_params);
999 }
1000 static void f32_ppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
1001 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8,
1002 xnn_init_f32_minmax_sse_params);
1003 }
Marat Dukhan0f349c42019-11-27 11:58:54 -08001004
Marat Dukhan81025932021-05-26 09:01:05 -07001005 BENCHMARK_GEMM(f32_gemm_1x16__avx512f_broadcast)
1006 BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast)
1007 BENCHMARK_GEMM(f32_gemm_5x16__avx512f_broadcast)
1008 BENCHMARK_GEMM(f32_gemm_6x16__avx512f_broadcast)
1009 BENCHMARK_GEMM(f32_gemm_7x16__avx512f_broadcast)
1010 BENCHMARK_GEMM(f32_gemm_8x16__avx512f_broadcast)
Marat Dukhan0f349c42019-11-27 11:58:54 -08001011
Marat Dukhanfda12b82019-11-21 12:27:59 -08001012 BENCHMARK_GEMM(f32_gemm_1x8__fma3_broadcast)
1013 BENCHMARK_GEMM(f32_gemm_4x8__fma3_broadcast)
1014 BENCHMARK_GEMM(f32_gemm_5x8__fma3_broadcast)
1015 BENCHMARK_GEMM(f32_gemm_6x8__fma3_broadcast)
1016 BENCHMARK_GEMM(f32_gemm_7x8__fma3_broadcast)
1017 BENCHMARK_GEMM(f32_gemm_8x8__fma3_broadcast)
Marat Dukhan27121322019-12-09 14:57:40 -08001018 BENCHMARK_GEMM(f32_gemm_1x16__fma3_broadcast)
1019 BENCHMARK_GEMM(f32_gemm_3x16__fma3_broadcast)
1020 BENCHMARK_GEMM(f32_gemm_4x16__fma3_broadcast)
1021 BENCHMARK_GEMM(f32_gemm_5x16__fma3_broadcast)
1022
1023 BENCHMARK_GEMM(f32_gemm_1x16s4__fma3_broadcast)
1024 BENCHMARK_GEMM(f32_gemm_3x16s4__fma3_broadcast)
1025 BENCHMARK_GEMM(f32_gemm_4x16s4__fma3_broadcast)
1026 BENCHMARK_GEMM(f32_gemm_5x16s4__fma3_broadcast)
Marat Dukhan0f349c42019-11-27 11:58:54 -08001027
Marat Dukhan81025932021-05-26 09:01:05 -07001028 BENCHMARK_GEMM(f32_gemm_1x8__avx_broadcast)
1029 BENCHMARK_GEMM(f32_gemm_4x8__avx_broadcast)
1030 BENCHMARK_GEMM(f32_gemm_5x8__avx_broadcast)
1031 BENCHMARK_GEMM(f32_gemm_6x8__avx_broadcast)
1032 BENCHMARK_GEMM(f32_gemm_7x8__avx_broadcast)
1033 BENCHMARK_GEMM(f32_gemm_1x16__avx_broadcast)
1034 BENCHMARK_GEMM(f32_gemm_3x16__avx_broadcast)
1035 BENCHMARK_GEMM(f32_gemm_4x16__avx_broadcast)
1036 BENCHMARK_GEMM(f32_gemm_5x16__avx_broadcast)
1037
1038 BENCHMARK_GEMM(f32_gemm_1x8__sse2_dup)
1039 BENCHMARK_GEMM(f32_gemm_3x8__sse2_dup)
1040 BENCHMARK_GEMM(f32_gemm_4x8__sse2_dup)
1041 BENCHMARK_GEMM(f32_gemm_5x8__sse2_dup)
1042
1043 BENCHMARK_GEMM(f32_gemm_1x8__sse_load1)
1044 BENCHMARK_GEMM(f32_gemm_3x8__sse_load1)
1045 BENCHMARK_GEMM(f32_gemm_4x8__sse_load1)
1046 BENCHMARK_GEMM(f32_gemm_5x8__sse_load1)
1047
1048 BENCHMARK_GEMM(f32_gemm_1x8__sse_dup)
1049 BENCHMARK_GEMM(f32_gemm_3x8__sse_dup)
1050 BENCHMARK_GEMM(f32_gemm_4x8__sse_dup)
1051 BENCHMARK_GEMM(f32_gemm_5x8__sse_dup)
1052
1053 BENCHMARK_GEMM(f32_gemm_1x8s4__sse)
1054 BENCHMARK_GEMM(f32_gemm_3x8s4__sse)
1055 BENCHMARK_GEMM(f32_gemm_4x8s4__sse)
1056 BENCHMARK_GEMM(f32_gemm_5x8s4__sse)
1057
1058 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__sse)
1059 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__sse)
Marat Dukhan1dadbf72019-10-01 10:46:20 -07001060#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -07001061
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001062
Marat Dukhan4c617792021-12-21 15:47:58 -08001063#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Frank Barchard0725b8d2020-12-07 11:07:35 -08001064 static void f32_gemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001065 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, 3, 8, 1, 1,
1066 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001067 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001068 static void f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001069 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, 4, 8, 1, 1,
1070 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001071 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001072 static void f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001073 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, 5, 8, 1, 1,
1074 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001075 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001076 static void f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001077 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, 6, 8, 1, 1,
1078 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001079 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001080 static void f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001081 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, 3, 8, 1, 1,
1082 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001083 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001084 static void f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001085 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, 4, 8, 1, 1,
1086 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001087 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001088 static void f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001089 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, 5, 8, 1, 1,
1090 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001091 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001092 static void f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001093 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, 6, 8, 1, 1,
1094 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001095 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001096 static void f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001097 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat, 3, 8, 1, 1,
1098 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001099 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001100 static void f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001101 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat, 4, 8, 1, 1,
1102 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001103 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001104 static void f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001105 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat, 5, 8, 1, 1,
1106 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001107 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001108 static void f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001109 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, 6, 8, 1, 1,
1110 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001111 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001112 static void f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001113 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat, 3, 8, 1, 1,
1114 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001115 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001116 static void f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001117 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat, 4, 8, 1, 1,
1118 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001119 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001120 static void f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001121 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat, 5, 8, 1, 1,
1122 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001123 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001124 static void f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001125 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, 6, 8, 1, 1,
1126 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001127 }
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001128 static void f32_gemm_3x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001129 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, 3, 8, 1, 4,
1130 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001131 }
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001132 static void f32_gemm_4x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001133 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, 8, 1, 4,
1134 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001135 }
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001136 static void f32_gemm_5x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001137 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm, 5, 8, 1, 4,
1138 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001139 }
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001140 static void f32_gemm_6x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001141 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, 6, 8, 1, 4,
1142 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001143 }
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001144 static void f32_gemm_3x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001145 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, 3, 8, 1, 4,
1146 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001147 }
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001148 static void f32_gemm_4x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001149 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, 8, 1, 4,
1150 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001151 }
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001152 static void f32_gemm_5x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001153 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86, 5, 8, 1, 4,
1154 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001155 }
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001156 static void f32_gemm_6x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001157 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, 6, 8, 1, 4,
1158 xnn_init_f32_minmax_scalar_params);
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001159 }
1160
Frank Barchard0725b8d2020-12-07 11:07:35 -08001161 static void f32_ppmm_4x8_unipass__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001162 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1163 xnn_init_f32_minmax_scalar_params);
Marat Dukhan0d0d8822020-07-23 23:37:56 -07001164 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001165 static void f32_ppmm_4x8_unipass__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001166 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1167 xnn_init_f32_minmax_scalar_params);
Marat Dukhan0d0d8822020-07-23 23:37:56 -07001168 }
1169
Frank Barchard0725b8d2020-12-07 11:07:35 -08001170 static void f32_ppmm_4x8_twopass__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001171 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1172 xnn_init_f32_minmax_scalar_params);
Marat Dukhan0d0d8822020-07-23 23:37:56 -07001173 }
Frank Barchard0725b8d2020-12-07 11:07:35 -08001174 static void f32_ppmm_4x8_twopass__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001175 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1176 xnn_init_f32_minmax_scalar_params);
Marat Dukhan0d0d8822020-07-23 23:37:56 -07001177 }
1178
Frank Barchard0725b8d2020-12-07 11:07:35 -08001179 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_arm_loadsplat)
1180 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_arm_loadsplat)
1181 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_arm_loadsplat)
1182 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_arm_loadsplat)
1183 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_x86_loadsplat)
1184 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_x86_loadsplat)
1185 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_x86_loadsplat)
1186 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_x86_loadsplat)
1187 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_arm_splat)
1188 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_arm_splat)
1189 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_arm_splat)
1190 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_arm_splat)
1191 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_x86_splat)
1192 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_x86_splat)
1193 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_x86_splat)
1194 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_x86_splat)
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001195 BENCHMARK_GEMM(f32_gemm_3x8s4__wasmsimd_arm)
1196 BENCHMARK_GEMM(f32_gemm_4x8s4__wasmsimd_arm)
1197 BENCHMARK_GEMM(f32_gemm_5x8s4__wasmsimd_arm)
1198 BENCHMARK_GEMM(f32_gemm_6x8s4__wasmsimd_arm)
1199 BENCHMARK_GEMM(f32_gemm_3x8s4__wasmsimd_x86)
1200 BENCHMARK_GEMM(f32_gemm_4x8s4__wasmsimd_x86)
1201 BENCHMARK_GEMM(f32_gemm_5x8s4__wasmsimd_x86)
1202 BENCHMARK_GEMM(f32_gemm_6x8s4__wasmsimd_x86)
Marat Dukhan81025932021-05-26 09:01:05 -07001203
Frank Barchard0725b8d2020-12-07 11:07:35 -08001204 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__wasmsimd_arm_splat)
1205 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__wasmsimd_x86_splat)
Marat Dukhan81025932021-05-26 09:01:05 -07001206
Frank Barchard0725b8d2020-12-07 11:07:35 -08001207 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__wasmsimd_arm_splat)
1208 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__wasmsimd_x86_splat)
Marat Dukhan4c617792021-12-21 15:47:58 -08001209#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07001210
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001211
Frank Barchard95bebc92019-11-15 18:18:28 -08001212static void f32_gemm_1x4__scalar(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001213 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x4__scalar, 1, 4, 1, 1,
1214 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001215}
Frank Barchard95bebc92019-11-15 18:18:28 -08001216static void f32_gemm_2x4__scalar(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001217 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_2x4__scalar, 2, 4, 1, 1,
1218 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001219}
Frank Barchard95bebc92019-11-15 18:18:28 -08001220static void f32_gemm_4x4__scalar(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001221 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x4__scalar, 4, 4, 1, 1,
1222 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001223}
1224
Frank Barchard95bebc92019-11-15 18:18:28 -08001225static void f32_ppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001226 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4,
1227 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001228}
Frank Barchard95bebc92019-11-15 18:18:28 -08001229static void f32_ppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001230 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2,
1231 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001232}
Frank Barchard95bebc92019-11-15 18:18:28 -08001233static void f32_ppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001234 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4,
1235 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001236}
Frank Barchard95bebc92019-11-15 18:18:28 -08001237static void f32_ppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001238 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3,
1239 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001240}
1241
Frank Barchard95bebc92019-11-15 18:18:28 -08001242static void f32_ppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001243 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4,
1244 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001245}
Frank Barchard95bebc92019-11-15 18:18:28 -08001246static void f32_ppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001247 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2,
1248 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001249}
Frank Barchard95bebc92019-11-15 18:18:28 -08001250static void f32_ppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001251 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4,
1252 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001253}
Frank Barchard95bebc92019-11-15 18:18:28 -08001254static void f32_ppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001255 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3,
1256 xnn_init_f32_minmax_scalar_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001257}
1258
Frank Barchard95bebc92019-11-15 18:18:28 -08001259BENCHMARK_GEMM(f32_gemm_1x4__scalar)
1260BENCHMARK_GEMM(f32_gemm_2x4__scalar)
1261BENCHMARK_GEMM(f32_gemm_4x4__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -07001262
Frank Barchard95bebc92019-11-15 18:18:28 -08001263BENCHMARK_GEMM(f32_ppmm_2x4_unipass__scalar)
1264BENCHMARK_GEMM(f32_ppmm_4x2_unipass__scalar)
1265BENCHMARK_GEMM(f32_ppmm_4x4_unipass__scalar)
1266BENCHMARK_GEMM(f32_ppmm_3x3_unipass__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -07001267
Frank Barchard95bebc92019-11-15 18:18:28 -08001268BENCHMARK_GEMM(f32_ppmm_2x4_twopass__scalar)
1269BENCHMARK_GEMM(f32_ppmm_4x2_twopass__scalar)
1270BENCHMARK_GEMM(f32_ppmm_4x4_twopass__scalar)
1271BENCHMARK_GEMM(f32_ppmm_3x3_twopass__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -07001272
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001273
Marat Dukhan33f0c7a2019-10-01 13:33:08 -07001274#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -07001275BENCHMARK_GEMM(ruy_st)
Marat Dukhan33f0c7a2019-10-01 13:33:08 -07001276#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -07001277
1278#ifndef XNNPACK_BENCHMARK_NO_MAIN
1279BENCHMARK_MAIN();
1280#endif