blob: 8788f233c8eacedf1295d149a8bb2d17e67808ad [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <algorithm>
10#include <cfloat>
11#include <chrono>
12#include <cmath>
13#include <functional>
14#include <mutex>
15#include <random>
16#include <vector>
17
18#include <cpuinfo.h>
19
Frank Barchardbb4c18b2019-09-30 11:05:52 -070020#include <benchmark/benchmark.h>
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070021#ifdef BENCHMARK_RUY
Frank Barchardbb4c18b2019-09-30 11:05:52 -070022#include "tensorflow/lite/experimental/ruy/ruy.h"
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070023#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -070024#include "bench/gemm.h"
Frank Barchardbb4c18b2019-09-30 11:05:52 -070025#include "bench/utils.h"
XNNPACK Teamb455b122019-09-27 18:10:33 -070026#include <xnnpack/AlignedAllocator.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070027#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070028#include <xnnpack/gemm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070029#include <xnnpack/pack.h>
Frank Barchardbb4c18b2019-09-30 11:05:52 -070030#include <xnnpack/packx.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070031#include <xnnpack/params-init.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070032#include <xnnpack/params.h>
33#include <xnnpack/ppmm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070034
XNNPACK Teamb455b122019-09-27 18:10:33 -070035
36static void GEMMBenchmark(benchmark::State& state,
37 xnn_f32_gemm_ukernel_function gemm,
Marat Dukhanc8466f52019-11-25 18:01:10 -080038 size_t mr, size_t nr, size_t kr, size_t sr,
39 benchmark::utils::IsaCheckFunction isa_check = nullptr)
XNNPACK Teamb455b122019-09-27 18:10:33 -070040{
41 if (!cpuinfo_initialize()) {
42 state.SkipWithError("cpuinfo initialization failed");
43 return;
44 }
Marat Dukhanc8466f52019-11-25 18:01:10 -080045 if (isa_check && !isa_check(state)) {
46 return;
47 }
XNNPACK Teamb455b122019-09-27 18:10:33 -070048
49 const size_t mc = state.range(0);
50 const size_t nc = state.range(1);
51 const size_t kc = state.range(2);
52
Marat Dukhan42323232019-10-23 02:09:02 -070053 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
54 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr);
XNNPACK Teamb455b122019-09-27 18:10:33 -070055
56 std::random_device random_device;
57 auto rng = std::mt19937(random_device());
58 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
59
60 std::vector<float> a(mc * kc);
61 std::generate(a.begin(), a.end(), std::ref(f32rng));
62 std::vector<float> k(nc * kc);
63 std::generate(k.begin(), k.end(), std::ref(f32rng));
64 std::vector<float> b(nc);
65 std::generate(b.begin(), b.end(), std::ref(f32rng));
66
67 const size_t w_elements = nc_stride * kc_stride + nc_stride;
68 const size_t c_elements = mc * nc;
69 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -070070 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -070071 sizeof(float) * (w_elements + c_elements));
72
73 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
74 std::fill(w.begin(), w.end(), 0.0f);
75 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data());
76 std::vector<float> c(c_elements * num_buffers);
77 std::fill(c.begin(), c.end(), std::nanf(""));
78
79 xnn_f32_output_params output_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070080 xnn_init_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -070081
82 size_t buffer_index = 0;
83 for (auto _ : state) {
84 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
85 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
86 // - W is not in cache (for any cache level)
87 // - C is not in cache (for any cache level)
88 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -070089 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -070090 buffer_index = (buffer_index + 1) % num_buffers;
91 state.ResumeTiming();
92
93 for (uint32_t m = 0; m < mc; m += mr) {
94 const uint32_t mb = min(mc - m, mr);
95 gemm(
96 mb, nc, kc * sizeof(float),
97 a.data() + m * kc, kc * sizeof(float),
98 w.data() + buffer_index * nc_stride * (kc_stride + 1),
99 c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
100 &output_params);
101 }
102 }
103
104 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
105 state.counters["FLOPS"] = benchmark::Counter(
106 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
107}
108
109static void PPMM1PBenchmark(benchmark::State& state,
110 xnn_f32_ppmm_ukernel_function ppmm,
111 xnn_x32_packx_ukernel_function packx,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800112 size_t mr, size_t nr,
113 benchmark::utils::IsaCheckFunction isa_check = nullptr)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700114{
115 if (!cpuinfo_initialize()) {
116 state.SkipWithError("cpuinfo initialization failed");
117 return;
118 }
Marat Dukhanc8466f52019-11-25 18:01:10 -0800119 if (isa_check && !isa_check(state)) {
120 return;
121 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700122
123 const size_t mc = state.range(0);
124 const size_t nc = state.range(1);
125 const size_t kc = state.range(2);
126
Marat Dukhan42323232019-10-23 02:09:02 -0700127 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700128
129 std::random_device random_device;
130 auto rng = std::mt19937(random_device());
131 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
132
133 std::vector<float> a(mc * kc);
134 std::generate(a.begin(), a.end(), std::ref(f32rng));
135 std::vector<float> k(nc * kc);
136 std::generate(k.begin(), k.end(), std::ref(f32rng));
137 std::vector<float> b(nc);
138 std::generate(b.begin(), b.end(), std::ref(f32rng));
139
140 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mr * kc);
141
142 const size_t w_elements = nc_stride * kc + nc_stride;
143 const size_t c_elements = mc * nc;
144 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700145 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700146 sizeof(float) * (w_elements + c_elements));
147
148 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
149 std::fill(w.begin(), w.end(), 0.0f);
150 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data());
151 std::vector<float> c(c_elements * num_buffers);
152 std::fill(c.begin(), c.end(), std::nanf(""));
153
154 xnn_f32_output_params output_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700155 xnn_init_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700156
157 size_t buffer_index = 0;
158 for (auto _ : state) {
159 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
160 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
161 // - W is not in cache (for any cache level)
162 // - C is not in cache (for any cache level)
163 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700164 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700165 buffer_index = (buffer_index + 1) % num_buffers;
166 state.ResumeTiming();
167
168 for (uint32_t m = 0; m < mc; m += mr) {
169 const uint32_t mb = min(mc - m, mr);
170 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data());
171 ppmm(
172 mb, nc, kc * sizeof(float),
173 reinterpret_cast<const float*>(t.data()),
174 w.data() + nc_stride * buffer_index * (kc + 1),
175 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
176 &output_params);
177 }
178 }
179
180 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
181 state.counters["FLOPS"] = benchmark::Counter(
182 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
183}
184
185static void PPMM2PBenchmark(benchmark::State& state,
186 xnn_f32_ppmm_ukernel_function ppmm,
187 xnn_x32_packx_ukernel_function packx,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800188 size_t mr, size_t nr,
189 benchmark::utils::IsaCheckFunction isa_check = nullptr)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700190{
191 if (!cpuinfo_initialize()) {
192 state.SkipWithError("cpuinfo initialization failed");
193 return;
194 }
Marat Dukhanc8466f52019-11-25 18:01:10 -0800195 if (isa_check && !isa_check(state)) {
196 return;
197 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700198
199 const size_t mc = state.range(0);
200 const size_t nc = state.range(1);
201 const size_t kc = state.range(2);
202
Marat Dukhan42323232019-10-23 02:09:02 -0700203 const size_t mc_stride = benchmark::utils::RoundUp(mc, mr);
204 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700205
206 std::random_device random_device;
207 auto rng = std::mt19937(random_device());
208 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
209
210 std::vector<float> a(mc * kc);
211 std::generate(a.begin(), a.end(), std::ref(f32rng));
212 std::vector<float> k(nc * kc);
213 std::generate(k.begin(), k.end(), std::ref(f32rng));
214 std::vector<float> b(nc);
215 std::generate(b.begin(), b.end(), std::ref(f32rng));
216
217 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mc_stride * kc);
218
219 const size_t w_elements = nc_stride * kc + nc_stride;
220 const size_t c_elements = mc * nc;
221 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700222 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700223 sizeof(float) * (w_elements + c_elements));
224
225 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
226 std::fill(w.begin(), w.end(), 0.0f);
227 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data());
228 std::vector<float> c(c_elements * num_buffers);
229 std::fill(c.begin(), c.end(), std::nanf(""));
230
231 xnn_f32_output_params output_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700232 xnn_init_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700233
234 size_t buffer_index = 0;
235 for (auto _ : state) {
236 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
237 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
238 // - W is not in cache (for any cache level)
239 // - C is not in cache (for any cache level)
240 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700241 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700242 buffer_index = (buffer_index + 1) % num_buffers;
243 state.ResumeTiming();
244
245 for (uint32_t m = 0; m < mc; m += mr) {
246 const uint32_t mb = min(mc - m, mr);
247 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data() + m * kc);
248 }
249 for (uint32_t m = 0; m < mc; m += mr) {
250 const uint32_t mb = min(mc - m, mr);
251 ppmm(
252 mb, nc, kc * sizeof(float),
253 reinterpret_cast<const float*>(t.data() + m * kc),
254 w.data() + nc_stride * buffer_index * (kc + 1),
255 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
256 &output_params);
257 }
258 }
259
260 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
261 state.counters["FLOPS"] = benchmark::Counter(
262 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
263}
264
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700265#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700266static void RuyBenchmark(benchmark::State& state, uint32_t threads)
267{
268 std::random_device random_device;
269 auto rng = std::mt19937(random_device());
270 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
271
272 const size_t mc = state.range(0);
273 const size_t nc = state.range(1);
274 const size_t kc = state.range(2);
275
276 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700277 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700278 sizeof(float) * (nc * (mc + kc + 1)));
279
280 std::vector<float> a(mc * kc);
281 std::generate(a.begin(), a.end(), std::ref(f32rng));
282 std::vector<float> k(num_buffers * nc * kc);
283 std::generate(k.begin(), k.end(), std::ref(f32rng));
284 std::vector<float> b(num_buffers * nc);
285 std::generate(b.begin(), b.end(), std::ref(f32rng));
286 std::vector<float> c(num_buffers * nc * mc);
287 std::fill(c.begin(), c.end(), std::nanf(""));
288
289 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
290 static ruy::Context context;
291 context.max_num_threads = threads;
292
293 ruy::Matrix<float> ruy_a;
294 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, &ruy_a.layout);
295 ruy::Matrix<float> ruy_b;
296 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, &ruy_b.layout);
297 ruy_b.data = a.data();
298 ruy::Matrix<float> ruy_c;
299 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, &ruy_c.layout);
300
301 ruy::BasicSpec<float, float> spec;
302
303 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
304 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
305 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
306 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
307 static std::once_flag warmup;
308 std::call_once(warmup, [&](){
309 auto start = std::chrono::steady_clock::now();
310 do {
311 ruy_a.data = k.data();
312 ruy_c.data = c.data();
313 spec.bias = b.data();
314
315 ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
316 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
317 });
318
319 size_t buffer_index = 0;
320 for (auto _ : state) {
321 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
322 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
323 // - K is not in cache (for any cache level)
324 // - B is not in cache (for any cache level)
325 // - C is not in cache (for any cache level)
326 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700327 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700328 buffer_index = (buffer_index + 1) % num_buffers;
329 state.ResumeTiming();
330
331 ruy_a.data = k.data() + buffer_index * nc * kc;
332 ruy_c.data = c.data() + buffer_index * mc * nc;
333 spec.bias = b.data() + buffer_index * nc;
334
335 ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
336 }
337
338 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
339 state.counters["FLOPS"] = benchmark::Counter(
340 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
341}
342
343static void ruy_st(benchmark::State& state, const char* net)
344{
345 RuyBenchmark(state, 1);
346}
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700347#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700348
349
Frank Barcharddbafc582019-10-09 16:30:48 -0700350#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard95bebc92019-11-15 18:18:28 -0800351 static void f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700352 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1);
353 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800354 static void f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Frank Barchard21be34f2019-10-09 19:32:19 -0700355 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1);
356 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800357 static void f32_gemm_1x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700358 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57, 1, 8, 1, 1);
359 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800360 static void f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700361 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1);
362 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800363 static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700364 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1);
365 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800366 static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Frank Barchard46fb8072019-10-25 12:54:22 -0700367 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1);
368 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800369 static void f32_gemm_4x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700370 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57, 4, 8, 1, 1);
371 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800372 static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700373 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1);
374 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800375 static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700376 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1);
377 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800378 static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700379 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1);
380 }
Frank Barchard387c2d12019-12-16 19:14:07 -0800381 static void f32_gemm_5x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
382 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57, 5, 8, 1, 1);
383 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800384 static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700385 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1);
386 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800387 static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700388 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1);
389 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800390 static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700391 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1);
392 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800393 static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Frank Barcharda7fb8552019-10-23 17:14:17 -0700394 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1);
395 }
Frank Barchard91e19992020-03-09 18:46:14 -0700396 static void f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
397 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55, 6, 8, 1, 1);
398 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800399 static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700400 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1);
401 }
Frank Barchard387c2d12019-12-16 19:14:07 -0800402 static void f32_gemm_6x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
403 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57, 6, 8, 1, 1);
404 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800405 static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700406 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1);
407 }
Frank Barchard91317c52019-11-22 10:54:35 -0800408 static void f32_gemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
409 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1);
410 }
Frank Barchard91317c52019-11-22 10:54:35 -0800411 static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
412 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1);
413 }
Frank Barchard91317c52019-11-22 10:54:35 -0800414 static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
415 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1);
416 }
Frank Barchard91317c52019-11-22 10:54:35 -0800417 static void f32_gemm_5x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
418 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64, 5, 8, 1, 1);
419 }
Frank Barchard91317c52019-11-22 10:54:35 -0800420 static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
421 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1);
422 }
Frank Barchard69172d92019-11-26 16:22:39 -0800423 static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
424 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128, 6, 8, 1, 1);
Frank Barchard5243bb02019-11-22 16:37:50 -0800425 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800426 BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)
427 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a53)
428 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a57)
429 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a75)
430 BENCHMARK_GEMM(f32_gemm_4x12__aarch64_neonfma_cortex_a53)
431 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
432 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a57)
433 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
434 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld128)
435 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld64)
Frank Barchard387c2d12019-12-16 19:14:07 -0800436 BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_cortex_a57)
Frank Barchard95bebc92019-11-15 18:18:28 -0800437 BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_cortex_a75)
438 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a53)
Frank Barchard91e19992020-03-09 18:46:14 -0700439 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a55)
Frank Barchard95bebc92019-11-15 18:18:28 -0800440 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a73)
Frank Barchard387c2d12019-12-16 19:14:07 -0800441 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a57)
Frank Barchard95bebc92019-11-15 18:18:28 -0800442 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a75)
443 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld64)
444 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld128)
Frank Barchard91317c52019-11-22 10:54:35 -0800445 BENCHMARK_GEMM(f32_gemm_1x8__neonfma_lane_ld64)
Frank Barchard91317c52019-11-22 10:54:35 -0800446 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800447 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld128)
Frank Barchard91317c52019-11-22 10:54:35 -0800448 BENCHMARK_GEMM(f32_gemm_5x8__neonfma_lane_ld64)
449 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800450 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld128)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700451#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700452
Frank Barchard8b0f0262019-11-27 23:18:40 -0800453#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
454 static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
Frank Barchard3e237f22019-12-04 23:08:51 -0800455 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEON);
456 }
Frank Barchard13916042019-12-11 10:56:34 -0800457 static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
458 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1, benchmark::utils::CheckNEON);
459 }
Frank Barchard3e237f22019-12-04 23:08:51 -0800460 static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
461 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1, benchmark::utils::CheckNEON);
Frank Barchard8b0f0262019-11-27 23:18:40 -0800462 }
Frank Barchard9f7d5552019-12-12 10:58:10 -0800463 static void f32_gemm_4x8__aarch32_neon_pld_cortex_a75(benchmark::State& state, const char* net) {
464 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75, 4, 8, 1, 1, benchmark::utils::CheckNEON);
465 }
Frank Barchard8b0f0262019-11-27 23:18:40 -0800466
467 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_ld64)
Frank Barchard13916042019-12-11 10:56:34 -0800468 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a53)
Frank Barchard3e237f22019-12-04 23:08:51 -0800469 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a75)
Frank Barchard9f7d5552019-12-12 10:58:10 -0800470 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_pld_cortex_a75)
Frank Barchard8b0f0262019-11-27 23:18:40 -0800471#endif // XNN_ARCH_ARM
472
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700473#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -0800474 static void f32_gemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800475 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700476 }
Frank Barchard91317c52019-11-22 10:54:35 -0800477 static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800478 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700479 }
Frank Barchard91317c52019-11-22 10:54:35 -0800480 static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800481 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700482 }
Frank Barchard91317c52019-11-22 10:54:35 -0800483 static void f32_gemm_5x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800484 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700485 }
Frank Barchard91317c52019-11-22 10:54:35 -0800486 static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800487 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700488 }
Frank Barchard69172d92019-11-26 16:22:39 -0800489 static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, const char* net) {
490 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_lane_ld128, 6, 8, 1, 1, benchmark::utils::CheckNEON);
491 }
492 static void f32_gemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
493 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEONFMA);
494 }
495 static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
496 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEONFMA);
497 }
498 static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
499 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEONFMA);
500 }
501 static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
502 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEONFMA);
503 }
504 static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
505 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128, 6, 8, 1, 1, benchmark::utils::CheckNEONFMA);
506 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800507 static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800508 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neon, 1, 8, 1, 4, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800509 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800510 static void f32_gemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800511 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neonfma, 1, 8, 1, 4, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800512 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800513 static void f32_gemm_4x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800514 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neon, 4, 8, 1, 4, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800515 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800516 static void f32_gemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800517 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neonfma, 4, 8, 1, 4, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800518 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800519 static void f32_gemm_6x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800520 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neon, 6, 8, 1, 4, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800521 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800522 static void f32_gemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800523 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neonfma, 6, 8, 1, 4, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800524 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800525 static void f32_gemm_8x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800526 GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neon, 8, 8, 1, 4, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800527 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800528 static void f32_gemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800529 GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800530 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800531 static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800532 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8, benchmark::utils::CheckNEONFMA);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700533 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800534 static void f32_ppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800535 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8, benchmark::utils::CheckNEONFMA);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700536 }
537
Frank Barchard91317c52019-11-22 10:54:35 -0800538 BENCHMARK_GEMM(f32_gemm_1x8__neon_lane_ld64)
Frank Barchard91317c52019-11-22 10:54:35 -0800539 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800540 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld128)
Frank Barchard91317c52019-11-22 10:54:35 -0800541 BENCHMARK_GEMM(f32_gemm_5x8__neon_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800542 BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld64)
543 BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld128)
544 BENCHMARK_GEMM(f32_gemm_1x8__neonfma_dup_ld64)
545 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld128)
546 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld64)
547 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld64)
548 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld128)
Frank Barcharddf06d802019-11-20 15:53:46 -0800549 BENCHMARK_GEMM(f32_gemm_1x8s4__neon)
550 BENCHMARK_GEMM(f32_gemm_1x8s4__neonfma)
551 BENCHMARK_GEMM(f32_gemm_4x8s4__neon)
552 BENCHMARK_GEMM(f32_gemm_4x8s4__neonfma)
553 BENCHMARK_GEMM(f32_gemm_6x8s4__neon)
554 BENCHMARK_GEMM(f32_gemm_6x8s4__neonfma)
555 BENCHMARK_GEMM(f32_gemm_8x8s4__neon)
556 BENCHMARK_GEMM(f32_gemm_8x8s4__neonfma)
Frank Barchard95bebc92019-11-15 18:18:28 -0800557 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__neonfma)
558 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__neonfma)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700559#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700560
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700561#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Frank Barchard95bebc92019-11-15 18:18:28 -0800562 static void f32_gemm_1x8__sse_load1(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700563 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_load1, 1, 8, 1, 1);
564 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800565 static void f32_gemm_4x8__sse_load1(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700566 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_load1, 4, 8, 1, 1);
567 }
568
Frank Barchard95bebc92019-11-15 18:18:28 -0800569 static void f32_gemm_1x8__sse_dup(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700570 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_dup, 1, 8, 1, 1);
571 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800572 static void f32_gemm_4x8__sse_dup(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700573 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_dup, 4, 8, 1, 1);
574 }
575
Frank Barchard95bebc92019-11-15 18:18:28 -0800576 static void f32_gemm_1x8s4__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700577 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__sse, 1, 8, 1, 4);
578 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800579 static void f32_gemm_4x8s4__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700580 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__sse, 4, 8, 1, 4);
581 }
582
Frank Barchard95bebc92019-11-15 18:18:28 -0800583 static void f32_ppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700584 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
585 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800586 static void f32_ppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700587 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
588 }
589
Marat Dukhanfda12b82019-11-21 12:27:59 -0800590 static void f32_gemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800591 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__avx_broadcast, 1, 8, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800592 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800593 static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800594 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__avx_broadcast, 4, 8, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800595 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800596 static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800597 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__avx_broadcast, 5, 8, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800598 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800599 static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800600 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__avx_broadcast, 6, 8, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800601 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800602 static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800603 GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__avx_broadcast, 7, 8, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800604 }
Marat Dukhan27121322019-12-09 14:57:40 -0800605 static void f32_gemm_1x16__avx_broadcast(benchmark::State& state, const char* net) {
606 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x16__avx_broadcast, 1, 16, 1, 1, benchmark::utils::CheckAVX);
607 }
608 static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, const char* net) {
609 GEMMBenchmark(state, xnn_f32_gemm_ukernel_3x16__avx_broadcast, 4, 16, 1, 1, benchmark::utils::CheckAVX);
610 }
611 static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, const char* net) {
612 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x16__avx_broadcast, 4, 16, 1, 1, benchmark::utils::CheckAVX);
613 }
614 static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, const char* net) {
615 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x16__avx_broadcast, 5, 16, 1, 1, benchmark::utils::CheckAVX);
616 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800617
618 static void f32_gemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800619 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800620 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800621 static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800622 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800623 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800624 static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800625 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800626 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800627 static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800628 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800629 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800630 static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800631 GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800632 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800633 static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800634 GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800635 }
Marat Dukhan27121322019-12-09 14:57:40 -0800636 static void f32_gemm_1x16__fma3_broadcast(benchmark::State& state, const char* net) {
637 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x16__fma3_broadcast, 1, 16, 1, 1, benchmark::utils::CheckFMA3);
638 }
639 static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, const char* net) {
640 GEMMBenchmark(state, xnn_f32_gemm_ukernel_3x16__fma3_broadcast, 4, 16, 1, 1, benchmark::utils::CheckFMA3);
641 }
642 static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, const char* net) {
643 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x16__fma3_broadcast, 4, 16, 1, 1, benchmark::utils::CheckFMA3);
644 }
645 static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, const char* net) {
646 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x16__fma3_broadcast, 5, 16, 1, 1, benchmark::utils::CheckFMA3);
647 }
648
649 static void f32_gemm_1x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
650 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast, 1, 16, 1, 4, benchmark::utils::CheckFMA3);
651 }
652 static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
653 GEMMBenchmark(state, xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast, 4, 16, 1, 4, benchmark::utils::CheckFMA3);
654 }
655 static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
656 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast, 4, 16, 1, 4, benchmark::utils::CheckFMA3);
657 }
658 static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
659 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast, 5, 16, 1, 4, benchmark::utils::CheckFMA3);
660 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800661
Marat Dukhan0f349c42019-11-27 11:58:54 -0800662 static void f32_gemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
663 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x16__avx512f_broadcast, 1, 16, 1, 1, benchmark::utils::CheckAVX512F);
664 }
665 static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) {
666 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x16__avx512f_broadcast, 4, 16, 1, 1, benchmark::utils::CheckAVX512F);
667 }
668 static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) {
669 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x16__avx512f_broadcast, 5, 16, 1, 1, benchmark::utils::CheckAVX512F);
670 }
671 static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) {
672 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x16__avx512f_broadcast, 6, 16, 1, 1, benchmark::utils::CheckAVX512F);
673 }
674 static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) {
675 GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x16__avx512f_broadcast, 7, 16, 1, 1, benchmark::utils::CheckAVX512F);
676 }
677 static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) {
678 GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x16__avx512f_broadcast, 8, 16, 1, 1, benchmark::utils::CheckAVX512F);
679 }
680
Frank Barchard95bebc92019-11-15 18:18:28 -0800681 BENCHMARK_GEMM(f32_gemm_1x8__sse_load1)
682 BENCHMARK_GEMM(f32_gemm_4x8__sse_load1)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800683
Frank Barchard95bebc92019-11-15 18:18:28 -0800684 BENCHMARK_GEMM(f32_gemm_1x8__sse_dup)
685 BENCHMARK_GEMM(f32_gemm_4x8__sse_dup)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800686
Frank Barchard95bebc92019-11-15 18:18:28 -0800687 BENCHMARK_GEMM(f32_gemm_1x8s4__sse)
688 BENCHMARK_GEMM(f32_gemm_4x8s4__sse)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800689
Frank Barchard95bebc92019-11-15 18:18:28 -0800690 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__sse)
691 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__sse)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800692
Marat Dukhanfda12b82019-11-21 12:27:59 -0800693 BENCHMARK_GEMM(f32_gemm_1x8__avx_broadcast)
694 BENCHMARK_GEMM(f32_gemm_4x8__avx_broadcast)
695 BENCHMARK_GEMM(f32_gemm_5x8__avx_broadcast)
696 BENCHMARK_GEMM(f32_gemm_6x8__avx_broadcast)
697 BENCHMARK_GEMM(f32_gemm_7x8__avx_broadcast)
Marat Dukhan27121322019-12-09 14:57:40 -0800698 BENCHMARK_GEMM(f32_gemm_1x16__avx_broadcast)
699 BENCHMARK_GEMM(f32_gemm_3x16__avx_broadcast)
700 BENCHMARK_GEMM(f32_gemm_4x16__avx_broadcast)
701 BENCHMARK_GEMM(f32_gemm_5x16__avx_broadcast)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800702
Marat Dukhanfda12b82019-11-21 12:27:59 -0800703 BENCHMARK_GEMM(f32_gemm_1x8__fma3_broadcast)
704 BENCHMARK_GEMM(f32_gemm_4x8__fma3_broadcast)
705 BENCHMARK_GEMM(f32_gemm_5x8__fma3_broadcast)
706 BENCHMARK_GEMM(f32_gemm_6x8__fma3_broadcast)
707 BENCHMARK_GEMM(f32_gemm_7x8__fma3_broadcast)
708 BENCHMARK_GEMM(f32_gemm_8x8__fma3_broadcast)
Marat Dukhan27121322019-12-09 14:57:40 -0800709 BENCHMARK_GEMM(f32_gemm_1x16__fma3_broadcast)
710 BENCHMARK_GEMM(f32_gemm_3x16__fma3_broadcast)
711 BENCHMARK_GEMM(f32_gemm_4x16__fma3_broadcast)
712 BENCHMARK_GEMM(f32_gemm_5x16__fma3_broadcast)
713
714 BENCHMARK_GEMM(f32_gemm_1x16s4__fma3_broadcast)
715 BENCHMARK_GEMM(f32_gemm_3x16s4__fma3_broadcast)
716 BENCHMARK_GEMM(f32_gemm_4x16s4__fma3_broadcast)
717 BENCHMARK_GEMM(f32_gemm_5x16s4__fma3_broadcast)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800718
719 BENCHMARK_GEMM(f32_gemm_1x16__avx512f_broadcast)
720 BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast)
721 BENCHMARK_GEMM(f32_gemm_5x16__avx512f_broadcast)
722 BENCHMARK_GEMM(f32_gemm_6x16__avx512f_broadcast)
723 BENCHMARK_GEMM(f32_gemm_7x16__avx512f_broadcast)
724 BENCHMARK_GEMM(f32_gemm_8x16__avx512f_broadcast)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700725#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700726
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700727#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
Frank Barchard95bebc92019-11-15 18:18:28 -0800728 static void f32_gemm_4x8__psimd_loadsplat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700729 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, 4, 8, 1, 1);
730 }
731
Frank Barchard95bebc92019-11-15 18:18:28 -0800732 static void f32_gemm_6x8__psimd_loadsplat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700733 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, 6, 8, 1, 1);
734 }
735
Frank Barchard95bebc92019-11-15 18:18:28 -0800736 static void f32_gemm_4x8__psimd_splat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700737 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_splat, 4, 8, 1, 1);
738 }
739
Frank Barchard95bebc92019-11-15 18:18:28 -0800740 static void f32_gemm_6x8__psimd_splat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700741 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_splat, 6, 8, 1, 1);
742 }
743
Frank Barchard95bebc92019-11-15 18:18:28 -0800744 static void f32_gemm_4x8s4__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700745 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__psimd, 4, 8, 1, 4);
746 }
747
Frank Barchard95bebc92019-11-15 18:18:28 -0800748 static void f32_gemm_6x8s4__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700749 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__psimd, 6, 8, 1, 4);
750 }
751
Frank Barchard95bebc92019-11-15 18:18:28 -0800752 static void f32_ppmm_4x8_unipass__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700753 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
754 }
755
Frank Barchard95bebc92019-11-15 18:18:28 -0800756 static void f32_ppmm_4x8_twopass__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700757 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
758 }
759
Frank Barchard95bebc92019-11-15 18:18:28 -0800760 BENCHMARK_GEMM(f32_gemm_4x8__psimd_loadsplat)
761 BENCHMARK_GEMM(f32_gemm_6x8__psimd_loadsplat)
762 BENCHMARK_GEMM(f32_gemm_4x8__psimd_splat)
763 BENCHMARK_GEMM(f32_gemm_6x8__psimd_splat)
764 BENCHMARK_GEMM(f32_gemm_4x8s4__psimd)
765 BENCHMARK_GEMM(f32_gemm_6x8s4__psimd)
766 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__psimd)
767 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__psimd)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700768#endif // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700769
Frank Barchard95bebc92019-11-15 18:18:28 -0800770static void f32_gemm_1x4__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700771 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x4__scalar, 1, 4, 1, 1);
772}
773
Frank Barchard95bebc92019-11-15 18:18:28 -0800774static void f32_gemm_2x4__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700775 GEMMBenchmark(state, xnn_f32_gemm_ukernel_2x4__scalar, 2, 4, 1, 1);
776}
777
Frank Barchard95bebc92019-11-15 18:18:28 -0800778static void f32_gemm_4x4__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700779 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x4__scalar, 4, 4, 1, 1);
780}
781
Frank Barchard95bebc92019-11-15 18:18:28 -0800782static void f32_ppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700783 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
784}
785
Frank Barchard95bebc92019-11-15 18:18:28 -0800786static void f32_ppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700787 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
788}
789
Frank Barchard95bebc92019-11-15 18:18:28 -0800790static void f32_ppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700791 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
792}
793
Frank Barchard95bebc92019-11-15 18:18:28 -0800794static void f32_ppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700795 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
796}
797
Frank Barchard95bebc92019-11-15 18:18:28 -0800798static void f32_ppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700799 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
800}
801
Frank Barchard95bebc92019-11-15 18:18:28 -0800802static void f32_ppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700803 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
804}
805
Frank Barchard95bebc92019-11-15 18:18:28 -0800806static void f32_ppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700807 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
808}
809
Frank Barchard95bebc92019-11-15 18:18:28 -0800810static void f32_ppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700811 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
812}
813
Frank Barchard95bebc92019-11-15 18:18:28 -0800814BENCHMARK_GEMM(f32_gemm_1x4__scalar)
815BENCHMARK_GEMM(f32_gemm_2x4__scalar)
816BENCHMARK_GEMM(f32_gemm_4x4__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700817
Frank Barchard95bebc92019-11-15 18:18:28 -0800818BENCHMARK_GEMM(f32_ppmm_2x4_unipass__scalar)
819BENCHMARK_GEMM(f32_ppmm_4x2_unipass__scalar)
820BENCHMARK_GEMM(f32_ppmm_4x4_unipass__scalar)
821BENCHMARK_GEMM(f32_ppmm_3x3_unipass__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700822
Frank Barchard95bebc92019-11-15 18:18:28 -0800823BENCHMARK_GEMM(f32_ppmm_2x4_twopass__scalar)
824BENCHMARK_GEMM(f32_ppmm_4x2_twopass__scalar)
825BENCHMARK_GEMM(f32_ppmm_4x4_twopass__scalar)
826BENCHMARK_GEMM(f32_ppmm_3x3_twopass__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700827
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700828#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700829BENCHMARK_GEMM(ruy_st)
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700830#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700831
832#ifndef XNNPACK_BENCHMARK_NO_MAIN
833BENCHMARK_MAIN();
834#endif