blob: 642bddf0dafe513a0a75855308b7a3c7246cf854 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <algorithm>
10#include <cfloat>
11#include <chrono>
12#include <cmath>
13#include <functional>
14#include <mutex>
15#include <random>
16#include <vector>
17
18#include <cpuinfo.h>
19
Frank Barchardbb4c18b2019-09-30 11:05:52 -070020#include <benchmark/benchmark.h>
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070021#ifdef BENCHMARK_RUY
Frank Barchardbb4c18b2019-09-30 11:05:52 -070022#include "tensorflow/lite/experimental/ruy/ruy.h"
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070023#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -070024#include "bench/gemm.h"
Frank Barchardbb4c18b2019-09-30 11:05:52 -070025#include "bench/utils.h"
XNNPACK Teamb455b122019-09-27 18:10:33 -070026#include <xnnpack/AlignedAllocator.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070027#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070028#include <xnnpack/gemm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070029#include <xnnpack/pack.h>
Frank Barchardbb4c18b2019-09-30 11:05:52 -070030#include <xnnpack/packx.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070031#include <xnnpack/params-init.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070032#include <xnnpack/params.h>
33#include <xnnpack/ppmm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070034
XNNPACK Teamb455b122019-09-27 18:10:33 -070035
36static void GEMMBenchmark(benchmark::State& state,
37 xnn_f32_gemm_ukernel_function gemm,
Marat Dukhanc8466f52019-11-25 18:01:10 -080038 size_t mr, size_t nr, size_t kr, size_t sr,
39 benchmark::utils::IsaCheckFunction isa_check = nullptr)
XNNPACK Teamb455b122019-09-27 18:10:33 -070040{
41 if (!cpuinfo_initialize()) {
42 state.SkipWithError("cpuinfo initialization failed");
43 return;
44 }
Marat Dukhanc8466f52019-11-25 18:01:10 -080045 if (isa_check && !isa_check(state)) {
46 return;
47 }
XNNPACK Teamb455b122019-09-27 18:10:33 -070048
49 const size_t mc = state.range(0);
50 const size_t nc = state.range(1);
51 const size_t kc = state.range(2);
52
Marat Dukhan42323232019-10-23 02:09:02 -070053 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
54 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr);
XNNPACK Teamb455b122019-09-27 18:10:33 -070055
56 std::random_device random_device;
57 auto rng = std::mt19937(random_device());
58 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
59
60 std::vector<float> a(mc * kc);
61 std::generate(a.begin(), a.end(), std::ref(f32rng));
62 std::vector<float> k(nc * kc);
63 std::generate(k.begin(), k.end(), std::ref(f32rng));
64 std::vector<float> b(nc);
65 std::generate(b.begin(), b.end(), std::ref(f32rng));
66
67 const size_t w_elements = nc_stride * kc_stride + nc_stride;
68 const size_t c_elements = mc * nc;
69 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -070070 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -070071 sizeof(float) * (w_elements + c_elements));
72
73 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
74 std::fill(w.begin(), w.end(), 0.0f);
75 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data());
76 std::vector<float> c(c_elements * num_buffers);
77 std::fill(c.begin(), c.end(), std::nanf(""));
78
79 xnn_f32_output_params output_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070080 xnn_init_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -070081
82 size_t buffer_index = 0;
83 for (auto _ : state) {
84 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
85 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
86 // - W is not in cache (for any cache level)
87 // - C is not in cache (for any cache level)
88 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -070089 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -070090 buffer_index = (buffer_index + 1) % num_buffers;
91 state.ResumeTiming();
92
93 for (uint32_t m = 0; m < mc; m += mr) {
94 const uint32_t mb = min(mc - m, mr);
95 gemm(
96 mb, nc, kc * sizeof(float),
97 a.data() + m * kc, kc * sizeof(float),
98 w.data() + buffer_index * nc_stride * (kc_stride + 1),
99 c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
100 &output_params);
101 }
102 }
103
104 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
105 state.counters["FLOPS"] = benchmark::Counter(
106 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
107}
108
109static void PPMM1PBenchmark(benchmark::State& state,
110 xnn_f32_ppmm_ukernel_function ppmm,
111 xnn_x32_packx_ukernel_function packx,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800112 size_t mr, size_t nr,
113 benchmark::utils::IsaCheckFunction isa_check = nullptr)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700114{
115 if (!cpuinfo_initialize()) {
116 state.SkipWithError("cpuinfo initialization failed");
117 return;
118 }
Marat Dukhanc8466f52019-11-25 18:01:10 -0800119 if (isa_check && !isa_check(state)) {
120 return;
121 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700122
123 const size_t mc = state.range(0);
124 const size_t nc = state.range(1);
125 const size_t kc = state.range(2);
126
Marat Dukhan42323232019-10-23 02:09:02 -0700127 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700128
129 std::random_device random_device;
130 auto rng = std::mt19937(random_device());
131 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
132
133 std::vector<float> a(mc * kc);
134 std::generate(a.begin(), a.end(), std::ref(f32rng));
135 std::vector<float> k(nc * kc);
136 std::generate(k.begin(), k.end(), std::ref(f32rng));
137 std::vector<float> b(nc);
138 std::generate(b.begin(), b.end(), std::ref(f32rng));
139
140 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mr * kc);
141
142 const size_t w_elements = nc_stride * kc + nc_stride;
143 const size_t c_elements = mc * nc;
144 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700145 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700146 sizeof(float) * (w_elements + c_elements));
147
148 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
149 std::fill(w.begin(), w.end(), 0.0f);
150 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data());
151 std::vector<float> c(c_elements * num_buffers);
152 std::fill(c.begin(), c.end(), std::nanf(""));
153
154 xnn_f32_output_params output_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700155 xnn_init_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700156
157 size_t buffer_index = 0;
158 for (auto _ : state) {
159 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
160 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
161 // - W is not in cache (for any cache level)
162 // - C is not in cache (for any cache level)
163 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700164 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700165 buffer_index = (buffer_index + 1) % num_buffers;
166 state.ResumeTiming();
167
168 for (uint32_t m = 0; m < mc; m += mr) {
169 const uint32_t mb = min(mc - m, mr);
170 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data());
171 ppmm(
172 mb, nc, kc * sizeof(float),
173 reinterpret_cast<const float*>(t.data()),
174 w.data() + nc_stride * buffer_index * (kc + 1),
175 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
176 &output_params);
177 }
178 }
179
180 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
181 state.counters["FLOPS"] = benchmark::Counter(
182 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
183}
184
185static void PPMM2PBenchmark(benchmark::State& state,
186 xnn_f32_ppmm_ukernel_function ppmm,
187 xnn_x32_packx_ukernel_function packx,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800188 size_t mr, size_t nr,
189 benchmark::utils::IsaCheckFunction isa_check = nullptr)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700190{
191 if (!cpuinfo_initialize()) {
192 state.SkipWithError("cpuinfo initialization failed");
193 return;
194 }
Marat Dukhanc8466f52019-11-25 18:01:10 -0800195 if (isa_check && !isa_check(state)) {
196 return;
197 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700198
199 const size_t mc = state.range(0);
200 const size_t nc = state.range(1);
201 const size_t kc = state.range(2);
202
Marat Dukhan42323232019-10-23 02:09:02 -0700203 const size_t mc_stride = benchmark::utils::RoundUp(mc, mr);
204 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700205
206 std::random_device random_device;
207 auto rng = std::mt19937(random_device());
208 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
209
210 std::vector<float> a(mc * kc);
211 std::generate(a.begin(), a.end(), std::ref(f32rng));
212 std::vector<float> k(nc * kc);
213 std::generate(k.begin(), k.end(), std::ref(f32rng));
214 std::vector<float> b(nc);
215 std::generate(b.begin(), b.end(), std::ref(f32rng));
216
217 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mc_stride * kc);
218
219 const size_t w_elements = nc_stride * kc + nc_stride;
220 const size_t c_elements = mc * nc;
221 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700222 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700223 sizeof(float) * (w_elements + c_elements));
224
225 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
226 std::fill(w.begin(), w.end(), 0.0f);
227 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data());
228 std::vector<float> c(c_elements * num_buffers);
229 std::fill(c.begin(), c.end(), std::nanf(""));
230
231 xnn_f32_output_params output_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700232 xnn_init_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700233
234 size_t buffer_index = 0;
235 for (auto _ : state) {
236 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
237 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
238 // - W is not in cache (for any cache level)
239 // - C is not in cache (for any cache level)
240 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700241 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700242 buffer_index = (buffer_index + 1) % num_buffers;
243 state.ResumeTiming();
244
245 for (uint32_t m = 0; m < mc; m += mr) {
246 const uint32_t mb = min(mc - m, mr);
247 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data() + m * kc);
248 }
249 for (uint32_t m = 0; m < mc; m += mr) {
250 const uint32_t mb = min(mc - m, mr);
251 ppmm(
252 mb, nc, kc * sizeof(float),
253 reinterpret_cast<const float*>(t.data() + m * kc),
254 w.data() + nc_stride * buffer_index * (kc + 1),
255 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
256 &output_params);
257 }
258 }
259
260 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
261 state.counters["FLOPS"] = benchmark::Counter(
262 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
263}
264
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700265#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700266static void RuyBenchmark(benchmark::State& state, uint32_t threads)
267{
268 std::random_device random_device;
269 auto rng = std::mt19937(random_device());
270 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
271
272 const size_t mc = state.range(0);
273 const size_t nc = state.range(1);
274 const size_t kc = state.range(2);
275
276 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700277 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700278 sizeof(float) * (nc * (mc + kc + 1)));
279
280 std::vector<float> a(mc * kc);
281 std::generate(a.begin(), a.end(), std::ref(f32rng));
282 std::vector<float> k(num_buffers * nc * kc);
283 std::generate(k.begin(), k.end(), std::ref(f32rng));
284 std::vector<float> b(num_buffers * nc);
285 std::generate(b.begin(), b.end(), std::ref(f32rng));
286 std::vector<float> c(num_buffers * nc * mc);
287 std::fill(c.begin(), c.end(), std::nanf(""));
288
289 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
290 static ruy::Context context;
291 context.max_num_threads = threads;
292
293 ruy::Matrix<float> ruy_a;
294 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, &ruy_a.layout);
295 ruy::Matrix<float> ruy_b;
296 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, &ruy_b.layout);
297 ruy_b.data = a.data();
298 ruy::Matrix<float> ruy_c;
299 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, &ruy_c.layout);
300
301 ruy::BasicSpec<float, float> spec;
302
303 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
304 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
305 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
306 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
307 static std::once_flag warmup;
308 std::call_once(warmup, [&](){
309 auto start = std::chrono::steady_clock::now();
310 do {
311 ruy_a.data = k.data();
312 ruy_c.data = c.data();
313 spec.bias = b.data();
314
315 ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
316 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
317 });
318
319 size_t buffer_index = 0;
320 for (auto _ : state) {
321 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
322 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
323 // - K is not in cache (for any cache level)
324 // - B is not in cache (for any cache level)
325 // - C is not in cache (for any cache level)
326 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700327 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700328 buffer_index = (buffer_index + 1) % num_buffers;
329 state.ResumeTiming();
330
331 ruy_a.data = k.data() + buffer_index * nc * kc;
332 ruy_c.data = c.data() + buffer_index * mc * nc;
333 spec.bias = b.data() + buffer_index * nc;
334
335 ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
336 }
337
338 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
339 state.counters["FLOPS"] = benchmark::Counter(
340 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
341}
342
343static void ruy_st(benchmark::State& state, const char* net)
344{
345 RuyBenchmark(state, 1);
346}
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700347#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700348
349
Frank Barcharddbafc582019-10-09 16:30:48 -0700350#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard95bebc92019-11-15 18:18:28 -0800351 static void f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700352 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1);
353 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800354 static void f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Frank Barchard21be34f2019-10-09 19:32:19 -0700355 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1);
356 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800357 static void f32_gemm_1x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700358 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57, 1, 8, 1, 1);
359 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800360 static void f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700361 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1);
362 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800363 static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700364 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1);
365 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800366 static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Frank Barchard46fb8072019-10-25 12:54:22 -0700367 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1);
368 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800369 static void f32_gemm_4x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700370 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57, 4, 8, 1, 1);
371 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800372 static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700373 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1);
374 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800375 static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700376 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1);
377 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800378 static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700379 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1);
380 }
Frank Barchard387c2d12019-12-16 19:14:07 -0800381 static void f32_gemm_5x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
382 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57, 5, 8, 1, 1);
383 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800384 static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700385 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1);
386 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800387 static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700388 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1);
389 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800390 static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700391 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1);
392 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800393 static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Frank Barcharda7fb8552019-10-23 17:14:17 -0700394 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1);
395 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800396 static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700397 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1);
398 }
Frank Barchard387c2d12019-12-16 19:14:07 -0800399 static void f32_gemm_6x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
400 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57, 6, 8, 1, 1);
401 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800402 static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700403 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1);
404 }
Frank Barchard91317c52019-11-22 10:54:35 -0800405 static void f32_gemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
406 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1);
407 }
Frank Barchard91317c52019-11-22 10:54:35 -0800408 static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
409 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1);
410 }
Frank Barchard91317c52019-11-22 10:54:35 -0800411 static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
412 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1);
413 }
Frank Barchard91317c52019-11-22 10:54:35 -0800414 static void f32_gemm_5x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
415 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64, 5, 8, 1, 1);
416 }
Frank Barchard91317c52019-11-22 10:54:35 -0800417 static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
418 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1);
419 }
Frank Barchard69172d92019-11-26 16:22:39 -0800420 static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
421 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128, 6, 8, 1, 1);
Frank Barchard5243bb02019-11-22 16:37:50 -0800422 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800423 BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)
424 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a53)
425 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a57)
426 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a75)
427 BENCHMARK_GEMM(f32_gemm_4x12__aarch64_neonfma_cortex_a53)
428 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
429 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a57)
430 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
431 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld128)
432 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld64)
Frank Barchard387c2d12019-12-16 19:14:07 -0800433 BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_cortex_a57)
Frank Barchard95bebc92019-11-15 18:18:28 -0800434 BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_cortex_a75)
435 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a53)
Frank Barchard95bebc92019-11-15 18:18:28 -0800436 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a73)
Frank Barchard387c2d12019-12-16 19:14:07 -0800437 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a57)
Frank Barchard95bebc92019-11-15 18:18:28 -0800438 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a75)
439 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld64)
440 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld128)
Frank Barchard91317c52019-11-22 10:54:35 -0800441 BENCHMARK_GEMM(f32_gemm_1x8__neonfma_lane_ld64)
Frank Barchard91317c52019-11-22 10:54:35 -0800442 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800443 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld128)
Frank Barchard91317c52019-11-22 10:54:35 -0800444 BENCHMARK_GEMM(f32_gemm_5x8__neonfma_lane_ld64)
445 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800446 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld128)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700447#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700448
Frank Barchard8b0f0262019-11-27 23:18:40 -0800449#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
450 static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
Frank Barchard3e237f22019-12-04 23:08:51 -0800451 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEON);
452 }
Frank Barchard13916042019-12-11 10:56:34 -0800453 static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
454 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1, benchmark::utils::CheckNEON);
455 }
Frank Barchard3e237f22019-12-04 23:08:51 -0800456 static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
457 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1, benchmark::utils::CheckNEON);
Frank Barchard8b0f0262019-11-27 23:18:40 -0800458 }
Frank Barchard9f7d5552019-12-12 10:58:10 -0800459 static void f32_gemm_4x8__aarch32_neon_pld_cortex_a75(benchmark::State& state, const char* net) {
460 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75, 4, 8, 1, 1, benchmark::utils::CheckNEON);
461 }
Frank Barchard8b0f0262019-11-27 23:18:40 -0800462
463 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_ld64)
Frank Barchard13916042019-12-11 10:56:34 -0800464 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a53)
Frank Barchard3e237f22019-12-04 23:08:51 -0800465 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a75)
Frank Barchard9f7d5552019-12-12 10:58:10 -0800466 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_pld_cortex_a75)
Frank Barchard8b0f0262019-11-27 23:18:40 -0800467#endif // XNN_ARCH_ARM
468
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700469#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -0800470 static void f32_gemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800471 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700472 }
Frank Barchard91317c52019-11-22 10:54:35 -0800473 static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800474 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700475 }
Frank Barchard91317c52019-11-22 10:54:35 -0800476 static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800477 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700478 }
Frank Barchard91317c52019-11-22 10:54:35 -0800479 static void f32_gemm_5x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800480 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700481 }
Frank Barchard91317c52019-11-22 10:54:35 -0800482 static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800483 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700484 }
Frank Barchard69172d92019-11-26 16:22:39 -0800485 static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, const char* net) {
486 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_lane_ld128, 6, 8, 1, 1, benchmark::utils::CheckNEON);
487 }
488 static void f32_gemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
489 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEONFMA);
490 }
491 static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
492 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEONFMA);
493 }
494 static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
495 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEONFMA);
496 }
497 static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
498 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEONFMA);
499 }
500 static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
501 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128, 6, 8, 1, 1, benchmark::utils::CheckNEONFMA);
502 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800503 static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800504 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neon, 1, 8, 1, 4, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800505 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800506 static void f32_gemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800507 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neonfma, 1, 8, 1, 4, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800508 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800509 static void f32_gemm_4x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800510 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neon, 4, 8, 1, 4, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800511 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800512 static void f32_gemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800513 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neonfma, 4, 8, 1, 4, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800514 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800515 static void f32_gemm_6x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800516 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neon, 6, 8, 1, 4, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800517 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800518 static void f32_gemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800519 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neonfma, 6, 8, 1, 4, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800520 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800521 static void f32_gemm_8x8s4__neon(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800522 GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neon, 8, 8, 1, 4, benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800523 }
Frank Barcharddf06d802019-11-20 15:53:46 -0800524 static void f32_gemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800525 GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4, benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800526 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800527 static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800528 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8, benchmark::utils::CheckNEONFMA);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700529 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800530 static void f32_ppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800531 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8, benchmark::utils::CheckNEONFMA);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700532 }
533
Frank Barchard91317c52019-11-22 10:54:35 -0800534 BENCHMARK_GEMM(f32_gemm_1x8__neon_lane_ld64)
Frank Barchard91317c52019-11-22 10:54:35 -0800535 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800536 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld128)
Frank Barchard91317c52019-11-22 10:54:35 -0800537 BENCHMARK_GEMM(f32_gemm_5x8__neon_lane_ld64)
Frank Barchard69172d92019-11-26 16:22:39 -0800538 BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld64)
539 BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld128)
540 BENCHMARK_GEMM(f32_gemm_1x8__neonfma_dup_ld64)
541 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld128)
542 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld64)
543 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld64)
544 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld128)
Frank Barcharddf06d802019-11-20 15:53:46 -0800545 BENCHMARK_GEMM(f32_gemm_1x8s4__neon)
546 BENCHMARK_GEMM(f32_gemm_1x8s4__neonfma)
547 BENCHMARK_GEMM(f32_gemm_4x8s4__neon)
548 BENCHMARK_GEMM(f32_gemm_4x8s4__neonfma)
549 BENCHMARK_GEMM(f32_gemm_6x8s4__neon)
550 BENCHMARK_GEMM(f32_gemm_6x8s4__neonfma)
551 BENCHMARK_GEMM(f32_gemm_8x8s4__neon)
552 BENCHMARK_GEMM(f32_gemm_8x8s4__neonfma)
Frank Barchard95bebc92019-11-15 18:18:28 -0800553 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__neonfma)
554 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__neonfma)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700555#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700556
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700557#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Frank Barchard95bebc92019-11-15 18:18:28 -0800558 static void f32_gemm_1x8__sse_load1(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700559 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_load1, 1, 8, 1, 1);
560 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800561 static void f32_gemm_4x8__sse_load1(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700562 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_load1, 4, 8, 1, 1);
563 }
564
Frank Barchard95bebc92019-11-15 18:18:28 -0800565 static void f32_gemm_1x8__sse_dup(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700566 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_dup, 1, 8, 1, 1);
567 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800568 static void f32_gemm_4x8__sse_dup(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700569 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_dup, 4, 8, 1, 1);
570 }
571
Frank Barchard95bebc92019-11-15 18:18:28 -0800572 static void f32_gemm_1x8s4__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700573 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__sse, 1, 8, 1, 4);
574 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800575 static void f32_gemm_4x8s4__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700576 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__sse, 4, 8, 1, 4);
577 }
578
Frank Barchard95bebc92019-11-15 18:18:28 -0800579 static void f32_ppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700580 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
581 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800582 static void f32_ppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700583 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
584 }
585
Marat Dukhanfda12b82019-11-21 12:27:59 -0800586 static void f32_gemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800587 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__avx_broadcast, 1, 8, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800588 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800589 static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800590 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__avx_broadcast, 4, 8, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800591 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800592 static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800593 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__avx_broadcast, 5, 8, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800594 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800595 static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800596 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__avx_broadcast, 6, 8, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800597 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800598 static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800599 GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__avx_broadcast, 7, 8, 1, 1, benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800600 }
Marat Dukhan27121322019-12-09 14:57:40 -0800601 static void f32_gemm_1x16__avx_broadcast(benchmark::State& state, const char* net) {
602 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x16__avx_broadcast, 1, 16, 1, 1, benchmark::utils::CheckAVX);
603 }
604 static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, const char* net) {
605 GEMMBenchmark(state, xnn_f32_gemm_ukernel_3x16__avx_broadcast, 4, 16, 1, 1, benchmark::utils::CheckAVX);
606 }
607 static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, const char* net) {
608 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x16__avx_broadcast, 4, 16, 1, 1, benchmark::utils::CheckAVX);
609 }
610 static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, const char* net) {
611 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x16__avx_broadcast, 5, 16, 1, 1, benchmark::utils::CheckAVX);
612 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800613
614 static void f32_gemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800615 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800616 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800617 static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800618 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800619 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800620 static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800621 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800622 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800623 static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800624 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800625 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800626 static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800627 GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800628 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800629 static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
Marat Dukhanc8466f52019-11-25 18:01:10 -0800630 GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1, benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800631 }
Marat Dukhan27121322019-12-09 14:57:40 -0800632 static void f32_gemm_1x16__fma3_broadcast(benchmark::State& state, const char* net) {
633 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x16__fma3_broadcast, 1, 16, 1, 1, benchmark::utils::CheckFMA3);
634 }
635 static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, const char* net) {
636 GEMMBenchmark(state, xnn_f32_gemm_ukernel_3x16__fma3_broadcast, 4, 16, 1, 1, benchmark::utils::CheckFMA3);
637 }
638 static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, const char* net) {
639 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x16__fma3_broadcast, 4, 16, 1, 1, benchmark::utils::CheckFMA3);
640 }
641 static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, const char* net) {
642 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x16__fma3_broadcast, 5, 16, 1, 1, benchmark::utils::CheckFMA3);
643 }
644
645 static void f32_gemm_1x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
646 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast, 1, 16, 1, 4, benchmark::utils::CheckFMA3);
647 }
648 static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
649 GEMMBenchmark(state, xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast, 4, 16, 1, 4, benchmark::utils::CheckFMA3);
650 }
651 static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
652 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast, 4, 16, 1, 4, benchmark::utils::CheckFMA3);
653 }
654 static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
655 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast, 5, 16, 1, 4, benchmark::utils::CheckFMA3);
656 }
Marat Dukhanfda12b82019-11-21 12:27:59 -0800657
Marat Dukhan0f349c42019-11-27 11:58:54 -0800658 static void f32_gemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
659 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x16__avx512f_broadcast, 1, 16, 1, 1, benchmark::utils::CheckAVX512F);
660 }
661 static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) {
662 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x16__avx512f_broadcast, 4, 16, 1, 1, benchmark::utils::CheckAVX512F);
663 }
664 static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) {
665 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x16__avx512f_broadcast, 5, 16, 1, 1, benchmark::utils::CheckAVX512F);
666 }
667 static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) {
668 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x16__avx512f_broadcast, 6, 16, 1, 1, benchmark::utils::CheckAVX512F);
669 }
670 static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) {
671 GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x16__avx512f_broadcast, 7, 16, 1, 1, benchmark::utils::CheckAVX512F);
672 }
673 static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) {
674 GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x16__avx512f_broadcast, 8, 16, 1, 1, benchmark::utils::CheckAVX512F);
675 }
676
Frank Barchard95bebc92019-11-15 18:18:28 -0800677 BENCHMARK_GEMM(f32_gemm_1x8__sse_load1)
678 BENCHMARK_GEMM(f32_gemm_4x8__sse_load1)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800679
Frank Barchard95bebc92019-11-15 18:18:28 -0800680 BENCHMARK_GEMM(f32_gemm_1x8__sse_dup)
681 BENCHMARK_GEMM(f32_gemm_4x8__sse_dup)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800682
Frank Barchard95bebc92019-11-15 18:18:28 -0800683 BENCHMARK_GEMM(f32_gemm_1x8s4__sse)
684 BENCHMARK_GEMM(f32_gemm_4x8s4__sse)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800685
Frank Barchard95bebc92019-11-15 18:18:28 -0800686 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__sse)
687 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__sse)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800688
Marat Dukhanfda12b82019-11-21 12:27:59 -0800689 BENCHMARK_GEMM(f32_gemm_1x8__avx_broadcast)
690 BENCHMARK_GEMM(f32_gemm_4x8__avx_broadcast)
691 BENCHMARK_GEMM(f32_gemm_5x8__avx_broadcast)
692 BENCHMARK_GEMM(f32_gemm_6x8__avx_broadcast)
693 BENCHMARK_GEMM(f32_gemm_7x8__avx_broadcast)
Marat Dukhan27121322019-12-09 14:57:40 -0800694 BENCHMARK_GEMM(f32_gemm_1x16__avx_broadcast)
695 BENCHMARK_GEMM(f32_gemm_3x16__avx_broadcast)
696 BENCHMARK_GEMM(f32_gemm_4x16__avx_broadcast)
697 BENCHMARK_GEMM(f32_gemm_5x16__avx_broadcast)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800698
Marat Dukhanfda12b82019-11-21 12:27:59 -0800699 BENCHMARK_GEMM(f32_gemm_1x8__fma3_broadcast)
700 BENCHMARK_GEMM(f32_gemm_4x8__fma3_broadcast)
701 BENCHMARK_GEMM(f32_gemm_5x8__fma3_broadcast)
702 BENCHMARK_GEMM(f32_gemm_6x8__fma3_broadcast)
703 BENCHMARK_GEMM(f32_gemm_7x8__fma3_broadcast)
704 BENCHMARK_GEMM(f32_gemm_8x8__fma3_broadcast)
Marat Dukhan27121322019-12-09 14:57:40 -0800705 BENCHMARK_GEMM(f32_gemm_1x16__fma3_broadcast)
706 BENCHMARK_GEMM(f32_gemm_3x16__fma3_broadcast)
707 BENCHMARK_GEMM(f32_gemm_4x16__fma3_broadcast)
708 BENCHMARK_GEMM(f32_gemm_5x16__fma3_broadcast)
709
710 BENCHMARK_GEMM(f32_gemm_1x16s4__fma3_broadcast)
711 BENCHMARK_GEMM(f32_gemm_3x16s4__fma3_broadcast)
712 BENCHMARK_GEMM(f32_gemm_4x16s4__fma3_broadcast)
713 BENCHMARK_GEMM(f32_gemm_5x16s4__fma3_broadcast)
Marat Dukhan0f349c42019-11-27 11:58:54 -0800714
715 BENCHMARK_GEMM(f32_gemm_1x16__avx512f_broadcast)
716 BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast)
717 BENCHMARK_GEMM(f32_gemm_5x16__avx512f_broadcast)
718 BENCHMARK_GEMM(f32_gemm_6x16__avx512f_broadcast)
719 BENCHMARK_GEMM(f32_gemm_7x16__avx512f_broadcast)
720 BENCHMARK_GEMM(f32_gemm_8x16__avx512f_broadcast)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700721#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700722
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700723#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
Frank Barchard95bebc92019-11-15 18:18:28 -0800724 static void f32_gemm_4x8__psimd_loadsplat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700725 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, 4, 8, 1, 1);
726 }
727
Frank Barchard95bebc92019-11-15 18:18:28 -0800728 static void f32_gemm_6x8__psimd_loadsplat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700729 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, 6, 8, 1, 1);
730 }
731
Frank Barchard95bebc92019-11-15 18:18:28 -0800732 static void f32_gemm_4x8__psimd_splat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700733 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_splat, 4, 8, 1, 1);
734 }
735
Frank Barchard95bebc92019-11-15 18:18:28 -0800736 static void f32_gemm_6x8__psimd_splat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700737 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_splat, 6, 8, 1, 1);
738 }
739
Frank Barchard95bebc92019-11-15 18:18:28 -0800740 static void f32_gemm_4x8s4__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700741 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__psimd, 4, 8, 1, 4);
742 }
743
Frank Barchard95bebc92019-11-15 18:18:28 -0800744 static void f32_gemm_6x8s4__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700745 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__psimd, 6, 8, 1, 4);
746 }
747
Frank Barchard95bebc92019-11-15 18:18:28 -0800748 static void f32_ppmm_4x8_unipass__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700749 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
750 }
751
Frank Barchard95bebc92019-11-15 18:18:28 -0800752 static void f32_ppmm_4x8_twopass__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700753 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
754 }
755
Frank Barchard95bebc92019-11-15 18:18:28 -0800756 BENCHMARK_GEMM(f32_gemm_4x8__psimd_loadsplat)
757 BENCHMARK_GEMM(f32_gemm_6x8__psimd_loadsplat)
758 BENCHMARK_GEMM(f32_gemm_4x8__psimd_splat)
759 BENCHMARK_GEMM(f32_gemm_6x8__psimd_splat)
760 BENCHMARK_GEMM(f32_gemm_4x8s4__psimd)
761 BENCHMARK_GEMM(f32_gemm_6x8s4__psimd)
762 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__psimd)
763 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__psimd)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700764#endif // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700765
Frank Barchard95bebc92019-11-15 18:18:28 -0800766static void f32_gemm_1x4__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700767 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x4__scalar, 1, 4, 1, 1);
768}
769
Frank Barchard95bebc92019-11-15 18:18:28 -0800770static void f32_gemm_2x4__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700771 GEMMBenchmark(state, xnn_f32_gemm_ukernel_2x4__scalar, 2, 4, 1, 1);
772}
773
Frank Barchard95bebc92019-11-15 18:18:28 -0800774static void f32_gemm_4x4__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700775 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x4__scalar, 4, 4, 1, 1);
776}
777
Frank Barchard95bebc92019-11-15 18:18:28 -0800778static void f32_ppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700779 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
780}
781
Frank Barchard95bebc92019-11-15 18:18:28 -0800782static void f32_ppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700783 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
784}
785
Frank Barchard95bebc92019-11-15 18:18:28 -0800786static void f32_ppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700787 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
788}
789
Frank Barchard95bebc92019-11-15 18:18:28 -0800790static void f32_ppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700791 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
792}
793
Frank Barchard95bebc92019-11-15 18:18:28 -0800794static void f32_ppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700795 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
796}
797
Frank Barchard95bebc92019-11-15 18:18:28 -0800798static void f32_ppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700799 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
800}
801
Frank Barchard95bebc92019-11-15 18:18:28 -0800802static void f32_ppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700803 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
804}
805
Frank Barchard95bebc92019-11-15 18:18:28 -0800806static void f32_ppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700807 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
808}
809
Frank Barchard95bebc92019-11-15 18:18:28 -0800810BENCHMARK_GEMM(f32_gemm_1x4__scalar)
811BENCHMARK_GEMM(f32_gemm_2x4__scalar)
812BENCHMARK_GEMM(f32_gemm_4x4__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700813
Frank Barchard95bebc92019-11-15 18:18:28 -0800814BENCHMARK_GEMM(f32_ppmm_2x4_unipass__scalar)
815BENCHMARK_GEMM(f32_ppmm_4x2_unipass__scalar)
816BENCHMARK_GEMM(f32_ppmm_4x4_unipass__scalar)
817BENCHMARK_GEMM(f32_ppmm_3x3_unipass__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700818
Frank Barchard95bebc92019-11-15 18:18:28 -0800819BENCHMARK_GEMM(f32_ppmm_2x4_twopass__scalar)
820BENCHMARK_GEMM(f32_ppmm_4x2_twopass__scalar)
821BENCHMARK_GEMM(f32_ppmm_4x4_twopass__scalar)
822BENCHMARK_GEMM(f32_ppmm_3x3_twopass__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700823
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700824#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700825BENCHMARK_GEMM(ruy_st)
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700826#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700827
828#ifndef XNNPACK_BENCHMARK_NO_MAIN
829BENCHMARK_MAIN();
830#endif