blob: 1467aef8b70583b2154ad8be994ec3477dbf9cc7 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <algorithm>
10#include <cfloat>
11#include <chrono>
12#include <cmath>
13#include <functional>
14#include <mutex>
15#include <random>
16#include <vector>
17
18#include <cpuinfo.h>
19
Frank Barchardbb4c18b2019-09-30 11:05:52 -070020#include <benchmark/benchmark.h>
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070021#ifdef BENCHMARK_RUY
Frank Barchardbb4c18b2019-09-30 11:05:52 -070022#include "tensorflow/lite/experimental/ruy/ruy.h"
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070023#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -070024#include "bench/gemm.h"
Frank Barchardbb4c18b2019-09-30 11:05:52 -070025#include "bench/utils.h"
XNNPACK Teamb455b122019-09-27 18:10:33 -070026#include <xnnpack/AlignedAllocator.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070027#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070028#include <xnnpack/gemm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070029#include <xnnpack/pack.h>
Frank Barchardbb4c18b2019-09-30 11:05:52 -070030#include <xnnpack/packx.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070031#include <xnnpack/params-init.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070032#include <xnnpack/params.h>
33#include <xnnpack/ppmm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070034
XNNPACK Teamb455b122019-09-27 18:10:33 -070035
36static void GEMMBenchmark(benchmark::State& state,
37 xnn_f32_gemm_ukernel_function gemm,
38 size_t mr, size_t nr, size_t kr, size_t sr)
39{
40 if (!cpuinfo_initialize()) {
41 state.SkipWithError("cpuinfo initialization failed");
42 return;
43 }
44
45 const size_t mc = state.range(0);
46 const size_t nc = state.range(1);
47 const size_t kc = state.range(2);
48
Marat Dukhan42323232019-10-23 02:09:02 -070049 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
50 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr);
XNNPACK Teamb455b122019-09-27 18:10:33 -070051
52 std::random_device random_device;
53 auto rng = std::mt19937(random_device());
54 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
55
56 std::vector<float> a(mc * kc);
57 std::generate(a.begin(), a.end(), std::ref(f32rng));
58 std::vector<float> k(nc * kc);
59 std::generate(k.begin(), k.end(), std::ref(f32rng));
60 std::vector<float> b(nc);
61 std::generate(b.begin(), b.end(), std::ref(f32rng));
62
63 const size_t w_elements = nc_stride * kc_stride + nc_stride;
64 const size_t c_elements = mc * nc;
65 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -070066 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -070067 sizeof(float) * (w_elements + c_elements));
68
69 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
70 std::fill(w.begin(), w.end(), 0.0f);
71 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data());
72 std::vector<float> c(c_elements * num_buffers);
73 std::fill(c.begin(), c.end(), std::nanf(""));
74
75 xnn_f32_output_params output_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070076 xnn_init_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -070077
78 size_t buffer_index = 0;
79 for (auto _ : state) {
80 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
81 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
82 // - W is not in cache (for any cache level)
83 // - C is not in cache (for any cache level)
84 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -070085 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -070086 buffer_index = (buffer_index + 1) % num_buffers;
87 state.ResumeTiming();
88
89 for (uint32_t m = 0; m < mc; m += mr) {
90 const uint32_t mb = min(mc - m, mr);
91 gemm(
92 mb, nc, kc * sizeof(float),
93 a.data() + m * kc, kc * sizeof(float),
94 w.data() + buffer_index * nc_stride * (kc_stride + 1),
95 c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
96 &output_params);
97 }
98 }
99
100 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
101 state.counters["FLOPS"] = benchmark::Counter(
102 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
103}
104
105static void PPMM1PBenchmark(benchmark::State& state,
106 xnn_f32_ppmm_ukernel_function ppmm,
107 xnn_x32_packx_ukernel_function packx,
108 size_t mr, size_t nr)
109{
110 if (!cpuinfo_initialize()) {
111 state.SkipWithError("cpuinfo initialization failed");
112 return;
113 }
114
115 const size_t mc = state.range(0);
116 const size_t nc = state.range(1);
117 const size_t kc = state.range(2);
118
Marat Dukhan42323232019-10-23 02:09:02 -0700119 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700120
121 std::random_device random_device;
122 auto rng = std::mt19937(random_device());
123 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
124
125 std::vector<float> a(mc * kc);
126 std::generate(a.begin(), a.end(), std::ref(f32rng));
127 std::vector<float> k(nc * kc);
128 std::generate(k.begin(), k.end(), std::ref(f32rng));
129 std::vector<float> b(nc);
130 std::generate(b.begin(), b.end(), std::ref(f32rng));
131
132 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mr * kc);
133
134 const size_t w_elements = nc_stride * kc + nc_stride;
135 const size_t c_elements = mc * nc;
136 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700137 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700138 sizeof(float) * (w_elements + c_elements));
139
140 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
141 std::fill(w.begin(), w.end(), 0.0f);
142 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data());
143 std::vector<float> c(c_elements * num_buffers);
144 std::fill(c.begin(), c.end(), std::nanf(""));
145
146 xnn_f32_output_params output_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700147 xnn_init_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700148
149 size_t buffer_index = 0;
150 for (auto _ : state) {
151 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
152 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
153 // - W is not in cache (for any cache level)
154 // - C is not in cache (for any cache level)
155 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700156 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700157 buffer_index = (buffer_index + 1) % num_buffers;
158 state.ResumeTiming();
159
160 for (uint32_t m = 0; m < mc; m += mr) {
161 const uint32_t mb = min(mc - m, mr);
162 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data());
163 ppmm(
164 mb, nc, kc * sizeof(float),
165 reinterpret_cast<const float*>(t.data()),
166 w.data() + nc_stride * buffer_index * (kc + 1),
167 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
168 &output_params);
169 }
170 }
171
172 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
173 state.counters["FLOPS"] = benchmark::Counter(
174 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
175}
176
177static void PPMM2PBenchmark(benchmark::State& state,
178 xnn_f32_ppmm_ukernel_function ppmm,
179 xnn_x32_packx_ukernel_function packx,
180 size_t mr, size_t nr)
181{
182 if (!cpuinfo_initialize()) {
183 state.SkipWithError("cpuinfo initialization failed");
184 return;
185 }
186
187 const size_t mc = state.range(0);
188 const size_t nc = state.range(1);
189 const size_t kc = state.range(2);
190
Marat Dukhan42323232019-10-23 02:09:02 -0700191 const size_t mc_stride = benchmark::utils::RoundUp(mc, mr);
192 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700193
194 std::random_device random_device;
195 auto rng = std::mt19937(random_device());
196 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
197
198 std::vector<float> a(mc * kc);
199 std::generate(a.begin(), a.end(), std::ref(f32rng));
200 std::vector<float> k(nc * kc);
201 std::generate(k.begin(), k.end(), std::ref(f32rng));
202 std::vector<float> b(nc);
203 std::generate(b.begin(), b.end(), std::ref(f32rng));
204
205 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mc_stride * kc);
206
207 const size_t w_elements = nc_stride * kc + nc_stride;
208 const size_t c_elements = mc * nc;
209 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700210 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700211 sizeof(float) * (w_elements + c_elements));
212
213 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
214 std::fill(w.begin(), w.end(), 0.0f);
215 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data());
216 std::vector<float> c(c_elements * num_buffers);
217 std::fill(c.begin(), c.end(), std::nanf(""));
218
219 xnn_f32_output_params output_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700220 xnn_init_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700221
222 size_t buffer_index = 0;
223 for (auto _ : state) {
224 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
225 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
226 // - W is not in cache (for any cache level)
227 // - C is not in cache (for any cache level)
228 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700229 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700230 buffer_index = (buffer_index + 1) % num_buffers;
231 state.ResumeTiming();
232
233 for (uint32_t m = 0; m < mc; m += mr) {
234 const uint32_t mb = min(mc - m, mr);
235 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data() + m * kc);
236 }
237 for (uint32_t m = 0; m < mc; m += mr) {
238 const uint32_t mb = min(mc - m, mr);
239 ppmm(
240 mb, nc, kc * sizeof(float),
241 reinterpret_cast<const float*>(t.data() + m * kc),
242 w.data() + nc_stride * buffer_index * (kc + 1),
243 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
244 &output_params);
245 }
246 }
247
248 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
249 state.counters["FLOPS"] = benchmark::Counter(
250 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
251}
252
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700253#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700254static void RuyBenchmark(benchmark::State& state, uint32_t threads)
255{
256 std::random_device random_device;
257 auto rng = std::mt19937(random_device());
258 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
259
260 const size_t mc = state.range(0);
261 const size_t nc = state.range(1);
262 const size_t kc = state.range(2);
263
264 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700265 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700266 sizeof(float) * (nc * (mc + kc + 1)));
267
268 std::vector<float> a(mc * kc);
269 std::generate(a.begin(), a.end(), std::ref(f32rng));
270 std::vector<float> k(num_buffers * nc * kc);
271 std::generate(k.begin(), k.end(), std::ref(f32rng));
272 std::vector<float> b(num_buffers * nc);
273 std::generate(b.begin(), b.end(), std::ref(f32rng));
274 std::vector<float> c(num_buffers * nc * mc);
275 std::fill(c.begin(), c.end(), std::nanf(""));
276
277 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
278 static ruy::Context context;
279 context.max_num_threads = threads;
280
281 ruy::Matrix<float> ruy_a;
282 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, &ruy_a.layout);
283 ruy::Matrix<float> ruy_b;
284 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, &ruy_b.layout);
285 ruy_b.data = a.data();
286 ruy::Matrix<float> ruy_c;
287 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, &ruy_c.layout);
288
289 ruy::BasicSpec<float, float> spec;
290
291 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
292 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
293 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
294 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
295 static std::once_flag warmup;
296 std::call_once(warmup, [&](){
297 auto start = std::chrono::steady_clock::now();
298 do {
299 ruy_a.data = k.data();
300 ruy_c.data = c.data();
301 spec.bias = b.data();
302
303 ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
304 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
305 });
306
307 size_t buffer_index = 0;
308 for (auto _ : state) {
309 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
310 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
311 // - K is not in cache (for any cache level)
312 // - B is not in cache (for any cache level)
313 // - C is not in cache (for any cache level)
314 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700315 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700316 buffer_index = (buffer_index + 1) % num_buffers;
317 state.ResumeTiming();
318
319 ruy_a.data = k.data() + buffer_index * nc * kc;
320 ruy_c.data = c.data() + buffer_index * mc * nc;
321 spec.bias = b.data() + buffer_index * nc;
322
323 ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
324 }
325
326 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
327 state.counters["FLOPS"] = benchmark::Counter(
328 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
329}
330
331static void ruy_st(benchmark::State& state, const char* net)
332{
333 RuyBenchmark(state, 1);
334}
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700335#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700336
337
Frank Barcharddbafc582019-10-09 16:30:48 -0700338#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard95bebc92019-11-15 18:18:28 -0800339 static void f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700340 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1);
341 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800342 static void f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Frank Barchard21be34f2019-10-09 19:32:19 -0700343 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1);
344 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800345 static void f32_gemm_1x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700346 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57, 1, 8, 1, 1);
347 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800348 static void f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700349 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1);
350 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800351 static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700352 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1);
353 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800354 static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Frank Barchard46fb8072019-10-25 12:54:22 -0700355 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1);
356 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800357 static void f32_gemm_4x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700358 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57, 4, 8, 1, 1);
359 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800360 static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700361 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1);
362 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800363 static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700364 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1);
365 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800366 static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700367 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1);
368 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800369 static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700370 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1);
371 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800372 static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700373 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1);
374 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800375 static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700376 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1);
377 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800378 static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Frank Barcharda7fb8552019-10-23 17:14:17 -0700379 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1);
380 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800381 static void f32_gemm_6x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700382 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57, 6, 8, 1, 1);
383 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800384 static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700385 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1);
386 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800387 static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700388 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1);
389 }
Frank Barchard91317c52019-11-22 10:54:35 -0800390 static void f32_gemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
391 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1);
392 }
393
394 static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
395 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1);
396 }
397
398 static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
399 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1);
400 }
401
402 static void f32_gemm_5x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
403 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64, 5, 8, 1, 1);
404 }
405
406 static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
407 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1);
408 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700409
Frank Barchard95bebc92019-11-15 18:18:28 -0800410 BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)
411 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a53)
412 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a57)
413 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a75)
414 BENCHMARK_GEMM(f32_gemm_4x12__aarch64_neonfma_cortex_a53)
415 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
416 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a57)
417 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
418 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld128)
419 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld64)
420 BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_cortex_a75)
421 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a53)
422 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a57)
423 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a73)
424 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a75)
425 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld64)
426 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld128)
Frank Barchard91317c52019-11-22 10:54:35 -0800427 BENCHMARK_GEMM(f32_gemm_1x8__neonfma_lane_ld64)
428 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld128)
429 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld64)
430 BENCHMARK_GEMM(f32_gemm_5x8__neonfma_lane_ld64)
431 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld64)
432
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700433#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700434
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700435#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -0800436 static void f32_gemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
437 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700438 }
439
Frank Barchard91317c52019-11-22 10:54:35 -0800440 static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
441 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700442 }
443
Frank Barchard91317c52019-11-22 10:54:35 -0800444 static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
445 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700446 }
447
Frank Barchard91317c52019-11-22 10:54:35 -0800448 static void f32_gemm_5x8__neon_lane_ld64(benchmark::State& state, const char* net) {
449 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700450 }
451
Frank Barchard91317c52019-11-22 10:54:35 -0800452 static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
453 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700454 }
455
Frank Barcharddf06d802019-11-20 15:53:46 -0800456 static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) {
457 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neon, 1, 8, 1, 4);
458 }
459
460 static void f32_gemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
461 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neonfma, 1, 8, 1, 4);
462 }
463
464 static void f32_gemm_4x8s4__neon(benchmark::State& state, const char* net) {
465 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neon, 4, 8, 1, 4);
466 }
467
468 static void f32_gemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
469 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neonfma, 4, 8, 1, 4);
470 }
471
472 static void f32_gemm_6x8s4__neon(benchmark::State& state, const char* net) {
473 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neon, 6, 8, 1, 4);
474 }
475
476 static void f32_gemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
477 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neonfma, 6, 8, 1, 4);
478 }
479
480 static void f32_gemm_8x8s4__neon(benchmark::State& state, const char* net) {
481 GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neon, 8, 8, 1, 4);
482 }
483
484 static void f32_gemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
485 GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4);
486 }
487
Frank Barchard95bebc92019-11-15 18:18:28 -0800488 static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700489 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
490 }
491
Frank Barchard95bebc92019-11-15 18:18:28 -0800492 static void f32_ppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700493 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
494 }
495
Frank Barchard91317c52019-11-22 10:54:35 -0800496 BENCHMARK_GEMM(f32_gemm_1x8__neon_lane_ld64)
497 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld128)
498 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld64)
499 BENCHMARK_GEMM(f32_gemm_5x8__neon_lane_ld64)
Frank Barcharddf06d802019-11-20 15:53:46 -0800500 BENCHMARK_GEMM(f32_gemm_1x8s4__neon)
501 BENCHMARK_GEMM(f32_gemm_1x8s4__neonfma)
502 BENCHMARK_GEMM(f32_gemm_4x8s4__neon)
503 BENCHMARK_GEMM(f32_gemm_4x8s4__neonfma)
504 BENCHMARK_GEMM(f32_gemm_6x8s4__neon)
505 BENCHMARK_GEMM(f32_gemm_6x8s4__neonfma)
506 BENCHMARK_GEMM(f32_gemm_8x8s4__neon)
507 BENCHMARK_GEMM(f32_gemm_8x8s4__neonfma)
Frank Barchard95bebc92019-11-15 18:18:28 -0800508 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__neonfma)
509 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__neonfma)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700510#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700511
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700512#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Frank Barchard95bebc92019-11-15 18:18:28 -0800513 static void f32_gemm_1x8__sse_load1(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700514 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_load1, 1, 8, 1, 1);
515 }
516
Frank Barchard95bebc92019-11-15 18:18:28 -0800517 static void f32_gemm_4x8__sse_load1(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700518 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_load1, 4, 8, 1, 1);
519 }
520
Frank Barchard95bebc92019-11-15 18:18:28 -0800521 static void f32_gemm_1x8__sse_dup(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700522 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_dup, 1, 8, 1, 1);
523 }
524
Frank Barchard95bebc92019-11-15 18:18:28 -0800525 static void f32_gemm_4x8__sse_dup(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700526 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_dup, 4, 8, 1, 1);
527 }
528
Frank Barchard95bebc92019-11-15 18:18:28 -0800529 static void f32_gemm_1x8s4__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700530 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__sse, 1, 8, 1, 4);
531 }
532
Frank Barchard95bebc92019-11-15 18:18:28 -0800533 static void f32_gemm_4x8s4__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700534 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__sse, 4, 8, 1, 4);
535 }
536
Frank Barchard95bebc92019-11-15 18:18:28 -0800537 static void f32_ppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700538 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
539 }
540
Frank Barchard95bebc92019-11-15 18:18:28 -0800541 static void f32_ppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700542 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
543 }
544
Marat Dukhanfda12b82019-11-21 12:27:59 -0800545 static void f32_gemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
546 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__avx_broadcast, 1, 8, 1, 1);
547 }
548
549 static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
550 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__avx_broadcast, 4, 8, 1, 1);
551 }
552
553 static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
554 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__avx_broadcast, 5, 8, 1, 1);
555 }
556
557 static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
558 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__avx_broadcast, 6, 8, 1, 1);
559 }
560
561 static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
562 GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__avx_broadcast, 7, 8, 1, 1);
563 }
564
565 static void f32_gemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
566 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1);
567 }
568
569 static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
570 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1);
571 }
572
573 static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
574 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1);
575 }
576
577 static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
578 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1);
579 }
580
581 static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
582 GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1);
583 }
584
585 static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
586 GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1);
587 }
588
Frank Barchard95bebc92019-11-15 18:18:28 -0800589 BENCHMARK_GEMM(f32_gemm_1x8__sse_load1)
590 BENCHMARK_GEMM(f32_gemm_4x8__sse_load1)
591 BENCHMARK_GEMM(f32_gemm_1x8__sse_dup)
592 BENCHMARK_GEMM(f32_gemm_4x8__sse_dup)
593 BENCHMARK_GEMM(f32_gemm_1x8s4__sse)
594 BENCHMARK_GEMM(f32_gemm_4x8s4__sse)
595 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__sse)
596 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__sse)
Marat Dukhanfda12b82019-11-21 12:27:59 -0800597 BENCHMARK_GEMM(f32_gemm_1x8__avx_broadcast)
598 BENCHMARK_GEMM(f32_gemm_4x8__avx_broadcast)
599 BENCHMARK_GEMM(f32_gemm_5x8__avx_broadcast)
600 BENCHMARK_GEMM(f32_gemm_6x8__avx_broadcast)
601 BENCHMARK_GEMM(f32_gemm_7x8__avx_broadcast)
602 BENCHMARK_GEMM(f32_gemm_1x8__fma3_broadcast)
603 BENCHMARK_GEMM(f32_gemm_4x8__fma3_broadcast)
604 BENCHMARK_GEMM(f32_gemm_5x8__fma3_broadcast)
605 BENCHMARK_GEMM(f32_gemm_6x8__fma3_broadcast)
606 BENCHMARK_GEMM(f32_gemm_7x8__fma3_broadcast)
607 BENCHMARK_GEMM(f32_gemm_8x8__fma3_broadcast)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700608#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700609
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700610#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
Frank Barchard95bebc92019-11-15 18:18:28 -0800611 static void f32_gemm_4x8__psimd_loadsplat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700612 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, 4, 8, 1, 1);
613 }
614
Frank Barchard95bebc92019-11-15 18:18:28 -0800615 static void f32_gemm_6x8__psimd_loadsplat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700616 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, 6, 8, 1, 1);
617 }
618
Frank Barchard95bebc92019-11-15 18:18:28 -0800619 static void f32_gemm_4x8__psimd_splat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700620 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_splat, 4, 8, 1, 1);
621 }
622
Frank Barchard95bebc92019-11-15 18:18:28 -0800623 static void f32_gemm_6x8__psimd_splat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700624 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_splat, 6, 8, 1, 1);
625 }
626
Frank Barchard95bebc92019-11-15 18:18:28 -0800627 static void f32_gemm_4x8s4__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700628 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__psimd, 4, 8, 1, 4);
629 }
630
Frank Barchard95bebc92019-11-15 18:18:28 -0800631 static void f32_gemm_6x8s4__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700632 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__psimd, 6, 8, 1, 4);
633 }
634
Frank Barchard95bebc92019-11-15 18:18:28 -0800635 static void f32_ppmm_4x8_unipass__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700636 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
637 }
638
Frank Barchard95bebc92019-11-15 18:18:28 -0800639 static void f32_ppmm_4x8_twopass__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700640 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
641 }
642
Frank Barchard95bebc92019-11-15 18:18:28 -0800643 BENCHMARK_GEMM(f32_gemm_4x8__psimd_loadsplat)
644 BENCHMARK_GEMM(f32_gemm_6x8__psimd_loadsplat)
645 BENCHMARK_GEMM(f32_gemm_4x8__psimd_splat)
646 BENCHMARK_GEMM(f32_gemm_6x8__psimd_splat)
647 BENCHMARK_GEMM(f32_gemm_4x8s4__psimd)
648 BENCHMARK_GEMM(f32_gemm_6x8s4__psimd)
649 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__psimd)
650 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__psimd)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700651#endif // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700652
Frank Barchard95bebc92019-11-15 18:18:28 -0800653static void f32_gemm_1x4__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700654 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x4__scalar, 1, 4, 1, 1);
655}
656
Frank Barchard95bebc92019-11-15 18:18:28 -0800657static void f32_gemm_2x4__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700658 GEMMBenchmark(state, xnn_f32_gemm_ukernel_2x4__scalar, 2, 4, 1, 1);
659}
660
Frank Barchard95bebc92019-11-15 18:18:28 -0800661static void f32_gemm_4x4__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700662 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x4__scalar, 4, 4, 1, 1);
663}
664
Frank Barchard95bebc92019-11-15 18:18:28 -0800665static void f32_ppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700666 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
667}
668
Frank Barchard95bebc92019-11-15 18:18:28 -0800669static void f32_ppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700670 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
671}
672
Frank Barchard95bebc92019-11-15 18:18:28 -0800673static void f32_ppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700674 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
675}
676
Frank Barchard95bebc92019-11-15 18:18:28 -0800677static void f32_ppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700678 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
679}
680
Frank Barchard95bebc92019-11-15 18:18:28 -0800681static void f32_ppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700682 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
683}
684
Frank Barchard95bebc92019-11-15 18:18:28 -0800685static void f32_ppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700686 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
687}
688
Frank Barchard95bebc92019-11-15 18:18:28 -0800689static void f32_ppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700690 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
691}
692
Frank Barchard95bebc92019-11-15 18:18:28 -0800693static void f32_ppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700694 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
695}
696
Frank Barchard95bebc92019-11-15 18:18:28 -0800697BENCHMARK_GEMM(f32_gemm_1x4__scalar)
698BENCHMARK_GEMM(f32_gemm_2x4__scalar)
699BENCHMARK_GEMM(f32_gemm_4x4__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700700
Frank Barchard95bebc92019-11-15 18:18:28 -0800701BENCHMARK_GEMM(f32_ppmm_2x4_unipass__scalar)
702BENCHMARK_GEMM(f32_ppmm_4x2_unipass__scalar)
703BENCHMARK_GEMM(f32_ppmm_4x4_unipass__scalar)
704BENCHMARK_GEMM(f32_ppmm_3x3_unipass__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700705
Frank Barchard95bebc92019-11-15 18:18:28 -0800706BENCHMARK_GEMM(f32_ppmm_2x4_twopass__scalar)
707BENCHMARK_GEMM(f32_ppmm_4x2_twopass__scalar)
708BENCHMARK_GEMM(f32_ppmm_4x4_twopass__scalar)
709BENCHMARK_GEMM(f32_ppmm_3x3_twopass__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700710
711
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700712#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700713BENCHMARK_GEMM(ruy_st)
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700714#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700715
716#ifndef XNNPACK_BENCHMARK_NO_MAIN
717BENCHMARK_MAIN();
718#endif