blob: a0cf3f0fdac24ade3cad056f9f5f40255a2ec7ff [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <algorithm>
10#include <cfloat>
11#include <chrono>
12#include <cmath>
13#include <functional>
14#include <mutex>
15#include <random>
16#include <vector>
17
18#include <cpuinfo.h>
19
Frank Barchardbb4c18b2019-09-30 11:05:52 -070020#include <benchmark/benchmark.h>
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070021#ifdef BENCHMARK_RUY
Frank Barchardbb4c18b2019-09-30 11:05:52 -070022#include "tensorflow/lite/experimental/ruy/ruy.h"
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070023#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -070024#include "bench/gemm.h"
Frank Barchardbb4c18b2019-09-30 11:05:52 -070025#include "bench/utils.h"
XNNPACK Teamb455b122019-09-27 18:10:33 -070026#include <xnnpack/AlignedAllocator.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070027#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070028#include <xnnpack/gemm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070029#include <xnnpack/pack.h>
Frank Barchardbb4c18b2019-09-30 11:05:52 -070030#include <xnnpack/packx.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070031#include <xnnpack/params-init.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070032#include <xnnpack/params.h>
33#include <xnnpack/ppmm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070034
XNNPACK Teamb455b122019-09-27 18:10:33 -070035
36static void GEMMBenchmark(benchmark::State& state,
37 xnn_f32_gemm_ukernel_function gemm,
38 size_t mr, size_t nr, size_t kr, size_t sr)
39{
40 if (!cpuinfo_initialize()) {
41 state.SkipWithError("cpuinfo initialization failed");
42 return;
43 }
44
45 const size_t mc = state.range(0);
46 const size_t nc = state.range(1);
47 const size_t kc = state.range(2);
48
Marat Dukhan42323232019-10-23 02:09:02 -070049 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
50 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr);
XNNPACK Teamb455b122019-09-27 18:10:33 -070051
52 std::random_device random_device;
53 auto rng = std::mt19937(random_device());
54 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
55
56 std::vector<float> a(mc * kc);
57 std::generate(a.begin(), a.end(), std::ref(f32rng));
58 std::vector<float> k(nc * kc);
59 std::generate(k.begin(), k.end(), std::ref(f32rng));
60 std::vector<float> b(nc);
61 std::generate(b.begin(), b.end(), std::ref(f32rng));
62
63 const size_t w_elements = nc_stride * kc_stride + nc_stride;
64 const size_t c_elements = mc * nc;
65 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -070066 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -070067 sizeof(float) * (w_elements + c_elements));
68
69 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
70 std::fill(w.begin(), w.end(), 0.0f);
71 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data());
72 std::vector<float> c(c_elements * num_buffers);
73 std::fill(c.begin(), c.end(), std::nanf(""));
74
75 xnn_f32_output_params output_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070076 xnn_init_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -070077
78 size_t buffer_index = 0;
79 for (auto _ : state) {
80 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
81 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
82 // - W is not in cache (for any cache level)
83 // - C is not in cache (for any cache level)
84 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -070085 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -070086 buffer_index = (buffer_index + 1) % num_buffers;
87 state.ResumeTiming();
88
89 for (uint32_t m = 0; m < mc; m += mr) {
90 const uint32_t mb = min(mc - m, mr);
91 gemm(
92 mb, nc, kc * sizeof(float),
93 a.data() + m * kc, kc * sizeof(float),
94 w.data() + buffer_index * nc_stride * (kc_stride + 1),
95 c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
96 &output_params);
97 }
98 }
99
100 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
101 state.counters["FLOPS"] = benchmark::Counter(
102 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
103}
104
105static void PPMM1PBenchmark(benchmark::State& state,
106 xnn_f32_ppmm_ukernel_function ppmm,
107 xnn_x32_packx_ukernel_function packx,
108 size_t mr, size_t nr)
109{
110 if (!cpuinfo_initialize()) {
111 state.SkipWithError("cpuinfo initialization failed");
112 return;
113 }
114
115 const size_t mc = state.range(0);
116 const size_t nc = state.range(1);
117 const size_t kc = state.range(2);
118
Marat Dukhan42323232019-10-23 02:09:02 -0700119 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700120
121 std::random_device random_device;
122 auto rng = std::mt19937(random_device());
123 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
124
125 std::vector<float> a(mc * kc);
126 std::generate(a.begin(), a.end(), std::ref(f32rng));
127 std::vector<float> k(nc * kc);
128 std::generate(k.begin(), k.end(), std::ref(f32rng));
129 std::vector<float> b(nc);
130 std::generate(b.begin(), b.end(), std::ref(f32rng));
131
132 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mr * kc);
133
134 const size_t w_elements = nc_stride * kc + nc_stride;
135 const size_t c_elements = mc * nc;
136 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700137 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700138 sizeof(float) * (w_elements + c_elements));
139
140 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
141 std::fill(w.begin(), w.end(), 0.0f);
142 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data());
143 std::vector<float> c(c_elements * num_buffers);
144 std::fill(c.begin(), c.end(), std::nanf(""));
145
146 xnn_f32_output_params output_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700147 xnn_init_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700148
149 size_t buffer_index = 0;
150 for (auto _ : state) {
151 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
152 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
153 // - W is not in cache (for any cache level)
154 // - C is not in cache (for any cache level)
155 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700156 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700157 buffer_index = (buffer_index + 1) % num_buffers;
158 state.ResumeTiming();
159
160 for (uint32_t m = 0; m < mc; m += mr) {
161 const uint32_t mb = min(mc - m, mr);
162 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data());
163 ppmm(
164 mb, nc, kc * sizeof(float),
165 reinterpret_cast<const float*>(t.data()),
166 w.data() + nc_stride * buffer_index * (kc + 1),
167 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
168 &output_params);
169 }
170 }
171
172 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
173 state.counters["FLOPS"] = benchmark::Counter(
174 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
175}
176
177static void PPMM2PBenchmark(benchmark::State& state,
178 xnn_f32_ppmm_ukernel_function ppmm,
179 xnn_x32_packx_ukernel_function packx,
180 size_t mr, size_t nr)
181{
182 if (!cpuinfo_initialize()) {
183 state.SkipWithError("cpuinfo initialization failed");
184 return;
185 }
186
187 const size_t mc = state.range(0);
188 const size_t nc = state.range(1);
189 const size_t kc = state.range(2);
190
Marat Dukhan42323232019-10-23 02:09:02 -0700191 const size_t mc_stride = benchmark::utils::RoundUp(mc, mr);
192 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700193
194 std::random_device random_device;
195 auto rng = std::mt19937(random_device());
196 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
197
198 std::vector<float> a(mc * kc);
199 std::generate(a.begin(), a.end(), std::ref(f32rng));
200 std::vector<float> k(nc * kc);
201 std::generate(k.begin(), k.end(), std::ref(f32rng));
202 std::vector<float> b(nc);
203 std::generate(b.begin(), b.end(), std::ref(f32rng));
204
205 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mc_stride * kc);
206
207 const size_t w_elements = nc_stride * kc + nc_stride;
208 const size_t c_elements = mc * nc;
209 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700210 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700211 sizeof(float) * (w_elements + c_elements));
212
213 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
214 std::fill(w.begin(), w.end(), 0.0f);
215 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data());
216 std::vector<float> c(c_elements * num_buffers);
217 std::fill(c.begin(), c.end(), std::nanf(""));
218
219 xnn_f32_output_params output_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700220 xnn_init_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700221
222 size_t buffer_index = 0;
223 for (auto _ : state) {
224 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
225 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
226 // - W is not in cache (for any cache level)
227 // - C is not in cache (for any cache level)
228 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700229 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700230 buffer_index = (buffer_index + 1) % num_buffers;
231 state.ResumeTiming();
232
233 for (uint32_t m = 0; m < mc; m += mr) {
234 const uint32_t mb = min(mc - m, mr);
235 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data() + m * kc);
236 }
237 for (uint32_t m = 0; m < mc; m += mr) {
238 const uint32_t mb = min(mc - m, mr);
239 ppmm(
240 mb, nc, kc * sizeof(float),
241 reinterpret_cast<const float*>(t.data() + m * kc),
242 w.data() + nc_stride * buffer_index * (kc + 1),
243 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
244 &output_params);
245 }
246 }
247
248 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
249 state.counters["FLOPS"] = benchmark::Counter(
250 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
251}
252
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700253#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700254static void RuyBenchmark(benchmark::State& state, uint32_t threads)
255{
256 std::random_device random_device;
257 auto rng = std::mt19937(random_device());
258 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
259
260 const size_t mc = state.range(0);
261 const size_t nc = state.range(1);
262 const size_t kc = state.range(2);
263
264 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700265 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700266 sizeof(float) * (nc * (mc + kc + 1)));
267
268 std::vector<float> a(mc * kc);
269 std::generate(a.begin(), a.end(), std::ref(f32rng));
270 std::vector<float> k(num_buffers * nc * kc);
271 std::generate(k.begin(), k.end(), std::ref(f32rng));
272 std::vector<float> b(num_buffers * nc);
273 std::generate(b.begin(), b.end(), std::ref(f32rng));
274 std::vector<float> c(num_buffers * nc * mc);
275 std::fill(c.begin(), c.end(), std::nanf(""));
276
277 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
278 static ruy::Context context;
279 context.max_num_threads = threads;
280
281 ruy::Matrix<float> ruy_a;
282 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, &ruy_a.layout);
283 ruy::Matrix<float> ruy_b;
284 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, &ruy_b.layout);
285 ruy_b.data = a.data();
286 ruy::Matrix<float> ruy_c;
287 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, &ruy_c.layout);
288
289 ruy::BasicSpec<float, float> spec;
290
291 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
292 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
293 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
294 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
295 static std::once_flag warmup;
296 std::call_once(warmup, [&](){
297 auto start = std::chrono::steady_clock::now();
298 do {
299 ruy_a.data = k.data();
300 ruy_c.data = c.data();
301 spec.bias = b.data();
302
303 ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
304 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
305 });
306
307 size_t buffer_index = 0;
308 for (auto _ : state) {
309 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
310 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
311 // - K is not in cache (for any cache level)
312 // - B is not in cache (for any cache level)
313 // - C is not in cache (for any cache level)
314 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700315 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700316 buffer_index = (buffer_index + 1) % num_buffers;
317 state.ResumeTiming();
318
319 ruy_a.data = k.data() + buffer_index * nc * kc;
320 ruy_c.data = c.data() + buffer_index * mc * nc;
321 spec.bias = b.data() + buffer_index * nc;
322
323 ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
324 }
325
326 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
327 state.counters["FLOPS"] = benchmark::Counter(
328 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
329}
330
331static void ruy_st(benchmark::State& state, const char* net)
332{
333 RuyBenchmark(state, 1);
334}
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700335#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700336
337
Frank Barcharddbafc582019-10-09 16:30:48 -0700338#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard95bebc92019-11-15 18:18:28 -0800339 static void f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700340 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1);
341 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800342 static void f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Frank Barchard21be34f2019-10-09 19:32:19 -0700343 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1);
344 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800345 static void f32_gemm_1x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700346 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57, 1, 8, 1, 1);
347 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800348 static void f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700349 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1);
350 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800351 static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700352 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1);
353 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800354 static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Frank Barchard46fb8072019-10-25 12:54:22 -0700355 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1);
356 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800357 static void f32_gemm_4x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700358 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57, 4, 8, 1, 1);
359 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800360 static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700361 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1);
362 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800363 static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700364 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1);
365 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800366 static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700367 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1);
368 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800369 static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700370 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1);
371 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800372 static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700373 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1);
374 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800375 static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700376 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1);
377 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800378 static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
Frank Barcharda7fb8552019-10-23 17:14:17 -0700379 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1);
380 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800381 static void f32_gemm_6x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700382 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57, 6, 8, 1, 1);
383 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800384 static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700385 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1);
386 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800387 static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700388 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1);
389 }
Frank Barchard91317c52019-11-22 10:54:35 -0800390 static void f32_gemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
391 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1);
392 }
Frank Barchard91317c52019-11-22 10:54:35 -0800393 static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
394 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1);
395 }
Frank Barchard91317c52019-11-22 10:54:35 -0800396 static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
397 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1);
398 }
Frank Barchard91317c52019-11-22 10:54:35 -0800399 static void f32_gemm_5x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
400 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64, 5, 8, 1, 1);
401 }
Frank Barchard91317c52019-11-22 10:54:35 -0800402 static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
403 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1);
404 }
Frank Barchard5243bb02019-11-22 16:37:50 -0800405 static void f32_gemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
406 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1);
407 }
408 static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
409 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1);
410 }
411 static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
412 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1);
413 }
414 static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
415 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1);
416 }
Frank Barchard95bebc92019-11-15 18:18:28 -0800417 BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)
418 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a53)
419 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a57)
420 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a75)
421 BENCHMARK_GEMM(f32_gemm_4x12__aarch64_neonfma_cortex_a53)
422 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
423 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a57)
424 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
425 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld128)
426 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld64)
427 BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_cortex_a75)
428 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a53)
429 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a57)
430 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a73)
431 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a75)
432 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld64)
433 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld128)
Frank Barchard91317c52019-11-22 10:54:35 -0800434 BENCHMARK_GEMM(f32_gemm_1x8__neonfma_lane_ld64)
435 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld128)
436 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld64)
437 BENCHMARK_GEMM(f32_gemm_5x8__neonfma_lane_ld64)
438 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld64)
Frank Barchard5243bb02019-11-22 16:37:50 -0800439 BENCHMARK_GEMM(f32_gemm_1x8__neonfma_dup_ld64)
440 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld128)
441 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld64)
442 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld64)
Frank Barchard91317c52019-11-22 10:54:35 -0800443
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700444#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700445
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700446#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -0800447 static void f32_gemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
448 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700449 }
450
Frank Barchard91317c52019-11-22 10:54:35 -0800451 static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
452 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700453 }
454
Frank Barchard91317c52019-11-22 10:54:35 -0800455 static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
456 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700457 }
458
Frank Barchard91317c52019-11-22 10:54:35 -0800459 static void f32_gemm_5x8__neon_lane_ld64(benchmark::State& state, const char* net) {
460 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700461 }
462
Frank Barchard91317c52019-11-22 10:54:35 -0800463 static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
464 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700465 }
466
Frank Barcharddf06d802019-11-20 15:53:46 -0800467 static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) {
468 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neon, 1, 8, 1, 4);
469 }
470
471 static void f32_gemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
472 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neonfma, 1, 8, 1, 4);
473 }
474
475 static void f32_gemm_4x8s4__neon(benchmark::State& state, const char* net) {
476 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neon, 4, 8, 1, 4);
477 }
478
479 static void f32_gemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
480 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neonfma, 4, 8, 1, 4);
481 }
482
483 static void f32_gemm_6x8s4__neon(benchmark::State& state, const char* net) {
484 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neon, 6, 8, 1, 4);
485 }
486
487 static void f32_gemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
488 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neonfma, 6, 8, 1, 4);
489 }
490
491 static void f32_gemm_8x8s4__neon(benchmark::State& state, const char* net) {
492 GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neon, 8, 8, 1, 4);
493 }
494
495 static void f32_gemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
496 GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4);
497 }
498
Frank Barchard95bebc92019-11-15 18:18:28 -0800499 static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700500 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
501 }
502
Frank Barchard95bebc92019-11-15 18:18:28 -0800503 static void f32_ppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700504 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
505 }
506
Frank Barchard91317c52019-11-22 10:54:35 -0800507 BENCHMARK_GEMM(f32_gemm_1x8__neon_lane_ld64)
508 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld128)
509 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld64)
510 BENCHMARK_GEMM(f32_gemm_5x8__neon_lane_ld64)
Frank Barcharddf06d802019-11-20 15:53:46 -0800511 BENCHMARK_GEMM(f32_gemm_1x8s4__neon)
512 BENCHMARK_GEMM(f32_gemm_1x8s4__neonfma)
513 BENCHMARK_GEMM(f32_gemm_4x8s4__neon)
514 BENCHMARK_GEMM(f32_gemm_4x8s4__neonfma)
515 BENCHMARK_GEMM(f32_gemm_6x8s4__neon)
516 BENCHMARK_GEMM(f32_gemm_6x8s4__neonfma)
517 BENCHMARK_GEMM(f32_gemm_8x8s4__neon)
518 BENCHMARK_GEMM(f32_gemm_8x8s4__neonfma)
Frank Barchard95bebc92019-11-15 18:18:28 -0800519 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__neonfma)
520 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__neonfma)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700521#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700522
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700523#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Frank Barchard95bebc92019-11-15 18:18:28 -0800524 static void f32_gemm_1x8__sse_load1(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700525 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_load1, 1, 8, 1, 1);
526 }
527
Frank Barchard95bebc92019-11-15 18:18:28 -0800528 static void f32_gemm_4x8__sse_load1(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700529 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_load1, 4, 8, 1, 1);
530 }
531
Frank Barchard95bebc92019-11-15 18:18:28 -0800532 static void f32_gemm_1x8__sse_dup(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700533 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_dup, 1, 8, 1, 1);
534 }
535
Frank Barchard95bebc92019-11-15 18:18:28 -0800536 static void f32_gemm_4x8__sse_dup(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700537 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_dup, 4, 8, 1, 1);
538 }
539
Frank Barchard95bebc92019-11-15 18:18:28 -0800540 static void f32_gemm_1x8s4__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700541 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__sse, 1, 8, 1, 4);
542 }
543
Frank Barchard95bebc92019-11-15 18:18:28 -0800544 static void f32_gemm_4x8s4__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700545 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__sse, 4, 8, 1, 4);
546 }
547
Frank Barchard95bebc92019-11-15 18:18:28 -0800548 static void f32_ppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700549 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
550 }
551
Frank Barchard95bebc92019-11-15 18:18:28 -0800552 static void f32_ppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700553 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
554 }
555
Marat Dukhanfda12b82019-11-21 12:27:59 -0800556 static void f32_gemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
557 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__avx_broadcast, 1, 8, 1, 1);
558 }
559
560 static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
561 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__avx_broadcast, 4, 8, 1, 1);
562 }
563
564 static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
565 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__avx_broadcast, 5, 8, 1, 1);
566 }
567
568 static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
569 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__avx_broadcast, 6, 8, 1, 1);
570 }
571
572 static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
573 GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__avx_broadcast, 7, 8, 1, 1);
574 }
575
576 static void f32_gemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
577 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1);
578 }
579
580 static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
581 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1);
582 }
583
584 static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
585 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1);
586 }
587
588 static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
589 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1);
590 }
591
592 static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
593 GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1);
594 }
595
596 static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
597 GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1);
598 }
599
Frank Barchard95bebc92019-11-15 18:18:28 -0800600 BENCHMARK_GEMM(f32_gemm_1x8__sse_load1)
601 BENCHMARK_GEMM(f32_gemm_4x8__sse_load1)
602 BENCHMARK_GEMM(f32_gemm_1x8__sse_dup)
603 BENCHMARK_GEMM(f32_gemm_4x8__sse_dup)
604 BENCHMARK_GEMM(f32_gemm_1x8s4__sse)
605 BENCHMARK_GEMM(f32_gemm_4x8s4__sse)
606 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__sse)
607 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__sse)
Marat Dukhanfda12b82019-11-21 12:27:59 -0800608 BENCHMARK_GEMM(f32_gemm_1x8__avx_broadcast)
609 BENCHMARK_GEMM(f32_gemm_4x8__avx_broadcast)
610 BENCHMARK_GEMM(f32_gemm_5x8__avx_broadcast)
611 BENCHMARK_GEMM(f32_gemm_6x8__avx_broadcast)
612 BENCHMARK_GEMM(f32_gemm_7x8__avx_broadcast)
613 BENCHMARK_GEMM(f32_gemm_1x8__fma3_broadcast)
614 BENCHMARK_GEMM(f32_gemm_4x8__fma3_broadcast)
615 BENCHMARK_GEMM(f32_gemm_5x8__fma3_broadcast)
616 BENCHMARK_GEMM(f32_gemm_6x8__fma3_broadcast)
617 BENCHMARK_GEMM(f32_gemm_7x8__fma3_broadcast)
618 BENCHMARK_GEMM(f32_gemm_8x8__fma3_broadcast)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700619#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700620
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700621#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
Frank Barchard95bebc92019-11-15 18:18:28 -0800622 static void f32_gemm_4x8__psimd_loadsplat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700623 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, 4, 8, 1, 1);
624 }
625
Frank Barchard95bebc92019-11-15 18:18:28 -0800626 static void f32_gemm_6x8__psimd_loadsplat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700627 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, 6, 8, 1, 1);
628 }
629
Frank Barchard95bebc92019-11-15 18:18:28 -0800630 static void f32_gemm_4x8__psimd_splat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700631 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_splat, 4, 8, 1, 1);
632 }
633
Frank Barchard95bebc92019-11-15 18:18:28 -0800634 static void f32_gemm_6x8__psimd_splat(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700635 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_splat, 6, 8, 1, 1);
636 }
637
Frank Barchard95bebc92019-11-15 18:18:28 -0800638 static void f32_gemm_4x8s4__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700639 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__psimd, 4, 8, 1, 4);
640 }
641
Frank Barchard95bebc92019-11-15 18:18:28 -0800642 static void f32_gemm_6x8s4__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700643 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__psimd, 6, 8, 1, 4);
644 }
645
Frank Barchard95bebc92019-11-15 18:18:28 -0800646 static void f32_ppmm_4x8_unipass__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700647 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
648 }
649
Frank Barchard95bebc92019-11-15 18:18:28 -0800650 static void f32_ppmm_4x8_twopass__psimd(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700651 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
652 }
653
Frank Barchard95bebc92019-11-15 18:18:28 -0800654 BENCHMARK_GEMM(f32_gemm_4x8__psimd_loadsplat)
655 BENCHMARK_GEMM(f32_gemm_6x8__psimd_loadsplat)
656 BENCHMARK_GEMM(f32_gemm_4x8__psimd_splat)
657 BENCHMARK_GEMM(f32_gemm_6x8__psimd_splat)
658 BENCHMARK_GEMM(f32_gemm_4x8s4__psimd)
659 BENCHMARK_GEMM(f32_gemm_6x8s4__psimd)
660 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__psimd)
661 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__psimd)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700662#endif // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700663
Frank Barchard95bebc92019-11-15 18:18:28 -0800664static void f32_gemm_1x4__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700665 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x4__scalar, 1, 4, 1, 1);
666}
667
Frank Barchard95bebc92019-11-15 18:18:28 -0800668static void f32_gemm_2x4__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700669 GEMMBenchmark(state, xnn_f32_gemm_ukernel_2x4__scalar, 2, 4, 1, 1);
670}
671
Frank Barchard95bebc92019-11-15 18:18:28 -0800672static void f32_gemm_4x4__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700673 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x4__scalar, 4, 4, 1, 1);
674}
675
Frank Barchard95bebc92019-11-15 18:18:28 -0800676static void f32_ppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700677 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
678}
679
Frank Barchard95bebc92019-11-15 18:18:28 -0800680static void f32_ppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700681 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
682}
683
Frank Barchard95bebc92019-11-15 18:18:28 -0800684static void f32_ppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700685 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
686}
687
Frank Barchard95bebc92019-11-15 18:18:28 -0800688static void f32_ppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700689 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
690}
691
Frank Barchard95bebc92019-11-15 18:18:28 -0800692static void f32_ppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700693 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
694}
695
Frank Barchard95bebc92019-11-15 18:18:28 -0800696static void f32_ppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700697 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
698}
699
Frank Barchard95bebc92019-11-15 18:18:28 -0800700static void f32_ppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700701 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
702}
703
Frank Barchard95bebc92019-11-15 18:18:28 -0800704static void f32_ppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700705 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
706}
707
Frank Barchard95bebc92019-11-15 18:18:28 -0800708BENCHMARK_GEMM(f32_gemm_1x4__scalar)
709BENCHMARK_GEMM(f32_gemm_2x4__scalar)
710BENCHMARK_GEMM(f32_gemm_4x4__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700711
Frank Barchard95bebc92019-11-15 18:18:28 -0800712BENCHMARK_GEMM(f32_ppmm_2x4_unipass__scalar)
713BENCHMARK_GEMM(f32_ppmm_4x2_unipass__scalar)
714BENCHMARK_GEMM(f32_ppmm_4x4_unipass__scalar)
715BENCHMARK_GEMM(f32_ppmm_3x3_unipass__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700716
Frank Barchard95bebc92019-11-15 18:18:28 -0800717BENCHMARK_GEMM(f32_ppmm_2x4_twopass__scalar)
718BENCHMARK_GEMM(f32_ppmm_4x2_twopass__scalar)
719BENCHMARK_GEMM(f32_ppmm_4x4_twopass__scalar)
720BENCHMARK_GEMM(f32_ppmm_3x3_twopass__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700721
722
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700723#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700724BENCHMARK_GEMM(ruy_st)
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700725#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700726
727#ifndef XNNPACK_BENCHMARK_NO_MAIN
728BENCHMARK_MAIN();
729#endif