blob: 59b5edea21607763dd7c040db2323fbe4a94dc8f [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <algorithm>
10#include <cfloat>
11#include <chrono>
12#include <cmath>
13#include <functional>
14#include <mutex>
15#include <random>
16#include <vector>
17
18#include <cpuinfo.h>
19
Frank Barchardbb4c18b2019-09-30 11:05:52 -070020#include <benchmark/benchmark.h>
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070021#ifdef BENCHMARK_RUY
Frank Barchardbb4c18b2019-09-30 11:05:52 -070022#include "tensorflow/lite/experimental/ruy/ruy.h"
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070023#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -070024#include "bench/gemm.h"
Frank Barchardbb4c18b2019-09-30 11:05:52 -070025#include "bench/utils.h"
XNNPACK Teamb455b122019-09-27 18:10:33 -070026#include <xnnpack/AlignedAllocator.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070027#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070028#include <xnnpack/gemm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070029#include <xnnpack/pack.h>
Frank Barchardbb4c18b2019-09-30 11:05:52 -070030#include <xnnpack/packx.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070031#include <xnnpack/params.h>
32#include <xnnpack/ppmm.h>
33#include <xnnpack/requantization.h>
34
XNNPACK Teamb455b122019-09-27 18:10:33 -070035
36static void GEMMBenchmark(benchmark::State& state,
37 xnn_f32_gemm_ukernel_function gemm,
38 size_t mr, size_t nr, size_t kr, size_t sr)
39{
40 if (!cpuinfo_initialize()) {
41 state.SkipWithError("cpuinfo initialization failed");
42 return;
43 }
44
45 const size_t mc = state.range(0);
46 const size_t nc = state.range(1);
47 const size_t kc = state.range(2);
48
49 const size_t nc_stride = benchmark::utils::roundUp(nc, nr);
50 const size_t kc_stride = benchmark::utils::roundUp(kc, kr);
51
52 std::random_device random_device;
53 auto rng = std::mt19937(random_device());
54 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
55
56 std::vector<float> a(mc * kc);
57 std::generate(a.begin(), a.end(), std::ref(f32rng));
58 std::vector<float> k(nc * kc);
59 std::generate(k.begin(), k.end(), std::ref(f32rng));
60 std::vector<float> b(nc);
61 std::generate(b.begin(), b.end(), std::ref(f32rng));
62
63 const size_t w_elements = nc_stride * kc_stride + nc_stride;
64 const size_t c_elements = mc * nc;
65 const size_t num_buffers = 1 +
Marat Dukhand62f3cc2019-10-01 12:37:52 -070066 benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -070067 sizeof(float) * (w_elements + c_elements));
68
69 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
70 std::fill(w.begin(), w.end(), 0.0f);
71 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data());
72 std::vector<float> c(c_elements * num_buffers);
73 std::fill(c.begin(), c.end(), std::nanf(""));
74
75 xnn_f32_output_params output_params =
76 xnn_compute_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
77
78 size_t buffer_index = 0;
79 for (auto _ : state) {
80 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
81 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
82 // - W is not in cache (for any cache level)
83 // - C is not in cache (for any cache level)
84 state.PauseTiming();
85 benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
86 buffer_index = (buffer_index + 1) % num_buffers;
87 state.ResumeTiming();
88
89 for (uint32_t m = 0; m < mc; m += mr) {
90 const uint32_t mb = min(mc - m, mr);
91 gemm(
92 mb, nc, kc * sizeof(float),
93 a.data() + m * kc, kc * sizeof(float),
94 w.data() + buffer_index * nc_stride * (kc_stride + 1),
95 c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
96 &output_params);
97 }
98 }
99
100 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
101 state.counters["FLOPS"] = benchmark::Counter(
102 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
103}
104
105static void PPMM1PBenchmark(benchmark::State& state,
106 xnn_f32_ppmm_ukernel_function ppmm,
107 xnn_x32_packx_ukernel_function packx,
108 size_t mr, size_t nr)
109{
110 if (!cpuinfo_initialize()) {
111 state.SkipWithError("cpuinfo initialization failed");
112 return;
113 }
114
115 const size_t mc = state.range(0);
116 const size_t nc = state.range(1);
117 const size_t kc = state.range(2);
118
119 const size_t nc_stride = benchmark::utils::roundUp(nc, nr);
120
121 std::random_device random_device;
122 auto rng = std::mt19937(random_device());
123 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
124
125 std::vector<float> a(mc * kc);
126 std::generate(a.begin(), a.end(), std::ref(f32rng));
127 std::vector<float> k(nc * kc);
128 std::generate(k.begin(), k.end(), std::ref(f32rng));
129 std::vector<float> b(nc);
130 std::generate(b.begin(), b.end(), std::ref(f32rng));
131
132 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mr * kc);
133
134 const size_t w_elements = nc_stride * kc + nc_stride;
135 const size_t c_elements = mc * nc;
136 const size_t num_buffers = 1 +
Marat Dukhand62f3cc2019-10-01 12:37:52 -0700137 benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700138 sizeof(float) * (w_elements + c_elements));
139
140 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
141 std::fill(w.begin(), w.end(), 0.0f);
142 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data());
143 std::vector<float> c(c_elements * num_buffers);
144 std::fill(c.begin(), c.end(), std::nanf(""));
145
146 xnn_f32_output_params output_params =
147 xnn_compute_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
148
149 size_t buffer_index = 0;
150 for (auto _ : state) {
151 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
152 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
153 // - W is not in cache (for any cache level)
154 // - C is not in cache (for any cache level)
155 state.PauseTiming();
156 benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
157 buffer_index = (buffer_index + 1) % num_buffers;
158 state.ResumeTiming();
159
160 for (uint32_t m = 0; m < mc; m += mr) {
161 const uint32_t mb = min(mc - m, mr);
162 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data());
163 ppmm(
164 mb, nc, kc * sizeof(float),
165 reinterpret_cast<const float*>(t.data()),
166 w.data() + nc_stride * buffer_index * (kc + 1),
167 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
168 &output_params);
169 }
170 }
171
172 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
173 state.counters["FLOPS"] = benchmark::Counter(
174 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
175}
176
177static void PPMM2PBenchmark(benchmark::State& state,
178 xnn_f32_ppmm_ukernel_function ppmm,
179 xnn_x32_packx_ukernel_function packx,
180 size_t mr, size_t nr)
181{
182 if (!cpuinfo_initialize()) {
183 state.SkipWithError("cpuinfo initialization failed");
184 return;
185 }
186
187 const size_t mc = state.range(0);
188 const size_t nc = state.range(1);
189 const size_t kc = state.range(2);
190
191 const size_t mc_stride = benchmark::utils::roundUp(mc, mr);
192 const size_t nc_stride = benchmark::utils::roundUp(nc, nr);
193
194 std::random_device random_device;
195 auto rng = std::mt19937(random_device());
196 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
197
198 std::vector<float> a(mc * kc);
199 std::generate(a.begin(), a.end(), std::ref(f32rng));
200 std::vector<float> k(nc * kc);
201 std::generate(k.begin(), k.end(), std::ref(f32rng));
202 std::vector<float> b(nc);
203 std::generate(b.begin(), b.end(), std::ref(f32rng));
204
205 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mc_stride * kc);
206
207 const size_t w_elements = nc_stride * kc + nc_stride;
208 const size_t c_elements = mc * nc;
209 const size_t num_buffers = 1 +
Marat Dukhand62f3cc2019-10-01 12:37:52 -0700210 benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700211 sizeof(float) * (w_elements + c_elements));
212
213 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
214 std::fill(w.begin(), w.end(), 0.0f);
215 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data());
216 std::vector<float> c(c_elements * num_buffers);
217 std::fill(c.begin(), c.end(), std::nanf(""));
218
219 xnn_f32_output_params output_params =
220 xnn_compute_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
221
222 size_t buffer_index = 0;
223 for (auto _ : state) {
224 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
225 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
226 // - W is not in cache (for any cache level)
227 // - C is not in cache (for any cache level)
228 state.PauseTiming();
229 benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
230 buffer_index = (buffer_index + 1) % num_buffers;
231 state.ResumeTiming();
232
233 for (uint32_t m = 0; m < mc; m += mr) {
234 const uint32_t mb = min(mc - m, mr);
235 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data() + m * kc);
236 }
237 for (uint32_t m = 0; m < mc; m += mr) {
238 const uint32_t mb = min(mc - m, mr);
239 ppmm(
240 mb, nc, kc * sizeof(float),
241 reinterpret_cast<const float*>(t.data() + m * kc),
242 w.data() + nc_stride * buffer_index * (kc + 1),
243 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
244 &output_params);
245 }
246 }
247
248 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
249 state.counters["FLOPS"] = benchmark::Counter(
250 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
251}
252
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700253#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700254static void RuyBenchmark(benchmark::State& state, uint32_t threads)
255{
256 std::random_device random_device;
257 auto rng = std::mt19937(random_device());
258 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
259
260 const size_t mc = state.range(0);
261 const size_t nc = state.range(1);
262 const size_t kc = state.range(2);
263
264 const size_t num_buffers = 1 +
Marat Dukhand62f3cc2019-10-01 12:37:52 -0700265 benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700266 sizeof(float) * (nc * (mc + kc + 1)));
267
268 std::vector<float> a(mc * kc);
269 std::generate(a.begin(), a.end(), std::ref(f32rng));
270 std::vector<float> k(num_buffers * nc * kc);
271 std::generate(k.begin(), k.end(), std::ref(f32rng));
272 std::vector<float> b(num_buffers * nc);
273 std::generate(b.begin(), b.end(), std::ref(f32rng));
274 std::vector<float> c(num_buffers * nc * mc);
275 std::fill(c.begin(), c.end(), std::nanf(""));
276
277 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
278 static ruy::Context context;
279 context.max_num_threads = threads;
280
281 ruy::Matrix<float> ruy_a;
282 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, &ruy_a.layout);
283 ruy::Matrix<float> ruy_b;
284 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, &ruy_b.layout);
285 ruy_b.data = a.data();
286 ruy::Matrix<float> ruy_c;
287 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, &ruy_c.layout);
288
289 ruy::BasicSpec<float, float> spec;
290
291 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
292 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
293 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
294 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
295 static std::once_flag warmup;
296 std::call_once(warmup, [&](){
297 auto start = std::chrono::steady_clock::now();
298 do {
299 ruy_a.data = k.data();
300 ruy_c.data = c.data();
301 spec.bias = b.data();
302
303 ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
304 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
305 });
306
307 size_t buffer_index = 0;
308 for (auto _ : state) {
309 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
310 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
311 // - K is not in cache (for any cache level)
312 // - B is not in cache (for any cache level)
313 // - C is not in cache (for any cache level)
314 state.PauseTiming();
315 benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
316 buffer_index = (buffer_index + 1) % num_buffers;
317 state.ResumeTiming();
318
319 ruy_a.data = k.data() + buffer_index * nc * kc;
320 ruy_c.data = c.data() + buffer_index * mc * nc;
321 spec.bias = b.data() + buffer_index * nc;
322
323 ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
324 }
325
326 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
327 state.counters["FLOPS"] = benchmark::Counter(
328 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
329}
330
331static void ruy_st(benchmark::State& state, const char* net)
332{
333 RuyBenchmark(state, 1);
334}
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700335#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700336
337
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700338#if XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700339 static void sgemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
340 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1);
341 }
342 static void sgemm_1x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
343 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57, 1, 8, 1, 1);
344 }
345 static void sgemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
346 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1);
347 }
348 static void sgemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
349 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1);
350 }
351 static void sgemm_4x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
352 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57, 4, 8, 1, 1);
353 }
354 static void sgemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
355 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1);
356 }
357 static void sgemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
358 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1);
359 }
360 static void sgemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
361 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1);
362 }
363 static void sgemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
364 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1);
365 }
366 static void sgemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
367 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1);
368 }
369 static void sgemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
370 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1);
371 }
372 static void sgemm_6x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
373 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57, 6, 8, 1, 1);
374 }
375 static void sgemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
376 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1);
377 }
378 static void sgemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
379 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1);
380 }
381
382 BENCHMARK_GEMM(sgemm_1x12__aarch64_neonfma_cortex_a53)
383 BENCHMARK_GEMM(sgemm_1x8__aarch64_neonfma_cortex_a57)
384 BENCHMARK_GEMM(sgemm_1x8__aarch64_neonfma_cortex_a75)
385 BENCHMARK_GEMM(sgemm_4x12__aarch64_neonfma_cortex_a53)
386 BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_cortex_a57)
387 BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_cortex_a75)
388 BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_ld128)
389 BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_ld64)
390 BENCHMARK_GEMM(sgemm_5x8__aarch64_neonfma_cortex_a75)
391 BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a57)
392 BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a73)
393 BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a75)
394 BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_ld64)
395 BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_ld128)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700396#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700397
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700398#if XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700399 static void sgemm_4x12__neon_ld64(benchmark::State& state, const char* net) {
400 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__neon_ld64, 4, 12, 1, 1);
401 }
402
403 static void sgemm_1x8__neon_ld64(benchmark::State& state, const char* net) {
404 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_ld64, 1, 8, 1, 1);
405 }
406
407 static void sgemm_1x8__neonfma_ld64(benchmark::State& state, const char* net) {
408 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_ld64, 1, 8, 1, 1);
409 }
410
411 static void sgemm_4x8__neon_ld64(benchmark::State& state, const char* net) {
412 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_ld64, 4, 8, 1, 1);
413 }
414
415 static void sgemm_4x8__neon_ld128(benchmark::State& state, const char* net) {
416 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_ld128, 4, 8, 1, 1);
417 }
418
419 static void sgemm_5x8__neon_ld64(benchmark::State& state, const char* net) {
420 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_ld64, 5, 8, 1, 1);
421 }
422
423 static void sgemm_6x8__neon_ld64(benchmark::State& state, const char* net) {
424 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_ld64, 6, 8, 1, 1);
425 }
426
427 static void sgemm_4x12__neonfma_ld64(benchmark::State& state, const char* net) {
428 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__neonfma_ld64, 4, 12, 1, 1);
429 }
430
431 static void sgemm_4x8__neonfma_ld64(benchmark::State& state, const char* net) {
432 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_ld64, 4, 8, 1, 1);
433 }
434
435 static void sgemm_4x8__neonfma_ld128(benchmark::State& state, const char* net) {
436 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_ld128, 4, 8, 1, 1);
437 }
438
439 static void sgemm_5x8__neonfma_ld64(benchmark::State& state, const char* net) {
440 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neonfma_ld64, 5, 8, 1, 1);
441 }
442
443 static void sgemm_6x8__neonfma_ld64(benchmark::State& state, const char* net) {
444 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_ld64, 6, 8, 1, 1);
445 }
446
447 static void sppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
448 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
449 }
450
451 static void sppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
452 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
453 }
454
455 BENCHMARK_GEMM(sgemm_4x12__neon_ld64)
456 BENCHMARK_GEMM(sgemm_4x12__neonfma_ld64)
457 BENCHMARK_GEMM(sgemm_1x8__neon_ld64)
458 BENCHMARK_GEMM(sgemm_1x8__neonfma_ld64)
459 BENCHMARK_GEMM(sgemm_4x8__neon_ld128)
460 BENCHMARK_GEMM(sgemm_4x8__neon_ld64)
461 BENCHMARK_GEMM(sgemm_4x8__neonfma_ld128)
462 BENCHMARK_GEMM(sgemm_4x8__neonfma_ld64)
463 BENCHMARK_GEMM(sgemm_5x8__neon_ld64)
464 BENCHMARK_GEMM(sgemm_5x8__neonfma_ld64)
465 BENCHMARK_GEMM(sgemm_6x8__neon_ld64)
466 BENCHMARK_GEMM(sgemm_6x8__neonfma_ld64)
467
468 BENCHMARK_GEMM(sppmm_4x8_unipass__neonfma)
469 BENCHMARK_GEMM(sppmm_4x8_twopass__neonfma)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700470#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700471
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700472#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700473 static void sgemm_1x8__sse_load1(benchmark::State& state, const char* net) {
474 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_load1, 1, 8, 1, 1);
475 }
476
477 static void sgemm_4x8__sse_load1(benchmark::State& state, const char* net) {
478 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_load1, 4, 8, 1, 1);
479 }
480
481 static void sgemm_1x8__sse_dup(benchmark::State& state, const char* net) {
482 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_dup, 1, 8, 1, 1);
483 }
484
485 static void sgemm_4x8__sse_dup(benchmark::State& state, const char* net) {
486 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_dup, 4, 8, 1, 1);
487 }
488
489 static void sgemm_1x8s4__sse(benchmark::State& state, const char* net) {
490 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__sse, 1, 8, 1, 4);
491 }
492
493 static void sgemm_4x8s4__sse(benchmark::State& state, const char* net) {
494 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__sse, 4, 8, 1, 4);
495 }
496
497 static void sppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
498 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
499 }
500
501 static void sppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
502 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
503 }
504
505 BENCHMARK_GEMM(sgemm_1x8__sse_load1)
506 BENCHMARK_GEMM(sgemm_4x8__sse_load1)
507 BENCHMARK_GEMM(sgemm_1x8__sse_dup)
508 BENCHMARK_GEMM(sgemm_4x8__sse_dup)
509 BENCHMARK_GEMM(sgemm_1x8s4__sse)
510 BENCHMARK_GEMM(sgemm_4x8s4__sse)
511 BENCHMARK_GEMM(sppmm_4x8_unipass__sse)
512 BENCHMARK_GEMM(sppmm_4x8_twopass__sse)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700513#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700514
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700515#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700516 static void sgemm_4x8__psimd_loadsplat(benchmark::State& state, const char* net) {
517 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, 4, 8, 1, 1);
518 }
519
520 static void sgemm_6x8__psimd_loadsplat(benchmark::State& state, const char* net) {
521 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, 6, 8, 1, 1);
522 }
523
524 static void sgemm_4x8__psimd_splat(benchmark::State& state, const char* net) {
525 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_splat, 4, 8, 1, 1);
526 }
527
528 static void sgemm_6x8__psimd_splat(benchmark::State& state, const char* net) {
529 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_splat, 6, 8, 1, 1);
530 }
531
532 static void sgemm_4x8s4__psimd(benchmark::State& state, const char* net) {
533 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__psimd, 4, 8, 1, 4);
534 }
535
536 static void sgemm_6x8s4__psimd(benchmark::State& state, const char* net) {
537 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__psimd, 6, 8, 1, 4);
538 }
539
540 static void sppmm_4x8_unipass__psimd(benchmark::State& state, const char* net) {
541 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
542 }
543
544 static void sppmm_4x8_twopass__psimd(benchmark::State& state, const char* net) {
545 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
546 }
547
548 BENCHMARK_GEMM(sgemm_4x8__psimd_loadsplat)
549 BENCHMARK_GEMM(sgemm_6x8__psimd_loadsplat)
550 BENCHMARK_GEMM(sgemm_4x8__psimd_splat)
551 BENCHMARK_GEMM(sgemm_6x8__psimd_splat)
552 BENCHMARK_GEMM(sgemm_4x8s4__psimd)
553 BENCHMARK_GEMM(sgemm_6x8s4__psimd)
554 BENCHMARK_GEMM(sppmm_4x8_unipass__psimd)
555 BENCHMARK_GEMM(sppmm_4x8_twopass__psimd)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700556#endif // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700557
558static void sgemm_1x4__scalar(benchmark::State& state, const char* net) {
559 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x4__scalar, 1, 4, 1, 1);
560}
561
562static void sgemm_2x4__scalar(benchmark::State& state, const char* net) {
563 GEMMBenchmark(state, xnn_f32_gemm_ukernel_2x4__scalar, 2, 4, 1, 1);
564}
565
566static void sgemm_4x4__scalar(benchmark::State& state, const char* net) {
567 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x4__scalar, 4, 4, 1, 1);
568}
569
570static void sppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
571 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
572}
573
574static void sppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
575 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
576}
577
578static void sppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
579 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
580}
581
582static void sppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
583 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
584}
585
586static void sppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
587 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
588}
589
590static void sppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
591 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
592}
593
594static void sppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
595 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
596}
597
598static void sppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
599 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
600}
601
602BENCHMARK_GEMM(sgemm_1x4__scalar)
603BENCHMARK_GEMM(sgemm_2x4__scalar)
604BENCHMARK_GEMM(sgemm_4x4__scalar)
605
606BENCHMARK_GEMM(sppmm_2x4_unipass__scalar)
607BENCHMARK_GEMM(sppmm_4x2_unipass__scalar)
608BENCHMARK_GEMM(sppmm_4x4_unipass__scalar)
609BENCHMARK_GEMM(sppmm_3x3_unipass__scalar)
610
611BENCHMARK_GEMM(sppmm_2x4_twopass__scalar)
612BENCHMARK_GEMM(sppmm_4x2_twopass__scalar)
613BENCHMARK_GEMM(sppmm_4x4_twopass__scalar)
614BENCHMARK_GEMM(sppmm_3x3_twopass__scalar)
615
616
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700617#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700618BENCHMARK_GEMM(ruy_st)
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700619#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700620
621#ifndef XNNPACK_BENCHMARK_NO_MAIN
622BENCHMARK_MAIN();
623#endif