blob: 94259719b4f78e6289ecf2eecc4353f6550e6040 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <algorithm>
10#include <cfloat>
11#include <chrono>
12#include <cmath>
13#include <functional>
14#include <mutex>
15#include <random>
16#include <vector>
17
18#include <cpuinfo.h>
19
Frank Barchardbb4c18b2019-09-30 11:05:52 -070020#include <benchmark/benchmark.h>
21#include "tensorflow/lite/experimental/ruy/ruy.h"
XNNPACK Teamb455b122019-09-27 18:10:33 -070022#include "bench/gemm.h"
Frank Barchardbb4c18b2019-09-30 11:05:52 -070023#include "bench/utils.h"
XNNPACK Teamb455b122019-09-27 18:10:33 -070024#include <xnnpack/AlignedAllocator.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070025#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070026#include <xnnpack/gemm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070027#include <xnnpack/pack.h>
Frank Barchardbb4c18b2019-09-30 11:05:52 -070028#include <xnnpack/packx.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070029#include <xnnpack/params.h>
30#include <xnnpack/ppmm.h>
31#include <xnnpack/requantization.h>
32
XNNPACK Teamb455b122019-09-27 18:10:33 -070033
34static void GEMMBenchmark(benchmark::State& state,
35 xnn_f32_gemm_ukernel_function gemm,
36 size_t mr, size_t nr, size_t kr, size_t sr)
37{
38 if (!cpuinfo_initialize()) {
39 state.SkipWithError("cpuinfo initialization failed");
40 return;
41 }
42
43 const size_t mc = state.range(0);
44 const size_t nc = state.range(1);
45 const size_t kc = state.range(2);
46
47 const size_t nc_stride = benchmark::utils::roundUp(nc, nr);
48 const size_t kc_stride = benchmark::utils::roundUp(kc, kr);
49
50 std::random_device random_device;
51 auto rng = std::mt19937(random_device());
52 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
53
54 std::vector<float> a(mc * kc);
55 std::generate(a.begin(), a.end(), std::ref(f32rng));
56 std::vector<float> k(nc * kc);
57 std::generate(k.begin(), k.end(), std::ref(f32rng));
58 std::vector<float> b(nc);
59 std::generate(b.begin(), b.end(), std::ref(f32rng));
60
61 const size_t w_elements = nc_stride * kc_stride + nc_stride;
62 const size_t c_elements = mc * nc;
63 const size_t num_buffers = 1 +
64 benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
65 sizeof(float) * (w_elements + c_elements));
66
67 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
68 std::fill(w.begin(), w.end(), 0.0f);
69 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data());
70 std::vector<float> c(c_elements * num_buffers);
71 std::fill(c.begin(), c.end(), std::nanf(""));
72
73 xnn_f32_output_params output_params =
74 xnn_compute_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
75
76 size_t buffer_index = 0;
77 for (auto _ : state) {
78 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
79 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
80 // - W is not in cache (for any cache level)
81 // - C is not in cache (for any cache level)
82 state.PauseTiming();
83 benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
84 buffer_index = (buffer_index + 1) % num_buffers;
85 state.ResumeTiming();
86
87 for (uint32_t m = 0; m < mc; m += mr) {
88 const uint32_t mb = min(mc - m, mr);
89 gemm(
90 mb, nc, kc * sizeof(float),
91 a.data() + m * kc, kc * sizeof(float),
92 w.data() + buffer_index * nc_stride * (kc_stride + 1),
93 c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
94 &output_params);
95 }
96 }
97
98 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
99 state.counters["FLOPS"] = benchmark::Counter(
100 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
101}
102
103static void PPMM1PBenchmark(benchmark::State& state,
104 xnn_f32_ppmm_ukernel_function ppmm,
105 xnn_x32_packx_ukernel_function packx,
106 size_t mr, size_t nr)
107{
108 if (!cpuinfo_initialize()) {
109 state.SkipWithError("cpuinfo initialization failed");
110 return;
111 }
112
113 const size_t mc = state.range(0);
114 const size_t nc = state.range(1);
115 const size_t kc = state.range(2);
116
117 const size_t nc_stride = benchmark::utils::roundUp(nc, nr);
118
119 std::random_device random_device;
120 auto rng = std::mt19937(random_device());
121 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
122
123 std::vector<float> a(mc * kc);
124 std::generate(a.begin(), a.end(), std::ref(f32rng));
125 std::vector<float> k(nc * kc);
126 std::generate(k.begin(), k.end(), std::ref(f32rng));
127 std::vector<float> b(nc);
128 std::generate(b.begin(), b.end(), std::ref(f32rng));
129
130 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mr * kc);
131
132 const size_t w_elements = nc_stride * kc + nc_stride;
133 const size_t c_elements = mc * nc;
134 const size_t num_buffers = 1 +
135 benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
136 sizeof(float) * (w_elements + c_elements));
137
138 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
139 std::fill(w.begin(), w.end(), 0.0f);
140 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data());
141 std::vector<float> c(c_elements * num_buffers);
142 std::fill(c.begin(), c.end(), std::nanf(""));
143
144 xnn_f32_output_params output_params =
145 xnn_compute_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
146
147 size_t buffer_index = 0;
148 for (auto _ : state) {
149 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
150 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
151 // - W is not in cache (for any cache level)
152 // - C is not in cache (for any cache level)
153 state.PauseTiming();
154 benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
155 buffer_index = (buffer_index + 1) % num_buffers;
156 state.ResumeTiming();
157
158 for (uint32_t m = 0; m < mc; m += mr) {
159 const uint32_t mb = min(mc - m, mr);
160 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data());
161 ppmm(
162 mb, nc, kc * sizeof(float),
163 reinterpret_cast<const float*>(t.data()),
164 w.data() + nc_stride * buffer_index * (kc + 1),
165 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
166 &output_params);
167 }
168 }
169
170 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
171 state.counters["FLOPS"] = benchmark::Counter(
172 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
173}
174
175static void PPMM2PBenchmark(benchmark::State& state,
176 xnn_f32_ppmm_ukernel_function ppmm,
177 xnn_x32_packx_ukernel_function packx,
178 size_t mr, size_t nr)
179{
180 if (!cpuinfo_initialize()) {
181 state.SkipWithError("cpuinfo initialization failed");
182 return;
183 }
184
185 const size_t mc = state.range(0);
186 const size_t nc = state.range(1);
187 const size_t kc = state.range(2);
188
189 const size_t mc_stride = benchmark::utils::roundUp(mc, mr);
190 const size_t nc_stride = benchmark::utils::roundUp(nc, nr);
191
192 std::random_device random_device;
193 auto rng = std::mt19937(random_device());
194 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
195
196 std::vector<float> a(mc * kc);
197 std::generate(a.begin(), a.end(), std::ref(f32rng));
198 std::vector<float> k(nc * kc);
199 std::generate(k.begin(), k.end(), std::ref(f32rng));
200 std::vector<float> b(nc);
201 std::generate(b.begin(), b.end(), std::ref(f32rng));
202
203 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mc_stride * kc);
204
205 const size_t w_elements = nc_stride * kc + nc_stride;
206 const size_t c_elements = mc * nc;
207 const size_t num_buffers = 1 +
208 benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
209 sizeof(float) * (w_elements + c_elements));
210
211 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
212 std::fill(w.begin(), w.end(), 0.0f);
213 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data());
214 std::vector<float> c(c_elements * num_buffers);
215 std::fill(c.begin(), c.end(), std::nanf(""));
216
217 xnn_f32_output_params output_params =
218 xnn_compute_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
219
220 size_t buffer_index = 0;
221 for (auto _ : state) {
222 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
223 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
224 // - W is not in cache (for any cache level)
225 // - C is not in cache (for any cache level)
226 state.PauseTiming();
227 benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
228 buffer_index = (buffer_index + 1) % num_buffers;
229 state.ResumeTiming();
230
231 for (uint32_t m = 0; m < mc; m += mr) {
232 const uint32_t mb = min(mc - m, mr);
233 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data() + m * kc);
234 }
235 for (uint32_t m = 0; m < mc; m += mr) {
236 const uint32_t mb = min(mc - m, mr);
237 ppmm(
238 mb, nc, kc * sizeof(float),
239 reinterpret_cast<const float*>(t.data() + m * kc),
240 w.data() + nc_stride * buffer_index * (kc + 1),
241 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
242 &output_params);
243 }
244 }
245
246 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
247 state.counters["FLOPS"] = benchmark::Counter(
248 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
249}
250
251static void RuyBenchmark(benchmark::State& state, uint32_t threads)
252{
253 std::random_device random_device;
254 auto rng = std::mt19937(random_device());
255 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
256
257 const size_t mc = state.range(0);
258 const size_t nc = state.range(1);
259 const size_t kc = state.range(2);
260
261 const size_t num_buffers = 1 +
262 benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
263 sizeof(float) * (nc * (mc + kc + 1)));
264
265 std::vector<float> a(mc * kc);
266 std::generate(a.begin(), a.end(), std::ref(f32rng));
267 std::vector<float> k(num_buffers * nc * kc);
268 std::generate(k.begin(), k.end(), std::ref(f32rng));
269 std::vector<float> b(num_buffers * nc);
270 std::generate(b.begin(), b.end(), std::ref(f32rng));
271 std::vector<float> c(num_buffers * nc * mc);
272 std::fill(c.begin(), c.end(), std::nanf(""));
273
274 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
275 static ruy::Context context;
276 context.max_num_threads = threads;
277
278 ruy::Matrix<float> ruy_a;
279 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, &ruy_a.layout);
280 ruy::Matrix<float> ruy_b;
281 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, &ruy_b.layout);
282 ruy_b.data = a.data();
283 ruy::Matrix<float> ruy_c;
284 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, &ruy_c.layout);
285
286 ruy::BasicSpec<float, float> spec;
287
288 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
289 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
290 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
291 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
292 static std::once_flag warmup;
293 std::call_once(warmup, [&](){
294 auto start = std::chrono::steady_clock::now();
295 do {
296 ruy_a.data = k.data();
297 ruy_c.data = c.data();
298 spec.bias = b.data();
299
300 ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
301 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
302 });
303
304 size_t buffer_index = 0;
305 for (auto _ : state) {
306 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
307 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
308 // - K is not in cache (for any cache level)
309 // - B is not in cache (for any cache level)
310 // - C is not in cache (for any cache level)
311 state.PauseTiming();
312 benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
313 buffer_index = (buffer_index + 1) % num_buffers;
314 state.ResumeTiming();
315
316 ruy_a.data = k.data() + buffer_index * nc * kc;
317 ruy_c.data = c.data() + buffer_index * mc * nc;
318 spec.bias = b.data() + buffer_index * nc;
319
320 ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
321 }
322
323 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
324 state.counters["FLOPS"] = benchmark::Counter(
325 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
326}
327
328static void ruy_st(benchmark::State& state, const char* net)
329{
330 RuyBenchmark(state, 1);
331}
332
333
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700334#if XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700335 static void sgemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
336 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1);
337 }
338 static void sgemm_1x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
339 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57, 1, 8, 1, 1);
340 }
341 static void sgemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
342 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1);
343 }
344 static void sgemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
345 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1);
346 }
347 static void sgemm_4x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
348 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57, 4, 8, 1, 1);
349 }
350 static void sgemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
351 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1);
352 }
353 static void sgemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
354 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1);
355 }
356 static void sgemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
357 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1);
358 }
359 static void sgemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
360 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1);
361 }
362 static void sgemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
363 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1);
364 }
365 static void sgemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
366 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1);
367 }
368 static void sgemm_6x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
369 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57, 6, 8, 1, 1);
370 }
371 static void sgemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
372 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1);
373 }
374 static void sgemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
375 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1);
376 }
377
378 BENCHMARK_GEMM(sgemm_1x12__aarch64_neonfma_cortex_a53)
379 BENCHMARK_GEMM(sgemm_1x8__aarch64_neonfma_cortex_a57)
380 BENCHMARK_GEMM(sgemm_1x8__aarch64_neonfma_cortex_a75)
381 BENCHMARK_GEMM(sgemm_4x12__aarch64_neonfma_cortex_a53)
382 BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_cortex_a57)
383 BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_cortex_a75)
384 BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_ld128)
385 BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_ld64)
386 BENCHMARK_GEMM(sgemm_5x8__aarch64_neonfma_cortex_a75)
387 BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a57)
388 BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a73)
389 BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a75)
390 BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_ld64)
391 BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_ld128)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700392#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700393
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700394#if XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700395 static void sgemm_4x12__neon_ld64(benchmark::State& state, const char* net) {
396 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__neon_ld64, 4, 12, 1, 1);
397 }
398
399 static void sgemm_1x8__neon_ld64(benchmark::State& state, const char* net) {
400 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_ld64, 1, 8, 1, 1);
401 }
402
403 static void sgemm_1x8__neonfma_ld64(benchmark::State& state, const char* net) {
404 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_ld64, 1, 8, 1, 1);
405 }
406
407 static void sgemm_4x8__neon_ld64(benchmark::State& state, const char* net) {
408 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_ld64, 4, 8, 1, 1);
409 }
410
411 static void sgemm_4x8__neon_ld128(benchmark::State& state, const char* net) {
412 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_ld128, 4, 8, 1, 1);
413 }
414
415 static void sgemm_5x8__neon_ld64(benchmark::State& state, const char* net) {
416 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_ld64, 5, 8, 1, 1);
417 }
418
419 static void sgemm_6x8__neon_ld64(benchmark::State& state, const char* net) {
420 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_ld64, 6, 8, 1, 1);
421 }
422
423 static void sgemm_4x12__neonfma_ld64(benchmark::State& state, const char* net) {
424 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__neonfma_ld64, 4, 12, 1, 1);
425 }
426
427 static void sgemm_4x8__neonfma_ld64(benchmark::State& state, const char* net) {
428 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_ld64, 4, 8, 1, 1);
429 }
430
431 static void sgemm_4x8__neonfma_ld128(benchmark::State& state, const char* net) {
432 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_ld128, 4, 8, 1, 1);
433 }
434
435 static void sgemm_5x8__neonfma_ld64(benchmark::State& state, const char* net) {
436 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neonfma_ld64, 5, 8, 1, 1);
437 }
438
439 static void sgemm_6x8__neonfma_ld64(benchmark::State& state, const char* net) {
440 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_ld64, 6, 8, 1, 1);
441 }
442
443 static void sppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
444 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
445 }
446
447 static void sppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
448 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
449 }
450
451 BENCHMARK_GEMM(sgemm_4x12__neon_ld64)
452 BENCHMARK_GEMM(sgemm_4x12__neonfma_ld64)
453 BENCHMARK_GEMM(sgemm_1x8__neon_ld64)
454 BENCHMARK_GEMM(sgemm_1x8__neonfma_ld64)
455 BENCHMARK_GEMM(sgemm_4x8__neon_ld128)
456 BENCHMARK_GEMM(sgemm_4x8__neon_ld64)
457 BENCHMARK_GEMM(sgemm_4x8__neonfma_ld128)
458 BENCHMARK_GEMM(sgemm_4x8__neonfma_ld64)
459 BENCHMARK_GEMM(sgemm_5x8__neon_ld64)
460 BENCHMARK_GEMM(sgemm_5x8__neonfma_ld64)
461 BENCHMARK_GEMM(sgemm_6x8__neon_ld64)
462 BENCHMARK_GEMM(sgemm_6x8__neonfma_ld64)
463
464 BENCHMARK_GEMM(sppmm_4x8_unipass__neonfma)
465 BENCHMARK_GEMM(sppmm_4x8_twopass__neonfma)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700466#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700467
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700468#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700469 static void sgemm_1x8__sse_load1(benchmark::State& state, const char* net) {
470 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_load1, 1, 8, 1, 1);
471 }
472
473 static void sgemm_4x8__sse_load1(benchmark::State& state, const char* net) {
474 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_load1, 4, 8, 1, 1);
475 }
476
477 static void sgemm_1x8__sse_dup(benchmark::State& state, const char* net) {
478 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_dup, 1, 8, 1, 1);
479 }
480
481 static void sgemm_4x8__sse_dup(benchmark::State& state, const char* net) {
482 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_dup, 4, 8, 1, 1);
483 }
484
485 static void sgemm_1x8s4__sse(benchmark::State& state, const char* net) {
486 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__sse, 1, 8, 1, 4);
487 }
488
489 static void sgemm_4x8s4__sse(benchmark::State& state, const char* net) {
490 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__sse, 4, 8, 1, 4);
491 }
492
493 static void sppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
494 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
495 }
496
497 static void sppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
498 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
499 }
500
501 BENCHMARK_GEMM(sgemm_1x8__sse_load1)
502 BENCHMARK_GEMM(sgemm_4x8__sse_load1)
503 BENCHMARK_GEMM(sgemm_1x8__sse_dup)
504 BENCHMARK_GEMM(sgemm_4x8__sse_dup)
505 BENCHMARK_GEMM(sgemm_1x8s4__sse)
506 BENCHMARK_GEMM(sgemm_4x8s4__sse)
507 BENCHMARK_GEMM(sppmm_4x8_unipass__sse)
508 BENCHMARK_GEMM(sppmm_4x8_twopass__sse)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700509#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700510
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700511#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700512 static void sgemm_4x8__psimd_loadsplat(benchmark::State& state, const char* net) {
513 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, 4, 8, 1, 1);
514 }
515
516 static void sgemm_6x8__psimd_loadsplat(benchmark::State& state, const char* net) {
517 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, 6, 8, 1, 1);
518 }
519
520 static void sgemm_4x8__psimd_splat(benchmark::State& state, const char* net) {
521 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_splat, 4, 8, 1, 1);
522 }
523
524 static void sgemm_6x8__psimd_splat(benchmark::State& state, const char* net) {
525 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_splat, 6, 8, 1, 1);
526 }
527
528 static void sgemm_4x8s4__psimd(benchmark::State& state, const char* net) {
529 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__psimd, 4, 8, 1, 4);
530 }
531
532 static void sgemm_6x8s4__psimd(benchmark::State& state, const char* net) {
533 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__psimd, 6, 8, 1, 4);
534 }
535
536 static void sppmm_4x8_unipass__psimd(benchmark::State& state, const char* net) {
537 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
538 }
539
540 static void sppmm_4x8_twopass__psimd(benchmark::State& state, const char* net) {
541 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
542 }
543
544 BENCHMARK_GEMM(sgemm_4x8__psimd_loadsplat)
545 BENCHMARK_GEMM(sgemm_6x8__psimd_loadsplat)
546 BENCHMARK_GEMM(sgemm_4x8__psimd_splat)
547 BENCHMARK_GEMM(sgemm_6x8__psimd_splat)
548 BENCHMARK_GEMM(sgemm_4x8s4__psimd)
549 BENCHMARK_GEMM(sgemm_6x8s4__psimd)
550 BENCHMARK_GEMM(sppmm_4x8_unipass__psimd)
551 BENCHMARK_GEMM(sppmm_4x8_twopass__psimd)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700552#endif // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700553
554static void sgemm_1x4__scalar(benchmark::State& state, const char* net) {
555 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x4__scalar, 1, 4, 1, 1);
556}
557
558static void sgemm_2x4__scalar(benchmark::State& state, const char* net) {
559 GEMMBenchmark(state, xnn_f32_gemm_ukernel_2x4__scalar, 2, 4, 1, 1);
560}
561
562static void sgemm_4x4__scalar(benchmark::State& state, const char* net) {
563 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x4__scalar, 4, 4, 1, 1);
564}
565
566static void sppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
567 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
568}
569
570static void sppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
571 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
572}
573
574static void sppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
575 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
576}
577
578static void sppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
579 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
580}
581
582static void sppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
583 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
584}
585
586static void sppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
587 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
588}
589
590static void sppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
591 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
592}
593
594static void sppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
595 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
596}
597
598BENCHMARK_GEMM(sgemm_1x4__scalar)
599BENCHMARK_GEMM(sgemm_2x4__scalar)
600BENCHMARK_GEMM(sgemm_4x4__scalar)
601
602BENCHMARK_GEMM(sppmm_2x4_unipass__scalar)
603BENCHMARK_GEMM(sppmm_4x2_unipass__scalar)
604BENCHMARK_GEMM(sppmm_4x4_unipass__scalar)
605BENCHMARK_GEMM(sppmm_3x3_unipass__scalar)
606
607BENCHMARK_GEMM(sppmm_2x4_twopass__scalar)
608BENCHMARK_GEMM(sppmm_4x2_twopass__scalar)
609BENCHMARK_GEMM(sppmm_4x4_twopass__scalar)
610BENCHMARK_GEMM(sppmm_3x3_twopass__scalar)
611
612
613BENCHMARK_GEMM(ruy_st)
614
615#ifndef XNNPACK_BENCHMARK_NO_MAIN
616BENCHMARK_MAIN();
617#endif