blob: 8d5637f68f039f197e56e85d2a2d654f69127871 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <algorithm>
10#include <cfloat>
11#include <chrono>
12#include <cmath>
13#include <functional>
14#include <mutex>
15#include <random>
16#include <vector>
17
18#include <cpuinfo.h>
19
Frank Barchardbb4c18b2019-09-30 11:05:52 -070020#include <benchmark/benchmark.h>
21#include "tensorflow/lite/experimental/ruy/ruy.h"
XNNPACK Teamb455b122019-09-27 18:10:33 -070022#include "bench/gemm.h"
Frank Barchardbb4c18b2019-09-30 11:05:52 -070023#include "bench/utils.h"
XNNPACK Teamb455b122019-09-27 18:10:33 -070024#include <xnnpack/AlignedAllocator.h>
25#include <xnnpack/gemm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070026#include <xnnpack/pack.h>
Frank Barchardbb4c18b2019-09-30 11:05:52 -070027#include <xnnpack/packx.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070028#include <xnnpack/params.h>
29#include <xnnpack/ppmm.h>
30#include <xnnpack/requantization.h>
31
XNNPACK Teamb455b122019-09-27 18:10:33 -070032
33static void GEMMBenchmark(benchmark::State& state,
34 xnn_f32_gemm_ukernel_function gemm,
35 size_t mr, size_t nr, size_t kr, size_t sr)
36{
37 if (!cpuinfo_initialize()) {
38 state.SkipWithError("cpuinfo initialization failed");
39 return;
40 }
41
42 const size_t mc = state.range(0);
43 const size_t nc = state.range(1);
44 const size_t kc = state.range(2);
45
46 const size_t nc_stride = benchmark::utils::roundUp(nc, nr);
47 const size_t kc_stride = benchmark::utils::roundUp(kc, kr);
48
49 std::random_device random_device;
50 auto rng = std::mt19937(random_device());
51 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
52
53 std::vector<float> a(mc * kc);
54 std::generate(a.begin(), a.end(), std::ref(f32rng));
55 std::vector<float> k(nc * kc);
56 std::generate(k.begin(), k.end(), std::ref(f32rng));
57 std::vector<float> b(nc);
58 std::generate(b.begin(), b.end(), std::ref(f32rng));
59
60 const size_t w_elements = nc_stride * kc_stride + nc_stride;
61 const size_t c_elements = mc * nc;
62 const size_t num_buffers = 1 +
63 benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
64 sizeof(float) * (w_elements + c_elements));
65
66 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
67 std::fill(w.begin(), w.end(), 0.0f);
68 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data());
69 std::vector<float> c(c_elements * num_buffers);
70 std::fill(c.begin(), c.end(), std::nanf(""));
71
72 xnn_f32_output_params output_params =
73 xnn_compute_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
74
75 size_t buffer_index = 0;
76 for (auto _ : state) {
77 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
78 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
79 // - W is not in cache (for any cache level)
80 // - C is not in cache (for any cache level)
81 state.PauseTiming();
82 benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
83 buffer_index = (buffer_index + 1) % num_buffers;
84 state.ResumeTiming();
85
86 for (uint32_t m = 0; m < mc; m += mr) {
87 const uint32_t mb = min(mc - m, mr);
88 gemm(
89 mb, nc, kc * sizeof(float),
90 a.data() + m * kc, kc * sizeof(float),
91 w.data() + buffer_index * nc_stride * (kc_stride + 1),
92 c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
93 &output_params);
94 }
95 }
96
97 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
98 state.counters["FLOPS"] = benchmark::Counter(
99 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
100}
101
102static void PPMM1PBenchmark(benchmark::State& state,
103 xnn_f32_ppmm_ukernel_function ppmm,
104 xnn_x32_packx_ukernel_function packx,
105 size_t mr, size_t nr)
106{
107 if (!cpuinfo_initialize()) {
108 state.SkipWithError("cpuinfo initialization failed");
109 return;
110 }
111
112 const size_t mc = state.range(0);
113 const size_t nc = state.range(1);
114 const size_t kc = state.range(2);
115
116 const size_t nc_stride = benchmark::utils::roundUp(nc, nr);
117
118 std::random_device random_device;
119 auto rng = std::mt19937(random_device());
120 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
121
122 std::vector<float> a(mc * kc);
123 std::generate(a.begin(), a.end(), std::ref(f32rng));
124 std::vector<float> k(nc * kc);
125 std::generate(k.begin(), k.end(), std::ref(f32rng));
126 std::vector<float> b(nc);
127 std::generate(b.begin(), b.end(), std::ref(f32rng));
128
129 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mr * kc);
130
131 const size_t w_elements = nc_stride * kc + nc_stride;
132 const size_t c_elements = mc * nc;
133 const size_t num_buffers = 1 +
134 benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
135 sizeof(float) * (w_elements + c_elements));
136
137 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
138 std::fill(w.begin(), w.end(), 0.0f);
139 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data());
140 std::vector<float> c(c_elements * num_buffers);
141 std::fill(c.begin(), c.end(), std::nanf(""));
142
143 xnn_f32_output_params output_params =
144 xnn_compute_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
145
146 size_t buffer_index = 0;
147 for (auto _ : state) {
148 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
149 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
150 // - W is not in cache (for any cache level)
151 // - C is not in cache (for any cache level)
152 state.PauseTiming();
153 benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
154 buffer_index = (buffer_index + 1) % num_buffers;
155 state.ResumeTiming();
156
157 for (uint32_t m = 0; m < mc; m += mr) {
158 const uint32_t mb = min(mc - m, mr);
159 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data());
160 ppmm(
161 mb, nc, kc * sizeof(float),
162 reinterpret_cast<const float*>(t.data()),
163 w.data() + nc_stride * buffer_index * (kc + 1),
164 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
165 &output_params);
166 }
167 }
168
169 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
170 state.counters["FLOPS"] = benchmark::Counter(
171 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
172}
173
174static void PPMM2PBenchmark(benchmark::State& state,
175 xnn_f32_ppmm_ukernel_function ppmm,
176 xnn_x32_packx_ukernel_function packx,
177 size_t mr, size_t nr)
178{
179 if (!cpuinfo_initialize()) {
180 state.SkipWithError("cpuinfo initialization failed");
181 return;
182 }
183
184 const size_t mc = state.range(0);
185 const size_t nc = state.range(1);
186 const size_t kc = state.range(2);
187
188 const size_t mc_stride = benchmark::utils::roundUp(mc, mr);
189 const size_t nc_stride = benchmark::utils::roundUp(nc, nr);
190
191 std::random_device random_device;
192 auto rng = std::mt19937(random_device());
193 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
194
195 std::vector<float> a(mc * kc);
196 std::generate(a.begin(), a.end(), std::ref(f32rng));
197 std::vector<float> k(nc * kc);
198 std::generate(k.begin(), k.end(), std::ref(f32rng));
199 std::vector<float> b(nc);
200 std::generate(b.begin(), b.end(), std::ref(f32rng));
201
202 std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mc_stride * kc);
203
204 const size_t w_elements = nc_stride * kc + nc_stride;
205 const size_t c_elements = mc * nc;
206 const size_t num_buffers = 1 +
207 benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
208 sizeof(float) * (w_elements + c_elements));
209
210 std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
211 std::fill(w.begin(), w.end(), 0.0f);
212 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data());
213 std::vector<float> c(c_elements * num_buffers);
214 std::fill(c.begin(), c.end(), std::nanf(""));
215
216 xnn_f32_output_params output_params =
217 xnn_compute_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
218
219 size_t buffer_index = 0;
220 for (auto _ : state) {
221 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
222 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
223 // - W is not in cache (for any cache level)
224 // - C is not in cache (for any cache level)
225 state.PauseTiming();
226 benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
227 buffer_index = (buffer_index + 1) % num_buffers;
228 state.ResumeTiming();
229
230 for (uint32_t m = 0; m < mc; m += mr) {
231 const uint32_t mb = min(mc - m, mr);
232 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data() + m * kc);
233 }
234 for (uint32_t m = 0; m < mc; m += mr) {
235 const uint32_t mb = min(mc - m, mr);
236 ppmm(
237 mb, nc, kc * sizeof(float),
238 reinterpret_cast<const float*>(t.data() + m * kc),
239 w.data() + nc_stride * buffer_index * (kc + 1),
240 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
241 &output_params);
242 }
243 }
244
245 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
246 state.counters["FLOPS"] = benchmark::Counter(
247 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
248}
249
250static void RuyBenchmark(benchmark::State& state, uint32_t threads)
251{
252 std::random_device random_device;
253 auto rng = std::mt19937(random_device());
254 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
255
256 const size_t mc = state.range(0);
257 const size_t nc = state.range(1);
258 const size_t kc = state.range(2);
259
260 const size_t num_buffers = 1 +
261 benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
262 sizeof(float) * (nc * (mc + kc + 1)));
263
264 std::vector<float> a(mc * kc);
265 std::generate(a.begin(), a.end(), std::ref(f32rng));
266 std::vector<float> k(num_buffers * nc * kc);
267 std::generate(k.begin(), k.end(), std::ref(f32rng));
268 std::vector<float> b(num_buffers * nc);
269 std::generate(b.begin(), b.end(), std::ref(f32rng));
270 std::vector<float> c(num_buffers * nc * mc);
271 std::fill(c.begin(), c.end(), std::nanf(""));
272
273 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
274 static ruy::Context context;
275 context.max_num_threads = threads;
276
277 ruy::Matrix<float> ruy_a;
278 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, &ruy_a.layout);
279 ruy::Matrix<float> ruy_b;
280 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, &ruy_b.layout);
281 ruy_b.data = a.data();
282 ruy::Matrix<float> ruy_c;
283 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, &ruy_c.layout);
284
285 ruy::BasicSpec<float, float> spec;
286
287 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
288 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
289 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
290 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
291 static std::once_flag warmup;
292 std::call_once(warmup, [&](){
293 auto start = std::chrono::steady_clock::now();
294 do {
295 ruy_a.data = k.data();
296 ruy_c.data = c.data();
297 spec.bias = b.data();
298
299 ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
300 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
301 });
302
303 size_t buffer_index = 0;
304 for (auto _ : state) {
305 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
306 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
307 // - K is not in cache (for any cache level)
308 // - B is not in cache (for any cache level)
309 // - C is not in cache (for any cache level)
310 state.PauseTiming();
311 benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
312 buffer_index = (buffer_index + 1) % num_buffers;
313 state.ResumeTiming();
314
315 ruy_a.data = k.data() + buffer_index * nc * kc;
316 ruy_c.data = c.data() + buffer_index * mc * nc;
317 spec.bias = b.data() + buffer_index * nc;
318
319 ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
320 }
321
322 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
323 state.counters["FLOPS"] = benchmark::Counter(
324 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
325}
326
327static void ruy_st(benchmark::State& state, const char* net)
328{
329 RuyBenchmark(state, 1);
330}
331
332
333#if CPUINFO_ARCH_ARM64
334 static void sgemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
335 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1);
336 }
337 static void sgemm_1x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
338 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57, 1, 8, 1, 1);
339 }
340 static void sgemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
341 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1);
342 }
343 static void sgemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
344 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1);
345 }
346 static void sgemm_4x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
347 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57, 4, 8, 1, 1);
348 }
349 static void sgemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
350 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1);
351 }
352 static void sgemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
353 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1);
354 }
355 static void sgemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
356 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1);
357 }
358 static void sgemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
359 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1);
360 }
361 static void sgemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
362 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1);
363 }
364 static void sgemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
365 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1);
366 }
367 static void sgemm_6x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
368 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57, 6, 8, 1, 1);
369 }
370 static void sgemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
371 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1);
372 }
373 static void sgemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
374 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1);
375 }
376
377 BENCHMARK_GEMM(sgemm_1x12__aarch64_neonfma_cortex_a53)
378 BENCHMARK_GEMM(sgemm_1x8__aarch64_neonfma_cortex_a57)
379 BENCHMARK_GEMM(sgemm_1x8__aarch64_neonfma_cortex_a75)
380 BENCHMARK_GEMM(sgemm_4x12__aarch64_neonfma_cortex_a53)
381 BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_cortex_a57)
382 BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_cortex_a75)
383 BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_ld128)
384 BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_ld64)
385 BENCHMARK_GEMM(sgemm_5x8__aarch64_neonfma_cortex_a75)
386 BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a57)
387 BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a73)
388 BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a75)
389 BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_ld64)
390 BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_ld128)
391#endif // CPUINFO_ARCH_ARM64
392
393#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
394 static void sgemm_4x12__neon_ld64(benchmark::State& state, const char* net) {
395 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__neon_ld64, 4, 12, 1, 1);
396 }
397
398 static void sgemm_1x8__neon_ld64(benchmark::State& state, const char* net) {
399 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_ld64, 1, 8, 1, 1);
400 }
401
402 static void sgemm_1x8__neonfma_ld64(benchmark::State& state, const char* net) {
403 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_ld64, 1, 8, 1, 1);
404 }
405
406 static void sgemm_4x8__neon_ld64(benchmark::State& state, const char* net) {
407 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_ld64, 4, 8, 1, 1);
408 }
409
410 static void sgemm_4x8__neon_ld128(benchmark::State& state, const char* net) {
411 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_ld128, 4, 8, 1, 1);
412 }
413
414 static void sgemm_5x8__neon_ld64(benchmark::State& state, const char* net) {
415 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_ld64, 5, 8, 1, 1);
416 }
417
418 static void sgemm_6x8__neon_ld64(benchmark::State& state, const char* net) {
419 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_ld64, 6, 8, 1, 1);
420 }
421
422 static void sgemm_4x12__neonfma_ld64(benchmark::State& state, const char* net) {
423 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__neonfma_ld64, 4, 12, 1, 1);
424 }
425
426 static void sgemm_4x8__neonfma_ld64(benchmark::State& state, const char* net) {
427 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_ld64, 4, 8, 1, 1);
428 }
429
430 static void sgemm_4x8__neonfma_ld128(benchmark::State& state, const char* net) {
431 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_ld128, 4, 8, 1, 1);
432 }
433
434 static void sgemm_5x8__neonfma_ld64(benchmark::State& state, const char* net) {
435 GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neonfma_ld64, 5, 8, 1, 1);
436 }
437
438 static void sgemm_6x8__neonfma_ld64(benchmark::State& state, const char* net) {
439 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_ld64, 6, 8, 1, 1);
440 }
441
442 static void sppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
443 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
444 }
445
446 static void sppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
447 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
448 }
449
450 BENCHMARK_GEMM(sgemm_4x12__neon_ld64)
451 BENCHMARK_GEMM(sgemm_4x12__neonfma_ld64)
452 BENCHMARK_GEMM(sgemm_1x8__neon_ld64)
453 BENCHMARK_GEMM(sgemm_1x8__neonfma_ld64)
454 BENCHMARK_GEMM(sgemm_4x8__neon_ld128)
455 BENCHMARK_GEMM(sgemm_4x8__neon_ld64)
456 BENCHMARK_GEMM(sgemm_4x8__neonfma_ld128)
457 BENCHMARK_GEMM(sgemm_4x8__neonfma_ld64)
458 BENCHMARK_GEMM(sgemm_5x8__neon_ld64)
459 BENCHMARK_GEMM(sgemm_5x8__neonfma_ld64)
460 BENCHMARK_GEMM(sgemm_6x8__neon_ld64)
461 BENCHMARK_GEMM(sgemm_6x8__neonfma_ld64)
462
463 BENCHMARK_GEMM(sppmm_4x8_unipass__neonfma)
464 BENCHMARK_GEMM(sppmm_4x8_twopass__neonfma)
465#endif // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
466
467#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
468 static void sgemm_1x8__sse_load1(benchmark::State& state, const char* net) {
469 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_load1, 1, 8, 1, 1);
470 }
471
472 static void sgemm_4x8__sse_load1(benchmark::State& state, const char* net) {
473 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_load1, 4, 8, 1, 1);
474 }
475
476 static void sgemm_1x8__sse_dup(benchmark::State& state, const char* net) {
477 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_dup, 1, 8, 1, 1);
478 }
479
480 static void sgemm_4x8__sse_dup(benchmark::State& state, const char* net) {
481 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_dup, 4, 8, 1, 1);
482 }
483
484 static void sgemm_1x8s4__sse(benchmark::State& state, const char* net) {
485 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__sse, 1, 8, 1, 4);
486 }
487
488 static void sgemm_4x8s4__sse(benchmark::State& state, const char* net) {
489 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__sse, 4, 8, 1, 4);
490 }
491
492 static void sppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
493 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
494 }
495
496 static void sppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
497 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
498 }
499
500 BENCHMARK_GEMM(sgemm_1x8__sse_load1)
501 BENCHMARK_GEMM(sgemm_4x8__sse_load1)
502 BENCHMARK_GEMM(sgemm_1x8__sse_dup)
503 BENCHMARK_GEMM(sgemm_4x8__sse_dup)
504 BENCHMARK_GEMM(sgemm_1x8s4__sse)
505 BENCHMARK_GEMM(sgemm_4x8s4__sse)
506 BENCHMARK_GEMM(sppmm_4x8_unipass__sse)
507 BENCHMARK_GEMM(sppmm_4x8_twopass__sse)
508#endif // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
509
510#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
511 static void sgemm_4x8__psimd_loadsplat(benchmark::State& state, const char* net) {
512 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, 4, 8, 1, 1);
513 }
514
515 static void sgemm_6x8__psimd_loadsplat(benchmark::State& state, const char* net) {
516 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, 6, 8, 1, 1);
517 }
518
519 static void sgemm_4x8__psimd_splat(benchmark::State& state, const char* net) {
520 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_splat, 4, 8, 1, 1);
521 }
522
523 static void sgemm_6x8__psimd_splat(benchmark::State& state, const char* net) {
524 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_splat, 6, 8, 1, 1);
525 }
526
527 static void sgemm_4x8s4__psimd(benchmark::State& state, const char* net) {
528 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__psimd, 4, 8, 1, 4);
529 }
530
531 static void sgemm_6x8s4__psimd(benchmark::State& state, const char* net) {
532 GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__psimd, 6, 8, 1, 4);
533 }
534
535 static void sppmm_4x8_unipass__psimd(benchmark::State& state, const char* net) {
536 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
537 }
538
539 static void sppmm_4x8_twopass__psimd(benchmark::State& state, const char* net) {
540 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
541 }
542
543 BENCHMARK_GEMM(sgemm_4x8__psimd_loadsplat)
544 BENCHMARK_GEMM(sgemm_6x8__psimd_loadsplat)
545 BENCHMARK_GEMM(sgemm_4x8__psimd_splat)
546 BENCHMARK_GEMM(sgemm_6x8__psimd_splat)
547 BENCHMARK_GEMM(sgemm_4x8s4__psimd)
548 BENCHMARK_GEMM(sgemm_6x8s4__psimd)
549 BENCHMARK_GEMM(sppmm_4x8_unipass__psimd)
550 BENCHMARK_GEMM(sppmm_4x8_twopass__psimd)
551#endif // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
552
553static void sgemm_1x4__scalar(benchmark::State& state, const char* net) {
554 GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x4__scalar, 1, 4, 1, 1);
555}
556
557static void sgemm_2x4__scalar(benchmark::State& state, const char* net) {
558 GEMMBenchmark(state, xnn_f32_gemm_ukernel_2x4__scalar, 2, 4, 1, 1);
559}
560
561static void sgemm_4x4__scalar(benchmark::State& state, const char* net) {
562 GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x4__scalar, 4, 4, 1, 1);
563}
564
565static void sppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
566 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
567}
568
569static void sppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
570 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
571}
572
573static void sppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
574 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
575}
576
577static void sppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
578 PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
579}
580
581static void sppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
582 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
583}
584
585static void sppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
586 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
587}
588
589static void sppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
590 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
591}
592
593static void sppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
594 PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
595}
596
597BENCHMARK_GEMM(sgemm_1x4__scalar)
598BENCHMARK_GEMM(sgemm_2x4__scalar)
599BENCHMARK_GEMM(sgemm_4x4__scalar)
600
601BENCHMARK_GEMM(sppmm_2x4_unipass__scalar)
602BENCHMARK_GEMM(sppmm_4x2_unipass__scalar)
603BENCHMARK_GEMM(sppmm_4x4_unipass__scalar)
604BENCHMARK_GEMM(sppmm_3x3_unipass__scalar)
605
606BENCHMARK_GEMM(sppmm_2x4_twopass__scalar)
607BENCHMARK_GEMM(sppmm_4x2_twopass__scalar)
608BENCHMARK_GEMM(sppmm_4x4_twopass__scalar)
609BENCHMARK_GEMM(sppmm_3x3_twopass__scalar)
610
611
612BENCHMARK_GEMM(ruy_st)
613
614#ifndef XNNPACK_BENCHMARK_NO_MAIN
615BENCHMARK_MAIN();
616#endif