Blame - bench/f32-gemm.cc - platform/external/XNNPACK

blob: 94259719b4f78e6289ecf2eecc4353f6550e6040 [file] [log] [blame]

XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	1	// Copyright (c) Facebook, Inc. and its affiliates.
				2	// All rights reserved.
				3	//
				4	// Copyright 2019 Google LLC
				5	//
				6	// This source code is licensed under the BSD-style license found in the
				7	// LICENSE file in the root directory of this source tree.
				8
				9	#include <algorithm>
				10	#include <cfloat>
				11	#include <chrono>
				12	#include <cmath>
				13	#include <functional>
				14	#include <mutex>
				15	#include <random>
				16	#include <vector>
				17
				18	#include <cpuinfo.h>
				19
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	20	#include <benchmark/benchmark.h>
				21	#include "tensorflow/lite/experimental/ruy/ruy.h"
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	22	#include "bench/gemm.h"
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	23	#include "bench/utils.h"
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	24	#include <xnnpack/AlignedAllocator.h>
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame^]	25	#include <xnnpack/common.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	26	#include <xnnpack/gemm.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	27	#include <xnnpack/pack.h>
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	28	#include <xnnpack/packx.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	29	#include <xnnpack/params.h>
				30	#include <xnnpack/ppmm.h>
				31	#include <xnnpack/requantization.h>
				32
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	33
				34	static void GEMMBenchmark(benchmark::State& state,
				35	xnn_f32_gemm_ukernel_function gemm,
				36	size_t mr, size_t nr, size_t kr, size_t sr)
				37	{
				38	if (!cpuinfo_initialize()) {
				39	state.SkipWithError("cpuinfo initialization failed");
				40	return;
				41	}
				42
				43	const size_t mc = state.range(0);
				44	const size_t nc = state.range(1);
				45	const size_t kc = state.range(2);
				46
				47	const size_t nc_stride = benchmark::utils::roundUp(nc, nr);
				48	const size_t kc_stride = benchmark::utils::roundUp(kc, kr);
				49
				50	std::random_device random_device;
				51	auto rng = std::mt19937(random_device());
				52	auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
				53
				54	std::vector<float> a(mc * kc);
				55	std::generate(a.begin(), a.end(), std::ref(f32rng));
				56	std::vector<float> k(nc * kc);
				57	std::generate(k.begin(), k.end(), std::ref(f32rng));
				58	std::vector<float> b(nc);
				59	std::generate(b.begin(), b.end(), std::ref(f32rng));
				60
				61	const size_t w_elements = nc_stride * kc_stride + nc_stride;
				62	const size_t c_elements = mc * nc;
				63	const size_t num_buffers = 1 +
				64	benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
				65	sizeof(float) * (w_elements + c_elements));
				66
				67	std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
				68	std::fill(w.begin(), w.end(), 0.0f);
				69	xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data());
				70	std::vector<float> c(c_elements * num_buffers);
				71	std::fill(c.begin(), c.end(), std::nanf(""));
				72
				73	xnn_f32_output_params output_params =
				74	xnn_compute_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
				75
				76	size_t buffer_index = 0;
				77	for (auto _ : state) {
				78	// Use circular buffers (exceeding cache size) and prefetch to control cache state:
				79	// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
				80	// - W is not in cache (for any cache level)
				81	// - C is not in cache (for any cache level)
				82	state.PauseTiming();
				83	benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
				84	buffer_index = (buffer_index + 1) % num_buffers;
				85	state.ResumeTiming();
				86
				87	for (uint32_t m = 0; m < mc; m += mr) {
				88	const uint32_t mb = min(mc - m, mr);
				89	gemm(
				90	mb, nc, kc * sizeof(float),
				91	a.data() + m * kc, kc * sizeof(float),
				92	w.data() + buffer_index * nc_stride * (kc_stride + 1),
				93	c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
				94	&output_params);
				95	}
				96	}
				97
				98	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
				99	state.counters["FLOPS"] = benchmark::Counter(
				100	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				101	}
				102
				103	static void PPMM1PBenchmark(benchmark::State& state,
				104	xnn_f32_ppmm_ukernel_function ppmm,
				105	xnn_x32_packx_ukernel_function packx,
				106	size_t mr, size_t nr)
				107	{
				108	if (!cpuinfo_initialize()) {
				109	state.SkipWithError("cpuinfo initialization failed");
				110	return;
				111	}
				112
				113	const size_t mc = state.range(0);
				114	const size_t nc = state.range(1);
				115	const size_t kc = state.range(2);
				116
				117	const size_t nc_stride = benchmark::utils::roundUp(nc, nr);
				118
				119	std::random_device random_device;
				120	auto rng = std::mt19937(random_device());
				121	auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
				122
				123	std::vector<float> a(mc * kc);
				124	std::generate(a.begin(), a.end(), std::ref(f32rng));
				125	std::vector<float> k(nc * kc);
				126	std::generate(k.begin(), k.end(), std::ref(f32rng));
				127	std::vector<float> b(nc);
				128	std::generate(b.begin(), b.end(), std::ref(f32rng));
				129
				130	std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mr * kc);
				131
				132	const size_t w_elements = nc_stride * kc + nc_stride;
				133	const size_t c_elements = mc * nc;
				134	const size_t num_buffers = 1 +
				135	benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
				136	sizeof(float) * (w_elements + c_elements));
				137
				138	std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
				139	std::fill(w.begin(), w.end(), 0.0f);
				140	xnn_pack_f32_gemm_goi_w(1 /* groups /, nc, kc, nr, 1 / kr /, 1 / sr */, k.data(), b.data(), w.data());
				141	std::vector<float> c(c_elements * num_buffers);
				142	std::fill(c.begin(), c.end(), std::nanf(""));
				143
				144	xnn_f32_output_params output_params =
				145	xnn_compute_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
				146
				147	size_t buffer_index = 0;
				148	for (auto _ : state) {
				149	// Use circular buffers (exceeding cache size) and prefetch to control cache state:
				150	// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
				151	// - W is not in cache (for any cache level)
				152	// - C is not in cache (for any cache level)
				153	state.PauseTiming();
				154	benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
				155	buffer_index = (buffer_index + 1) % num_buffers;
				156	state.ResumeTiming();
				157
				158	for (uint32_t m = 0; m < mc; m += mr) {
				159	const uint32_t mb = min(mc - m, mr);
				160	packx(mb, kc, reinterpret_cast<const uint32_t>(a.data() + m kc), kc, t.data());
				161	ppmm(
				162	mb, nc, kc * sizeof(float),
				163	reinterpret_cast<const float*>(t.data()),
				164	w.data() + nc_stride * buffer_index * (kc + 1),
				165	c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
				166	&output_params);
				167	}
				168	}
				169
				170	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
				171	state.counters["FLOPS"] = benchmark::Counter(
				172	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				173	}
				174
				175	static void PPMM2PBenchmark(benchmark::State& state,
				176	xnn_f32_ppmm_ukernel_function ppmm,
				177	xnn_x32_packx_ukernel_function packx,
				178	size_t mr, size_t nr)
				179	{
				180	if (!cpuinfo_initialize()) {
				181	state.SkipWithError("cpuinfo initialization failed");
				182	return;
				183	}
				184
				185	const size_t mc = state.range(0);
				186	const size_t nc = state.range(1);
				187	const size_t kc = state.range(2);
				188
				189	const size_t mc_stride = benchmark::utils::roundUp(mc, mr);
				190	const size_t nc_stride = benchmark::utils::roundUp(nc, nr);
				191
				192	std::random_device random_device;
				193	auto rng = std::mt19937(random_device());
				194	auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
				195
				196	std::vector<float> a(mc * kc);
				197	std::generate(a.begin(), a.end(), std::ref(f32rng));
				198	std::vector<float> k(nc * kc);
				199	std::generate(k.begin(), k.end(), std::ref(f32rng));
				200	std::vector<float> b(nc);
				201	std::generate(b.begin(), b.end(), std::ref(f32rng));
				202
				203	std::vector<uint32_t, AlignedAllocator<uint32_t, 32>> t(mc_stride * kc);
				204
				205	const size_t w_elements = nc_stride * kc + nc_stride;
				206	const size_t c_elements = mc * nc;
				207	const size_t num_buffers = 1 +
				208	benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
				209	sizeof(float) * (w_elements + c_elements));
				210
				211	std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
				212	std::fill(w.begin(), w.end(), 0.0f);
				213	xnn_pack_f32_gemm_goi_w(1 /* groups /, nc, kc, nr, 1 / kr /, 1 / sr */, k.data(), b.data(), w.data());
				214	std::vector<float> c(c_elements * num_buffers);
				215	std::fill(c.begin(), c.end(), std::nanf(""));
				216
				217	xnn_f32_output_params output_params =
				218	xnn_compute_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
				219
				220	size_t buffer_index = 0;
				221	for (auto _ : state) {
				222	// Use circular buffers (exceeding cache size) and prefetch to control cache state:
				223	// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
				224	// - W is not in cache (for any cache level)
				225	// - C is not in cache (for any cache level)
				226	state.PauseTiming();
				227	benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
				228	buffer_index = (buffer_index + 1) % num_buffers;
				229	state.ResumeTiming();
				230
				231	for (uint32_t m = 0; m < mc; m += mr) {
				232	const uint32_t mb = min(mc - m, mr);
				233	packx(mb, kc, reinterpret_cast<const uint32_t>(a.data() + m kc), kc, t.data() + m * kc);
				234	}
				235	for (uint32_t m = 0; m < mc; m += mr) {
				236	const uint32_t mb = min(mc - m, mr);
				237	ppmm(
				238	mb, nc, kc * sizeof(float),
				239	reinterpret_cast<const float>(t.data() + m kc),
				240	w.data() + nc_stride * buffer_index * (kc + 1),
				241	c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
				242	&output_params);
				243	}
				244	}
				245
				246	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
				247	state.counters["FLOPS"] = benchmark::Counter(
				248	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				249	}
				250
				251	static void RuyBenchmark(benchmark::State& state, uint32_t threads)
				252	{
				253	std::random_device random_device;
				254	auto rng = std::mt19937(random_device());
				255	auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
				256
				257	const size_t mc = state.range(0);
				258	const size_t nc = state.range(1);
				259	const size_t kc = state.range(2);
				260
				261	const size_t num_buffers = 1 +
				262	benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
				263	sizeof(float) * (nc * (mc + kc + 1)));
				264
				265	std::vector<float> a(mc * kc);
				266	std::generate(a.begin(), a.end(), std::ref(f32rng));
				267	std::vector<float> k(num_buffers * nc * kc);
				268	std::generate(k.begin(), k.end(), std::ref(f32rng));
				269	std::vector<float> b(num_buffers * nc);
				270	std::generate(b.begin(), b.end(), std::ref(f32rng));
				271	std::vector<float> c(num_buffers * nc * mc);
				272	std::fill(c.begin(), c.end(), std::nanf(""));
				273
				274	// Note: context must be static to avoid the cost of re-creating it for each benchmark.
				275	static ruy::Context context;
				276	context.max_num_threads = threads;
				277
				278	ruy::Matrix<float> ruy_a;
				279	ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, &ruy_a.layout);
				280	ruy::Matrix<float> ruy_b;
				281	ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, &ruy_b.layout);
				282	ruy_b.data = a.data();
				283	ruy::Matrix<float> ruy_c;
				284	ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, &ruy_c.layout);
				285
				286	ruy::BasicSpec<float, float> spec;
				287
				288	// ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
				289	// the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
				290	// Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
				291	// keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
				292	static std::once_flag warmup;
				293	std::call_once(warmup, [&](){
				294	auto start = std::chrono::steady_clock::now();
				295	do {
				296	ruy_a.data = k.data();
				297	ruy_c.data = c.data();
				298	spec.bias = b.data();
				299
				300	ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
				301	} while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
				302	});
				303
				304	size_t buffer_index = 0;
				305	for (auto _ : state) {
				306	// Use circular buffers (exceeding cache size) and prefetch to control cache state:
				307	// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
				308	// - K is not in cache (for any cache level)
				309	// - B is not in cache (for any cache level)
				310	// - C is not in cache (for any cache level)
				311	state.PauseTiming();
				312	benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(float));
				313	buffer_index = (buffer_index + 1) % num_buffers;
				314	state.ResumeTiming();
				315
				316	ruy_a.data = k.data() + buffer_index * nc * kc;
				317	ruy_c.data = c.data() + buffer_index * mc * nc;
				318	spec.bias = b.data() + buffer_index * nc;
				319
				320	ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
				321	}
				322
				323	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
				324	state.counters["FLOPS"] = benchmark::Counter(
				325	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				326	}
				327
				328	static void ruy_st(benchmark::State& state, const char* net)
				329	{
				330	RuyBenchmark(state, 1);
				331	}
				332
				333
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame^]	334	#if XNN_ARCH_ARM64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	335	static void sgemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
				336	GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1);
				337	}
				338	static void sgemm_1x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
				339	GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57, 1, 8, 1, 1);
				340	}
				341	static void sgemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
				342	GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1);
				343	}
				344	static void sgemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
				345	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1);
				346	}
				347	static void sgemm_4x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
				348	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57, 4, 8, 1, 1);
				349	}
				350	static void sgemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
				351	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1);
				352	}
				353	static void sgemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
				354	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1);
				355	}
				356	static void sgemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
				357	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1);
				358	}
				359	static void sgemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
				360	GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1);
				361	}
				362	static void sgemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
				363	GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1);
				364	}
				365	static void sgemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
				366	GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1);
				367	}
				368	static void sgemm_6x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
				369	GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57, 6, 8, 1, 1);
				370	}
				371	static void sgemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
				372	GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1);
				373	}
				374	static void sgemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
				375	GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1);
				376	}
				377
				378	BENCHMARK_GEMM(sgemm_1x12__aarch64_neonfma_cortex_a53)
				379	BENCHMARK_GEMM(sgemm_1x8__aarch64_neonfma_cortex_a57)
				380	BENCHMARK_GEMM(sgemm_1x8__aarch64_neonfma_cortex_a75)
				381	BENCHMARK_GEMM(sgemm_4x12__aarch64_neonfma_cortex_a53)
				382	BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_cortex_a57)
				383	BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_cortex_a75)
				384	BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_ld128)
				385	BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_ld64)
				386	BENCHMARK_GEMM(sgemm_5x8__aarch64_neonfma_cortex_a75)
				387	BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a57)
				388	BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a73)
				389	BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a75)
				390	BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_ld64)
				391	BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_ld128)
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame^]	392	#endif // XNN_ARCH_ARM64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	393
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame^]	394	#if XNN_ARCH_ARM \|\| XNN_ARCH_ARM64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	395	static void sgemm_4x12__neon_ld64(benchmark::State& state, const char* net) {
				396	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__neon_ld64, 4, 12, 1, 1);
				397	}
				398
				399	static void sgemm_1x8__neon_ld64(benchmark::State& state, const char* net) {
				400	GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_ld64, 1, 8, 1, 1);
				401	}
				402
				403	static void sgemm_1x8__neonfma_ld64(benchmark::State& state, const char* net) {
				404	GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_ld64, 1, 8, 1, 1);
				405	}
				406
				407	static void sgemm_4x8__neon_ld64(benchmark::State& state, const char* net) {
				408	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_ld64, 4, 8, 1, 1);
				409	}
				410
				411	static void sgemm_4x8__neon_ld128(benchmark::State& state, const char* net) {
				412	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_ld128, 4, 8, 1, 1);
				413	}
				414
				415	static void sgemm_5x8__neon_ld64(benchmark::State& state, const char* net) {
				416	GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_ld64, 5, 8, 1, 1);
				417	}
				418
				419	static void sgemm_6x8__neon_ld64(benchmark::State& state, const char* net) {
				420	GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_ld64, 6, 8, 1, 1);
				421	}
				422
				423	static void sgemm_4x12__neonfma_ld64(benchmark::State& state, const char* net) {
				424	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__neonfma_ld64, 4, 12, 1, 1);
				425	}
				426
				427	static void sgemm_4x8__neonfma_ld64(benchmark::State& state, const char* net) {
				428	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_ld64, 4, 8, 1, 1);
				429	}
				430
				431	static void sgemm_4x8__neonfma_ld128(benchmark::State& state, const char* net) {
				432	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_ld128, 4, 8, 1, 1);
				433	}
				434
				435	static void sgemm_5x8__neonfma_ld64(benchmark::State& state, const char* net) {
				436	GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neonfma_ld64, 5, 8, 1, 1);
				437	}
				438
				439	static void sgemm_6x8__neonfma_ld64(benchmark::State& state, const char* net) {
				440	GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_ld64, 6, 8, 1, 1);
				441	}
				442
				443	static void sppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
				444	PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
				445	}
				446
				447	static void sppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
				448	PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
				449	}
				450
				451	BENCHMARK_GEMM(sgemm_4x12__neon_ld64)
				452	BENCHMARK_GEMM(sgemm_4x12__neonfma_ld64)
				453	BENCHMARK_GEMM(sgemm_1x8__neon_ld64)
				454	BENCHMARK_GEMM(sgemm_1x8__neonfma_ld64)
				455	BENCHMARK_GEMM(sgemm_4x8__neon_ld128)
				456	BENCHMARK_GEMM(sgemm_4x8__neon_ld64)
				457	BENCHMARK_GEMM(sgemm_4x8__neonfma_ld128)
				458	BENCHMARK_GEMM(sgemm_4x8__neonfma_ld64)
				459	BENCHMARK_GEMM(sgemm_5x8__neon_ld64)
				460	BENCHMARK_GEMM(sgemm_5x8__neonfma_ld64)
				461	BENCHMARK_GEMM(sgemm_6x8__neon_ld64)
				462	BENCHMARK_GEMM(sgemm_6x8__neonfma_ld64)
				463
				464	BENCHMARK_GEMM(sppmm_4x8_unipass__neonfma)
				465	BENCHMARK_GEMM(sppmm_4x8_twopass__neonfma)
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame^]	466	#endif // XNN_ARCH_ARM \|\| XNN_ARCH_ARM64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	467
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame^]	468	#if XNN_ARCH_X86 \|\| XNN_ARCH_X86_64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	469	static void sgemm_1x8__sse_load1(benchmark::State& state, const char* net) {
				470	GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_load1, 1, 8, 1, 1);
				471	}
				472
				473	static void sgemm_4x8__sse_load1(benchmark::State& state, const char* net) {
				474	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_load1, 4, 8, 1, 1);
				475	}
				476
				477	static void sgemm_1x8__sse_dup(benchmark::State& state, const char* net) {
				478	GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_dup, 1, 8, 1, 1);
				479	}
				480
				481	static void sgemm_4x8__sse_dup(benchmark::State& state, const char* net) {
				482	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_dup, 4, 8, 1, 1);
				483	}
				484
				485	static void sgemm_1x8s4__sse(benchmark::State& state, const char* net) {
				486	GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__sse, 1, 8, 1, 4);
				487	}
				488
				489	static void sgemm_4x8s4__sse(benchmark::State& state, const char* net) {
				490	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__sse, 4, 8, 1, 4);
				491	}
				492
				493	static void sppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
				494	PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
				495	}
				496
				497	static void sppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
				498	PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
				499	}
				500
				501	BENCHMARK_GEMM(sgemm_1x8__sse_load1)
				502	BENCHMARK_GEMM(sgemm_4x8__sse_load1)
				503	BENCHMARK_GEMM(sgemm_1x8__sse_dup)
				504	BENCHMARK_GEMM(sgemm_4x8__sse_dup)
				505	BENCHMARK_GEMM(sgemm_1x8s4__sse)
				506	BENCHMARK_GEMM(sgemm_4x8s4__sse)
				507	BENCHMARK_GEMM(sppmm_4x8_unipass__sse)
				508	BENCHMARK_GEMM(sppmm_4x8_twopass__sse)
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame^]	509	#endif // XNN_ARCH_X86 \|\| XNN_ARCH_X86_64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	510
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame^]	511	#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	512	static void sgemm_4x8__psimd_loadsplat(benchmark::State& state, const char* net) {
				513	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, 4, 8, 1, 1);
				514	}
				515
				516	static void sgemm_6x8__psimd_loadsplat(benchmark::State& state, const char* net) {
				517	GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, 6, 8, 1, 1);
				518	}
				519
				520	static void sgemm_4x8__psimd_splat(benchmark::State& state, const char* net) {
				521	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_splat, 4, 8, 1, 1);
				522	}
				523
				524	static void sgemm_6x8__psimd_splat(benchmark::State& state, const char* net) {
				525	GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_splat, 6, 8, 1, 1);
				526	}
				527
				528	static void sgemm_4x8s4__psimd(benchmark::State& state, const char* net) {
				529	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__psimd, 4, 8, 1, 4);
				530	}
				531
				532	static void sgemm_6x8s4__psimd(benchmark::State& state, const char* net) {
				533	GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__psimd, 6, 8, 1, 4);
				534	}
				535
				536	static void sppmm_4x8_unipass__psimd(benchmark::State& state, const char* net) {
				537	PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
				538	}
				539
				540	static void sppmm_4x8_twopass__psimd(benchmark::State& state, const char* net) {
				541	PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
				542	}
				543
				544	BENCHMARK_GEMM(sgemm_4x8__psimd_loadsplat)
				545	BENCHMARK_GEMM(sgemm_6x8__psimd_loadsplat)
				546	BENCHMARK_GEMM(sgemm_4x8__psimd_splat)
				547	BENCHMARK_GEMM(sgemm_6x8__psimd_splat)
				548	BENCHMARK_GEMM(sgemm_4x8s4__psimd)
				549	BENCHMARK_GEMM(sgemm_6x8s4__psimd)
				550	BENCHMARK_GEMM(sppmm_4x8_unipass__psimd)
				551	BENCHMARK_GEMM(sppmm_4x8_twopass__psimd)
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame^]	552	#endif // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	553
				554	static void sgemm_1x4__scalar(benchmark::State& state, const char* net) {
				555	GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x4__scalar, 1, 4, 1, 1);
				556	}
				557
				558	static void sgemm_2x4__scalar(benchmark::State& state, const char* net) {
				559	GEMMBenchmark(state, xnn_f32_gemm_ukernel_2x4__scalar, 2, 4, 1, 1);
				560	}
				561
				562	static void sgemm_4x4__scalar(benchmark::State& state, const char* net) {
				563	GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x4__scalar, 4, 4, 1, 1);
				564	}
				565
				566	static void sppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
				567	PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
				568	}
				569
				570	static void sppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
				571	PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
				572	}
				573
				574	static void sppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
				575	PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
				576	}
				577
				578	static void sppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
				579	PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
				580	}
				581
				582	static void sppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
				583	PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
				584	}
				585
				586	static void sppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
				587	PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
				588	}
				589
				590	static void sppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
				591	PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
				592	}
				593
				594	static void sppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
				595	PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
				596	}
				597
				598	BENCHMARK_GEMM(sgemm_1x4__scalar)
				599	BENCHMARK_GEMM(sgemm_2x4__scalar)
				600	BENCHMARK_GEMM(sgemm_4x4__scalar)
				601
				602	BENCHMARK_GEMM(sppmm_2x4_unipass__scalar)
				603	BENCHMARK_GEMM(sppmm_4x2_unipass__scalar)
				604	BENCHMARK_GEMM(sppmm_4x4_unipass__scalar)
				605	BENCHMARK_GEMM(sppmm_3x3_unipass__scalar)
				606
				607	BENCHMARK_GEMM(sppmm_2x4_twopass__scalar)
				608	BENCHMARK_GEMM(sppmm_4x2_twopass__scalar)
				609	BENCHMARK_GEMM(sppmm_4x4_twopass__scalar)
				610	BENCHMARK_GEMM(sppmm_3x3_twopass__scalar)
				611
				612
				613	BENCHMARK_GEMM(ruy_st)
				614
				615	#ifndef XNNPACK_BENCHMARK_NO_MAIN
				616	BENCHMARK_MAIN();
				617	#endif