Blame - bench/f32-spmm.cc - platform/external/XNNPACK

blob: 2249ca0695ec8eeadb2847d58755e31a200f5dd6 [file] [log] [blame]

XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	1	// Copyright 2019 Google LLC
				2	//
				3	// This source code is licensed under the BSD-style license found in the
				4	// LICENSE file in the root directory of this source tree.
				5
				6	#include <algorithm>
				7	#include <cfloat>
				8	#include <cmath>
				9	#include <functional>
				10	#include <random>
				11	#include <vector>
				12
				13	#include <cpuinfo.h>
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	14
				15	#include <benchmark/benchmark.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	16	#include "bench/gemm.h"
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	17	#include "bench/utils.h"
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	18	#include <xnnpack/AlignedAllocator.h>
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame]	19	#include <xnnpack/common.h>
Marat Dukhan	eeaa7bd	2019-10-25 17:31:25 -0700	[diff] [blame]	20	#include <xnnpack/params-init.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	21	#include <xnnpack/params.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	22	#include <xnnpack/spmm.h>
				23
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	24
				25	static void SpMMBenchmark(benchmark::State& state,
				26	xnn_f32_spmm_ukernel_function spmm, uint32_t mr, uint32_t nr, float sparsity)
				27	{
				28	if (!cpuinfo_initialize()) {
				29	state.SkipWithError("cpuinfo initialization failed");
				30	return;
				31	}
				32
				33	const size_t mc = state.range(0);
				34	const size_t nc = state.range(1);
				35	const size_t kc = state.range(2);
				36
				37	std::random_device random_device;
				38	auto rng = std::mt19937(random_device());
				39	auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
				40
				41	// if using blocks, generate the reduced matrix first and then extrude along
				42	// the block dimension (n), to get the full matrix
Marat Dukhan	b8ab4cb	2019-10-03 15:08:04 -0700	[diff] [blame]	43	size_t ncols = nc / nr + nc % nr;
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	44	std::vector<float> b(ncols * kc);
				45	std::vector<float> bias(nc);
				46	std::vector<float> w;
				47	std::vector<uint32_t> nmap;
				48	std::vector<int32_t> dmap;
				49	const size_t sparse_end = std::min(size_t(float(b.size()) * sparsity), b.size());
				50	const size_t num_nonzeroes = nr * (b.size() - sparse_end);
				51
				52	const size_t w_elements = num_nonzeroes + nc;
				53	const size_t c_elements = mc * nc;
				54	const size_t dmap_elements = num_nonzeroes / nr;
				55	const size_t nmap_elements = nc;
				56	const size_t num_buffers = 1 +
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	57	benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	58	sizeof(float) * (w_elements + c_elements) + sizeof(uint32_t) * (dmap_elements + nmap_elements));
				59
				60	// Micro-kernel can access one element beyond w and dmap for software pipelining.
				61	w.reserve(num_buffers * w_elements + 1);
				62	dmap.reserve(num_buffers * dmap_elements + 1);
				63	nmap.resize(num_buffers * nmap_elements);
				64
				65	std::vector<size_t> a_offsets(num_buffers);
				66
				67	for (size_t buffer_index = 0; buffer_index < num_buffers; buffer_index++) {
				68	// Re-generate weights. Note: each re-generation produces the number of non-zeroes.
				69	std::fill(b.begin(), b.begin() + sparse_end, 0.0f);
				70	std::generate(b.begin() + sparse_end, b.end(), std::ref(f32rng));
				71	std::shuffle(b.begin(), b.end(), rng);
				72	std::generate(bias.begin(), bias.end(), std::ref(f32rng));
				73
				74	uint32_t first_j = 0, last_j = 0;
				75	bool is_first_nonzero = true;
				76	for (uint32_t i = 0; i < nc / nr; i++) {
				77	for (uint32_t n = 0; n < nr; n++)
				78	w.push_back(bias[nr * i + n]);
				79	for (uint32_t j = 0; j < kc; j++) {
				80	if (b[i * kc + j] != 0.0f) {
				81	for (size_t l = 0; l < nr; l++)
				82	w.push_back(b[i * kc + j] + static_cast<float>(i));
				83	if (is_first_nonzero) {
				84	first_j = j;
				85	} else {
				86	const ptrdiff_t increment = int32_t(j - last_j) * int32_t(mc) * int32_t(sizeof(float));
				87	dmap.push_back(increment);
				88	}
				89	last_j = j;
				90	is_first_nonzero = false;
				91	nmap[buffer_index * nmap_elements + i] += 1;
				92	}
				93	}
				94	}
				95	for (uint32_t i = nc / nr; i < ncols; i++) {
				96	w.push_back(bias[i]);
				97	for (uint32_t j = 0; j < kc; j++) {
				98	if (b[i * kc + j] != 0.0f) {
				99	w.push_back(b[i * kc + j]);
				100	if (is_first_nonzero) {
				101	first_j = j;
				102	} else {
				103	const ptrdiff_t increment = int32_t(j - last_j) * int32_t(mc) * int32_t(sizeof(float));
				104	dmap.push_back(increment);
				105	}
				106	last_j = j;
				107	is_first_nonzero = false;
				108	nmap[buffer_index * nmap_elements + i] += 1;
				109	}
				110	}
				111	}
				112	{
				113	const ptrdiff_t increment = int32_t(first_j - last_j) * int32_t(mc) * int32_t(sizeof(float));
				114	dmap.push_back(increment);
				115	}
				116
				117	a_offsets[buffer_index] = first_j * mc;
				118	}
				119
				120	// Micro-kernel can access one element beyond w and dmap for software pipelining.
				121	w.resize(w.size() + 1);
				122	dmap.resize(dmap.size() + 1);
				123
				124	std::vector<float, AlignedAllocator<float, 64>> a(kc * mc);
				125	std::vector<float, AlignedAllocator<float, 64>> c(num_buffers * c_elements);
				126
				127	std::generate(a.begin(), a.end(), std::ref(f32rng));
				128	std::fill(c.begin(), c.end(), nanf(""));
				129
				130	xnn_f32_output_params output_params =
Marat Dukhan	eeaa7bd	2019-10-25 17:31:25 -0700	[diff] [blame]	131	xnn_init_f32_output_params(-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	132
				133	size_t buffer_index = 0;
				134	for (auto _ : state) {
				135	// Use circular buffers (exceeding cache size) and prefetch to control cache state:
				136	// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
				137	// - W, Kmap, and Nmap is not in cache (for any cache level)
				138	// - C is not in cache (for any cache level)
				139	state.PauseTiming();
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	140	benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	141	buffer_index = (buffer_index + 1) % num_buffers;
				142	state.ResumeTiming();
				143
				144	spmm(mc, nc,
				145	a.data() + a_offsets[buffer_index],
				146	w.data() + buffer_index * w_elements,
				147	dmap.data() + buffer_index * dmap_elements,
				148	nmap.data() + buffer_index * nmap_elements,
				149	c.data() + buffer_index * c_elements,
				150	&output_params);
				151	}
				152
				153	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
				154	state.counters["FLOPS"] = benchmark::Counter(
				155	uint64_t(state.iterations()) * 2 * mc * num_nonzeroes, benchmark::Counter::kIsRate);
				156
				157	state.counters["EffFLOPS"] = benchmark::Counter(
				158	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				159	}
				160
				161
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame]	162	#if XNN_ARCH_ARM64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	163	static void spmm80_4x1__neonfma(benchmark::State& state, const char* net) {
				164	SpMMBenchmark(state, xnn_f32_spmm_ukernel_4x1__neonfma, 4, 1, 0.8f);
				165	}
				166	static void spmm80_4x2__neonfma(benchmark::State& state, const char* net) {
				167	SpMMBenchmark(state, xnn_f32_spmm_ukernel_4x2__neonfma, 4, 2, 0.8f);
				168	}
				169	static void spmm80_4x4__neonfma(benchmark::State& state, const char* net) {
				170	SpMMBenchmark(state, xnn_f32_spmm_ukernel_4x4__neonfma, 4, 4, 0.8f);
				171	}
				172
				173	static void spmm80_8x1__neonfma(benchmark::State& state, const char* net) {
				174	SpMMBenchmark(state, xnn_f32_spmm_ukernel_8x1__neonfma, 8, 1, 0.8f);
				175	}
				176
				177	static void spmm80_8x2__neonfma(benchmark::State& state, const char* net) {
				178	SpMMBenchmark(state, xnn_f32_spmm_ukernel_8x2__neonfma, 8, 2, 0.8f);
				179	}
				180
				181	static void spmm80_8x4__neonfma(benchmark::State& state, const char* net) {
				182	SpMMBenchmark(state, xnn_f32_spmm_ukernel_8x4__neonfma, 8, 4, 0.8f);
				183	}
				184
				185	static void spmm80_12x1__neonfma(benchmark::State& state, const char* net) {
				186	SpMMBenchmark(state, xnn_f32_spmm_ukernel_12x1__neonfma, 12, 1, 0.8f);
				187	}
				188
				189	static void spmm80_12x2__neonfma(benchmark::State& state, const char* net) {
				190	SpMMBenchmark(state, xnn_f32_spmm_ukernel_12x2__neonfma, 12, 2, 0.8f);
				191	}
				192
				193	static void spmm80_12x4__neonfma(benchmark::State& state, const char* net) {
				194	SpMMBenchmark(state, xnn_f32_spmm_ukernel_12x4__neonfma, 12, 4, 0.8f);
				195	}
				196
				197	static void spmm80_16x1__neonfma(benchmark::State& state, const char* net) {
				198	SpMMBenchmark(state, xnn_f32_spmm_ukernel_16x1__neonfma, 16, 1, 0.8f);
				199	}
				200
				201	static void spmm80_16x2__neonfma(benchmark::State& state, const char* net) {
				202	SpMMBenchmark(state, xnn_f32_spmm_ukernel_16x2__neonfma, 16, 2, 0.8f);
				203	}
				204
				205	static void spmm80_16x4__neonfma(benchmark::State& state, const char* net) {
				206	SpMMBenchmark(state, xnn_f32_spmm_ukernel_16x4__neonfma, 16, 4, 0.8f);
				207	}
				208
				209	static void spmm80_4x1__neonfma_unroll2(benchmark::State& state, const char* net) {
				210	SpMMBenchmark(state, xnn_f32_spmm_ukernel_4x1__neonfma_unroll2, 4, 1, 0.8f);
				211	}
				212
				213	static void spmm80_8x1__neonfma_unroll2(benchmark::State& state, const char* net) {
				214	SpMMBenchmark(state, xnn_f32_spmm_ukernel_8x1__neonfma_unroll2, 8, 1, 0.8f);
				215	}
				216
				217	static void spmm80_16x1__neonfma_unroll2(benchmark::State& state, const char* net) {
				218	SpMMBenchmark(state, xnn_f32_spmm_ukernel_16x1__neonfma_unroll2, 16, 1, 0.8f);
				219	}
				220
				221	static void spmm80_4x1__neonfma_pipelined(benchmark::State& state, const char* net) {
				222	SpMMBenchmark(state, xnn_f32_spmm_ukernel_4x1__neonfma_pipelined, 4, 1, 0.8f);
				223	}
				224
				225	static void spmm80_8x1__neonfma_pipelined(benchmark::State& state, const char* net) {
				226	SpMMBenchmark(state, xnn_f32_spmm_ukernel_8x1__neonfma_pipelined, 8, 1, 0.8f);
				227	}
				228
				229	static void spmm80_16x1__neonfma_pipelined(benchmark::State& state, const char* net) {
				230	SpMMBenchmark(state, xnn_f32_spmm_ukernel_16x1__neonfma_pipelined, 16, 1, 0.8f);
				231	}
				232
				233	BENCHMARK_GEMM(spmm80_4x1__neonfma)
				234	BENCHMARK_GEMM(spmm80_4x2__neonfma)
				235	BENCHMARK_GEMM(spmm80_4x4__neonfma)
				236	BENCHMARK_GEMM(spmm80_8x1__neonfma)
				237	BENCHMARK_GEMM(spmm80_8x2__neonfma)
				238	BENCHMARK_GEMM(spmm80_8x4__neonfma)
				239	BENCHMARK_GEMM(spmm80_12x1__neonfma)
				240	BENCHMARK_GEMM(spmm80_12x2__neonfma)
				241	BENCHMARK_GEMM(spmm80_12x4__neonfma)
				242	BENCHMARK_GEMM(spmm80_16x1__neonfma)
				243	BENCHMARK_GEMM(spmm80_16x2__neonfma)
				244	BENCHMARK_GEMM(spmm80_16x4__neonfma)
				245	BENCHMARK_GEMM(spmm80_4x1__neonfma_unroll2)
				246	BENCHMARK_GEMM(spmm80_8x1__neonfma_unroll2)
				247	BENCHMARK_GEMM(spmm80_16x1__neonfma_unroll2)
				248	BENCHMARK_GEMM(spmm80_4x1__neonfma_pipelined)
				249	BENCHMARK_GEMM(spmm80_8x1__neonfma_pipelined)
				250	BENCHMARK_GEMM(spmm80_16x1__neonfma_pipelined)
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame]	251	#endif // XNN_ARCH_ARM64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	252
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame]	253	#if XNN_ARCH_X86 \|\| XNN_ARCH_X86_64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	254	static void spmm80_4x1__sse(benchmark::State& state, const char* net) {
				255	SpMMBenchmark(state, xnn_f32_spmm_ukernel_4x1__sse, 4, 1, 0.8f);
				256	}
				257
				258	static void spmm80_8x1__sse(benchmark::State& state, const char* net) {
				259	SpMMBenchmark(state, xnn_f32_spmm_ukernel_8x1__sse, 8, 1, 0.8f);
				260	}
				261
				262	BENCHMARK_GEMM(spmm80_4x1__sse)
				263	BENCHMARK_GEMM(spmm80_8x1__sse)
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame]	264	#endif // XNN_ARCH_X86 \|\| XNN_ARCH_X86_64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	265
				266	static void spmm80_1x1__scalar(benchmark::State& state, const char* net) {
				267	SpMMBenchmark(state, xnn_f32_spmm_ukernel_1x1__scalar, 1, 1, 0.8f);
				268	}
				269
				270	static void spmm80_2x1__scalar(benchmark::State& state, const char* net) {
				271	SpMMBenchmark(state, xnn_f32_spmm_ukernel_2x1__scalar, 2, 1, 0.8f);
				272	}
				273
				274	static void spmm80_4x1__scalar(benchmark::State& state, const char* net) {
				275	SpMMBenchmark(state, xnn_f32_spmm_ukernel_4x1__scalar, 4, 1, 0.8f);
				276	}
				277
				278	static void spmm80_8x1__scalar(benchmark::State& state, const char* net) {
				279	SpMMBenchmark(state, xnn_f32_spmm_ukernel_8x1__scalar, 8, 1, 0.8f);
				280	}
				281
Erich Elsen	c6afd9b	2019-10-24 16:10:53 -0700	[diff] [blame]	282	static void spmm80_8x2__scalar(benchmark::State& state, const char* net) {
				283	SpMMBenchmark(state, xnn_f32_spmm_ukernel_8x2__scalar, 8, 2, 0.8f);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	284	}
				285
Erich Elsen	c6afd9b	2019-10-24 16:10:53 -0700	[diff] [blame]	286	static void spmm80_8x4__scalar(benchmark::State& state, const char* net) {
				287	SpMMBenchmark(state, xnn_f32_spmm_ukernel_8x4__scalar, 8, 4, 0.8f);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	288	}
				289
				290	static void spmm80_1x1__scalar_pipelined(benchmark::State& state, const char* net) {
				291	SpMMBenchmark(state, xnn_f32_spmm_ukernel_1x1__scalar_pipelined, 1, 1, 0.8f);
				292	}
				293
				294	static void spmm80_2x1__scalar_pipelined(benchmark::State& state, const char* net) {
				295	SpMMBenchmark(state, xnn_f32_spmm_ukernel_2x1__scalar_pipelined, 2, 1, 0.8f);
				296	}
				297
				298	static void spmm80_4x1__scalar_pipelined(benchmark::State& state, const char* net) {
				299	SpMMBenchmark(state, xnn_f32_spmm_ukernel_4x1__scalar_pipelined, 4, 1, 0.8f);
				300	}
				301
				302	static void spmm80_8x1__scalar_pipelined(benchmark::State& state, const char* net) {
				303	SpMMBenchmark(state, xnn_f32_spmm_ukernel_8x1__scalar_pipelined, 8, 1, 0.8f);
				304	}
				305
				306	BENCHMARK_GEMM(spmm80_1x1__scalar)
				307	BENCHMARK_GEMM(spmm80_2x1__scalar)
				308	BENCHMARK_GEMM(spmm80_4x1__scalar)
				309	BENCHMARK_GEMM(spmm80_8x1__scalar)
Erich Elsen	c6afd9b	2019-10-24 16:10:53 -0700	[diff] [blame]	310	BENCHMARK_GEMM(spmm80_8x2__scalar)
				311	BENCHMARK_GEMM(spmm80_8x4__scalar)
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	312	BENCHMARK_GEMM(spmm80_1x1__scalar_pipelined)
				313	BENCHMARK_GEMM(spmm80_2x1__scalar_pipelined)
				314	BENCHMARK_GEMM(spmm80_4x1__scalar_pipelined)
				315	BENCHMARK_GEMM(spmm80_8x1__scalar_pipelined)
				316
				317	#ifndef XNNPACK_BENCHMARK_NO_MAIN
				318	BENCHMARK_MAIN();
				319	#endif