Blame - bench/q8-gemm.cc - platform/external/XNNPACK

blob: 7056ae31583db2f77493bd0b12513fb21e238780 [file] [log] [blame]

XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	1	// Copyright (c) Facebook, Inc. and its affiliates.
				2	// All rights reserved.
				3	//
				4	// Copyright 2019 Google LLC
				5	//
				6	// This source code is licensed under the BSD-style license found in the
				7	// LICENSE file in the root directory of this source tree.
				8
				9	#include <algorithm>
				10	#include <cfloat>
				11	#include <chrono>
				12	#include <cmath>
				13	#include <functional>
				14	#include <mutex>
				15	#include <random>
				16	#include <vector>
				17
				18	#include <cpuinfo.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	19
				20	#include <benchmark/benchmark.h>
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	21	#ifdef BENCHMARK_GEMMLOWP
				22	#include "gemmlowp/public/gemmlowp.h"
				23	#endif // BENCHMARK_GEMMLOWP
				24	#ifdef BENCHMARK_RUY
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	25	#include "tensorflow/lite/experimental/ruy/ruy.h"
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	26	#endif // BENCHMARK_RUY
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	27	#include "bench/gemm.h"
				28	#include "bench/utils.h"
				29	#include <xnnpack/AlignedAllocator.h>
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame]	30	#include <xnnpack/common.h>
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	31	#include <xnnpack/gemm.h>
				32	#include <xnnpack/pack.h>
Marat Dukhan	eeaa7bd	2019-10-25 17:31:25 -0700	[diff] [blame]	33	#include <xnnpack/params-init.h>
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	34	#include <xnnpack/params.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	35
				36
				37	static void GEMMBenchmark(benchmark::State& state,
				38	xnn_q8_gemm_ukernel_function q8gemm,
				39	size_t mr, size_t nr, size_t kr)
				40	{
				41	if (!cpuinfo_initialize()) {
				42	state.SkipWithError("cpuinfo initialization failed");
				43	return;
				44	}
				45
				46	const size_t mc = state.range(0);
				47	const size_t nc = state.range(1);
				48	const size_t kc = state.range(2);
				49
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	50	const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
				51	const size_t kc_stride = benchmark::utils::RoundUp(kc, kr);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	52
				53	std::random_device random_device;
				54	auto rng = std::mt19937(random_device());
				55	auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
				56	auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
				57
				58	std::vector<uint8_t> a(mc * kc);
				59	std::generate(a.begin(), a.end(), std::ref(u8rng));
				60	std::vector<uint8_t> k(nc * kc);
				61	std::generate(k.begin(), k.end(), std::ref(u8rng));
				62	std::vector<int32_t> b(nc);
				63	std::generate(b.begin(), b.end(), std::ref(s32rng));
				64
				65	const size_t w_elements = kc_stride * nc_stride + nc_stride * sizeof(int32_t) / sizeof(uint8_t);
				66	const size_t c_elements = mc * nc;
				67	const size_t num_buffers = 1 +
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	68	benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	69	sizeof(uint8_t) * (w_elements + c_elements));
				70
				71	std::vector<uint8_t, AlignedAllocator<uint8_t, 32>> w(w_elements * num_buffers);
				72	std::fill(w.begin(), w.end(), 0);
				73	xnn_pack_q8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, 127, 127, k.data(), b.data(), w.data());
				74	std::vector<uint8_t> c(c_elements * num_buffers);
				75	std::fill(c.begin(), c.end(), 0xA5);
				76
				77	union xnn_q8_gemm_params quantizationParams =
Marat Dukhan	eeaa7bd	2019-10-25 17:31:25 -0700	[diff] [blame]	78	xnn_init_q8_gemm_params(127, 127, 0.75f, 127, 1, 254);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	79
				80	size_t buffer_index = 0;
				81	for (auto _ : state) {
				82	// Use circular buffers (exceeding cache size) and prefetch to control cache state:
				83	// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
				84	// - W is not in cache (for any cache level)
				85	// - C is not in cache (for any cache level)
				86	state.PauseTiming();
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	87	benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	88	buffer_index = (buffer_index + 1) % num_buffers;
				89	state.ResumeTiming();
				90
				91	for (uint32_t m = 0; m < mc; m += mr) {
				92	const uint32_t mb = min(mc - m, mr);
				93	for (uint32_t n = 0; n < nc; n += nr) {
				94	const uint32_t nb = min(nc - n, nr);
				95	q8gemm(
Marat Dukhan	b186463	2019-11-25 16:34:17 -0800	[diff] [blame]	96	mb, nb, kc * sizeof(uint8_t),
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	97	a.data() + m * kc, kc * sizeof(uint8_t),
				98	w.data() + (w_elements * buffer_index + n * (kc_stride + sizeof(int32_t))) / sizeof(uint8_t),
				99	c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint8_t), nr * sizeof(uint8_t),
				100	&quantizationParams);
				101	}
				102	}
				103	}
				104
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	105	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	106	state.counters["OPS"] = benchmark::Counter(
				107	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				108	}
				109
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	110	#ifdef BENCHMARK_GEMMLOWP
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	111	struct GemmlowpOutputPipeline {
				112	typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap;
				113	typedef std::tuple<
				114	gemmlowp::OutputStageBiasAddition<ColVectorMap>,
				115	gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
				116	gemmlowp::OutputStageClamp,
				117	gemmlowp::OutputStageSaturatingCastToUint8>
				118	Pipeline;
				119
				120	static Pipeline Make(
				121	const int32_t* bias_data,
				122	int output_rows,
				123	int32_t output_offset,
				124	int32_t output_multiplier,
				125	int output_shift,
				126	int32_t output_activation_min,
				127	int32_t output_activation_max)
				128	{
				129	ColVectorMap bias_vector(bias_data, output_rows);
				130	gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
				131	bias_addition_stage.bias_vector = bias_vector;
				132	gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint quantize_down_stage;
				133	quantize_down_stage.result_offset_after_shift = output_offset;
				134	quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
				135	quantize_down_stage.result_shift = output_shift;
				136	gemmlowp::OutputStageClamp clamp_stage;
				137	clamp_stage.min = output_activation_min;
				138	clamp_stage.max = output_activation_max;
				139	gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
				140	return std::make_tuple(bias_addition_stage, quantize_down_stage, clamp_stage, saturating_cast_stage);
				141	}
				142	};
				143
				144	static void GemmlowpBenchmark(benchmark::State& state, uint32_t threads)
				145	{
				146	if (!cpuinfo_initialize()) {
				147	state.SkipWithError("cpuinfo initialization failed");
				148	return;
				149	}
				150
				151	const size_t mc = state.range(0);
				152	const size_t nc = state.range(1);
				153	const size_t kc = state.range(2);
				154
				155	std::random_device random_device;
				156	auto rng = std::mt19937(random_device());
				157	auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
				158	auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
				159
				160	std::vector<uint8_t> a(mc * kc);
				161	std::generate(a.begin(), a.end(), std::ref(u8rng));
				162
				163	const size_t kElements = nc * kc;
				164	const size_t bElements = nc;
				165	const size_t c_elements = mc * nc;
				166	const size_t num_buffers = 1 +
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	167	benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	168	kElements * sizeof(uint8_t) + bElements * sizeof(int32_t) + c_elements * sizeof(uint8_t));
				169
				170	std::vector<uint8_t> k(kElements * num_buffers);
				171	std::generate(k.begin(), k.end(), std::ref(u8rng));
				172	std::vector<int32_t> b(bElements * num_buffers);
				173	std::generate(b.begin(), b.end(), std::ref(s32rng));
				174	std::vector<uint8_t> c(c_elements * num_buffers);
				175	std::fill(c.begin(), c.end(), 0xA5);
				176
				177	gemmlowp::MultiThreadGemmContext threadingContext;
				178	threadingContext.set_max_num_threads(threads);
				179
				180	size_t buffer_index = 0;
				181	for (auto _ : state) {
				182	state.PauseTiming();
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	183	benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	184	buffer_index = (buffer_index + 1) % num_buffers;
				185	state.ResumeTiming();
				186
				187	gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> AM(a.data(), mc, kc, kc);
				188	gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> BM(k.data() + buffer_index * kElements, kc, nc, kc);
				189	gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::RowMajor> CM(c.data() + buffer_index * c_elements, mc, nc, nc);
				190	const auto& outputPipeline = GemmlowpOutputPipeline::Make(b.data() + buffer_index * bElements, nc, 127, 127, 127, 0, 255);
				191	gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
				192	&threadingContext, AM, BM, &CM, 127, 127, outputPipeline);
				193	}
				194
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	195	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	196	state.counters["OPS"] = benchmark::Counter(
				197	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				198	}
				199
				200	static void gemmlowp_st(benchmark::State& state, const char* net)
				201	{
				202	GemmlowpBenchmark(state, 1);
				203	}
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	204	#endif // BENCHMARK_GEMMLOWP
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	205
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	206
				207	#ifdef BENCHMARK_RUY
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	208	static void RuyBenchmark(benchmark::State& state, size_t threads)
				209	{
				210	const size_t mc = state.range(0);
				211	const size_t nc = state.range(1);
				212	const size_t kc = state.range(2);
				213
				214	std::random_device random_device;
				215	auto rng = std::mt19937(random_device());
				216	auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
				217	auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
				218
				219	const size_t num_buffers = 1 +
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	220	benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	221	nc * (sizeof(uint8_t) * (mc + kc) + sizeof(int32_t)));
				222
				223	std::vector<uint8_t> a(mc * kc);
				224	std::generate(a.begin(), a.end(), std::ref(u8rng));
				225	std::vector<uint8_t> k(num_buffers * nc * kc);
				226	std::generate(k.begin(), k.end(), std::ref(u8rng));
				227	std::vector<int32_t> b(num_buffers * nc);
				228	std::generate(b.begin(), b.end(), std::ref(s32rng));
				229	std::vector<uint8_t> c(num_buffers * nc * mc);
				230	std::fill(c.begin(), c.end(), std::nanf(""));
				231
				232	// Note: context must be static to avoid the cost of re-creating it for each benchmark.
				233	static ruy::Context context;
				234	context.max_num_threads = threads;
				235
				236	ruy::Matrix<uint8_t> ruy_a;
				237	ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, &ruy_a.layout);
				238	ruy_a.zero_point = 127;
				239	ruy::Matrix<uint8_t> ruy_b;
				240	ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, &ruy_b.layout);
				241	ruy_b.data = a.data();
				242	ruy_b.zero_point = 127;
				243	ruy::Matrix<uint8_t> ruy_c;
				244	ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, &ruy_c.layout);
				245	ruy_c.zero_point = 127;
				246
				247	ruy::BasicSpec<int32_t, uint8_t> spec;
				248	spec.multiplier_fixedpoint = 0x40000000;
				249
				250	// ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
				251	// the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
				252	// Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
				253	// keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
				254	static std::once_flag warmup;
				255	std::call_once(warmup, [&](){
				256	auto start = std::chrono::steady_clock::now();
				257	do {
				258	ruy_a.data = k.data();
				259	ruy_c.data = c.data();
				260	spec.bias = b.data();
				261
				262	ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
				263	} while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
				264	});
				265
				266	size_t buffer_index = 0;
				267	for (auto _ : state) {
				268	// Use circular buffers (exceeding cache size) and prefetch to control cache state:
				269	// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
				270	// - K is not in cache (for any cache level)
				271	// - B is not in cache (for any cache level)
				272	// - C is not in cache (for any cache level)
				273	state.PauseTiming();
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	274	benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	275	buffer_index = (buffer_index + 1) % num_buffers;
				276	state.ResumeTiming();
				277
				278	ruy_a.data = k.data() + buffer_index * nc * kc;
				279	ruy_c.data = c.data() + buffer_index * mc * nc;
				280	spec.bias = b.data() + buffer_index * nc;
				281
				282	ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
				283	}
				284
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	285	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	286	state.counters["OPS"] = benchmark::Counter(
				287	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				288	}
				289
				290	static void ruy_st(benchmark::State& state, const char* net)
				291	{
				292	RuyBenchmark(state, 1);
				293	}
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	294	#endif // BENCHMARK_RUY
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	295
				296
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame]	297	#if XNN_ARCH_ARM \|\| XNN_ARCH_ARM64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	298	static void q8gemm_4x8__neon(benchmark::State& state, const char* net) {
				299	GEMMBenchmark(state, xnn_q8_gemm_ukernel_4x8__neon, 4, 8, 1);
				300	}
				301
				302	static void q8gemm_8x8__neon(benchmark::State& state, const char* net) {
				303	GEMMBenchmark(state, xnn_q8_gemm_ukernel_8x8__neon, 8, 8, 1);
				304	}
				305
				306	BENCHMARK_GEMM(q8gemm_4x8__neon)
				307	BENCHMARK_GEMM(q8gemm_8x8__neon)
				308	#endif
				309
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame]	310	#if XNN_ARCH_X86 \|\| XNN_ARCH_X86_64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	311	static void q8gemm_4x4c2__sse2(benchmark::State& state, const char* net) {
				312	GEMMBenchmark(state, xnn_q8_gemm_ukernel_4x4c2__sse2, 4, 4, 2);
				313	}
				314
				315	static void q8gemm_2x4c8__sse2(benchmark::State& state, const char* net) {
				316	GEMMBenchmark(state, xnn_q8_gemm_ukernel_2x4c8__sse2, 2, 4, 8);
				317	}
				318
				319	BENCHMARK_GEMM(q8gemm_4x4c2__sse2)
				320	BENCHMARK_GEMM(q8gemm_2x4c8__sse2)
				321	#endif
				322
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	323	#ifdef BENCHMARK_RUY
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	324	BENCHMARK_GEMM(ruy_st)
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	325	#endif // BENCHMARK_RUY
				326	#ifdef BENCHMARK_GEMMLOWP
				327	BENCHMARK_GEMM(gemmlowp_st)
				328	#endif // BENCHMARK_GEMMLOWP
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	329
				330	#ifndef XNNPACK_BENCHMARK_NO_MAIN
				331	BENCHMARK_MAIN();
				332	#endif