Blame - bench/q8-gemm.cc - platform/external/XNNPACK

blob: 9a150062c1814a6e7345f9fb383b6f2d28ed51b6 [file] [log] [blame]

XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	1	// Copyright (c) Facebook, Inc. and its affiliates.
				2	// All rights reserved.
				3	//
				4	// Copyright 2019 Google LLC
				5	//
				6	// This source code is licensed under the BSD-style license found in the
				7	// LICENSE file in the root directory of this source tree.
				8
				9	#include <algorithm>
				10	#include <cfloat>
				11	#include <chrono>
				12	#include <cmath>
				13	#include <functional>
				14	#include <mutex>
				15	#include <random>
				16	#include <vector>
				17
				18	#include <cpuinfo.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	19
				20	#include <benchmark/benchmark.h>
				21	#include "third_party/gemmlowp/public/gemmlowp.h"
				22	#include "tensorflow/lite/experimental/ruy/ruy.h"
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	23	#include "bench/gemm.h"
				24	#include "bench/utils.h"
				25	#include <xnnpack/AlignedAllocator.h>
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame^]	26	#include <xnnpack/common.h>
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	27	#include <xnnpack/gemm.h>
				28	#include <xnnpack/pack.h>
				29	#include <xnnpack/params.h>
				30	#include <xnnpack/requantization.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	31
				32
				33	static void GEMMBenchmark(benchmark::State& state,
				34	xnn_q8_gemm_ukernel_function q8gemm,
				35	size_t mr, size_t nr, size_t kr)
				36	{
				37	if (!cpuinfo_initialize()) {
				38	state.SkipWithError("cpuinfo initialization failed");
				39	return;
				40	}
				41
				42	const size_t mc = state.range(0);
				43	const size_t nc = state.range(1);
				44	const size_t kc = state.range(2);
				45
				46	const size_t nc_stride = benchmark::utils::roundUp(nc, nr);
				47	const size_t kc_stride = benchmark::utils::roundUp(kc, kr);
				48
				49	std::random_device random_device;
				50	auto rng = std::mt19937(random_device());
				51	auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
				52	auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
				53
				54	std::vector<uint8_t> a(mc * kc);
				55	std::generate(a.begin(), a.end(), std::ref(u8rng));
				56	std::vector<uint8_t> k(nc * kc);
				57	std::generate(k.begin(), k.end(), std::ref(u8rng));
				58	std::vector<int32_t> b(nc);
				59	std::generate(b.begin(), b.end(), std::ref(s32rng));
				60
				61	const size_t w_elements = kc_stride * nc_stride + nc_stride * sizeof(int32_t) / sizeof(uint8_t);
				62	const size_t c_elements = mc * nc;
				63	const size_t num_buffers = 1 +
				64	benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
				65	sizeof(uint8_t) * (w_elements + c_elements));
				66
				67	std::vector<uint8_t, AlignedAllocator<uint8_t, 32>> w(w_elements * num_buffers);
				68	std::fill(w.begin(), w.end(), 0);
				69	xnn_pack_q8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, 127, 127, k.data(), b.data(), w.data());
				70	std::vector<uint8_t> c(c_elements * num_buffers);
				71	std::fill(c.begin(), c.end(), 0xA5);
				72
				73	union xnn_q8_gemm_params quantizationParams =
				74	xnn_compute_q8_gemm_params(127, 127, 0.75f, 127, 1, 254);
				75
				76	size_t buffer_index = 0;
				77	for (auto _ : state) {
				78	// Use circular buffers (exceeding cache size) and prefetch to control cache state:
				79	// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
				80	// - W is not in cache (for any cache level)
				81	// - C is not in cache (for any cache level)
				82	state.PauseTiming();
				83	benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(uint8_t));
				84	buffer_index = (buffer_index + 1) % num_buffers;
				85	state.ResumeTiming();
				86
				87	for (uint32_t m = 0; m < mc; m += mr) {
				88	const uint32_t mb = min(mc - m, mr);
				89	for (uint32_t n = 0; n < nc; n += nr) {
				90	const uint32_t nb = min(nc - n, nr);
				91	q8gemm(
				92	mb, nb, kc,
				93	a.data() + m * kc, kc * sizeof(uint8_t),
				94	w.data() + (w_elements * buffer_index + n * (kc_stride + sizeof(int32_t))) / sizeof(uint8_t),
				95	c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint8_t), nr * sizeof(uint8_t),
				96	&quantizationParams);
				97	}
				98	}
				99	}
				100
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	101	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	102	state.counters["OPS"] = benchmark::Counter(
				103	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				104	}
				105
				106	struct GemmlowpOutputPipeline {
				107	typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap;
				108	typedef std::tuple<
				109	gemmlowp::OutputStageBiasAddition<ColVectorMap>,
				110	gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
				111	gemmlowp::OutputStageClamp,
				112	gemmlowp::OutputStageSaturatingCastToUint8>
				113	Pipeline;
				114
				115	static Pipeline Make(
				116	const int32_t* bias_data,
				117	int output_rows,
				118	int32_t output_offset,
				119	int32_t output_multiplier,
				120	int output_shift,
				121	int32_t output_activation_min,
				122	int32_t output_activation_max)
				123	{
				124	ColVectorMap bias_vector(bias_data, output_rows);
				125	gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
				126	bias_addition_stage.bias_vector = bias_vector;
				127	gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint quantize_down_stage;
				128	quantize_down_stage.result_offset_after_shift = output_offset;
				129	quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
				130	quantize_down_stage.result_shift = output_shift;
				131	gemmlowp::OutputStageClamp clamp_stage;
				132	clamp_stage.min = output_activation_min;
				133	clamp_stage.max = output_activation_max;
				134	gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
				135	return std::make_tuple(bias_addition_stage, quantize_down_stage, clamp_stage, saturating_cast_stage);
				136	}
				137	};
				138
				139	static void GemmlowpBenchmark(benchmark::State& state, uint32_t threads)
				140	{
				141	if (!cpuinfo_initialize()) {
				142	state.SkipWithError("cpuinfo initialization failed");
				143	return;
				144	}
				145
				146	const size_t mc = state.range(0);
				147	const size_t nc = state.range(1);
				148	const size_t kc = state.range(2);
				149
				150	std::random_device random_device;
				151	auto rng = std::mt19937(random_device());
				152	auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
				153	auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
				154
				155	std::vector<uint8_t> a(mc * kc);
				156	std::generate(a.begin(), a.end(), std::ref(u8rng));
				157
				158	const size_t kElements = nc * kc;
				159	const size_t bElements = nc;
				160	const size_t c_elements = mc * nc;
				161	const size_t num_buffers = 1 +
				162	benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
				163	kElements * sizeof(uint8_t) + bElements * sizeof(int32_t) + c_elements * sizeof(uint8_t));
				164
				165	std::vector<uint8_t> k(kElements * num_buffers);
				166	std::generate(k.begin(), k.end(), std::ref(u8rng));
				167	std::vector<int32_t> b(bElements * num_buffers);
				168	std::generate(b.begin(), b.end(), std::ref(s32rng));
				169	std::vector<uint8_t> c(c_elements * num_buffers);
				170	std::fill(c.begin(), c.end(), 0xA5);
				171
				172	gemmlowp::MultiThreadGemmContext threadingContext;
				173	threadingContext.set_max_num_threads(threads);
				174
				175	size_t buffer_index = 0;
				176	for (auto _ : state) {
				177	state.PauseTiming();
				178	benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(uint8_t));
				179	buffer_index = (buffer_index + 1) % num_buffers;
				180	state.ResumeTiming();
				181
				182	gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> AM(a.data(), mc, kc, kc);
				183	gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> BM(k.data() + buffer_index * kElements, kc, nc, kc);
				184	gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::RowMajor> CM(c.data() + buffer_index * c_elements, mc, nc, nc);
				185	const auto& outputPipeline = GemmlowpOutputPipeline::Make(b.data() + buffer_index * bElements, nc, 127, 127, 127, 0, 255);
				186	gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
				187	&threadingContext, AM, BM, &CM, 127, 127, outputPipeline);
				188	}
				189
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	190	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	191	state.counters["OPS"] = benchmark::Counter(
				192	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				193	}
				194
				195	static void gemmlowp_st(benchmark::State& state, const char* net)
				196	{
				197	GemmlowpBenchmark(state, 1);
				198	}
				199
				200	static void RuyBenchmark(benchmark::State& state, size_t threads)
				201	{
				202	const size_t mc = state.range(0);
				203	const size_t nc = state.range(1);
				204	const size_t kc = state.range(2);
				205
				206	std::random_device random_device;
				207	auto rng = std::mt19937(random_device());
				208	auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
				209	auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
				210
				211	const size_t num_buffers = 1 +
				212	benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
				213	nc * (sizeof(uint8_t) * (mc + kc) + sizeof(int32_t)));
				214
				215	std::vector<uint8_t> a(mc * kc);
				216	std::generate(a.begin(), a.end(), std::ref(u8rng));
				217	std::vector<uint8_t> k(num_buffers * nc * kc);
				218	std::generate(k.begin(), k.end(), std::ref(u8rng));
				219	std::vector<int32_t> b(num_buffers * nc);
				220	std::generate(b.begin(), b.end(), std::ref(s32rng));
				221	std::vector<uint8_t> c(num_buffers * nc * mc);
				222	std::fill(c.begin(), c.end(), std::nanf(""));
				223
				224	// Note: context must be static to avoid the cost of re-creating it for each benchmark.
				225	static ruy::Context context;
				226	context.max_num_threads = threads;
				227
				228	ruy::Matrix<uint8_t> ruy_a;
				229	ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, &ruy_a.layout);
				230	ruy_a.zero_point = 127;
				231	ruy::Matrix<uint8_t> ruy_b;
				232	ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, &ruy_b.layout);
				233	ruy_b.data = a.data();
				234	ruy_b.zero_point = 127;
				235	ruy::Matrix<uint8_t> ruy_c;
				236	ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, &ruy_c.layout);
				237	ruy_c.zero_point = 127;
				238
				239	ruy::BasicSpec<int32_t, uint8_t> spec;
				240	spec.multiplier_fixedpoint = 0x40000000;
				241
				242	// ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
				243	// the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
				244	// Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
				245	// keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
				246	static std::once_flag warmup;
				247	std::call_once(warmup, [&](){
				248	auto start = std::chrono::steady_clock::now();
				249	do {
				250	ruy_a.data = k.data();
				251	ruy_c.data = c.data();
				252	spec.bias = b.data();
				253
				254	ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
				255	} while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
				256	});
				257
				258	size_t buffer_index = 0;
				259	for (auto _ : state) {
				260	// Use circular buffers (exceeding cache size) and prefetch to control cache state:
				261	// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
				262	// - K is not in cache (for any cache level)
				263	// - B is not in cache (for any cache level)
				264	// - C is not in cache (for any cache level)
				265	state.PauseTiming();
				266	benchmark::utils::prefetchToL1(a.data(), a.size() * sizeof(uint8_t));
				267	buffer_index = (buffer_index + 1) % num_buffers;
				268	state.ResumeTiming();
				269
				270	ruy_a.data = k.data() + buffer_index * nc * kc;
				271	ruy_c.data = c.data() + buffer_index * mc * nc;
				272	spec.bias = b.data() + buffer_index * nc;
				273
				274	ruy::Mul<ruy::kAllPaths>(ruy_a, ruy_b, spec, &context, &ruy_c);
				275	}
				276
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	277	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	278	state.counters["OPS"] = benchmark::Counter(
				279	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				280	}
				281
				282	static void ruy_st(benchmark::State& state, const char* net)
				283	{
				284	RuyBenchmark(state, 1);
				285	}
				286
				287
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame^]	288	#if XNN_ARCH_ARM \|\| XNN_ARCH_ARM64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	289	static void q8gemm_4x8__neon(benchmark::State& state, const char* net) {
				290	GEMMBenchmark(state, xnn_q8_gemm_ukernel_4x8__neon, 4, 8, 1);
				291	}
				292
				293	static void q8gemm_8x8__neon(benchmark::State& state, const char* net) {
				294	GEMMBenchmark(state, xnn_q8_gemm_ukernel_8x8__neon, 8, 8, 1);
				295	}
				296
				297	BENCHMARK_GEMM(q8gemm_4x8__neon)
				298	BENCHMARK_GEMM(q8gemm_8x8__neon)
				299	#endif
				300
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame^]	301	#if XNN_ARCH_X86 \|\| XNN_ARCH_X86_64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	302	static void q8gemm_4x4c2__sse2(benchmark::State& state, const char* net) {
				303	GEMMBenchmark(state, xnn_q8_gemm_ukernel_4x4c2__sse2, 4, 4, 2);
				304	}
				305
				306	static void q8gemm_2x4c8__sse2(benchmark::State& state, const char* net) {
				307	GEMMBenchmark(state, xnn_q8_gemm_ukernel_2x4c8__sse2, 2, 4, 8);
				308	}
				309
				310	BENCHMARK_GEMM(q8gemm_4x4c2__sse2)
				311	BENCHMARK_GEMM(q8gemm_2x4c8__sse2)
				312	#endif
				313
				314	BENCHMARK_GEMM(gemmlowp_st)
				315	BENCHMARK_GEMM(ruy_st)
				316
				317	#ifndef XNNPACK_BENCHMARK_NO_MAIN
				318	BENCHMARK_MAIN();
				319	#endif
				320