Blame - bench/qu8-gemm.cc - platform/external/XNNPACK

blob: 63198ce0cbc068c4581f2a37f40ada08ed0fc607 [file] [log] [blame]

XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	1	// Copyright (c) Facebook, Inc. and its affiliates.
				2	// All rights reserved.
				3	//
				4	// Copyright 2019 Google LLC
				5	//
				6	// This source code is licensed under the BSD-style license found in the
				7	// LICENSE file in the root directory of this source tree.
				8
				9	#include <algorithm>
				10	#include <cfloat>
				11	#include <chrono>
				12	#include <cmath>
				13	#include <functional>
Marat Dukhan	5ce30d9	2020-04-14 03:31:26 -0700	[diff] [blame]	14	#include <limits>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	15	#include <mutex>
				16	#include <random>
				17	#include <vector>
				18
				19	#include <cpuinfo.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	20
				21	#include <benchmark/benchmark.h>
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	22	#ifdef BENCHMARK_GEMMLOWP
				23	#include "gemmlowp/public/gemmlowp.h"
				24	#endif // BENCHMARK_GEMMLOWP
				25	#ifdef BENCHMARK_RUY
Benoit Jacob	b038fdc	2020-03-25 12:14:20 -0700	[diff] [blame]	26	#include "ruy/ruy.h"
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	27	#endif // BENCHMARK_RUY
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	28	#include "bench/gemm.h"
				29	#include "bench/utils.h"
				30	#include <xnnpack/AlignedAllocator.h>
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame]	31	#include <xnnpack/common.h>
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	32	#include <xnnpack/gemm.h>
				33	#include <xnnpack/pack.h>
Marat Dukhan	eeaa7bd	2019-10-25 17:31:25 -0700	[diff] [blame]	34	#include <xnnpack/params-init.h>
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	35	#include <xnnpack/params.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	36
				37
				38	static void GEMMBenchmark(benchmark::State& state,
Marat Dukhan	08b7a97	2020-07-14 18:17:29 -0700	[diff] [blame]	39	xnn_qu8_gemm_ukernel_function gemm,
Marat Dukhan	bc08f31	2020-07-07 16:22:04 -0700	[diff] [blame]	40	size_t mr, size_t nr, size_t kr, size_t sr)
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	41	{
				42	if (!cpuinfo_initialize()) {
				43	state.SkipWithError("cpuinfo initialization failed");
				44	return;
				45	}
				46
				47	const size_t mc = state.range(0);
				48	const size_t nc = state.range(1);
				49	const size_t kc = state.range(2);
				50
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	51	const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
				52	const size_t kc_stride = benchmark::utils::RoundUp(kc, kr);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	53
				54	std::random_device random_device;
				55	auto rng = std::mt19937(random_device());
				56	auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
Marat Dukhan	5ce30d9	2020-04-14 03:31:26 -0700	[diff] [blame]	57	auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	58
				59	std::vector<uint8_t> a(mc * kc);
				60	std::generate(a.begin(), a.end(), std::ref(u8rng));
				61	std::vector<uint8_t> k(nc * kc);
				62	std::generate(k.begin(), k.end(), std::ref(u8rng));
				63	std::vector<int32_t> b(nc);
				64	std::generate(b.begin(), b.end(), std::ref(s32rng));
				65
				66	const size_t w_elements = kc_stride * nc_stride + nc_stride * sizeof(int32_t) / sizeof(uint8_t);
				67	const size_t c_elements = mc * nc;
				68	const size_t num_buffers = 1 +
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	69	benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	70	sizeof(uint8_t) * (w_elements + c_elements));
				71
				72	std::vector<uint8_t, AlignedAllocator<uint8_t, 32>> w(w_elements * num_buffers);
				73	std::fill(w.begin(), w.end(), 0);
Marat Dukhan	08b7a97	2020-07-14 18:17:29 -0700	[diff] [blame]	74	const xnn_qu8_packing_params packing_params = { 127, 127 };
				75	xnn_pack_qu8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), &packing_params);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	76	std::vector<uint8_t> c(c_elements * num_buffers);
				77	std::fill(c.begin(), c.end(), 0xA5);
				78
Marat Dukhan	08b7a97	2020-07-14 18:17:29 -0700	[diff] [blame]	79	union xnn_qu8_gemm_params quantization_params =
Marat Dukhan	9199246	2020-07-30 00:06:34 -0700	[diff] [blame]	80	xnn_init_qu8_gemm_params(127, 0.75f, 127, 1, 254);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	81
				82	size_t buffer_index = 0;
				83	for (auto _ : state) {
				84	// Use circular buffers (exceeding cache size) and prefetch to control cache state:
				85	// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
				86	// - W is not in cache (for any cache level)
				87	// - C is not in cache (for any cache level)
				88	state.PauseTiming();
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	89	benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	90	buffer_index = (buffer_index + 1) % num_buffers;
				91	state.ResumeTiming();
				92
				93	for (uint32_t m = 0; m < mc; m += mr) {
				94	const uint32_t mb = min(mc - m, mr);
				95	for (uint32_t n = 0; n < nc; n += nr) {
				96	const uint32_t nb = min(nc - n, nr);
Marat Dukhan	08b7a97	2020-07-14 18:17:29 -0700	[diff] [blame]	97	gemm(
Marat Dukhan	b186463	2019-11-25 16:34:17 -0800	[diff] [blame]	98	mb, nb, kc * sizeof(uint8_t),
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	99	a.data() + m * kc, kc * sizeof(uint8_t),
				100	w.data() + (w_elements * buffer_index + n * (kc_stride + sizeof(int32_t))) / sizeof(uint8_t),
				101	c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint8_t), nr * sizeof(uint8_t),
Marat Dukhan	b42f866	2020-07-06 20:46:13 -0700	[diff] [blame]	102	&quantization_params);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	103	}
				104	}
				105	}
				106
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	107	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	108	state.counters["OPS"] = benchmark::Counter(
				109	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				110	}
				111
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	112	#ifdef BENCHMARK_GEMMLOWP
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	113	struct GemmlowpOutputPipeline {
				114	typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap;
				115	typedef std::tuple<
				116	gemmlowp::OutputStageBiasAddition<ColVectorMap>,
				117	gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
				118	gemmlowp::OutputStageClamp,
				119	gemmlowp::OutputStageSaturatingCastToUint8>
				120	Pipeline;
				121
				122	static Pipeline Make(
				123	const int32_t* bias_data,
				124	int output_rows,
				125	int32_t output_offset,
				126	int32_t output_multiplier,
				127	int output_shift,
				128	int32_t output_activation_min,
				129	int32_t output_activation_max)
				130	{
				131	ColVectorMap bias_vector(bias_data, output_rows);
				132	gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
				133	bias_addition_stage.bias_vector = bias_vector;
				134	gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint quantize_down_stage;
				135	quantize_down_stage.result_offset_after_shift = output_offset;
				136	quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
				137	quantize_down_stage.result_shift = output_shift;
				138	gemmlowp::OutputStageClamp clamp_stage;
				139	clamp_stage.min = output_activation_min;
				140	clamp_stage.max = output_activation_max;
				141	gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
				142	return std::make_tuple(bias_addition_stage, quantize_down_stage, clamp_stage, saturating_cast_stage);
				143	}
				144	};
				145
				146	static void GemmlowpBenchmark(benchmark::State& state, uint32_t threads)
				147	{
				148	if (!cpuinfo_initialize()) {
				149	state.SkipWithError("cpuinfo initialization failed");
				150	return;
				151	}
				152
				153	const size_t mc = state.range(0);
				154	const size_t nc = state.range(1);
				155	const size_t kc = state.range(2);
				156
				157	std::random_device random_device;
				158	auto rng = std::mt19937(random_device());
				159	auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
Marat Dukhan	5ce30d9	2020-04-14 03:31:26 -0700	[diff] [blame]	160	auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	161
				162	std::vector<uint8_t> a(mc * kc);
				163	std::generate(a.begin(), a.end(), std::ref(u8rng));
				164
				165	const size_t kElements = nc * kc;
				166	const size_t bElements = nc;
				167	const size_t c_elements = mc * nc;
				168	const size_t num_buffers = 1 +
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	169	benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	170	kElements * sizeof(uint8_t) + bElements * sizeof(int32_t) + c_elements * sizeof(uint8_t));
				171
				172	std::vector<uint8_t> k(kElements * num_buffers);
				173	std::generate(k.begin(), k.end(), std::ref(u8rng));
				174	std::vector<int32_t> b(bElements * num_buffers);
				175	std::generate(b.begin(), b.end(), std::ref(s32rng));
				176	std::vector<uint8_t> c(c_elements * num_buffers);
				177	std::fill(c.begin(), c.end(), 0xA5);
				178
				179	gemmlowp::MultiThreadGemmContext threadingContext;
				180	threadingContext.set_max_num_threads(threads);
				181
				182	size_t buffer_index = 0;
				183	for (auto _ : state) {
				184	state.PauseTiming();
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	185	benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	186	buffer_index = (buffer_index + 1) % num_buffers;
				187	state.ResumeTiming();
				188
				189	gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> AM(a.data(), mc, kc, kc);
				190	gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> BM(k.data() + buffer_index * kElements, kc, nc, kc);
				191	gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::RowMajor> CM(c.data() + buffer_index * c_elements, mc, nc, nc);
				192	const auto& outputPipeline = GemmlowpOutputPipeline::Make(b.data() + buffer_index * bElements, nc, 127, 127, 127, 0, 255);
				193	gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
				194	&threadingContext, AM, BM, &CM, 127, 127, outputPipeline);
				195	}
				196
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	197	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	198	state.counters["OPS"] = benchmark::Counter(
				199	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				200	}
				201
				202	static void gemmlowp_st(benchmark::State& state, const char* net)
				203	{
				204	GemmlowpBenchmark(state, 1);
				205	}
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	206	#endif // BENCHMARK_GEMMLOWP
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	207
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	208
				209	#ifdef BENCHMARK_RUY
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	210	static void RuyBenchmark(benchmark::State& state, size_t threads)
				211	{
				212	const size_t mc = state.range(0);
				213	const size_t nc = state.range(1);
				214	const size_t kc = state.range(2);
				215
				216	std::random_device random_device;
				217	auto rng = std::mt19937(random_device());
				218	auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
Marat Dukhan	5ce30d9	2020-04-14 03:31:26 -0700	[diff] [blame]	219	auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	220
				221	const size_t num_buffers = 1 +
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	222	benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	223	nc * (sizeof(uint8_t) * (mc + kc) + sizeof(int32_t)));
				224
				225	std::vector<uint8_t> a(mc * kc);
				226	std::generate(a.begin(), a.end(), std::ref(u8rng));
				227	std::vector<uint8_t> k(num_buffers * nc * kc);
				228	std::generate(k.begin(), k.end(), std::ref(u8rng));
				229	std::vector<int32_t> b(num_buffers * nc);
				230	std::generate(b.begin(), b.end(), std::ref(s32rng));
				231	std::vector<uint8_t> c(num_buffers * nc * mc);
				232	std::fill(c.begin(), c.end(), std::nanf(""));
				233
				234	// Note: context must be static to avoid the cost of re-creating it for each benchmark.
				235	static ruy::Context context;
Benoit Jacob	349701a	2020-04-15 19:35:24 -0700	[diff] [blame]	236	context.set_max_num_threads(threads);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	237
				238	ruy::Matrix<uint8_t> ruy_a;
Benoit Jacob	349701a	2020-04-15 19:35:24 -0700	[diff] [blame]	239	ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
				240	ruy_a.set_zero_point(127);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	241	ruy::Matrix<uint8_t> ruy_b;
Benoit Jacob	349701a	2020-04-15 19:35:24 -0700	[diff] [blame]	242	ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
				243	ruy_b.set_data(a.data());
				244	ruy_b.set_zero_point(127);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	245	ruy::Matrix<uint8_t> ruy_c;
Benoit Jacob	349701a	2020-04-15 19:35:24 -0700	[diff] [blame]	246	ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
				247	ruy_c.set_zero_point(127);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	248
Benoit Jacob	b026e22	2020-04-16 12:30:03 -0700	[diff] [blame]	249	ruy::MulParams<int32_t, uint8_t> mul_params;
				250	mul_params.set_multiplier_fixedpoint(0x40000000);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	251
				252	// ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
				253	// the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
				254	// Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
				255	// keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
				256	static std::once_flag warmup;
				257	std::call_once(warmup, [&](){
				258	auto start = std::chrono::steady_clock::now();
				259	do {
Benoit Jacob	349701a	2020-04-15 19:35:24 -0700	[diff] [blame]	260	ruy_a.set_data(k.data());
				261	ruy_c.set_data(c.data());
Benoit Jacob	b026e22	2020-04-16 12:30:03 -0700	[diff] [blame]	262	mul_params.set_bias(b.data());
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	263
Benoit Jacob	b026e22	2020-04-16 12:30:03 -0700	[diff] [blame]	264	ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	265	} while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
				266	});
				267
				268	size_t buffer_index = 0;
				269	for (auto _ : state) {
				270	// Use circular buffers (exceeding cache size) and prefetch to control cache state:
				271	// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
				272	// - K is not in cache (for any cache level)
				273	// - B is not in cache (for any cache level)
				274	// - C is not in cache (for any cache level)
				275	state.PauseTiming();
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	276	benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	277	buffer_index = (buffer_index + 1) % num_buffers;
				278	state.ResumeTiming();
				279
Benoit Jacob	349701a	2020-04-15 19:35:24 -0700	[diff] [blame]	280	ruy_a.set_data(k.data() + buffer_index * nc * kc);
				281	ruy_c.set_data(c.data() + buffer_index * mc * nc);
Benoit Jacob	b026e22	2020-04-16 12:30:03 -0700	[diff] [blame]	282	mul_params.set_bias(b.data() + buffer_index * nc);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	283
Benoit Jacob	b026e22	2020-04-16 12:30:03 -0700	[diff] [blame]	284	ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	285	}
				286
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	287	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	288	state.counters["OPS"] = benchmark::Counter(
				289	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				290	}
				291
				292	static void ruy_st(benchmark::State& state, const char* net)
				293	{
				294	RuyBenchmark(state, 1);
				295	}
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	296	#endif // BENCHMARK_RUY
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	297
				298
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame]	299	#if XNN_ARCH_ARM \|\| XNN_ARCH_ARM64
Marat Dukhan	08b7a97	2020-07-14 18:17:29 -0700	[diff] [blame]	300	static void qu8_gemm_4x8__neon(benchmark::State& state, const char* net) {
				301	GEMMBenchmark(state, xnn_qu8_gemm_minmax_ukernel_4x8__neon, 4, 8, 1, 1);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	302	}
				303
Marat Dukhan	08b7a97	2020-07-14 18:17:29 -0700	[diff] [blame]	304	static void qu8_gemm_8x8__neon(benchmark::State& state, const char* net) {
				305	GEMMBenchmark(state, xnn_qu8_gemm_minmax_ukernel_8x8__neon, 8, 8, 1, 1);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	306	}
				307
Marat Dukhan	08b7a97	2020-07-14 18:17:29 -0700	[diff] [blame]	308	BENCHMARK_GEMM(qu8_gemm_4x8__neon)
				309	BENCHMARK_GEMM(qu8_gemm_8x8__neon)
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	310	#endif
				311
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame]	312	#if XNN_ARCH_X86 \|\| XNN_ARCH_X86_64
Marat Dukhan	08b7a97	2020-07-14 18:17:29 -0700	[diff] [blame]	313	static void qu8_gemm_4x4c2__sse2(benchmark::State& state, const char* net) {
				314	GEMMBenchmark(state, xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2, 4, 4, 2, 1);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	315	}
				316
Marat Dukhan	08b7a97	2020-07-14 18:17:29 -0700	[diff] [blame]	317	static void qu8_gemm_2x4c8__sse2(benchmark::State& state, const char* net) {
				318	GEMMBenchmark(state, xnn_qu8_gemm_minmax_ukernel_2x4c8__sse2, 2, 4, 8, 1);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	319	}
				320
Marat Dukhan	08b7a97	2020-07-14 18:17:29 -0700	[diff] [blame]	321	BENCHMARK_GEMM(qu8_gemm_4x4c2__sse2)
				322	BENCHMARK_GEMM(qu8_gemm_2x4c8__sse2)
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	323	#endif
				324
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	325	#ifdef BENCHMARK_RUY
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	326	BENCHMARK_GEMM(ruy_st)
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	327	#endif // BENCHMARK_RUY
				328	#ifdef BENCHMARK_GEMMLOWP
				329	BENCHMARK_GEMM(gemmlowp_st)
				330	#endif // BENCHMARK_GEMMLOWP
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	331
				332	#ifndef XNNPACK_BENCHMARK_NO_MAIN
				333	BENCHMARK_MAIN();
				334	#endif