Blame - bench/q8-gemm.cc - platform/external/XNNPACK

blob: 130ebe8ee85fb63f02a85eade158d2d2f373c760 [file] [log] [blame]

XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	1	// Copyright (c) Facebook, Inc. and its affiliates.
				2	// All rights reserved.
				3	//
				4	// Copyright 2019 Google LLC
				5	//
				6	// This source code is licensed under the BSD-style license found in the
				7	// LICENSE file in the root directory of this source tree.
				8
				9	#include <algorithm>
				10	#include <cfloat>
				11	#include <chrono>
				12	#include <cmath>
				13	#include <functional>
Marat Dukhan	5ce30d9	2020-04-14 03:31:26 -0700	[diff] [blame]	14	#include <limits>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	15	#include <mutex>
				16	#include <random>
				17	#include <vector>
				18
				19	#include <cpuinfo.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	20
				21	#include <benchmark/benchmark.h>
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	22	#ifdef BENCHMARK_GEMMLOWP
				23	#include "gemmlowp/public/gemmlowp.h"
				24	#endif // BENCHMARK_GEMMLOWP
				25	#ifdef BENCHMARK_RUY
Benoit Jacob	b038fdc	2020-03-25 12:14:20 -0700	[diff] [blame]	26	#include "ruy/ruy.h"
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	27	#endif // BENCHMARK_RUY
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	28	#include "bench/gemm.h"
				29	#include "bench/utils.h"
				30	#include <xnnpack/AlignedAllocator.h>
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame]	31	#include <xnnpack/common.h>
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	32	#include <xnnpack/gemm.h>
				33	#include <xnnpack/pack.h>
Marat Dukhan	eeaa7bd	2019-10-25 17:31:25 -0700	[diff] [blame]	34	#include <xnnpack/params-init.h>
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	35	#include <xnnpack/params.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	36
				37
				38	static void GEMMBenchmark(benchmark::State& state,
				39	xnn_q8_gemm_ukernel_function q8gemm,
				40	size_t mr, size_t nr, size_t kr)
				41	{
				42	if (!cpuinfo_initialize()) {
				43	state.SkipWithError("cpuinfo initialization failed");
				44	return;
				45	}
				46
				47	const size_t mc = state.range(0);
				48	const size_t nc = state.range(1);
				49	const size_t kc = state.range(2);
				50
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	51	const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
				52	const size_t kc_stride = benchmark::utils::RoundUp(kc, kr);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	53
				54	std::random_device random_device;
				55	auto rng = std::mt19937(random_device());
				56	auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
Marat Dukhan	5ce30d9	2020-04-14 03:31:26 -0700	[diff] [blame]	57	auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	58
				59	std::vector<uint8_t> a(mc * kc);
				60	std::generate(a.begin(), a.end(), std::ref(u8rng));
				61	std::vector<uint8_t> k(nc * kc);
				62	std::generate(k.begin(), k.end(), std::ref(u8rng));
				63	std::vector<int32_t> b(nc);
				64	std::generate(b.begin(), b.end(), std::ref(s32rng));
				65
				66	const size_t w_elements = kc_stride * nc_stride + nc_stride * sizeof(int32_t) / sizeof(uint8_t);
				67	const size_t c_elements = mc * nc;
				68	const size_t num_buffers = 1 +
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	69	benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	70	sizeof(uint8_t) * (w_elements + c_elements));
				71
				72	std::vector<uint8_t, AlignedAllocator<uint8_t, 32>> w(w_elements * num_buffers);
				73	std::fill(w.begin(), w.end(), 0);
				74	xnn_pack_q8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, 127, 127, k.data(), b.data(), w.data());
				75	std::vector<uint8_t> c(c_elements * num_buffers);
				76	std::fill(c.begin(), c.end(), 0xA5);
				77
				78	union xnn_q8_gemm_params quantizationParams =
Marat Dukhan	eeaa7bd	2019-10-25 17:31:25 -0700	[diff] [blame]	79	xnn_init_q8_gemm_params(127, 127, 0.75f, 127, 1, 254);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	80
				81	size_t buffer_index = 0;
				82	for (auto _ : state) {
				83	// Use circular buffers (exceeding cache size) and prefetch to control cache state:
				84	// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
				85	// - W is not in cache (for any cache level)
				86	// - C is not in cache (for any cache level)
				87	state.PauseTiming();
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	88	benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	89	buffer_index = (buffer_index + 1) % num_buffers;
				90	state.ResumeTiming();
				91
				92	for (uint32_t m = 0; m < mc; m += mr) {
				93	const uint32_t mb = min(mc - m, mr);
				94	for (uint32_t n = 0; n < nc; n += nr) {
				95	const uint32_t nb = min(nc - n, nr);
				96	q8gemm(
Marat Dukhan	b186463	2019-11-25 16:34:17 -0800	[diff] [blame]	97	mb, nb, kc * sizeof(uint8_t),
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	98	a.data() + m * kc, kc * sizeof(uint8_t),
				99	w.data() + (w_elements * buffer_index + n * (kc_stride + sizeof(int32_t))) / sizeof(uint8_t),
				100	c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint8_t), nr * sizeof(uint8_t),
				101	&quantizationParams);
				102	}
				103	}
				104	}
				105
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	106	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	107	state.counters["OPS"] = benchmark::Counter(
				108	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				109	}
				110
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	111	#ifdef BENCHMARK_GEMMLOWP
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	112	struct GemmlowpOutputPipeline {
				113	typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap;
				114	typedef std::tuple<
				115	gemmlowp::OutputStageBiasAddition<ColVectorMap>,
				116	gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
				117	gemmlowp::OutputStageClamp,
				118	gemmlowp::OutputStageSaturatingCastToUint8>
				119	Pipeline;
				120
				121	static Pipeline Make(
				122	const int32_t* bias_data,
				123	int output_rows,
				124	int32_t output_offset,
				125	int32_t output_multiplier,
				126	int output_shift,
				127	int32_t output_activation_min,
				128	int32_t output_activation_max)
				129	{
				130	ColVectorMap bias_vector(bias_data, output_rows);
				131	gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
				132	bias_addition_stage.bias_vector = bias_vector;
				133	gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint quantize_down_stage;
				134	quantize_down_stage.result_offset_after_shift = output_offset;
				135	quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
				136	quantize_down_stage.result_shift = output_shift;
				137	gemmlowp::OutputStageClamp clamp_stage;
				138	clamp_stage.min = output_activation_min;
				139	clamp_stage.max = output_activation_max;
				140	gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
				141	return std::make_tuple(bias_addition_stage, quantize_down_stage, clamp_stage, saturating_cast_stage);
				142	}
				143	};
				144
				145	static void GemmlowpBenchmark(benchmark::State& state, uint32_t threads)
				146	{
				147	if (!cpuinfo_initialize()) {
				148	state.SkipWithError("cpuinfo initialization failed");
				149	return;
				150	}
				151
				152	const size_t mc = state.range(0);
				153	const size_t nc = state.range(1);
				154	const size_t kc = state.range(2);
				155
				156	std::random_device random_device;
				157	auto rng = std::mt19937(random_device());
				158	auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
Marat Dukhan	5ce30d9	2020-04-14 03:31:26 -0700	[diff] [blame]	159	auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	160
				161	std::vector<uint8_t> a(mc * kc);
				162	std::generate(a.begin(), a.end(), std::ref(u8rng));
				163
				164	const size_t kElements = nc * kc;
				165	const size_t bElements = nc;
				166	const size_t c_elements = mc * nc;
				167	const size_t num_buffers = 1 +
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	168	benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	169	kElements * sizeof(uint8_t) + bElements * sizeof(int32_t) + c_elements * sizeof(uint8_t));
				170
				171	std::vector<uint8_t> k(kElements * num_buffers);
				172	std::generate(k.begin(), k.end(), std::ref(u8rng));
				173	std::vector<int32_t> b(bElements * num_buffers);
				174	std::generate(b.begin(), b.end(), std::ref(s32rng));
				175	std::vector<uint8_t> c(c_elements * num_buffers);
				176	std::fill(c.begin(), c.end(), 0xA5);
				177
				178	gemmlowp::MultiThreadGemmContext threadingContext;
				179	threadingContext.set_max_num_threads(threads);
				180
				181	size_t buffer_index = 0;
				182	for (auto _ : state) {
				183	state.PauseTiming();
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	184	benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	185	buffer_index = (buffer_index + 1) % num_buffers;
				186	state.ResumeTiming();
				187
				188	gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> AM(a.data(), mc, kc, kc);
				189	gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> BM(k.data() + buffer_index * kElements, kc, nc, kc);
				190	gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::RowMajor> CM(c.data() + buffer_index * c_elements, mc, nc, nc);
				191	const auto& outputPipeline = GemmlowpOutputPipeline::Make(b.data() + buffer_index * bElements, nc, 127, 127, 127, 0, 255);
				192	gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
				193	&threadingContext, AM, BM, &CM, 127, 127, outputPipeline);
				194	}
				195
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	196	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	197	state.counters["OPS"] = benchmark::Counter(
				198	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				199	}
				200
				201	static void gemmlowp_st(benchmark::State& state, const char* net)
				202	{
				203	GemmlowpBenchmark(state, 1);
				204	}
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	205	#endif // BENCHMARK_GEMMLOWP
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	206
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	207
				208	#ifdef BENCHMARK_RUY
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	209	static void RuyBenchmark(benchmark::State& state, size_t threads)
				210	{
				211	const size_t mc = state.range(0);
				212	const size_t nc = state.range(1);
				213	const size_t kc = state.range(2);
				214
				215	std::random_device random_device;
				216	auto rng = std::mt19937(random_device());
				217	auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
Marat Dukhan	5ce30d9	2020-04-14 03:31:26 -0700	[diff] [blame]	218	auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	219
				220	const size_t num_buffers = 1 +
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	221	benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	222	nc * (sizeof(uint8_t) * (mc + kc) + sizeof(int32_t)));
				223
				224	std::vector<uint8_t> a(mc * kc);
				225	std::generate(a.begin(), a.end(), std::ref(u8rng));
				226	std::vector<uint8_t> k(num_buffers * nc * kc);
				227	std::generate(k.begin(), k.end(), std::ref(u8rng));
				228	std::vector<int32_t> b(num_buffers * nc);
				229	std::generate(b.begin(), b.end(), std::ref(s32rng));
				230	std::vector<uint8_t> c(num_buffers * nc * mc);
				231	std::fill(c.begin(), c.end(), std::nanf(""));
				232
				233	// Note: context must be static to avoid the cost of re-creating it for each benchmark.
				234	static ruy::Context context;
Benoit Jacob	349701a	2020-04-15 19:35:24 -0700	[diff] [blame]	235	context.set_max_num_threads(threads);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	236
				237	ruy::Matrix<uint8_t> ruy_a;
Benoit Jacob	349701a	2020-04-15 19:35:24 -0700	[diff] [blame]	238	ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
				239	ruy_a.set_zero_point(127);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	240	ruy::Matrix<uint8_t> ruy_b;
Benoit Jacob	349701a	2020-04-15 19:35:24 -0700	[diff] [blame]	241	ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
				242	ruy_b.set_data(a.data());
				243	ruy_b.set_zero_point(127);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	244	ruy::Matrix<uint8_t> ruy_c;
Benoit Jacob	349701a	2020-04-15 19:35:24 -0700	[diff] [blame]	245	ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
				246	ruy_c.set_zero_point(127);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	247
Benoit Jacob	b026e22	2020-04-16 12:30:03 -0700	[diff] [blame]	248	ruy::MulParams<int32_t, uint8_t> mul_params;
				249	mul_params.set_multiplier_fixedpoint(0x40000000);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	250
				251	// ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
				252	// the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
				253	// Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
				254	// keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
				255	static std::once_flag warmup;
				256	std::call_once(warmup, [&](){
				257	auto start = std::chrono::steady_clock::now();
				258	do {
Benoit Jacob	349701a	2020-04-15 19:35:24 -0700	[diff] [blame]	259	ruy_a.set_data(k.data());
				260	ruy_c.set_data(c.data());
Benoit Jacob	b026e22	2020-04-16 12:30:03 -0700	[diff] [blame]	261	mul_params.set_bias(b.data());
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	262
Benoit Jacob	b026e22	2020-04-16 12:30:03 -0700	[diff] [blame]	263	ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	264	} while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
				265	});
				266
				267	size_t buffer_index = 0;
				268	for (auto _ : state) {
				269	// Use circular buffers (exceeding cache size) and prefetch to control cache state:
				270	// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
				271	// - K is not in cache (for any cache level)
				272	// - B is not in cache (for any cache level)
				273	// - C is not in cache (for any cache level)
				274	state.PauseTiming();
Marat Dukhan	4232323	2019-10-23 02:09:02 -0700	[diff] [blame]	275	benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	276	buffer_index = (buffer_index + 1) % num_buffers;
				277	state.ResumeTiming();
				278
Benoit Jacob	349701a	2020-04-15 19:35:24 -0700	[diff] [blame]	279	ruy_a.set_data(k.data() + buffer_index * nc * kc);
				280	ruy_c.set_data(c.data() + buffer_index * mc * nc);
Benoit Jacob	b026e22	2020-04-16 12:30:03 -0700	[diff] [blame]	281	mul_params.set_bias(b.data() + buffer_index * nc);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	282
Benoit Jacob	b026e22	2020-04-16 12:30:03 -0700	[diff] [blame]	283	ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	284	}
				285
Frank Barchard	bb4c18b	2019-09-30 11:05:52 -0700	[diff] [blame]	286	state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	287	state.counters["OPS"] = benchmark::Counter(
				288	uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
				289	}
				290
				291	static void ruy_st(benchmark::State& state, const char* net)
				292	{
				293	RuyBenchmark(state, 1);
				294	}
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	295	#endif // BENCHMARK_RUY
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	296
				297
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame]	298	#if XNN_ARCH_ARM \|\| XNN_ARCH_ARM64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	299	static void q8gemm_4x8__neon(benchmark::State& state, const char* net) {
Marat Dukhan	de06f49	2020-04-09 00:19:31 -0700	[diff] [blame]	300	GEMMBenchmark(state, xnn_q8_gemm_minmax_ukernel_4x8__neon, 4, 8, 1);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	301	}
				302
				303	static void q8gemm_8x8__neon(benchmark::State& state, const char* net) {
Marat Dukhan	de06f49	2020-04-09 00:19:31 -0700	[diff] [blame]	304	GEMMBenchmark(state, xnn_q8_gemm_minmax_ukernel_8x8__neon, 8, 8, 1);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	305	}
				306
				307	BENCHMARK_GEMM(q8gemm_4x8__neon)
				308	BENCHMARK_GEMM(q8gemm_8x8__neon)
				309	#endif
				310
Marat Dukhan	1dadbf7	2019-10-01 10:46:20 -0700	[diff] [blame]	311	#if XNN_ARCH_X86 \|\| XNN_ARCH_X86_64
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	312	static void q8gemm_4x4c2__sse2(benchmark::State& state, const char* net) {
Marat Dukhan	de06f49	2020-04-09 00:19:31 -0700	[diff] [blame]	313	GEMMBenchmark(state, xnn_q8_gemm_minmax_ukernel_4x4c2__sse2, 4, 4, 2);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	314	}
				315
				316	static void q8gemm_2x4c8__sse2(benchmark::State& state, const char* net) {
Marat Dukhan	de06f49	2020-04-09 00:19:31 -0700	[diff] [blame]	317	GEMMBenchmark(state, xnn_q8_gemm_minmax_ukernel_2x4c8__sse2, 2, 4, 8);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	318	}
				319
				320	BENCHMARK_GEMM(q8gemm_4x4c2__sse2)
				321	BENCHMARK_GEMM(q8gemm_2x4c8__sse2)
				322	#endif
				323
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	324	#ifdef BENCHMARK_RUY
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	325	BENCHMARK_GEMM(ruy_st)
Marat Dukhan	33f0c7a	2019-10-01 13:33:08 -0700	[diff] [blame]	326	#endif // BENCHMARK_RUY
				327	#ifdef BENCHMARK_GEMMLOWP
				328	BENCHMARK_GEMM(gemmlowp_st)
				329	#endif // BENCHMARK_GEMMLOWP
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	330
				331	#ifndef XNNPACK_BENCHMARK_NO_MAIN
				332	BENCHMARK_MAIN();
				333	#endif