Blame - test/spmm-microkernel-tester.h - platform/external/XNNPACK

blob: bcffad659876f9d3d456eb8e6967423f41792101 [file] [log] [blame]

XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	1	// Copyright 2019 Google LLC
				2	//
				3	// This source code is licensed under the BSD-style license found in the
				4	// LICENSE file in the root directory of this source tree.
				5
				6	#pragma once
				7
				8	#include <gtest/gtest.h>
				9
				10	#include <algorithm>
				11	#include <cassert>
				12	#include <cmath>
				13	#include <cstddef>
				14	#include <cstdlib>
				15	#include <functional>
				16	#include <random>
				17	#include <vector>
				18
				19	#include <xnnpack.h>
				20	#include <xnnpack/AlignedAllocator.h>
Marat Dukhan	eeaa7bd	2019-10-25 17:31:25 -0700	[diff] [blame]	21	#include <xnnpack/params-init.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	22	#include <xnnpack/params.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	23
				24
Marat Dukhan	bdb56f5	2020-02-05 21:42:49 -0800	[diff] [blame]	25	static inline bool is_fp16_zero(uint16_t x) {
				26	const uint32_t ext_x = x;
				27	const uint32_t two_x = ext_x + ext_x;
				28	return (uint16_t) two_x == 0;
				29	}
				30
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	31	class SpMMMicrokernelTester {
				32	public:
				33	enum class Variant {
				34	Native,
				35	Scalar,
				36	};
				37
				38	inline SpMMMicrokernelTester& mr(size_t mr) {
				39	this->mr_ = mr;
				40	return *this;
				41	}
				42
				43	inline size_t mr() const {
				44	return this->mr_;
				45	}
				46
				47	inline SpMMMicrokernelTester& nr(size_t nr) {
				48	this->nr_ = nr;
				49	return *this;
				50	}
				51
				52	inline size_t nr() const {
				53	return this->nr_;
				54	}
				55
				56	inline SpMMMicrokernelTester& m(size_t m) {
				57	this->m_ = m;
				58	return *this;
				59	}
				60
				61	inline size_t m() const {
				62	return this->m_;
				63	}
				64
				65	inline SpMMMicrokernelTester& n(size_t n) {
				66	this->n_ = n;
				67	return *this;
				68	}
				69
				70	inline size_t n() const {
				71	return this->n_;
				72	}
				73
				74	inline SpMMMicrokernelTester& k(size_t k) {
				75	this->k_ = k;
				76	return *this;
				77	}
				78
				79	inline size_t k() const {
				80	return this->k_;
				81	}
				82
				83	inline SpMMMicrokernelTester& sparsity(float sparsity) {
				84	this->sparsity_ = sparsity;
				85	return *this;
				86	}
				87
				88	inline float sparsity() const {
				89	return this->sparsity_;
				90	}
				91
				92	inline SpMMMicrokernelTester& qmin(uint8_t qmin) {
				93	this->qmin_ = qmin;
				94	return *this;
				95	}
				96
				97	inline uint8_t qmin() const {
				98	return this->qmin_;
				99	}
				100
				101	inline SpMMMicrokernelTester& qmax(uint8_t qmax) {
				102	this->qmax_ = qmax;
				103	return *this;
				104	}
				105
				106	inline uint8_t qmax() const {
				107	return this->qmax_;
				108	}
				109
				110	inline SpMMMicrokernelTester& iterations(size_t iterations) {
				111	this->iterations_ = iterations;
				112	return *this;
				113	}
				114
				115	inline size_t iterations() const {
				116	return this->iterations_;
				117	}
				118
				119	void Test(xnn_f32_spmm_ukernel_function spmm, Variant variant = Variant::Native) const {
				120	ASSERT_GE(m(), 1);
				121	ASSERT_GE(n(), 1);
				122	ASSERT_GE(k(), 1);
				123
				124	std::random_device random_device;
				125	auto rng = std::mt19937(random_device());
				126	auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
				127	auto prng = std::bind(std::uniform_real_distribution<float>(), rng);
				128
				129	std::vector<float, AlignedAllocator<float, 64>> a(k() * m());
				130	// Think of b as (n/nr + n % nr) x k, expansion happens later.
				131	const size_t ncols = n() / nr() + n() % nr();
				132	std::vector<float> b(ncols * k());
				133	std::vector<float> bias(n());
				134	// Number of non-zero weights per N (output channel).
				135	std::vector<uint32_t> nmap(n());
				136	// Mapping from index of non-zero weight to increment of K (input channel) following this index.
				137	std::vector<int32_t> dmap(n() * k());
				138	std::vector<float> w(n() * k() + n());
				139	std::vector<float> c(n() * m());
				140	std::vector<float> c_ref(n() * m());
				141
				142	for (size_t iteration = 0; iteration < iterations(); iteration++) {
				143	std::generate(a.begin(), a.end(), std::ref(f32rng));
				144	std::generate(b.begin(), b.end(), std::ref(f32rng));
				145	std::generate(bias.begin(), bias.end(), std::ref(f32rng));
				146	std::fill(c.begin(), c.end(), nanf(""));
				147	std::fill(c_ref.begin(), c_ref.end(), 0.0f);
				148	std::fill(nmap.begin(), nmap.end(), 0);
				149	std::fill(dmap.begin(), dmap.end(), 0);
				150	std::fill(w.begin(), w.end(), 0.0f);
				151
				152	for (float& b_value : b) {
				153	if (prng() <= sparsity()) {
				154	b_value = 0.0f;
				155	}
				156	}
				157
				158	uint32_t nnz = 0;
				159	uint32_t wcnt = 0;
				160	size_t last_kk = 0;
				161	bool first_nzz = true;
				162	size_t first_kk = 0;
				163	for (size_t nn = 0; nn < n() / nr(); nn++) {
				164	for (size_t i = 0; i < nr(); ++i)
				165	w[wcnt++] = bias[nr() * nn + i];
				166	for (size_t kk = 0; kk < k(); kk++) {
				167	if (b[nn * k() + kk] != 0.0f) {
				168	// Every non-zero actually corresponds to nr adjacent non-zeros.
				169	for (size_t i = 0; i < nr(); ++i)
				170	w[wcnt++] = b[nn * k() + kk] + static_cast<float>(i);
				171	// Skip the very first non-zero weight as we record only the difference.
				172	if (first_nzz) {
				173	first_kk = kk;
				174	} else {
				175	const int32_t increment = int32_t(kk - last_kk) * int32_t(m() * sizeof(float));
				176	dmap[nnz++] = increment;
				177	}
				178	last_kk = kk;
				179	first_nzz = false;
				180	nmap[nn] += 1;
				181	}
				182	}
				183	}
				184
				185	// now we've constructed the matrix for the blocked part and switch to the
				186	// leftovers, which we do as nr=1 always.
				187	for (size_t nn = n() / nr(); nn < ncols; nn++) {
				188	w[wcnt++] = bias[(n() / nr()) * nr() + (nn - n() / nr())];
				189	for (size_t kk = 0; kk < k(); kk++) {
				190	if (b[nn * k() + kk] != 0.0f) {
				191	// Every non-zero actually corresponds to nr adjacent non-zeros.
				192	w[wcnt++] = b[nn * k() + kk];
				193	// Skip the very first non-zero weight as we record only the difference.
				194	if (first_nzz) {
				195	first_kk = kk;
				196	} else {
				197	const int32_t increment = int32_t(kk - last_kk) * int32_t(m() * sizeof(float));
				198	dmap[nnz++] = increment;
				199	}
				200	last_kk = kk;
				201	first_nzz = false;
				202	nmap[nn] += 1;
				203	}
				204	}
				205	}
				206	// In the end, we must return input pointer to the initial value.
				207	const int64_t increment = int32_t(first_kk - last_kk) * int32_t(m() * sizeof(float));
				208	dmap[nnz++] = increment;
				209
				210	// Generate expanded b which will be used in reference calculation.
				211	// Everywhere there is a non-zero in the original we copy it and add an
				212	// adjacent non-zero with incremented weight value.
				213	std::vector<float> b_full(n() * k());
				214	if (nr() == 1) {
				215	b_full = b;
				216	}
				217	else {
				218	for (size_t nn = 0; nn < n() / nr(); nn++) {
				219	for (size_t kk = 0; kk < k(); kk++) {
				220	if (b[nn * k() + kk] != 0.0f) {
				221	for (size_t i = 0; i < nr(); ++i)
				222	b_full[nr() * nn * k() + i * k() + kk] = b[nn * k() + kk] + static_cast<float>(i);
				223	}
				224	}
				225	}
				226	for (size_t nn = n() / nr(); nn < ncols; nn++) {
				227	for (size_t kk = 0; kk < k(); kk++) {
				228	if (b[nn * k() + kk] != 0.0f) {
				229	b_full[nr() * (n() / nr()) * k() + (nn - n() / nr()) * k() + kk] = b[nn * k() + kk];
				230	}
				231	}
				232	}
				233	}
				234
				235	for (size_t oc = 0; oc < n(); oc++) {
				236	for (size_t pxb = 0; pxb < m(); pxb++) {
				237	c_ref[oc * m() + pxb] = bias[oc];
				238	for (size_t ic = 0; ic < k(); ic++) {
				239	c_ref[oc * m() + pxb] += a[ic * m() + pxb] * b_full[oc * k() + ic];
				240	}
				241	}
				242	}
				243
				244	// Micro-kernel can access one element beyond w and dmap for software pipelining.
				245	w.resize(wcnt + 1);
				246	dmap.resize(nnz + 1);
				247
				248	// Compute clamping parameters.
				249	const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
				250	const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
				251	const float c_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
				252	const float c_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
				253
				254	// Clamp reference results.
				255	for (float& c_value : c_ref) {
				256	c_value = std::min(std::max(c_value, c_min), c_max);
				257	}
				258
				259	// Prepare output parameters.
				260	xnn_f32_output_params output_params = { };
				261	switch (variant) {
				262	case Variant::Native:
Marat Dukhan	eeaa7bd	2019-10-25 17:31:25 -0700	[diff] [blame]	263	output_params = xnn_init_f32_output_params(c_min, c_max);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	264	break;
				265	case Variant::Scalar:
Marat Dukhan	eeaa7bd	2019-10-25 17:31:25 -0700	[diff] [blame]	266	output_params = xnn_init_scalar_f32_output_params(c_min, c_max);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	267	break;
				268	}
				269
				270	spmm(m(), n(),
				271	a.data() + first_kk * m(), w.data(), dmap.data(), nmap.data(), c.data(),
				272	&output_params);
				273
				274	// Validate micro-kernel outputs.
				275	for (size_t pxb = 0; pxb < n(); pxb++) {
				276	for (size_t oc = 0; oc < m(); oc++) {
				277	ASSERT_NEAR(
				278	c[pxb * m() + oc],
				279	c_ref[pxb * m() + oc],
				280	std::abs(c_ref[pxb * m() + oc]) * 1.0e-6f)
				281	<< "at " << pxb << ", " << oc
				282	<< ": Mr x Nr x Kr = " << mr() << " x " << nr()
				283	<< ", M x N x K = " << m() << " x " << n() << " x " << k();
				284	}
				285	}
				286	}
				287	}
				288
Marat Dukhan	bdb56f5	2020-02-05 21:42:49 -0800	[diff] [blame]	289	void Test(xnn_f16_spmm_ukernel_function spmm) const {
				290	ASSERT_GE(m(), 1);
				291	ASSERT_GE(n(), 1);
				292	ASSERT_GE(k(), 1);
				293
				294	std::random_device random_device;
				295	auto rng = std::mt19937(random_device());
				296	auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
				297	auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
				298	auto prng = std::bind(std::uniform_real_distribution<float>(), rng);
				299
				300	std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> a(k() * m());
				301	// Think of b as (n/nr + n % nr) x k, expansion happens later.
				302	const size_t ncols = n() / nr() + n() % nr();
				303	std::vector<uint16_t> b(ncols * k());
				304	std::vector<uint16_t> bias(n());
				305	// Number of non-zero weights per N (output channel).
				306	std::vector<uint32_t> nmap(n());
				307	// Mapping from index of non-zero weight to increment of K (input channel) following this index.
				308	std::vector<int32_t> dmap(n() * k());
				309	std::vector<uint16_t> w(n() * k() + n());
				310	std::vector<uint16_t> c(n() * m());
				311	std::vector<float> c_ref(n() * m());
				312
				313	for (size_t iteration = 0; iteration < iterations(); iteration++) {
				314	std::generate(a.begin(), a.end(), std::ref(f16rng));
				315	std::generate(b.begin(), b.end(), std::ref(f16rng));
				316	std::generate(bias.begin(), bias.end(), std::ref(f16rng));
				317	std::fill(c.begin(), c.end(), 0xC000);
				318	std::fill(c_ref.begin(), c_ref.end(), 0.0f);
				319	std::fill(nmap.begin(), nmap.end(), 0);
				320	std::fill(dmap.begin(), dmap.end(), 0);
				321	std::fill(w.begin(), w.end(), 0);
				322
				323	for (uint16_t& b_value : b) {
				324	if (prng() <= sparsity()) {
				325	b_value = 0;
				326	}
				327	}
				328
				329	uint32_t nnz = 0;
				330	uint32_t wcnt = 0;
				331	size_t last_kk = 0;
				332	bool first_nzz = true;
				333	size_t first_kk = 0;
				334	for (size_t nn = 0; nn < n() / nr(); nn++) {
				335	for (size_t i = 0; i < nr(); ++i)
				336	w[wcnt++] = bias[nr() * nn + i];
				337	for (size_t kk = 0; kk < k(); kk++) {
				338	if (!is_fp16_zero(b[nn * k() + kk])) {
				339	// Every non-zero actually corresponds to nr adjacent non-zeros.
				340	for (size_t i = 0; i < nr(); ++i)
				341	w[wcnt++] = fp16_ieee_from_fp32_value(fp16_ieee_to_fp32_value(b[nn * k() + kk]) + static_cast<float>(i));
				342	// Skip the very first non-zero weight as we record only the difference.
				343	if (first_nzz) {
				344	first_kk = kk;
				345	} else {
				346	const int32_t increment = int32_t(kk - last_kk) * int32_t(m() * sizeof(uint16_t));
				347	dmap[nnz++] = increment;
				348	}
				349	last_kk = kk;
				350	first_nzz = false;
				351	nmap[nn] += 1;
				352	}
				353	}
				354	}
				355
				356	// now we've constructed the matrix for the blocked part and switch to the
				357	// leftovers, which we do as nr=1 always.
				358	for (size_t nn = n() / nr(); nn < ncols; nn++) {
				359	w[wcnt++] = bias[(n() / nr()) * nr() + (nn - n() / nr())];
				360	for (size_t kk = 0; kk < k(); kk++) {
				361	if (!is_fp16_zero(b[nn * k() + kk])) {
				362	// Every non-zero actually corresponds to nr adjacent non-zeros.
				363	w[wcnt++] = b[nn * k() + kk];
				364	// Skip the very first non-zero weight as we record only the difference.
				365	if (first_nzz) {
				366	first_kk = kk;
				367	} else {
				368	const int32_t increment = int32_t(kk - last_kk) * int32_t(m() * sizeof(uint16_t));
				369	dmap[nnz++] = increment;
				370	}
				371	last_kk = kk;
				372	first_nzz = false;
				373	nmap[nn] += 1;
				374	}
				375	}
				376	}
				377	// In the end, we must return input pointer to the initial value.
				378	const int64_t increment = int32_t(first_kk - last_kk) * int32_t(m() * sizeof(uint16_t));
				379	dmap[nnz++] = increment;
				380
				381	// Generate expanded b which will be used in reference calculation.
				382	// Everywhere there is a non-zero in the original we copy it and add an
				383	// adjacent non-zero with incremented weight value.
				384	std::vector<uint16_t> b_full(n() * k());
				385	if (nr() == 1) {
				386	b_full = b;
				387	}
				388	else {
				389	for (size_t nn = 0; nn < n() / nr(); nn++) {
				390	for (size_t kk = 0; kk < k(); kk++) {
				391	if (b[nn * k() + kk] != 0.0f) {
				392	for (size_t i = 0; i < nr(); ++i)
				393	b_full[nr() * nn * k() + i * k() + kk] = fp16_ieee_from_fp32_value(
				394	fp16_ieee_to_fp32_value(b[nn * k() + kk]) + static_cast<float>(i));
				395	}
				396	}
				397	}
				398	for (size_t nn = n() / nr(); nn < ncols; nn++) {
				399	for (size_t kk = 0; kk < k(); kk++) {
				400	if (b[nn * k() + kk] != 0.0f) {
				401	b_full[nr() * (n() / nr()) * k() + (nn - n() / nr()) * k() + kk] = b[nn * k() + kk];
				402	}
				403	}
				404	}
				405	}
				406
				407	for (size_t oc = 0; oc < n(); oc++) {
				408	for (size_t pxb = 0; pxb < m(); pxb++) {
				409	c_ref[oc * m() + pxb] = fp16_ieee_to_fp32_value(bias[oc]);
				410	for (size_t ic = 0; ic < k(); ic++) {
				411	c_ref[oc * m() + pxb] += fp16_ieee_to_fp32_value(a[ic * m() + pxb]) * fp16_ieee_to_fp32_value(b_full[oc * k() + ic]);
				412	}
				413	}
				414	}
				415
				416	// Micro-kernel can access one element beyond w and dmap for software pipelining.
				417	w.resize(wcnt + 1);
				418	dmap.resize(nnz + 1);
				419
				420	// Compute clamping parameters.
				421	const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
				422	const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
				423	const float c_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
				424	const float c_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
				425
				426	// Clamp reference results.
				427	for (float& c_value : c_ref) {
				428	c_value = std::min(std::max(c_value, c_min), c_max);
				429	}
				430
				431	// Prepare output parameters.
				432	xnn_f16_output_params output_params;
				433	output_params.scale = UINT16_C(0x3C00) /* 1.0 */;
				434	output_params.max = fp16_ieee_from_fp32_value(c_max);
				435	output_params.min = fp16_ieee_from_fp32_value(c_min);
				436
				437	spmm(m(), n(),
				438	a.data() + first_kk * m(), w.data(), dmap.data(), nmap.data(), c.data(),
				439	&output_params);
				440
				441	// Validate micro-kernel outputs.
				442	for (size_t pxb = 0; pxb < n(); pxb++) {
				443	for (size_t oc = 0; oc < m(); oc++) {
				444	ASSERT_NEAR(
				445	fp16_ieee_to_fp32_value(c[pxb * m() + oc]),
				446	c_ref[pxb * m() + oc],
				447	std::abs(c_ref[pxb * m() + oc]) * 1.0e-2f)
				448	<< "at " << pxb << ", " << oc
				449	<< ": Mr x Nr x Kr = " << mr() << " x " << nr()
				450	<< ", M x N x K = " << m() << " x " << n() << " x " << k();
				451	}
				452	}
				453	}
				454	}
				455
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	456	private:
				457	size_t mr_{1};
				458	size_t nr_{1};
				459	size_t m_{1};
				460	size_t n_{1};
				461	size_t k_{1};
				462	float sparsity_{0.5f};
				463	uint8_t qmin_{0};
				464	uint8_t qmax_{255};
				465	size_t iterations_{1};
				466	};