Blame - doc/quantization_example.cc - platform/external/gemmlowp

blob: d7b147d12c361d24aa4c1a2deef35afb72651593 [file] [log] [blame]

Miao Wang	a9fd919	2017-07-06 11:06:31 -0700	[diff] [blame]	1	// Example code illustrating the theory exposed in doc/quantization.md
				2
				3	/* Command line to build and run on x86:
				4
				5	c++ doc/quantization_example.cc -I . --std=c++11 -msse4.1 -lpthread \
				6	-o /tmp/quantization_example && \
				7	/tmp/quantization_example
				8
				9	*/
				10
				11	#include <algorithm>
				12	#include <cassert>
				13	#include <cmath>
				14	#include <cstdint>
				15	#include <iostream>
				16	#include <random>
				17	#include <vector>
				18	#include "../public/gemmlowp.h"
				19	#include "../public/output_stages.h"
				20
				21	// We will handle both float and quantized matrices, which we will
				22	// represent as gemmlowp::MatrixMap.
				23	// We will need to be able to print them.
				24
				25	// Output a matrix to a std::ostream
				26	template <typename tScalar, gemmlowp::MapOrder tOrder>
				27	std::ostream& operator<<(std::ostream& s,
				28	const gemmlowp::MatrixMap<tScalar, tOrder>& m) {
				29	for (int i = 0; i < m.rows(); i++) {
				30	for (int j = 0; j < m.cols(); j++) {
				31	if (j) {
				32	s << '\t';
				33	}
				34	s << static_cast<float>(m(i, j));
				35	}
				36	s << '\n';
				37	}
				38	return s;
				39	}
				40
				41	// Find the min and max value in a float matrix.
				42	template <gemmlowp::MapOrder tOrder>
				43	void FindMinMax(const gemmlowp::MatrixMap<float, tOrder>& m, float* min,
				44	float* max) {
				45	min = max = m(0, 0);
				46	for (int i = 0; i < m.rows(); i++) {
				47	for (int j = 0; j < m.cols(); j++) {
				48	const float val = m(i, j);
				49	min = std::min(min, val);
				50	max = std::max(max, val);
				51	}
				52	}
				53	}
				54
				55	// A structure to hold quantization parameters 'scale' and 'zero_point'
				56	// as discussed in doc/quantization.md. As explained there, the meaning
				57	// of these values is as the constants in the quantization equation
				58	//
				59	// real_value = scale * (quantized_value - zero_point)
				60	//
				61	// In other words, 'zero_point' is the quantized value that corresponds
				62	// to the real value 0, and 'scale' is the difference of real values
				63	// corresponding to consecutive quantized values.
				64	struct QuantizationParams {
				65	float scale;
				66	std::uint8_t zero_point;
				67	};
				68
				69	// Given the min and max values of a float array, return
				70	// reasonable quantization parameters to use for this array.
				71	QuantizationParams ChooseQuantizationParams(float min, float max) {
				72	// We extend the [min, max] interval to ensure that it contains 0.
				73	// Otherwise, we would not meet the requirement that 0 be an exactly
				74	// representable value.
				75	min = std::min(min, 0.f);
				76	max = std::max(max, 0.f);
				77
				78	// the min and max quantized values, as floating-point values
				79	const float qmin = 0;
				80	const float qmax = 255;
				81
				82	// First determine the scale.
				83	const double scale = (max - min) / (qmax - qmin);
				84
				85	// Zero-point computation.
				86	// First the initial floating-point computation. The zero-point can be
				87	// determined from solving an affine equation for any known pair
				88	// (real value, corresponding quantized value).
				89	// We know two such pairs: (rmin, qmin) and (rmax, qmax).
				90	// Let's use the first one here.
				91	const double initial_zero_point = qmin - min / scale;
				92
				93	// Now we need to nudge the zero point to be an integer
				94	// (our zero points are integer, and this is motivated by the requirement
				95	// to be able to represent the real value "0" exactly as a quantized value,
				96	// which is required in multiple places, for example in Im2col with SAME
				97	// padding).
				98	std::uint8_t nudged_zero_point = 0;
				99	if (initial_zero_point < qmin) {
				100	nudged_zero_point = qmin;
				101	} else if (initial_zero_point > qmax) {
				102	nudged_zero_point = qmax;
				103	} else {
				104	nudged_zero_point =
				105	static_cast<std::uint8_t>(std::round(initial_zero_point));
				106	}
				107
				108	QuantizationParams result;
				109	result.scale = scale;
				110	result.zero_point = nudged_zero_point;
				111	return result;
				112	}
				113
				114	template <gemmlowp::MapOrder tLhsOrder, gemmlowp::MapOrder tRhsOrder,
				115	gemmlowp::MapOrder tResultOrder>
				116	void FloatMatrixMultiplication(
				117	const gemmlowp::MatrixMap<const float, tLhsOrder>& lhs,
				118	const gemmlowp::MatrixMap<const float, tRhsOrder>& rhs,
				119	gemmlowp::MatrixMap<float, tResultOrder>* result) {
				120	assert(lhs.cols() == rhs.rows());
				121	assert(lhs.rows() == result->rows());
				122	assert(rhs.cols() == result->cols());
				123	for (int i = 0; i < lhs.rows(); i++) {
				124	for (int k = 0; k < rhs.cols(); k++) {
				125	(*result)(i, k) = 0;
				126	for (int j = 0; j < lhs.cols(); j++) {
				127	(result)(i, k) += lhs(i, j) rhs(j, k);
				128	}
				129	}
				130	}
				131	}
				132
				133	void Quantize(const QuantizationParams& qparams, const std::vector<float>& src,
				134	std::vector<std::uint8_t>* dst) {
				135	assert(src.size() == dst->size());
				136	for (std::size_t i = 0; i < src.size(); i++) {
				137	const float real_val = src[i];
				138	const float transformed_val = qparams.zero_point + real_val / qparams.scale;
				139	const float clamped_val = std::max(0.f, std::min(255.f, transformed_val));
				140	(*dst)[i] = static_cast<std::uint8_t>(std::round(clamped_val));
				141	}
				142	}
				143
				144	void Dequantize(const QuantizationParams& qparams,
				145	const std::vector<std::uint8_t>& src, std::vector<float>* dst) {
				146	assert(src.size() == dst->size());
				147	for (std::size_t i = 0; i < src.size(); i++) {
				148	const std::uint8_t quantized_val = src[i];
				149	(dst)[i] = qparams.scale (quantized_val - qparams.zero_point);
				150	}
				151	}
				152
				153	template <typename tScalar, gemmlowp::MapOrder tOrder>
				154	class MatrixWithStorage {
				155	public:
				156	MatrixWithStorage(int rows, int cols)
				157	: storage(rows * cols), matrix_map(storage.data(), rows, cols) {}
				158	void MakeRandom() {
				159	static std::mt19937 random_engine;
				160	std::uniform_real_distribution<float> distribution(-1, 1);
				161	for (auto& x : storage) {
				162	x = static_cast<tScalar>(distribution(random_engine));
				163	}
				164	}
				165	gemmlowp::MatrixMap<const tScalar, tOrder> ConstMap() const {
				166	return gemmlowp::MatrixMap<const tScalar, tOrder>(
				167	storage.data(), matrix_map.rows(), matrix_map.cols());
				168	}
				169	gemmlowp::MatrixMap<tScalar, tOrder> Map() {
				170	return gemmlowp::MatrixMap<tScalar, tOrder>(
				171	storage.data(), matrix_map.rows(), matrix_map.cols());
				172	}
				173	const std::vector<tScalar>& Storage() const { return storage; }
				174	std::vector<tScalar>& Storage() { return storage; }
				175
				176	private:
				177	std::vector<tScalar> storage;
				178	gemmlowp::MatrixMap<tScalar, tOrder> matrix_map;
				179	};
				180
				181	template <typename tScalar, gemmlowp::MapOrder tOrder>
				182	std::ostream& operator<<(std::ostream& s,
				183	const MatrixWithStorage<tScalar, tOrder>& m) {
				184	return s << m.ConstMap();
				185	}
				186
				187	// Given a real_multiplier in the interval (0, 1),
				188	// produces a pair (quantized_multiplier, right_shift) where
				189	// quantized_multiplier is an int32 representing a fixed-point value
				190	// in the interval [-1, 1) (in practice we only produce positive values)
				191	// and right_shift is an amount to shift right by, so that the
				192	// floating-point multiplication of some int32 input value by real_multiplier,
				193	//
				194	// return static_cast<int32>(int32_value * real_multiplier);
				195	//
				196	// is best approximated by the integer-arithmetic-only code
				197	//
				198	// return RoundingRightShift(
				199	// FixedPointMultiplication(int32_value, quantized_multiplier),
				200	// right_shift);
				201	//
				202	// This is how to obtain the fixed-point multiplier and right shift
				203	// parameters to pass to
Miao Wang	7d0d5a6	2018-02-23 11:33:20 -0800	[diff] [blame]	204	// OutputStageQuantizeDownInt32ByFixedPoint.
Miao Wang	a9fd919	2017-07-06 11:06:31 -0700	[diff] [blame]	205	//
				206	// Note: all this code only needs to run offline to generate the quantized
				207	// neural network workload, not at runtime on the
				208	// device on which quantized neural networks need to run. So it's not
				209	// performance-critical at all.
				210	void QuantizeMultiplierSmallerThanOne(float real_multiplier,
				211	std::int32_t* quantized_multiplier,
				212	int* right_shift) {
				213	assert(real_multiplier > 0.f);
				214	assert(real_multiplier < 1.f);
				215	int s = 0;
				216	// We want to bring the real multiplier into the interval [1/2, 1).
				217	// We can do so by multiplying it by two, and recording how many times
				218	// we multiplied by two so that we can compensate that by a right
				219	// shift by the same amount.
				220	while (real_multiplier < 0.5f) {
				221	real_multiplier *= 2.0f;
				222	s++;
				223	}
				224	// Now that the real multiplier is in [1/2, 1), we convert it
				225	// into a fixed-point number.
				226	std::int64_t q =
				227	static_cast<std::int64_t>(std::round(real_multiplier * (1ll << 31)));
				228	assert(q <= (1ll << 31));
				229	// Handle the special case when the real multiplier was so close to 1
				230	// that its fixed-point approximation was undistinguishable from 1.
				231	// We handle this by dividing it by two, and remembering to decrement
				232	// the right shift amount.
				233	if (q == (1ll << 31)) {
				234	q /= 2;
				235	s--;
				236	}
				237	assert(s >= 0);
				238	assert(q <= std::numeric_limits<std::int32_t>::max());
				239	*quantized_multiplier = static_cast<std::int32_t>(q);
				240	*right_shift = s;
				241	}
				242
				243	int main() {
				244	std::cout.precision(3);
				245
				246	const int rows = 2;
				247	const int depth = 4;
				248	const int cols = 3;
				249	const auto kOrder = gemmlowp::MapOrder::ColMajor;
				250
				251	std::cout << "First, let us make some float matrices LHS and RHS, "
				252	<< "and compute their product.\n"
				253	<< std::endl;
				254	MatrixWithStorage<float, kOrder> float_lhs(rows, depth);
				255	float_lhs.MakeRandom();
				256	MatrixWithStorage<float, kOrder> float_rhs(depth, cols);
				257	float_rhs.MakeRandom();
				258	MatrixWithStorage<float, kOrder> reference_float_result(rows, cols);
				259	auto reference_float_result_map = reference_float_result.Map();
				260	FloatMatrixMultiplication(float_lhs.ConstMap(), float_rhs.ConstMap(),
				261	&reference_float_result_map);
				262	std::cout << "Here is the float LHS matrix:\n" << float_lhs << std::endl;
				263	std::cout << "Here is the float RHS matrix:\n" << float_rhs << std::endl;
				264	std::cout << "Here is the float product (LHS * RHS) matrix obtained by "
				265	<< "ordinary float matrix multiplication, i.e. as far as we are "
				266	<< "concerned, the REFERENCE RESULT:\n"
				267	<< reference_float_result << std::endl;
				268
				269	std::cout
				270	<< "Now we embark on reproducing this result using "
				271	<< "quantized arithmetic. The code below splits into two parts: "
				272	<< "quantization code that only needs to run offline (e.g. to "
				273	<< "generate a quantized neural network workload), and actual "
				274	<< "runtime quantized code, which is typically performance-critical "
				275	<< "and where we typically do not want to use any floating-point "
				276	<< "arithmetic. We want to clearly distinguish between the two.\n"
				277	<< std::endl;
				278
				279	std::cout << "The below is OFFLINE QUANTIZATION CODE. We still use some "
				280	<< "floating-point arithmetic in the process of generating the "
				281	<< "quantized workload to be run on-device.\n"
				282	<< std::endl;
				283
				284	std::cout
				285	<< "Now, let us choose quantization parameters for these matrices. "
				286	<< "You might ask, what good is quantization if we need to pick "
				287	<< "quantization parameters for the result before we can run the "
				288	<< "quantized computation to obtain the result? The idea is that we "
				289	<< "target applications such as neural networks, where unknown results "
				290	<< "are only allowed to vary within preexisting bounds. In practice, the "
				291	<< "bounds for the results are typically learned during the neural "
				292	"network "
				293	<< "training process. The min and max of the result do not have to be "
				294	<< "exact. If they are too broad, we just get lower quantization "
				295	"accuracy. "
				296	<< "If they are too narrow, we just get clamping at the bounds.\n"
				297	<< std::endl;
				298
				299	float lhs_min, lhs_max, rhs_min, rhs_max, result_min, result_max;
				300	FindMinMax(float_lhs.Map(), &lhs_min, &lhs_max);
				301	FindMinMax(float_rhs.Map(), &rhs_min, &rhs_max);
				302	FindMinMax(reference_float_result.Map(), &result_min, &result_max);
				303	const auto lhs_qparams = ChooseQuantizationParams(lhs_min, lhs_max);
				304	const auto rhs_qparams = ChooseQuantizationParams(rhs_min, rhs_max);
				305	const auto result_qparams = ChooseQuantizationParams(result_min, result_max);
				306
				307	std::cout << "For LHS, we have min = " << lhs_min << ", max = " << lhs_max
				308	<< ", scale = " << lhs_qparams.scale
				309	<< ", zero_point = " << static_cast<float>(lhs_qparams.zero_point)
				310	<< std::endl;
				311	std::cout << "For RHS, we have min = " << rhs_min << ", max = " << rhs_max
				312	<< ", scale = " << rhs_qparams.scale
				313	<< ", zero_point = " << static_cast<float>(rhs_qparams.zero_point)
				314	<< std::endl;
				315	std::cout << "For the result, we have min = " << result_min
				316	<< ", max = " << result_max << ", scale = " << result_qparams.scale
				317	<< ", zero_point = "
				318	<< static_cast<float>(result_qparams.zero_point) << std::endl;
				319
				320	std::cout << std::endl;
				321
				322	MatrixWithStorage<std::uint8_t, kOrder> uint8_lhs(rows, depth);
				323	MatrixWithStorage<std::uint8_t, kOrder> uint8_rhs(depth, cols);
				324	MatrixWithStorage<std::uint8_t, kOrder> actual_uint8_result(rows, cols);
				325
				326	Quantize(lhs_qparams, float_lhs.Storage(), &uint8_lhs.Storage());
				327	Quantize(rhs_qparams, float_rhs.Storage(), &uint8_rhs.Storage());
				328
				329	std::cout << "Quantized uint8 LHS matrix:\n" << uint8_lhs << std::endl;
				330	std::cout << "Quantized uint8 RHS matrix:\n" << uint8_rhs << std::endl;
				331
				332	const int lhs_offset = -lhs_qparams.zero_point;
				333	const int rhs_offset = -rhs_qparams.zero_point;
				334	const int result_offset = result_qparams.zero_point;
				335
				336	const float real_multiplier =
				337	lhs_qparams.scale * rhs_qparams.scale / result_qparams.scale;
				338	std::int32_t quantized_multiplier;
				339	int right_shift;
				340	QuantizeMultiplierSmallerThanOne(real_multiplier, &quantized_multiplier,
				341	&right_shift);
				342
				343	std::cout << "End of OFFLINE QUANTIZATION CODE.\n" << std::endl;
				344
				345	std::cout << "The below is ON-DEVICE RUNTIME QUANTIZED CODE. "
				346	<< "This is the part that is performance-critical and may only "
				347	<< "use quantized arithmetic.\n"
				348	<< std::endl;
				349
Miao Wang	7d0d5a6	2018-02-23 11:33:20 -0800	[diff] [blame]	350	gemmlowp::OutputStageQuantizeDownInt32ByFixedPoint
Miao Wang	a9fd919	2017-07-06 11:06:31 -0700	[diff] [blame]	351	quantize_down_stage;
				352	quantize_down_stage.result_offset_after_shift = result_offset;
				353	quantize_down_stage.result_fixedpoint_multiplier = quantized_multiplier;
				354	quantize_down_stage.result_shift = right_shift;
				355	gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
				356	const auto& output_pipeline =
				357	std::make_tuple(quantize_down_stage, saturating_cast_stage);
				358
				359	auto actual_uint8_result_map = actual_uint8_result.Map();
				360	gemmlowp::GemmContext gemm_context;
				361	gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
				362	gemmlowp::DefaultL8R8BitDepthParams>(
				363	&gemm_context, uint8_lhs.ConstMap(), uint8_rhs.ConstMap(),
				364	&actual_uint8_result_map, lhs_offset, rhs_offset, output_pipeline);
				365
				366	std::cout << "Quantized uint8 result matrix obtained by quantized "
				367	<< "multiplication:\n"
				368	<< actual_uint8_result << std::endl;
				369
				370	std::cout << "End of ON-DEVICE RUNTIME QUANTIZED CODE.\n" << std::endl;
				371
				372	MatrixWithStorage<float, kOrder> actual_float_result(rows, cols);
				373	Dequantize(result_qparams, actual_uint8_result.Storage(),
				374	&actual_float_result.Storage());
				375	std::cout
				376	<< "Here is the actual float product (LHS * RHS) matrix obtained by "
				377	<< "dequantizing the above uint8 result, i.e. "
				378	<< "as far as we are concerned, the ACTUAL RESULT:\n"
				379	<< actual_float_result << std::endl;
				380
				381	MatrixWithStorage<float, kOrder> diff_float_result(rows, cols);
				382	for (int i = 0; i < rows; i++) {
				383	for (int j = 0; j < cols; j++) {
				384	diff_float_result.Map()(i, j) =
				385	actual_float_result.Map()(i, j) - reference_float_result.Map()(i, j);
				386	}
				387	}
				388
				389	std::cout << "Difference between ACTUAL and REFERENCE float results:\n"
				390	<< diff_float_result << std::endl;
				391	}