Miao Wang | a9fd919 | 2017-07-06 11:06:31 -0700 | [diff] [blame] | 1 | // Example code illustrating the theory exposed in doc/quantization.md |
| 2 | |
| 3 | /* Command line to build and run on x86: |
| 4 | |
| 5 | c++ doc/quantization_example.cc -I . --std=c++11 -msse4.1 -lpthread \ |
| 6 | -o /tmp/quantization_example && \ |
| 7 | /tmp/quantization_example |
| 8 | |
| 9 | */ |
| 10 | |
| 11 | #include <algorithm> |
| 12 | #include <cassert> |
| 13 | #include <cmath> |
| 14 | #include <cstdint> |
| 15 | #include <iostream> |
| 16 | #include <random> |
| 17 | #include <vector> |
| 18 | #include "../public/gemmlowp.h" |
| 19 | #include "../public/output_stages.h" |
| 20 | |
| 21 | // We will handle both float and quantized matrices, which we will |
| 22 | // represent as gemmlowp::MatrixMap. |
| 23 | // We will need to be able to print them. |
| 24 | |
| 25 | // Output a matrix to a std::ostream |
| 26 | template <typename tScalar, gemmlowp::MapOrder tOrder> |
| 27 | std::ostream& operator<<(std::ostream& s, |
| 28 | const gemmlowp::MatrixMap<tScalar, tOrder>& m) { |
| 29 | for (int i = 0; i < m.rows(); i++) { |
| 30 | for (int j = 0; j < m.cols(); j++) { |
| 31 | if (j) { |
| 32 | s << '\t'; |
| 33 | } |
| 34 | s << static_cast<float>(m(i, j)); |
| 35 | } |
| 36 | s << '\n'; |
| 37 | } |
| 38 | return s; |
| 39 | } |
| 40 | |
| 41 | // Find the min and max value in a float matrix. |
| 42 | template <gemmlowp::MapOrder tOrder> |
| 43 | void FindMinMax(const gemmlowp::MatrixMap<float, tOrder>& m, float* min, |
| 44 | float* max) { |
| 45 | *min = *max = m(0, 0); |
| 46 | for (int i = 0; i < m.rows(); i++) { |
| 47 | for (int j = 0; j < m.cols(); j++) { |
| 48 | const float val = m(i, j); |
| 49 | *min = std::min(*min, val); |
| 50 | *max = std::max(*max, val); |
| 51 | } |
| 52 | } |
| 53 | } |
| 54 | |
| 55 | // A structure to hold quantization parameters 'scale' and 'zero_point' |
| 56 | // as discussed in doc/quantization.md. As explained there, the meaning |
| 57 | // of these values is as the constants in the quantization equation |
| 58 | // |
| 59 | // real_value = scale * (quantized_value - zero_point) |
| 60 | // |
| 61 | // In other words, 'zero_point' is the quantized value that corresponds |
| 62 | // to the real value 0, and 'scale' is the difference of real values |
| 63 | // corresponding to consecutive quantized values. |
| 64 | struct QuantizationParams { |
| 65 | float scale; |
| 66 | std::uint8_t zero_point; |
| 67 | }; |
| 68 | |
| 69 | // Given the min and max values of a float array, return |
| 70 | // reasonable quantization parameters to use for this array. |
| 71 | QuantizationParams ChooseQuantizationParams(float min, float max) { |
| 72 | // We extend the [min, max] interval to ensure that it contains 0. |
| 73 | // Otherwise, we would not meet the requirement that 0 be an exactly |
| 74 | // representable value. |
| 75 | min = std::min(min, 0.f); |
| 76 | max = std::max(max, 0.f); |
| 77 | |
| 78 | // the min and max quantized values, as floating-point values |
| 79 | const float qmin = 0; |
| 80 | const float qmax = 255; |
| 81 | |
| 82 | // First determine the scale. |
| 83 | const double scale = (max - min) / (qmax - qmin); |
| 84 | |
| 85 | // Zero-point computation. |
| 86 | // First the initial floating-point computation. The zero-point can be |
| 87 | // determined from solving an affine equation for any known pair |
| 88 | // (real value, corresponding quantized value). |
| 89 | // We know two such pairs: (rmin, qmin) and (rmax, qmax). |
| 90 | // Let's use the first one here. |
| 91 | const double initial_zero_point = qmin - min / scale; |
| 92 | |
| 93 | // Now we need to nudge the zero point to be an integer |
| 94 | // (our zero points are integer, and this is motivated by the requirement |
| 95 | // to be able to represent the real value "0" exactly as a quantized value, |
| 96 | // which is required in multiple places, for example in Im2col with SAME |
| 97 | // padding). |
| 98 | std::uint8_t nudged_zero_point = 0; |
| 99 | if (initial_zero_point < qmin) { |
| 100 | nudged_zero_point = qmin; |
| 101 | } else if (initial_zero_point > qmax) { |
| 102 | nudged_zero_point = qmax; |
| 103 | } else { |
| 104 | nudged_zero_point = |
| 105 | static_cast<std::uint8_t>(std::round(initial_zero_point)); |
| 106 | } |
| 107 | |
| 108 | QuantizationParams result; |
| 109 | result.scale = scale; |
| 110 | result.zero_point = nudged_zero_point; |
| 111 | return result; |
| 112 | } |
| 113 | |
| 114 | template <gemmlowp::MapOrder tLhsOrder, gemmlowp::MapOrder tRhsOrder, |
| 115 | gemmlowp::MapOrder tResultOrder> |
| 116 | void FloatMatrixMultiplication( |
| 117 | const gemmlowp::MatrixMap<const float, tLhsOrder>& lhs, |
| 118 | const gemmlowp::MatrixMap<const float, tRhsOrder>& rhs, |
| 119 | gemmlowp::MatrixMap<float, tResultOrder>* result) { |
| 120 | assert(lhs.cols() == rhs.rows()); |
| 121 | assert(lhs.rows() == result->rows()); |
| 122 | assert(rhs.cols() == result->cols()); |
| 123 | for (int i = 0; i < lhs.rows(); i++) { |
| 124 | for (int k = 0; k < rhs.cols(); k++) { |
| 125 | (*result)(i, k) = 0; |
| 126 | for (int j = 0; j < lhs.cols(); j++) { |
| 127 | (*result)(i, k) += lhs(i, j) * rhs(j, k); |
| 128 | } |
| 129 | } |
| 130 | } |
| 131 | } |
| 132 | |
| 133 | void Quantize(const QuantizationParams& qparams, const std::vector<float>& src, |
| 134 | std::vector<std::uint8_t>* dst) { |
| 135 | assert(src.size() == dst->size()); |
| 136 | for (std::size_t i = 0; i < src.size(); i++) { |
| 137 | const float real_val = src[i]; |
| 138 | const float transformed_val = qparams.zero_point + real_val / qparams.scale; |
| 139 | const float clamped_val = std::max(0.f, std::min(255.f, transformed_val)); |
| 140 | (*dst)[i] = static_cast<std::uint8_t>(std::round(clamped_val)); |
| 141 | } |
| 142 | } |
| 143 | |
| 144 | void Dequantize(const QuantizationParams& qparams, |
| 145 | const std::vector<std::uint8_t>& src, std::vector<float>* dst) { |
| 146 | assert(src.size() == dst->size()); |
| 147 | for (std::size_t i = 0; i < src.size(); i++) { |
| 148 | const std::uint8_t quantized_val = src[i]; |
| 149 | (*dst)[i] = qparams.scale * (quantized_val - qparams.zero_point); |
| 150 | } |
| 151 | } |
| 152 | |
| 153 | template <typename tScalar, gemmlowp::MapOrder tOrder> |
| 154 | class MatrixWithStorage { |
| 155 | public: |
| 156 | MatrixWithStorage(int rows, int cols) |
| 157 | : storage(rows * cols), matrix_map(storage.data(), rows, cols) {} |
| 158 | void MakeRandom() { |
| 159 | static std::mt19937 random_engine; |
| 160 | std::uniform_real_distribution<float> distribution(-1, 1); |
| 161 | for (auto& x : storage) { |
| 162 | x = static_cast<tScalar>(distribution(random_engine)); |
| 163 | } |
| 164 | } |
| 165 | gemmlowp::MatrixMap<const tScalar, tOrder> ConstMap() const { |
| 166 | return gemmlowp::MatrixMap<const tScalar, tOrder>( |
| 167 | storage.data(), matrix_map.rows(), matrix_map.cols()); |
| 168 | } |
| 169 | gemmlowp::MatrixMap<tScalar, tOrder> Map() { |
| 170 | return gemmlowp::MatrixMap<tScalar, tOrder>( |
| 171 | storage.data(), matrix_map.rows(), matrix_map.cols()); |
| 172 | } |
| 173 | const std::vector<tScalar>& Storage() const { return storage; } |
| 174 | std::vector<tScalar>& Storage() { return storage; } |
| 175 | |
| 176 | private: |
| 177 | std::vector<tScalar> storage; |
| 178 | gemmlowp::MatrixMap<tScalar, tOrder> matrix_map; |
| 179 | }; |
| 180 | |
| 181 | template <typename tScalar, gemmlowp::MapOrder tOrder> |
| 182 | std::ostream& operator<<(std::ostream& s, |
| 183 | const MatrixWithStorage<tScalar, tOrder>& m) { |
| 184 | return s << m.ConstMap(); |
| 185 | } |
| 186 | |
| 187 | // Given a real_multiplier in the interval (0, 1), |
| 188 | // produces a pair (quantized_multiplier, right_shift) where |
| 189 | // quantized_multiplier is an int32 representing a fixed-point value |
| 190 | // in the interval [-1, 1) (in practice we only produce positive values) |
| 191 | // and right_shift is an amount to shift right by, so that the |
| 192 | // floating-point multiplication of some int32 input value by real_multiplier, |
| 193 | // |
| 194 | // return static_cast<int32>(int32_value * real_multiplier); |
| 195 | // |
| 196 | // is best approximated by the integer-arithmetic-only code |
| 197 | // |
| 198 | // return RoundingRightShift( |
| 199 | // FixedPointMultiplication(int32_value, quantized_multiplier), |
| 200 | // right_shift); |
| 201 | // |
| 202 | // This is how to obtain the fixed-point multiplier and right shift |
| 203 | // parameters to pass to |
Miao Wang | 7d0d5a6 | 2018-02-23 11:33:20 -0800 | [diff] [blame] | 204 | // OutputStageQuantizeDownInt32ByFixedPoint. |
Miao Wang | a9fd919 | 2017-07-06 11:06:31 -0700 | [diff] [blame] | 205 | // |
| 206 | // Note: all this code only needs to run offline to generate the quantized |
| 207 | // neural network workload, not at runtime on the |
| 208 | // device on which quantized neural networks need to run. So it's not |
| 209 | // performance-critical at all. |
| 210 | void QuantizeMultiplierSmallerThanOne(float real_multiplier, |
| 211 | std::int32_t* quantized_multiplier, |
| 212 | int* right_shift) { |
| 213 | assert(real_multiplier > 0.f); |
| 214 | assert(real_multiplier < 1.f); |
| 215 | int s = 0; |
| 216 | // We want to bring the real multiplier into the interval [1/2, 1). |
| 217 | // We can do so by multiplying it by two, and recording how many times |
| 218 | // we multiplied by two so that we can compensate that by a right |
| 219 | // shift by the same amount. |
| 220 | while (real_multiplier < 0.5f) { |
| 221 | real_multiplier *= 2.0f; |
| 222 | s++; |
| 223 | } |
| 224 | // Now that the real multiplier is in [1/2, 1), we convert it |
| 225 | // into a fixed-point number. |
| 226 | std::int64_t q = |
| 227 | static_cast<std::int64_t>(std::round(real_multiplier * (1ll << 31))); |
| 228 | assert(q <= (1ll << 31)); |
| 229 | // Handle the special case when the real multiplier was so close to 1 |
| 230 | // that its fixed-point approximation was undistinguishable from 1. |
| 231 | // We handle this by dividing it by two, and remembering to decrement |
| 232 | // the right shift amount. |
| 233 | if (q == (1ll << 31)) { |
| 234 | q /= 2; |
| 235 | s--; |
| 236 | } |
| 237 | assert(s >= 0); |
| 238 | assert(q <= std::numeric_limits<std::int32_t>::max()); |
| 239 | *quantized_multiplier = static_cast<std::int32_t>(q); |
| 240 | *right_shift = s; |
| 241 | } |
| 242 | |
| 243 | int main() { |
| 244 | std::cout.precision(3); |
| 245 | |
| 246 | const int rows = 2; |
| 247 | const int depth = 4; |
| 248 | const int cols = 3; |
| 249 | const auto kOrder = gemmlowp::MapOrder::ColMajor; |
| 250 | |
| 251 | std::cout << "First, let us make some float matrices LHS and RHS, " |
| 252 | << "and compute their product.\n" |
| 253 | << std::endl; |
| 254 | MatrixWithStorage<float, kOrder> float_lhs(rows, depth); |
| 255 | float_lhs.MakeRandom(); |
| 256 | MatrixWithStorage<float, kOrder> float_rhs(depth, cols); |
| 257 | float_rhs.MakeRandom(); |
| 258 | MatrixWithStorage<float, kOrder> reference_float_result(rows, cols); |
| 259 | auto reference_float_result_map = reference_float_result.Map(); |
| 260 | FloatMatrixMultiplication(float_lhs.ConstMap(), float_rhs.ConstMap(), |
| 261 | &reference_float_result_map); |
| 262 | std::cout << "Here is the float LHS matrix:\n" << float_lhs << std::endl; |
| 263 | std::cout << "Here is the float RHS matrix:\n" << float_rhs << std::endl; |
| 264 | std::cout << "Here is the float product (LHS * RHS) matrix obtained by " |
| 265 | << "ordinary float matrix multiplication, i.e. as far as we are " |
| 266 | << "concerned, the REFERENCE RESULT:\n" |
| 267 | << reference_float_result << std::endl; |
| 268 | |
| 269 | std::cout |
| 270 | << "Now we embark on reproducing this result using " |
| 271 | << "quantized arithmetic. The code below splits into two parts: " |
| 272 | << "quantization code that only needs to run offline (e.g. to " |
| 273 | << "generate a quantized neural network workload), and actual " |
| 274 | << "runtime quantized code, which is typically performance-critical " |
| 275 | << "and where we typically do not want to use any floating-point " |
| 276 | << "arithmetic. We want to clearly distinguish between the two.\n" |
| 277 | << std::endl; |
| 278 | |
| 279 | std::cout << "The below is OFFLINE QUANTIZATION CODE. We still use some " |
| 280 | << "floating-point arithmetic in the process of generating the " |
| 281 | << "quantized workload to be run on-device.\n" |
| 282 | << std::endl; |
| 283 | |
| 284 | std::cout |
| 285 | << "Now, let us choose quantization parameters for these matrices. " |
| 286 | << "You might ask, what good is quantization if we need to pick " |
| 287 | << "quantization parameters for the result before we can run the " |
| 288 | << "quantized computation to obtain the result? The idea is that we " |
| 289 | << "target applications such as neural networks, where unknown results " |
| 290 | << "are only allowed to vary within preexisting bounds. In practice, the " |
| 291 | << "bounds for the results are typically learned during the neural " |
| 292 | "network " |
| 293 | << "training process. The min and max of the result do not have to be " |
| 294 | << "exact. If they are too broad, we just get lower quantization " |
| 295 | "accuracy. " |
| 296 | << "If they are too narrow, we just get clamping at the bounds.\n" |
| 297 | << std::endl; |
| 298 | |
| 299 | float lhs_min, lhs_max, rhs_min, rhs_max, result_min, result_max; |
| 300 | FindMinMax(float_lhs.Map(), &lhs_min, &lhs_max); |
| 301 | FindMinMax(float_rhs.Map(), &rhs_min, &rhs_max); |
| 302 | FindMinMax(reference_float_result.Map(), &result_min, &result_max); |
| 303 | const auto lhs_qparams = ChooseQuantizationParams(lhs_min, lhs_max); |
| 304 | const auto rhs_qparams = ChooseQuantizationParams(rhs_min, rhs_max); |
| 305 | const auto result_qparams = ChooseQuantizationParams(result_min, result_max); |
| 306 | |
| 307 | std::cout << "For LHS, we have min = " << lhs_min << ", max = " << lhs_max |
| 308 | << ", scale = " << lhs_qparams.scale |
| 309 | << ", zero_point = " << static_cast<float>(lhs_qparams.zero_point) |
| 310 | << std::endl; |
| 311 | std::cout << "For RHS, we have min = " << rhs_min << ", max = " << rhs_max |
| 312 | << ", scale = " << rhs_qparams.scale |
| 313 | << ", zero_point = " << static_cast<float>(rhs_qparams.zero_point) |
| 314 | << std::endl; |
| 315 | std::cout << "For the result, we have min = " << result_min |
| 316 | << ", max = " << result_max << ", scale = " << result_qparams.scale |
| 317 | << ", zero_point = " |
| 318 | << static_cast<float>(result_qparams.zero_point) << std::endl; |
| 319 | |
| 320 | std::cout << std::endl; |
| 321 | |
| 322 | MatrixWithStorage<std::uint8_t, kOrder> uint8_lhs(rows, depth); |
| 323 | MatrixWithStorage<std::uint8_t, kOrder> uint8_rhs(depth, cols); |
| 324 | MatrixWithStorage<std::uint8_t, kOrder> actual_uint8_result(rows, cols); |
| 325 | |
| 326 | Quantize(lhs_qparams, float_lhs.Storage(), &uint8_lhs.Storage()); |
| 327 | Quantize(rhs_qparams, float_rhs.Storage(), &uint8_rhs.Storage()); |
| 328 | |
| 329 | std::cout << "Quantized uint8 LHS matrix:\n" << uint8_lhs << std::endl; |
| 330 | std::cout << "Quantized uint8 RHS matrix:\n" << uint8_rhs << std::endl; |
| 331 | |
| 332 | const int lhs_offset = -lhs_qparams.zero_point; |
| 333 | const int rhs_offset = -rhs_qparams.zero_point; |
| 334 | const int result_offset = result_qparams.zero_point; |
| 335 | |
| 336 | const float real_multiplier = |
| 337 | lhs_qparams.scale * rhs_qparams.scale / result_qparams.scale; |
| 338 | std::int32_t quantized_multiplier; |
| 339 | int right_shift; |
| 340 | QuantizeMultiplierSmallerThanOne(real_multiplier, &quantized_multiplier, |
| 341 | &right_shift); |
| 342 | |
| 343 | std::cout << "End of OFFLINE QUANTIZATION CODE.\n" << std::endl; |
| 344 | |
| 345 | std::cout << "The below is ON-DEVICE RUNTIME QUANTIZED CODE. " |
| 346 | << "This is the part that is performance-critical and may only " |
| 347 | << "use quantized arithmetic.\n" |
| 348 | << std::endl; |
| 349 | |
Miao Wang | 7d0d5a6 | 2018-02-23 11:33:20 -0800 | [diff] [blame] | 350 | gemmlowp::OutputStageQuantizeDownInt32ByFixedPoint |
Miao Wang | a9fd919 | 2017-07-06 11:06:31 -0700 | [diff] [blame] | 351 | quantize_down_stage; |
| 352 | quantize_down_stage.result_offset_after_shift = result_offset; |
| 353 | quantize_down_stage.result_fixedpoint_multiplier = quantized_multiplier; |
| 354 | quantize_down_stage.result_shift = right_shift; |
| 355 | gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; |
| 356 | const auto& output_pipeline = |
| 357 | std::make_tuple(quantize_down_stage, saturating_cast_stage); |
| 358 | |
| 359 | auto actual_uint8_result_map = actual_uint8_result.Map(); |
| 360 | gemmlowp::GemmContext gemm_context; |
| 361 | gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, |
| 362 | gemmlowp::DefaultL8R8BitDepthParams>( |
| 363 | &gemm_context, uint8_lhs.ConstMap(), uint8_rhs.ConstMap(), |
| 364 | &actual_uint8_result_map, lhs_offset, rhs_offset, output_pipeline); |
| 365 | |
| 366 | std::cout << "Quantized uint8 result matrix obtained by quantized " |
| 367 | << "multiplication:\n" |
| 368 | << actual_uint8_result << std::endl; |
| 369 | |
| 370 | std::cout << "End of ON-DEVICE RUNTIME QUANTIZED CODE.\n" << std::endl; |
| 371 | |
| 372 | MatrixWithStorage<float, kOrder> actual_float_result(rows, cols); |
| 373 | Dequantize(result_qparams, actual_uint8_result.Storage(), |
| 374 | &actual_float_result.Storage()); |
| 375 | std::cout |
| 376 | << "Here is the actual float product (LHS * RHS) matrix obtained by " |
| 377 | << "dequantizing the above uint8 result, i.e. " |
| 378 | << "as far as we are concerned, the ACTUAL RESULT:\n" |
| 379 | << actual_float_result << std::endl; |
| 380 | |
| 381 | MatrixWithStorage<float, kOrder> diff_float_result(rows, cols); |
| 382 | for (int i = 0; i < rows; i++) { |
| 383 | for (int j = 0; j < cols; j++) { |
| 384 | diff_float_result.Map()(i, j) = |
| 385 | actual_float_result.Map()(i, j) - reference_float_result.Map()(i, j); |
| 386 | } |
| 387 | } |
| 388 | |
| 389 | std::cout << "Difference between ACTUAL and REFERENCE float results:\n" |
| 390 | << diff_float_result << std::endl; |
| 391 | } |