| /* Copyright 2019 Google LLC. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_TEST_H_ |
| #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_TEST_H_ |
| |
| #include <algorithm> |
| #include <ctime> |
| #include <initializer_list> |
| #include <iostream> |
| #include <limits> |
| #include <random> |
| #include <set> |
| #include <sstream> |
| #include <string> |
| #include <type_traits> |
| #include <vector> |
| |
| #include "testing/base/public/gunit.h" |
| #include "platform.h" |
| #include "pmu.h" |
| #include "ruy.h" |
| #include "ruy_advanced.h" |
| #include "time.h" |
| |
| #ifdef RUY_TEST_EXTERNAL_PATHS |
| #define EIGEN_USE_THREADS |
| #define EIGEN_USE_CUSTOM_THREAD_POOL |
| #include "third_party/eigen3/Eigen/Core" |
| #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" |
| #include "third_party/gemmlowp/public/gemmlowp.h" |
| #include "third_party/lapack/blas.h" |
| #endif |
| |
| #ifdef GEMMLOWP_PROFILING |
| #include "third_party/gemmlowp/profiling/profiler.h" |
| #endif |
| |
| namespace ruy { |
| |
| const float kClampRatio = 0.1f; |
| |
| enum class ExternalPath { kNone, kGemmlowp, kEigen, kEigenTensor, kOpenBlas }; |
| |
| inline std::vector<std::string>* CoveredPaths() { |
| static std::vector<std::string> covered_paths; |
| return &covered_paths; |
| } |
| |
| const char* PathName(Path path) { |
| #define RUY_PATHNAME_CASE(NAME) \ |
| case Path::NAME: \ |
| return #NAME; |
| switch (path) { |
| RUY_PATHNAME_CASE(kReference) |
| RUY_PATHNAME_CASE(kStandardCpp) |
| #if RUY_PLATFORM(NEON) |
| RUY_PATHNAME_CASE(kNeon) |
| RUY_PATHNAME_CASE(kNeonDotprod) |
| #elif RUY_PLATFORM(AVX512) |
| RUY_PATHNAME_CASE(kAvx512) |
| #endif |
| default: |
| RUY_CHECK(false); |
| return nullptr; |
| } |
| #undef RUY_PATHNAME_CASE |
| } |
| |
| const char* TuningName(Tuning tuning) { |
| #define RUY_SUBPATHNAME_CASE(NAME) \ |
| case Tuning::NAME: \ |
| return #NAME; |
| switch (tuning) { |
| RUY_SUBPATHNAME_CASE(kInOrder) |
| RUY_SUBPATHNAME_CASE(kOutOfOrder) |
| default: |
| RUY_CHECK(false); |
| return nullptr; |
| } |
| #undef RUY_SUBPATHNAME_CASE |
| } |
| |
| const char* PathName(ExternalPath path) { |
| #define RUY_PATHNAME_CASE(NAME) \ |
| case ExternalPath::NAME: \ |
| return #NAME; |
| switch (path) { |
| RUY_PATHNAME_CASE(kGemmlowp) |
| RUY_PATHNAME_CASE(kEigen) |
| RUY_PATHNAME_CASE(kEigenTensor) |
| RUY_PATHNAME_CASE(kOpenBlas) |
| default: |
| RUY_CHECK(false); |
| return nullptr; |
| } |
| #undef RUY_PATHNAME_CASE |
| } |
| |
| std::ostream& operator<<(std::ostream& stream, Path path) { |
| return stream << PathName(path); |
| } |
| |
| std::ostream& operator<<(std::ostream& stream, ExternalPath external_path) { |
| return stream << PathName(external_path); |
| } |
| |
| template <typename ContainerType> |
| std::string Join(const ContainerType& container) { |
| if (container.empty()) { |
| return "<empty>"; |
| } |
| std::ostringstream stream; |
| auto it = container.begin(); |
| stream << *it++; |
| for (; it != container.end(); ++it) { |
| stream << ", "; |
| stream << *it; |
| } |
| return stream.str(); |
| } |
| |
| struct LogCoveredPathsOnDestruction final { |
| ~LogCoveredPathsOnDestruction() { |
| std::cerr << "Covered paths: " << Join(*CoveredPaths()) << std::endl; |
| } |
| static void Singleton() { static LogCoveredPathsOnDestruction singleton; } |
| }; |
| |
| enum class RandomRange { |
| kGeneral, |
| kAvoidMinValue, |
| kReasonableSrcZeroPoint, |
| kReasonableDstZeroPoint, |
| kBias |
| }; |
| |
| template <typename Scalar, |
| bool IsFloatingPoint = std::is_floating_point<Scalar>::value> |
| struct RandomRangeBounds {}; |
| |
| template <typename Scalar> |
| struct RandomRangeBounds<Scalar, true> { |
| static Scalar GetMinBound(RandomRange range) { |
| switch (range) { |
| case RandomRange::kGeneral: |
| return -1; |
| case RandomRange::kAvoidMinValue: |
| return -1; |
| case RandomRange::kReasonableSrcZeroPoint: |
| return 0; |
| case RandomRange::kReasonableDstZeroPoint: |
| return 0; |
| case RandomRange::kBias: |
| return -1; |
| default: |
| RUY_CHECK(false); |
| return 0; |
| } |
| } |
| static Scalar GetMaxBound(RandomRange range) { |
| switch (range) { |
| case RandomRange::kGeneral: |
| return 1; |
| case RandomRange::kAvoidMinValue: |
| return 1; |
| case RandomRange::kReasonableSrcZeroPoint: |
| return 0; |
| case RandomRange::kReasonableDstZeroPoint: |
| return 0; |
| case RandomRange::kBias: |
| return 1; |
| default: |
| RUY_CHECK(false); |
| return 0; |
| } |
| } |
| }; |
| |
| template <typename Scalar> |
| Scalar WeightedSum(Scalar s1, float weight1, Scalar s2, float weight2) { |
| float sum = s1 * weight1 + s2 * weight2; |
| float clamped = std::min<float>( |
| std::numeric_limits<Scalar>::max(), |
| std::max<float>(std::numeric_limits<Scalar>::lowest(), sum)); |
| return static_cast<Scalar>(clamped); |
| } |
| |
| template <typename Scalar> |
| Scalar Parametrized(float param) { |
| return WeightedSum(std::numeric_limits<Scalar>::max(), param, |
| std::numeric_limits<Scalar>::lowest(), 1 - param); |
| } |
| |
| template <typename Scalar> |
| struct RandomRangeBounds<Scalar, false> { |
| static Scalar GetMinBound(RandomRange range) { |
| switch (range) { |
| case RandomRange::kGeneral: |
| return std::numeric_limits<Scalar>::lowest(); |
| case RandomRange::kAvoidMinValue: |
| return 1 + std::numeric_limits<Scalar>::lowest(); |
| case RandomRange::kReasonableSrcZeroPoint: |
| return std::numeric_limits<Scalar>::lowest(); |
| case RandomRange::kReasonableDstZeroPoint: |
| return Parametrized<Scalar>(0.4); |
| case RandomRange::kBias: |
| return std::is_same<Scalar, std::int32_t>::value |
| ? static_cast<Scalar>(-10000) |
| : 0; |
| default: |
| RUY_CHECK(false); |
| return 0; |
| } |
| } |
| static Scalar GetMaxBound(RandomRange range) { |
| switch (range) { |
| case RandomRange::kGeneral: |
| return std::numeric_limits<Scalar>::max(); |
| case RandomRange::kAvoidMinValue: |
| return std::numeric_limits<Scalar>::max(); |
| case RandomRange::kReasonableSrcZeroPoint: |
| return std::numeric_limits<Scalar>::max(); |
| case RandomRange::kReasonableDstZeroPoint: |
| return Parametrized<Scalar>(0.6); |
| case RandomRange::kBias: |
| return std::is_same<Scalar, std::int32_t>::value |
| ? static_cast<Scalar>(10000) |
| : 0; |
| default: |
| RUY_CHECK(false); |
| return 0; |
| } |
| } |
| }; |
| |
| inline std::default_random_engine& global_random_engine() { |
| static std::default_random_engine engine; |
| return engine; |
| } |
| |
| template <typename Scalar> |
| struct UniformRandomDistribution { |
| UniformRandomDistribution(RandomRange range) |
| : dist(RandomRangeBounds<Scalar>::GetMinBound(range), |
| RandomRangeBounds<Scalar>::GetMaxBound(range)) {} |
| Scalar Get() { return dist(global_random_engine()); } |
| // std::uniform_int_distribution is specified not to support char types, |
| // only short and wider types. MSVC actually generates an error on |
| // std::uniform_int_distribution<std::int8_t>. |
| using StdDistType = typename std::conditional< |
| std::is_floating_point<Scalar>::value, |
| std::uniform_real_distribution<Scalar>, |
| std::uniform_int_distribution<std::int32_t>>::type; |
| StdDistType dist; |
| }; |
| |
| template <typename Scalar> |
| void MakeRandomScalar(UniformRandomDistribution<Scalar>* uniform_dist, |
| Scalar* dst) { |
| *dst = uniform_dist->Get(); |
| } |
| |
| template <typename Scalar> |
| void MakeRandomVector(UniformRandomDistribution<Scalar>* uniform_dist, int size, |
| std::vector<Scalar>* dst) { |
| dst->resize(size); |
| for (auto& x : *dst) { |
| MakeRandomScalar(uniform_dist, &x); |
| } |
| } |
| |
| template <typename Scalar> |
| void MakeRandomScalar(RandomRange range, Scalar* dst) { |
| UniformRandomDistribution<Scalar> dist(range); |
| *dst = dist.Get(); |
| if (range == RandomRange::kReasonableDstZeroPoint || |
| range == RandomRange::kReasonableSrcZeroPoint) { |
| if (global_random_engine()() & 1) { |
| *dst = SymmetricZeroPoint<Scalar>(); |
| } |
| } |
| } |
| |
| template <typename Scalar> |
| void MakeRandomVector(RandomRange range, int size, std::vector<Scalar>* dst) { |
| UniformRandomDistribution<Scalar> dist(range); |
| dst->resize(size); |
| for (auto& x : *dst) { |
| MakeRandomScalar(&dist, &x); |
| } |
| } |
| |
| enum class LayoutStyle { kPackedLinear, kLinear }; |
| |
| void MakeLayout(int rows, int cols, Order order, LayoutStyle layout_style, |
| Layout* layout) { |
| layout->rows = rows; |
| layout->cols = cols; |
| layout->order = order; |
| |
| const int packed_stride = order == Order::kColMajor ? rows : cols; |
| |
| RUY_CHECK(layout_style == LayoutStyle::kPackedLinear || |
| layout_style == LayoutStyle::kLinear); |
| if (layout_style == LayoutStyle::kPackedLinear) { |
| layout->stride = packed_stride; |
| } else { |
| layout->stride = packed_stride + 1; |
| } |
| } |
| |
| template <typename Scalar> |
| struct StorageMatrix { |
| StorageMatrix() = default; |
| StorageMatrix(const StorageMatrix&) = delete; |
| void operator=(const StorageMatrix&) = delete; |
| std::vector<Scalar> data; |
| Matrix<Scalar> matrix; |
| }; |
| |
| template <typename Scalar> |
| void VerifyConsistentFields(const StorageMatrix<Scalar>& storage_matrix) { |
| if (storage_matrix.data.empty()) { |
| RUY_CHECK_EQ(storage_matrix.matrix.data.get(), nullptr); |
| RUY_CHECK_EQ(storage_matrix.matrix.layout.rows, 0); |
| RUY_CHECK_EQ(storage_matrix.matrix.layout.cols, 0); |
| } else { |
| RUY_CHECK_EQ(storage_matrix.matrix.data.get(), storage_matrix.data.data()); |
| RUY_CHECK_EQ(FlatSize(storage_matrix.matrix.layout), |
| storage_matrix.data.size()); |
| } |
| } |
| |
| template <typename Scalar> |
| void MakeRandom(int rows, int cols, Order order, Scalar zero_point, |
| LayoutStyle layout_style, RandomRange range, |
| StorageMatrix<Scalar>* storage_matrix) { |
| MakeLayout(rows, cols, order, layout_style, &storage_matrix->matrix.layout); |
| storage_matrix->matrix.zero_point = zero_point; |
| UniformRandomDistribution<Scalar> data_dist(range); |
| MakeRandomVector(&data_dist, FlatSize(storage_matrix->matrix.layout), |
| &storage_matrix->data); |
| storage_matrix->matrix.data = storage_matrix->data.data(); |
| VerifyConsistentFields(*storage_matrix); |
| } |
| |
| template <typename Scalar> |
| struct TestResult { |
| void operator=(const TestResult&) = delete; |
| void operator=(const TestResult&&) = delete; |
| StorageMatrix<Scalar> storage_matrix; |
| Path path = Path::kNone; |
| Tuning tuning = Tuning::kAuto; |
| ExternalPath external_path = ExternalPath::kNone; |
| float latency; |
| float l1_refill_rate; |
| float l2_refill_rate; |
| float l3_refill_rate; |
| float l1tlb_refill_rate; |
| float l2tlb_refill_rate; |
| float mispred_rate; |
| float frontend_stall_rate; |
| float backend_stall_rate; |
| |
| // Per-path data for pre-packing. |
| // This is not used by external paths or by Path::kReference. |
| Allocator allocator; |
| PrepackedMatrix prepacked_lhs; |
| PrepackedMatrix prepacked_rhs; |
| bool use_prepacked_lhs = false; |
| bool use_prepacked_rhs = false; |
| }; |
| |
| template <typename Scalar> |
| std::string PathName(const TestResult<Scalar>& result) { |
| std::string pathname; |
| if (result.path != Path::kNone) { |
| pathname.assign(PathName(result.path)); |
| } else if (result.external_path != ExternalPath::kNone) { |
| pathname.assign(PathName(result.external_path)); |
| } else { |
| RUY_CHECK(false); |
| } |
| if (result.tuning != Tuning::kAuto) { |
| pathname.append("/"); |
| pathname.append(TuningName(result.tuning)); |
| } |
| return pathname; |
| } |
| |
| enum class ExpectedOutcome { kSuccess, kDeath }; |
| |
| template <typename tLhsScalar, typename tRhsScalar, typename SpecType> |
| struct TestSet final { |
| using LhsScalar = tLhsScalar; |
| using RhsScalar = tRhsScalar; |
| using AccumScalar = typename SpecType::AccumScalar; |
| using DstScalar = typename SpecType::DstScalar; |
| using Spec = SpecType; |
| using TestResultType = TestResult<DstScalar>; |
| |
| void Run() { |
| MakeZeroPoints(); |
| MakeLhsRhs(); |
| MakeSpec(); |
| MakeOtherParams(); |
| MakeResultPaths(); |
| MakePrepackedMatrices(); |
| Eval(); |
| Verify(); |
| } |
| |
| private: |
| void MakeZeroPoints(); |
| void MakeLhsRhs(); |
| void MakeSpec(); |
| void MakeResultPaths(); |
| void MakePrepackedMatrices(); |
| void MakeOtherParams(); |
| void EvalAndVerify(); |
| void Eval(); |
| void Verify(); |
| |
| void EvalResult(TestResultType* result); |
| void EvalRuy(TestResultType* result); |
| void DoMul(TestResultType* result); |
| void Benchmark(TestResultType* result); |
| void VerifyTestResults() const; |
| void VerifyNonTrivial() const; |
| |
| public: |
| enum class LifeStage { |
| kInitial, |
| kHasZeroPoints, |
| kHasLhsRhs, |
| kHasSpec, |
| kHasOtherParams, |
| kHasResultPaths, |
| kHasPrepackedMatrices, |
| kEvaluated, |
| kFinal |
| }; |
| |
| ~TestSet() { |
| RUY_CHECK(life_stage == LifeStage::kFinal); |
| LogCoveredPathsOnDestruction::Singleton(); |
| } |
| |
| LifeStage life_stage = LifeStage::kInitial; |
| |
| int rows = 0; |
| int cols = 0; |
| int depth = 0; |
| Order lhs_order = Order::kRowMajor; |
| Order rhs_order = Order::kColMajor; |
| Order dst_order = Order::kColMajor; |
| LayoutStyle layout_style = LayoutStyle::kPackedLinear; |
| ExpectedOutcome expected_outcome = ExpectedOutcome::kSuccess; |
| |
| bool use_specified_zero_points = false; |
| LhsScalar lhs_zero_point = 0; |
| RhsScalar rhs_zero_point = 0; |
| DstScalar dst_zero_point = 0; |
| |
| std::vector<AccumScalar> per_channel_multiplier_fixedpoint; |
| std::vector<int> per_channel_multiplier_exponent; |
| |
| StorageMatrix<LhsScalar> lhs; |
| StorageMatrix<RhsScalar> rhs; |
| Spec spec; |
| std::vector<AccumScalar> bias_data; |
| std::vector<std::unique_ptr<TestResultType>> results; |
| |
| std::vector<Path> paths; |
| std::vector<ExternalPath> external_paths; |
| |
| bool benchmark = false; |
| bool perchannel = false; |
| int max_num_threads = 0; |
| bool benchmark_prepack_lhs = false; |
| bool benchmark_prepack_rhs = false; |
| }; |
| |
| Context& GlobalContext() { |
| static Context context; |
| return context; |
| } |
| |
| #if defined(__has_feature) |
| #if __has_feature(thread_sanitizer) |
| #define RUY_TSAN |
| #endif |
| #if __has_feature(address_sanitizer) |
| #define RUY_ASAN |
| #endif |
| #endif // defined(__has_feature) |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void TestSet<LhsScalar, RhsScalar, SpecType>::DoMul(TestResultType* result) { |
| Context* context = &GlobalContext(); |
| |
| if (!result->use_prepacked_lhs && !result->use_prepacked_rhs) { |
| Mul<kAllPaths>(lhs.matrix, rhs.matrix, spec, context, |
| &result->storage_matrix.matrix); |
| return; |
| } |
| |
| // If we prepacked an input matrix, null out its data pointer to check |
| // that we don't access any data through it. |
| Matrix<LhsScalar> null_data_lhs = lhs.matrix; |
| Matrix<RhsScalar> null_data_rhs = rhs.matrix; |
| if (result->use_prepacked_lhs) { |
| null_data_lhs.data = nullptr; |
| } |
| if (result->use_prepacked_rhs) { |
| null_data_rhs.data = nullptr; |
| } |
| |
| // Do the multiplication with pre-packed matrices. |
| PrepackedMatrix* prepacked_lhs_ptr = |
| result->use_prepacked_lhs ? &result->prepacked_lhs : nullptr; |
| PrepackedMatrix* prepacked_rhs_ptr = |
| result->use_prepacked_rhs ? &result->prepacked_rhs : nullptr; |
| MulWithPrepacked<kAllPaths>(null_data_lhs, null_data_rhs, spec, context, |
| &result->storage_matrix.matrix, prepacked_lhs_ptr, |
| prepacked_rhs_ptr); |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void TestSet<LhsScalar, RhsScalar, SpecType>::EvalRuy(TestResultType* result) { |
| GlobalContext().explicit_tuning = result->tuning; |
| if (max_num_threads) { |
| GlobalContext().max_num_threads = max_num_threads; |
| } else if (benchmark) { |
| GlobalContext().max_num_threads = 1; |
| } else { |
| GlobalContext().max_num_threads = 1 + global_random_engine()() % 8; |
| } |
| GlobalContext().SetRuntimeEnabledPaths(result->path); |
| if (expected_outcome == ExpectedOutcome::kSuccess) { |
| DoMul(result); |
| RUY_CHECK(GlobalContext().last_taken_path == result->path); |
| } else if (expected_outcome == ExpectedOutcome::kDeath) { |
| // TODO(benoitjacob) TSan and ASan seem to be breaking ASSERT_DEATH. |
| // Report a bug? |
| #if (!defined NDEBUG) && (!defined RUY_ASAN) && (!defined RUY_TSAN) |
| ASSERT_DEATH(DoMul(result), ""); |
| #endif |
| } else { |
| RUY_CHECK(false); |
| } |
| GlobalContext().explicit_tuning = Tuning::kAuto; |
| GlobalContext().max_num_threads = 1; |
| } |
| |
| #ifdef RUY_TEST_EXTERNAL_PATHS |
| |
| template <typename Scalar, gemmlowp::MapOrder tOrder> |
| void WrapGemmlowp(const Matrix<Scalar>& src, |
| gemmlowp::MatrixMap<const Scalar, tOrder>* dst) { |
| RUY_CHECK(src.layout.order == (tOrder == gemmlowp::MapOrder::ColMajor |
| ? Order::kColMajor |
| : Order::kRowMajor)); |
| *dst = gemmlowp::MatrixMap<const Scalar, tOrder>( |
| src.data.get(), src.layout.rows, src.layout.cols, src.layout.stride); |
| } |
| |
| template <typename Scalar, gemmlowp::MapOrder tOrder> |
| void WrapGemmlowpMutable(Matrix<Scalar>* src, |
| gemmlowp::MatrixMap<Scalar, tOrder>* dst) { |
| RUY_CHECK(src->layout.order == (tOrder == gemmlowp::MapOrder::ColMajor |
| ? Order::kColMajor |
| : Order::kRowMajor)); |
| *dst = gemmlowp::MatrixMap<Scalar, tOrder>( |
| src->data.get(), src->layout.rows, src->layout.cols, src->layout.stride); |
| } |
| |
| template <Order tOrder> |
| struct GemmlowpOrder {}; |
| |
| template <> |
| struct GemmlowpOrder<Order::kColMajor> { |
| static constexpr gemmlowp::MapOrder kValue = gemmlowp::MapOrder::ColMajor; |
| }; |
| |
| template <> |
| struct GemmlowpOrder<Order::kRowMajor> { |
| static constexpr gemmlowp::MapOrder kValue = gemmlowp::MapOrder::RowMajor; |
| }; |
| |
| gemmlowp::GemmContext& GlobalGemmlowpContext() { |
| static gemmlowp::GemmContext context; |
| return context; |
| } |
| |
| template <Order LhsOrder, Order RhsOrder, Order DstOrder, typename LhsScalar, |
| typename RhsScalar, typename DstScalar, typename Spec> |
| void EvalGemmlowp(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs, |
| const Spec& spec, int max_num_threads, |
| Matrix<DstScalar>* dst) { |
| static constexpr gemmlowp::MapOrder kGemmlowpLhsOrder = |
| GemmlowpOrder<LhsOrder>::kValue; |
| static constexpr gemmlowp::MapOrder kGemmlowpRhsOrder = |
| GemmlowpOrder<RhsOrder>::kValue; |
| static constexpr gemmlowp::MapOrder kGemmlowpDstOrder = |
| GemmlowpOrder<DstOrder>::kValue; |
| gemmlowp::MatrixMap<const LhsScalar, kGemmlowpLhsOrder> gemmlowp_lhs; |
| gemmlowp::MatrixMap<const RhsScalar, kGemmlowpRhsOrder> gemmlowp_rhs; |
| gemmlowp::MatrixMap<DstScalar, kGemmlowpDstOrder> gemmlowp_dst; |
| WrapGemmlowp(lhs, &gemmlowp_lhs); |
| WrapGemmlowp(rhs, &gemmlowp_rhs); |
| WrapGemmlowpMutable(dst, &gemmlowp_dst); |
| |
| gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage; |
| quantize_down_stage.result_offset_after_shift = dst->zero_point; |
| quantize_down_stage.result_fixedpoint_multiplier = spec.multiplier_fixedpoint; |
| quantize_down_stage.result_exponent = spec.multiplier_exponent; |
| gemmlowp::OutputStageScaleInt32ByFixedPointAndExponentPC< |
| gemmlowp::VectorShape::Col> |
| quantize_down_stage_pc; |
| quantize_down_stage_pc.result_offset_after_shift = dst->zero_point; |
| using ColVectorMap = |
| gemmlowp::VectorMap<const std::int32_t, gemmlowp::VectorShape::Col>; |
| quantize_down_stage_pc.result_fixedpoint_multiplier = |
| ColVectorMap(spec.multiplier_fixedpoint_perchannel, lhs.layout.rows); |
| quantize_down_stage_pc.result_exponent = |
| ColVectorMap(spec.multiplier_exponent_perchannel, lhs.layout.rows); |
| |
| gemmlowp::OutputStageClamp clamp_stage; |
| clamp_stage.min = spec.clamp_min; |
| clamp_stage.max = spec.clamp_max; |
| using OutputStageSaturatingCast = typename std::conditional< |
| std::is_same<DstScalar, std::uint8_t>::value, |
| gemmlowp::OutputStageSaturatingCastToUint8, |
| gemmlowp::OutputStageSaturatingCastToInt16>::type; |
| OutputStageSaturatingCast saturating_cast_stage; |
| |
| GlobalGemmlowpContext().set_max_num_threads(max_num_threads ? max_num_threads |
| : 1); |
| if (spec.bias) { |
| using ColVectorMap = |
| gemmlowp::VectorMap<const std::int32_t, gemmlowp::VectorShape::Col>; |
| gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_add_stage; |
| bias_add_stage.bias_vector = ColVectorMap(spec.bias, dst->layout.rows); |
| #ifndef GEMMLOWP_SSE4 // gemmlowp perchannel stuff does not build on SSE |
| if (spec.multiplier_exponent_perchannel) { |
| const auto& output_pipeline = |
| std::make_tuple(bias_add_stage, quantize_down_stage_pc, clamp_stage, |
| saturating_cast_stage); |
| gemmlowp::GemmWithOutputPipeline< |
| LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( |
| &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst, |
| -lhs.zero_point, -rhs.zero_point, output_pipeline); |
| } else // NOLINT[readability/braces] |
| #endif |
| { |
| const auto& output_pipeline = |
| std::make_tuple(bias_add_stage, quantize_down_stage, clamp_stage, |
| saturating_cast_stage); |
| gemmlowp::GemmWithOutputPipeline< |
| LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( |
| &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst, |
| -lhs.zero_point, -rhs.zero_point, output_pipeline); |
| } |
| } else { |
| #ifndef GEMMLOWP_SSE4 // gemmlowp perchannel stuff does not build on SSE |
| if (spec.multiplier_exponent_perchannel) { |
| const auto& output_pipeline = std::make_tuple( |
| quantize_down_stage_pc, clamp_stage, saturating_cast_stage); |
| gemmlowp::GemmWithOutputPipeline< |
| LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( |
| &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst, |
| -lhs.zero_point, -rhs.zero_point, output_pipeline); |
| } else // NOLINT[readability/braces] |
| #endif |
| { |
| const auto& output_pipeline = std::make_tuple( |
| quantize_down_stage, clamp_stage, saturating_cast_stage); |
| gemmlowp::GemmWithOutputPipeline< |
| LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( |
| &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst, |
| -lhs.zero_point, -rhs.zero_point, output_pipeline); |
| } |
| } |
| } |
| |
| inline constexpr int Mash(Order LhsOrder, Order RhsOrder, Order DstOrder) { |
| return (LhsOrder == Order::kRowMajor ? 4 : 0) + |
| (RhsOrder == Order::kRowMajor ? 2 : 0) + |
| (DstOrder == Order::kRowMajor ? 1 : 0); |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename DstScalar, |
| typename Spec> |
| void EvalGemmlowp(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs, |
| const Spec& spec, int max_num_threads, |
| Matrix<DstScalar>* dst) { |
| int index = Mash(lhs.layout.order, rhs.layout.order, dst->layout.order); |
| switch (index) { |
| #define EVALGEMMLOWP_CASE3(LHS, RHS, DST) \ |
| case Mash(LHS, RHS, DST): \ |
| return EvalGemmlowp<LHS, RHS, DST>(lhs, rhs, spec, max_num_threads, dst); |
| #define EVALGEMMLOWP_CASE2(LHS, RHS) \ |
| EVALGEMMLOWP_CASE3(LHS, RHS, Order::kColMajor) \ |
| EVALGEMMLOWP_CASE3(LHS, RHS, Order::kRowMajor) |
| #define EVALGEMMLOWP_CASE1(LHS) \ |
| EVALGEMMLOWP_CASE2(LHS, Order::kColMajor) \ |
| EVALGEMMLOWP_CASE2(LHS, Order::kRowMajor) |
| |
| EVALGEMMLOWP_CASE1(Order::kColMajor) |
| EVALGEMMLOWP_CASE1(Order::kRowMajor) |
| |
| #undef EVALGEMMLOWP_CASE1 |
| #undef EVALGEMMLOWP_CASE2 |
| #undef EVALGEMMLOWP_CASE3 |
| |
| default: |
| RUY_CHECK(false); |
| } |
| } |
| |
| template <Order tOrder> |
| struct EigenOrder {}; |
| |
| template <> |
| struct EigenOrder<Order::kColMajor> { |
| static constexpr int kValue = Eigen::ColMajor; |
| }; |
| |
| template <> |
| struct EigenOrder<Order::kRowMajor> { |
| static constexpr int kValue = Eigen::RowMajor; |
| }; |
| |
| template <Order LhsOrder, Order RhsOrder, Order DstOrder, typename LhsScalar, |
| typename RhsScalar, typename DstScalar, typename Spec> |
| void EvalEigen(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs, |
| const Spec& spec, int max_num_threads, Matrix<DstScalar>* dst) { |
| RUY_CHECK_EQ(lhs.zero_point, 0); |
| RUY_CHECK_EQ(rhs.zero_point, 0); |
| RUY_CHECK_EQ(dst->zero_point, 0); |
| RUY_CHECK_EQ(spec.multiplier_fixedpoint, 0); |
| RUY_CHECK_EQ(spec.multiplier_exponent, 0); |
| |
| static constexpr int kEigenLhsOrder = EigenOrder<LhsOrder>::kValue; |
| static constexpr int kEigenRhsOrder = EigenOrder<RhsOrder>::kValue; |
| static constexpr int kEigenDstOrder = EigenOrder<DstOrder>::kValue; |
| |
| using EigenLhsType = typename Eigen::Matrix<LhsScalar, Eigen::Dynamic, |
| Eigen::Dynamic, kEigenLhsOrder>:: |
| template StridedConstMapType<Eigen::OuterStride<Eigen::Dynamic>>::type; |
| using EigenRhsType = typename Eigen::Matrix<RhsScalar, Eigen::Dynamic, |
| Eigen::Dynamic, kEigenRhsOrder>:: |
| template StridedConstMapType<Eigen::OuterStride<Eigen::Dynamic>>::type; |
| using EigenDstType = typename Eigen::Matrix<DstScalar, Eigen::Dynamic, |
| Eigen::Dynamic, kEigenDstOrder>:: |
| template StridedMapType<Eigen::OuterStride<Eigen::Dynamic>>::type; |
| using EigenBiasType = |
| typename Eigen::Matrix<DstScalar, Eigen::Dynamic, 1>::ConstMapType; |
| |
| EigenLhsType eigen_lhs(lhs.data.get(), lhs.layout.rows, lhs.layout.cols, |
| Eigen::OuterStride<Eigen::Dynamic>(lhs.layout.stride)); |
| EigenRhsType eigen_rhs(rhs.data.get(), rhs.layout.rows, rhs.layout.cols, |
| Eigen::OuterStride<Eigen::Dynamic>(rhs.layout.stride)); |
| EigenDstType eigen_dst( |
| dst->data.get(), dst->layout.rows, dst->layout.cols, |
| Eigen::OuterStride<Eigen::Dynamic>(dst->layout.stride)); |
| Eigen::setNbThreads(max_num_threads ? max_num_threads : 1); |
| |
| if (spec.bias) { |
| EigenBiasType eigen_bias(spec.bias, dst->layout.rows); |
| if (spec.clamp_max == std::numeric_limits<DstScalar>::infinity() && |
| spec.clamp_min == -std::numeric_limits<DstScalar>::infinity()) { |
| eigen_dst.noalias() = (eigen_lhs * eigen_rhs).colwise() + eigen_bias; |
| } else { |
| eigen_dst.noalias() = ((eigen_lhs * eigen_rhs).colwise() + eigen_bias) |
| .cwiseMin(spec.clamp_max) |
| .cwiseMax(spec.clamp_min); |
| } |
| } else { |
| if (spec.clamp_max == std::numeric_limits<DstScalar>::infinity() && |
| spec.clamp_min == -std::numeric_limits<DstScalar>::infinity()) { |
| eigen_dst.noalias() = eigen_lhs * eigen_rhs; |
| } else { |
| eigen_dst.noalias() = (eigen_lhs * eigen_rhs) |
| .cwiseMin(spec.clamp_max) |
| .cwiseMax(spec.clamp_min); |
| } |
| } |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename DstScalar, |
| typename Spec> |
| void EvalEigen(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs, |
| const Spec& spec, int max_num_threads, Matrix<DstScalar>* dst) { |
| int index = Mash(lhs.layout.order, rhs.layout.order, dst->layout.order); |
| switch (index) { |
| #define EVALEIGEN_CASE3(LHS, RHS, DST) \ |
| case Mash(LHS, RHS, DST): \ |
| return EvalEigen<LHS, RHS, DST>(lhs, rhs, spec, max_num_threads, dst); |
| #define EVALEIGEN_CASE2(LHS, RHS) \ |
| EVALEIGEN_CASE3(LHS, RHS, Order::kColMajor) \ |
| EVALEIGEN_CASE3(LHS, RHS, Order::kRowMajor) |
| #define EVALEIGEN_CASE1(LHS) \ |
| EVALEIGEN_CASE2(LHS, Order::kColMajor) \ |
| EVALEIGEN_CASE2(LHS, Order::kRowMajor) |
| |
| EVALEIGEN_CASE1(Order::kColMajor) |
| EVALEIGEN_CASE1(Order::kRowMajor) |
| |
| #undef EVALEIGEN_CASE1 |
| #undef EVALEIGEN_CASE2 |
| #undef EVALEIGEN_CASE3 |
| |
| default: |
| RUY_CHECK(false); |
| } |
| } |
| |
| template <Order LhsOrder, Order RhsOrder, Order DstOrder, typename Scalar, |
| typename Spec> |
| void EvalEigenTensor(const Matrix<Scalar>& lhs, const Matrix<Scalar>& rhs, |
| const Spec& spec, int max_num_threads, |
| Matrix<Scalar>* dst) { |
| RUY_CHECK_EQ(lhs.zero_point, 0); |
| RUY_CHECK_EQ(rhs.zero_point, 0); |
| RUY_CHECK_EQ(dst->zero_point, 0); |
| RUY_CHECK_EQ(spec.multiplier_fixedpoint, 0); |
| RUY_CHECK_EQ(spec.multiplier_exponent, 0); |
| |
| // Eigen::TensorMap only supports packed layouts |
| RUY_CHECK(IsPacked(lhs.layout)); |
| RUY_CHECK(IsPacked(rhs.layout)); |
| RUY_CHECK(IsPacked(dst->layout)); |
| |
| using TensorLhsType = |
| Eigen::TensorMap<Eigen::Tensor<const Scalar, 2, Eigen::ColMajor>>; |
| using TensorRhsType = |
| Eigen::TensorMap<Eigen::Tensor<const Scalar, 2, Eigen::ColMajor>>; |
| using TensorDstType = |
| Eigen::TensorMap<Eigen::Tensor<Scalar, 2, Eigen::ColMajor>>; |
| using TensorBiasType = |
| Eigen::TensorMap<Eigen::Tensor<const Scalar, 1, Eigen::ColMajor>>; |
| |
| const bool tr = DstOrder == Order::kRowMajor; |
| const auto& contract_lhs = tr ? rhs : lhs; |
| const auto& contract_rhs = tr ? lhs : rhs; |
| |
| TensorLhsType tensor_lhs( |
| contract_lhs.data.get(), |
| LhsOrder == Order::kColMajor ? contract_lhs.layout.rows |
| : contract_lhs.layout.cols, |
| LhsOrder == Order::kColMajor ? contract_lhs.layout.cols |
| : contract_lhs.layout.rows); |
| TensorRhsType tensor_rhs( |
| contract_rhs.data.get(), |
| RhsOrder == Order::kColMajor ? contract_rhs.layout.rows |
| : contract_rhs.layout.cols, |
| RhsOrder == Order::kColMajor ? contract_rhs.layout.cols |
| : contract_rhs.layout.rows); |
| TensorDstType tensor_dst( |
| dst->data.get(), |
| DstOrder == Order::kColMajor ? dst->layout.rows : dst->layout.cols, |
| DstOrder == Order::kColMajor ? dst->layout.cols : dst->layout.rows); |
| using DimPair = |
| typename Eigen::Tensor<Scalar, 1, 0, Eigen::Index>::DimensionPair; |
| Eigen::array<DimPair, 1> contract_dims( |
| {DimPair((LhsOrder == Order::kColMajor) ? 1 : 0, |
| (RhsOrder == Order::kColMajor) ? 0 : 1)}); |
| Eigen::array<int, 2> shuffle(DstOrder == Order::kColMajor ? 0 : 1, |
| DstOrder == Order::kColMajor ? 1 : 0); |
| static Eigen::ThreadPool pool(max_num_threads ? max_num_threads : 1); |
| static Eigen::ThreadPoolDevice device(&pool, pool.NumThreads()); |
| if (spec.bias) { |
| TensorBiasType tensor_bias(spec.bias, dst->layout.rows); |
| Eigen::array<int, 2> bias_2d_shape(tr ? 1 : dst->layout.rows, |
| tr ? dst->layout.rows : 1); |
| Eigen::array<int, 2> bcast(tr ? dst->layout.cols : 1, |
| tr ? 1 : dst->layout.cols); |
| if (spec.clamp_max == std::numeric_limits<Scalar>::infinity() && |
| spec.clamp_min == -std::numeric_limits<Scalar>::infinity()) { |
| tensor_dst.device(device) = |
| tensor_lhs.contract(tensor_rhs, contract_dims); |
| } else { |
| tensor_dst.device(device) = |
| (tensor_lhs.contract(tensor_rhs, contract_dims) + |
| tensor_bias.reshape(bias_2d_shape).broadcast(bcast)) |
| .cwiseMin(spec.clamp_max) |
| .cwiseMax(spec.clamp_min); |
| } |
| } else { |
| if (spec.clamp_max == std::numeric_limits<Scalar>::infinity() && |
| spec.clamp_min == -std::numeric_limits<Scalar>::infinity()) { |
| tensor_dst.device(device) = |
| tensor_lhs.contract(tensor_rhs, contract_dims); |
| } else { |
| tensor_dst.device(device) = tensor_lhs.contract(tensor_rhs, contract_dims) |
| .cwiseMin(spec.clamp_max) |
| .cwiseMax(spec.clamp_min); |
| } |
| } |
| } |
| |
| template <typename Scalar, typename Spec> |
| void EvalEigenTensor(const Matrix<Scalar>& lhs, const Matrix<Scalar>& rhs, |
| const Spec& spec, int max_num_threads, |
| Matrix<Scalar>* dst) { |
| int index = Mash(lhs.layout.order, rhs.layout.order, dst->layout.order); |
| switch (index) { |
| #define EVALEIGENTENSOR_CASE3(LHS, RHS, DST) \ |
| case Mash(LHS, RHS, DST): \ |
| return EvalEigenTensor<LHS, RHS, DST>(lhs, rhs, spec, max_num_threads, dst); |
| #define EVALEIGENTENSOR_CASE2(LHS, RHS) \ |
| EVALEIGENTENSOR_CASE3(LHS, RHS, Order::kColMajor) \ |
| EVALEIGENTENSOR_CASE3(LHS, RHS, Order::kRowMajor) |
| #define EVALEIGENTENSOR_CASE1(LHS) \ |
| EVALEIGENTENSOR_CASE2(LHS, Order::kColMajor) \ |
| EVALEIGENTENSOR_CASE2(LHS, Order::kRowMajor) |
| |
| EVALEIGENTENSOR_CASE1(Order::kColMajor) |
| EVALEIGENTENSOR_CASE1(Order::kRowMajor) |
| |
| #undef EVALEIGENTENSOR_CASE1 |
| #undef EVALEIGENTENSOR_CASE2 |
| #undef EVALEIGENTENSOR_CASE3 |
| |
| default: |
| RUY_CHECK(false); |
| } |
| } |
| |
| template <typename Scalar> |
| struct GenericBlasGemm {}; |
| |
| template <> |
| struct GenericBlasGemm<lapack::doublereal> { |
| static void Run(char* transa, char* transb, lapack::integer* m, |
| lapack::integer* n, lapack::integer* k, |
| lapack::doublereal* alpha, lapack::doublereal* a, |
| lapack::integer* lda, lapack::doublereal* b, |
| lapack::integer* ldb, lapack::doublereal* beta, |
| lapack::doublereal* c, lapack::integer* ldc) { |
| dgemm_(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); |
| } |
| }; |
| |
| template <> |
| struct GenericBlasGemm<lapack::real> { |
| static void Run(char* transa, char* transb, lapack::integer* m, |
| lapack::integer* n, lapack::integer* k, lapack::real* alpha, |
| lapack::real* a, lapack::integer* lda, lapack::real* b, |
| lapack::integer* ldb, lapack::real* beta, lapack::real* c, |
| lapack::integer* ldc) { |
| sgemm_(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); |
| } |
| }; |
| |
| template <typename Scalar, typename Spec> |
| void EvalOpenBlas(const Matrix<Scalar>& lhs, const Matrix<Scalar>& rhs, |
| const Spec& spec, int max_num_threads, Matrix<Scalar>* dst) { |
| RUY_CHECK_EQ(lhs.zero_point, 0); |
| RUY_CHECK_EQ(rhs.zero_point, 0); |
| RUY_CHECK_EQ(dst->zero_point, 0); |
| RUY_CHECK_EQ(spec.multiplier_fixedpoint, 0); |
| RUY_CHECK_EQ(spec.multiplier_exponent, 0); |
| |
| Matrix<Scalar> gemm_lhs; |
| Matrix<Scalar> gemm_rhs; |
| Matrix<Scalar> gemm_dst; |
| gemm_dst = *dst; |
| |
| // Use Transpose to reduce to the all-column-major case. |
| // Notice that ruy::Matrix merely holds a pointer, does not own data, |
| // so Transpose is cheap -- no actual matrix data is being transposed here. |
| if (dst->layout.order == Order::kColMajor) { |
| gemm_lhs = lhs; |
| gemm_rhs = rhs; |
| } else { |
| gemm_lhs = rhs; |
| gemm_rhs = lhs; |
| Transpose(&gemm_lhs); |
| Transpose(&gemm_rhs); |
| Transpose(&gemm_dst); |
| } |
| bool transposed_lhs = false; |
| bool transposed_rhs = false; |
| |
| if (gemm_lhs.layout.order == Order::kRowMajor) { |
| Transpose(&gemm_lhs); |
| transposed_lhs = true; |
| } |
| if (gemm_rhs.layout.order == Order::kRowMajor) { |
| Transpose(&gemm_rhs); |
| transposed_rhs = true; |
| } |
| |
| RUY_CHECK(gemm_lhs.layout.order == Order::kColMajor); |
| RUY_CHECK(gemm_rhs.layout.order == Order::kColMajor); |
| RUY_CHECK(gemm_dst.layout.order == Order::kColMajor); |
| |
| char transa = transposed_lhs ? 'T' : 'N'; |
| char transb = transposed_rhs ? 'T' : 'N'; |
| int m = gemm_lhs.layout.rows; |
| int n = gemm_rhs.layout.cols; |
| int k = gemm_lhs.layout.cols; |
| float alpha = 1; |
| Scalar* a = gemm_lhs.data.get(); |
| int lda = gemm_lhs.layout.stride; |
| Scalar* b = gemm_rhs.data.get(); |
| int ldb = gemm_rhs.layout.stride; |
| float beta = 0; |
| Scalar* c = gemm_dst.data.get(); |
| int ldc = gemm_dst.layout.stride; |
| GenericBlasGemm<Scalar>::Run(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, |
| &ldb, &beta, c, &ldc); |
| |
| // BLAS does not allow us to express the bias-addition and clamping, so |
| // we use Eigen for that. |
| |
| using EigenDstType = |
| typename Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>:: |
| template StridedMapType<Eigen::OuterStride<Eigen::Dynamic>>::type; |
| using EigenBiasType = |
| typename Eigen::Matrix<Scalar, Eigen::Dynamic, 1>::ConstMapType; |
| |
| EigenDstType eigen_dst( |
| gemm_dst.data.get(), gemm_dst.layout.rows, gemm_dst.layout.cols, |
| Eigen::OuterStride<Eigen::Dynamic>(gemm_dst.layout.stride)); |
| Eigen::setNbThreads(max_num_threads ? max_num_threads : 1); |
| |
| if (spec.bias) { |
| EigenBiasType eigen_bias(spec.bias, dst->layout.rows); |
| if (spec.clamp_max == std::numeric_limits<Scalar>::infinity() && |
| spec.clamp_min == -std::numeric_limits<Scalar>::infinity()) { |
| eigen_dst.noalias() = eigen_dst.colwise() + eigen_bias; |
| } else { |
| eigen_dst.noalias() = (eigen_dst.colwise() + eigen_bias) |
| .cwiseMin(spec.clamp_max) |
| .cwiseMax(spec.clamp_min); |
| } |
| } else { |
| if (spec.clamp_max == std::numeric_limits<Scalar>::infinity() && |
| spec.clamp_min == -std::numeric_limits<Scalar>::infinity()) { |
| } else { |
| eigen_dst.noalias() = |
| eigen_dst.cwiseMin(spec.clamp_max).cwiseMax(spec.clamp_min); |
| } |
| } |
| } |
| |
| template <typename TestSetType> |
| struct SupportsGemmlowp { |
| static constexpr bool kValue = |
| std::is_same<typename TestSetType::LhsScalar, std::uint8_t>::value && |
| std::is_same<typename TestSetType::RhsScalar, std::uint8_t>::value; |
| }; |
| |
| template <typename TestSetType> |
| struct UsesSingleScalarType { |
| static constexpr bool kValue = |
| std::is_same<typename TestSetType::DstScalar, |
| typename TestSetType::LhsScalar>::value && |
| std::is_same<typename TestSetType::DstScalar, |
| typename TestSetType::RhsScalar>::value && |
| std::is_same<typename TestSetType::DstScalar, |
| typename TestSetType::AccumScalar>::value; |
| }; |
| |
| template <typename TestSetType, |
| bool IsFloatingPoint = |
| std::is_floating_point<typename TestSetType::AccumScalar>::value, |
| bool EnableGemmlowp = SupportsGemmlowp<TestSetType>::kValue, |
| bool SingleScalarType = UsesSingleScalarType<TestSetType>::kValue> |
| struct EvalExternalPathImpl { |
| using DstScalar = typename TestSetType::DstScalar; |
| static void Run(TestSetType*, TestResult<DstScalar>*) { RUY_CHECK(false); } |
| }; |
| |
| template <typename TestSetType> |
| struct EvalExternalPathImpl<TestSetType, true, false, true> { |
| using DstScalar = typename TestSetType::DstScalar; |
| static void Run(TestSetType* test_set, TestResult<DstScalar>* test_result) { |
| if (test_result->external_path == ExternalPath::kEigen) { |
| EvalEigen(test_set->lhs.matrix, test_set->rhs.matrix, test_set->spec, |
| test_set->max_num_threads, &test_result->storage_matrix.matrix); |
| } else if (test_result->external_path == ExternalPath::kEigenTensor) { |
| EvalEigenTensor(test_set->lhs.matrix, test_set->rhs.matrix, |
| test_set->spec, test_set->max_num_threads, |
| &test_result->storage_matrix.matrix); |
| } else if (test_result->external_path == ExternalPath::kOpenBlas) { |
| EvalOpenBlas(test_set->lhs.matrix, test_set->rhs.matrix, test_set->spec, |
| test_set->max_num_threads, |
| &test_result->storage_matrix.matrix); |
| } else { |
| RUY_CHECK(false); |
| } |
| } |
| }; |
| |
| template <typename TestSetType, bool SingleScalarType> |
| struct EvalExternalPathImpl<TestSetType, false, true, SingleScalarType> { |
| using DstScalar = typename TestSetType::DstScalar; |
| static void Run(TestSetType* test_set, TestResult<DstScalar>* test_result) { |
| if (test_result->external_path == ExternalPath::kGemmlowp) { |
| EvalGemmlowp(test_set->lhs.matrix, test_set->rhs.matrix, test_set->spec, |
| test_set->max_num_threads, |
| &test_result->storage_matrix.matrix); |
| } else { |
| RUY_CHECK(false); |
| } |
| } |
| }; |
| |
| template <typename TestSetType> |
| void EvalExternalPath( |
| TestSetType* test_set, |
| TestResult<typename TestSetType::DstScalar>* test_result) { |
| EvalExternalPathImpl<TestSetType>::Run(test_set, test_result); |
| } |
| |
| #endif // RUY_TEST_EXTERNAL_PATHS |
| |
| template <typename Scalar> |
| bool Agree(const Matrix<Scalar>& matrix1, const Matrix<Scalar>& matrix2, |
| int depth) { |
| RUY_CHECK_EQ(matrix1.layout.rows, matrix2.layout.rows); |
| RUY_CHECK_EQ(matrix1.layout.cols, matrix2.layout.cols); |
| RUY_CHECK_EQ(matrix1.zero_point, matrix2.zero_point); |
| const int size = matrix1.layout.rows * matrix1.layout.cols; |
| double tolerated_max_diff = 0; |
| double tolerated_mean_diff = 0; |
| if (std::is_floating_point<Scalar>::value) { |
| // TODO: replace hardcoded 100 by something more sensible, probably |
| // roughly sqrt(depth) based on central limit theorem. |
| double max_abs_val = 0; |
| for (int row = 0; row < matrix1.layout.rows; row++) { |
| for (int col = 0; col < matrix1.layout.cols; col++) { |
| max_abs_val = |
| std::max(max_abs_val, |
| std::abs(static_cast<double>(Element(matrix1, row, col)))); |
| max_abs_val = |
| std::max(max_abs_val, |
| std::abs(static_cast<double>(Element(matrix2, row, col)))); |
| } |
| } |
| tolerated_max_diff = max_abs_val * std::numeric_limits<Scalar>::epsilon() * |
| 4 * std::sqrt(static_cast<float>(depth)); |
| tolerated_mean_diff = tolerated_max_diff / std::sqrt(size); |
| } else if (RUY_OPT_ENABLED(RUY_OPT_NATIVE_ROUNDING)) { |
| tolerated_max_diff = 1; |
| // totally empirical |
| tolerated_mean_diff = std::min(1.0, 2.0 * std::pow(size, -0.2)); |
| } |
| double sum_diff = 0; |
| for (int row = 0; row < matrix1.layout.rows; row++) { |
| for (int col = 0; col < matrix1.layout.cols; col++) { |
| double elem1 = Element(matrix1, row, col); |
| double elem2 = Element(matrix2, row, col); |
| double diff = elem1 - elem2; |
| |
| sum_diff += diff; |
| // Test (std::abs(diff) > tolerated_max_diff), but also true if diff is |
| // NaN. |
| if (!(std::abs(diff) <= tolerated_max_diff)) { |
| return false; |
| } |
| } |
| } |
| double mean_diff = sum_diff / size; |
| if (std::abs(mean_diff) > tolerated_mean_diff) { |
| return false; |
| } |
| return true; |
| } |
| |
| template <typename Scalar> |
| bool Agree(const StorageMatrix<Scalar>& storage_matrix1, |
| const StorageMatrix<Scalar>& storage_matrix2, int depth) { |
| VerifyConsistentFields(storage_matrix1); |
| VerifyConsistentFields(storage_matrix2); |
| return Agree(storage_matrix1.matrix, storage_matrix2.matrix, depth); |
| } |
| |
| template <typename Scalar> |
| bool Agree(const TestResult<Scalar>& result1, const TestResult<Scalar>& result2, |
| int depth) { |
| return Agree(result1.storage_matrix, result2.storage_matrix, depth); |
| } |
| |
| struct Stats { |
| double median; |
| double mean; |
| double min; |
| double max; |
| }; |
| |
| std::string StatsAsString(const Stats& stats) { |
| char buf[256]; |
| snprintf(buf, sizeof(buf), "(median = %g, mean = %g, min = %g, max = %g)", |
| stats.median, stats.mean, stats.min, stats.max); |
| return std::string(buf); |
| } |
| |
| template <typename Scalar> |
| void GetMatrixStats(const Matrix<Scalar>& matrix, Stats* stats) { |
| double min = std::numeric_limits<double>::infinity(); |
| double max = -std::numeric_limits<double>::infinity(); |
| double sum = 0; |
| std::vector<double> allvals; |
| for (int row = 0; row < matrix.layout.rows; row++) { |
| for (int col = 0; col < matrix.layout.cols; col++) { |
| double val = Element(matrix, row, col); |
| min = std::min(min, val); |
| max = std::max(max, val); |
| sum += val; |
| allvals.push_back(val); |
| } |
| } |
| std::sort(allvals.begin(), allvals.end()); |
| stats->min = min; |
| stats->max = max; |
| stats->mean = sum / allvals.size(); |
| stats->median = allvals[allvals.size() / 2]; |
| } |
| |
| struct ErrorAnalysis { |
| Stats stats_good; |
| Stats stats_bad; |
| // The below is to help document departure from bit exactness. It's probably |
| // not going to be relevant to floating-point. |
| std::set<int> error_rows; |
| std::set<int> error_cols; |
| int row_of_first_error = 0; |
| int col_of_first_error = 0; |
| double first_error_good_value = 0; |
| double first_error_bad_value = 0; |
| }; |
| |
| template <typename TestSetType> |
| void AnalyzeTestError(const TestSetType& test_set, int first_bad_result_index, |
| ErrorAnalysis* error_analysis) { |
| const auto& good_matrix = test_set.results[0]->storage_matrix.matrix; |
| const auto& bad_matrix = |
| test_set.results[first_bad_result_index]->storage_matrix.matrix; |
| GetMatrixStats(good_matrix, &error_analysis->stats_good); |
| GetMatrixStats(bad_matrix, &error_analysis->stats_bad); |
| bool found_first_error = false; |
| for (int row = 0; row < good_matrix.layout.rows; row++) { |
| for (int col = 0; col < good_matrix.layout.cols; col++) { |
| if (Element(good_matrix, row, col) != Element(bad_matrix, row, col)) { |
| if (!found_first_error) { |
| found_first_error = true; |
| error_analysis->row_of_first_error = row; |
| error_analysis->col_of_first_error = col; |
| error_analysis->first_error_good_value = |
| Element(good_matrix, row, col); |
| error_analysis->first_error_bad_value = Element(bad_matrix, row, col); |
| } |
| error_analysis->error_rows.insert(row); |
| error_analysis->error_cols.insert(col); |
| } |
| } |
| } |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void ComputeAccumRangeBeforeMultiplier( |
| const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs, |
| const SpecType& spec, typename SpecType::AccumScalar* accum_min, |
| typename SpecType::AccumScalar* accum_max) { |
| Context context; |
| context.SetRuntimeEnabledPaths(Path::kReference); |
| using AccumScalar = typename SpecType::AccumScalar; |
| Matrix<AccumScalar> dst_before_multiplier; |
| MakeSimpleLayout(lhs.layout.rows, rhs.layout.cols, Order::kColMajor, |
| &dst_before_multiplier.layout); |
| const int size = FlatSize(dst_before_multiplier.layout); |
| std::vector<AccumScalar> dst_before_multiplier_data(size); |
| dst_before_multiplier.data = dst_before_multiplier_data.data(); |
| ruy::BasicSpec<AccumScalar, AccumScalar> spec_before_multiplier; |
| spec_before_multiplier.bias = spec.bias; |
| Mul<Path::kReference>(lhs, rhs, spec_before_multiplier, &context, |
| &dst_before_multiplier); |
| *accum_min = *std::min_element(dst_before_multiplier_data.begin(), |
| dst_before_multiplier_data.end()); |
| *accum_max = *std::max_element(dst_before_multiplier_data.begin(), |
| dst_before_multiplier_data.end()); |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void ComputeReasonableMultiplier(const Matrix<LhsScalar>& lhs, |
| const Matrix<RhsScalar>& rhs, |
| typename SpecType::DstScalar dst_zero_point, |
| const SpecType& spec, double* multiplier) { |
| using AccumScalar = typename SpecType::AccumScalar; |
| using DstScalar = typename SpecType::DstScalar; |
| if (std::is_floating_point<DstScalar>::value || |
| std::is_same<DstScalar, std::int32_t>::value) { |
| *multiplier = 0; |
| return; |
| } |
| if (getenv("QUICK_BENCHMARK")) { |
| *multiplier = static_cast<double>(std::numeric_limits<DstScalar>::max()) / |
| (static_cast<double>(lhs.layout.cols) * |
| std::numeric_limits<LhsScalar>::max() * |
| std::numeric_limits<RhsScalar>::max()); |
| return; |
| } |
| AccumScalar accum_min; |
| AccumScalar accum_max; |
| ComputeAccumRangeBeforeMultiplier(lhs, rhs, spec, &accum_min, &accum_max); |
| accum_min = std::min(accum_min, 0); |
| accum_max = std::max(accum_max, 0); |
| const double dst_pos_range_width = |
| static_cast<double>(std::numeric_limits<DstScalar>::max()) - |
| dst_zero_point; |
| const double dst_neg_range_width = |
| dst_zero_point - |
| static_cast<double>(std::numeric_limits<DstScalar>::lowest()); |
| if (accum_max == 0 && accum_min == 0) { |
| *multiplier = 1; |
| } else if (std::abs(accum_max) * dst_pos_range_width > |
| std::abs(accum_min) * dst_neg_range_width) { |
| *multiplier = dst_pos_range_width / accum_max; |
| } else { |
| *multiplier = dst_neg_range_width / -accum_min; |
| } |
| RUY_CHECK_GT(*multiplier, 0.0); |
| } |
| |
| void QuantizeMultiplier(double multiplier_double, |
| std::int32_t* multiplier_fixedpoint, |
| int* multiplier_exponent) { |
| RUY_CHECK_GT(multiplier_double, 0); |
| if (multiplier_double == 0.) { |
| *multiplier_fixedpoint = 0; |
| *multiplier_exponent = 0; |
| return; |
| } |
| const double q = std::frexp(multiplier_double, multiplier_exponent); |
| auto q_fixed = static_cast<std::int64_t>(std::round(q * (1ll << 31))); |
| RUY_CHECK_LE(q_fixed, (1ll << 31)); |
| if (q_fixed == (1ll << 31)) { |
| q_fixed /= 2; |
| ++*multiplier_exponent; |
| } |
| RUY_CHECK_LE(q_fixed, std::numeric_limits<std::int32_t>::max()); |
| *multiplier_fixedpoint = static_cast<std::int32_t>(q_fixed); |
| } |
| |
| template <typename TestSetType> |
| void SwitchMultiplierToPerChannel(TestSetType* test_set) { |
| test_set->per_channel_multiplier_fixedpoint.resize(test_set->rows); |
| test_set->per_channel_multiplier_exponent.resize(test_set->rows); |
| for (int i = 0; i < test_set->rows; i++) { |
| // multipliers typically range in [2^30 ; 2^31 - 1]. |
| // Values in [0, 2^30 - 1] are normally unused, but harmless. |
| // Thus a good way to randomize multipliers is to subtract from them |
| // a random value smaller than 2^30 but still significant compared to it. |
| std::int32_t nudged_multiplier = test_set->spec.multiplier_fixedpoint - |
| (global_random_engine()() % (1 << 26)); |
| int nudged_exponent = |
| test_set->spec.multiplier_exponent - 1 + (global_random_engine()() % 4); |
| test_set->per_channel_multiplier_fixedpoint[i] = nudged_multiplier; |
| test_set->per_channel_multiplier_exponent[i] = nudged_exponent; |
| } |
| test_set->spec.multiplier_fixedpoint_perchannel = |
| test_set->per_channel_multiplier_fixedpoint.data(); |
| test_set->spec.multiplier_exponent_perchannel = |
| test_set->per_channel_multiplier_exponent.data(); |
| test_set->spec.multiplier_fixedpoint = 0; |
| test_set->spec.multiplier_exponent = 0; |
| } |
| |
| template < |
| typename TestSetType, |
| bool IsApplicable = |
| std::is_same<typename TestSetType::AccumScalar, std::int32_t>::value && |
| !std::is_same<typename TestSetType::DstScalar, std::int32_t>::value> |
| struct MakeSpecMultiplierFieldsImpl {}; |
| |
| template <typename TestSetType> |
| struct MakeSpecMultiplierFieldsImpl<TestSetType, true> { |
| static void Run(TestSetType* test_set) { |
| double multiplier; |
| ComputeReasonableMultiplier(test_set->lhs.matrix, test_set->rhs.matrix, |
| test_set->dst_zero_point, test_set->spec, |
| &multiplier); |
| QuantizeMultiplier(multiplier, &test_set->spec.multiplier_fixedpoint, |
| &test_set->spec.multiplier_exponent); |
| if (!test_set->benchmark) { |
| test_set->perchannel = global_random_engine()() & 1; |
| } |
| if (test_set->perchannel) { |
| SwitchMultiplierToPerChannel(test_set); |
| } |
| } |
| }; |
| |
| template <typename TestSetType> |
| struct MakeSpecMultiplierFieldsImpl<TestSetType, false> { |
| static void Run(TestSetType* test_set) { |
| test_set->spec.multiplier_fixedpoint = 0; |
| test_set->spec.multiplier_exponent = 0; |
| } |
| }; |
| |
| template <typename LhsScalar, typename RhsScalar, typename Spec> |
| void MakeSpecClampFields(const Matrix<LhsScalar>& lhs, |
| const Matrix<RhsScalar>& rhs, |
| typename Spec::DstScalar dst_zero_point, Spec* spec) { |
| using AccumScalar = typename Spec::AccumScalar; |
| using DstScalar = typename Spec::DstScalar; |
| |
| if (getenv("BENCHMARK_ONLY_MATMUL")) { |
| spec->clamp_min = -std::numeric_limits<DstScalar>::infinity(); |
| spec->clamp_max = std::numeric_limits<DstScalar>::infinity(); |
| return; |
| } |
| |
| if (getenv("QUICK_BENCHMARK")) { |
| spec->clamp_min = std::numeric_limits<DstScalar>::lowest() + 1; |
| spec->clamp_max = std::numeric_limits<DstScalar>::max() - 1; |
| return; |
| } |
| Context context; |
| context.SetRuntimeEnabledPaths(Path::kReference); |
| Matrix<DstScalar> unclamped_dst; |
| MakeSimpleLayout(lhs.layout.rows, rhs.layout.cols, Order::kColMajor, |
| &unclamped_dst.layout); |
| unclamped_dst.zero_point = dst_zero_point; |
| const int size = FlatSize(unclamped_dst.layout); |
| std::vector<DstScalar> unclamped_dst_data(size); |
| unclamped_dst.data = unclamped_dst_data.data(); |
| ruy::BasicSpec<AccumScalar, DstScalar> spec_unclamped; |
| spec_unclamped.bias = spec->bias; |
| spec_unclamped.multiplier_fixedpoint = spec->multiplier_fixedpoint; |
| spec_unclamped.multiplier_exponent = spec->multiplier_exponent; |
| spec_unclamped.multiplier_fixedpoint_perchannel = |
| spec->multiplier_fixedpoint_perchannel; |
| spec_unclamped.multiplier_exponent_perchannel = |
| spec->multiplier_exponent_perchannel; |
| Mul<Path::kReference>(lhs, rhs, spec_unclamped, &context, &unclamped_dst); |
| // If dst is std::int32_t, no need to set the clamp min/max. |
| if (!std::is_same<typename Spec::DstScalar, std::int32_t>::value) { |
| std::sort(unclamped_dst_data.begin(), unclamped_dst_data.end()); |
| const int clamp_count = static_cast<int>(std::floor(kClampRatio * size)); |
| RUY_CHECK_LT(clamp_count, size); |
| spec->clamp_min = unclamped_dst_data[clamp_count]; |
| spec->clamp_max = unclamped_dst_data[size - 1 - clamp_count]; |
| } |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void TestSet<LhsScalar, RhsScalar, SpecType>::MakeZeroPoints() { |
| RUY_CHECK(life_stage == LifeStage::kInitial); |
| if (!use_specified_zero_points) { |
| MakeRandomScalar(RandomRange::kReasonableSrcZeroPoint, &lhs_zero_point); |
| MakeRandomScalar(RandomRange::kReasonableSrcZeroPoint, &rhs_zero_point); |
| // If destination is std::int32_t, no dst_zero_point is necessary. |
| if (std::is_same<DstScalar, std::int32_t>::value) { |
| dst_zero_point = 0; |
| } else { |
| MakeRandomScalar(RandomRange::kReasonableDstZeroPoint, &dst_zero_point); |
| } |
| } |
| life_stage = LifeStage::kHasZeroPoints; |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void TestSet<LhsScalar, RhsScalar, SpecType>::MakeLhsRhs() { |
| RUY_CHECK(life_stage == LifeStage::kHasZeroPoints); |
| MakeRandom(rows, depth, lhs_order, lhs_zero_point, layout_style, |
| RandomRange::kAvoidMinValue, &lhs); |
| MakeRandom(depth, cols, rhs_order, rhs_zero_point, layout_style, |
| RandomRange::kGeneral, &rhs); |
| life_stage = LifeStage::kHasLhsRhs; |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void TestSet<LhsScalar, RhsScalar, SpecType>::MakeSpec() { |
| RUY_CHECK(life_stage == LifeStage::kHasLhsRhs); |
| |
| if (!getenv("BENCHMARK_ONLY_MATMUL") && (global_random_engine()() & 1)) { |
| MakeRandomVector(RandomRange::kBias, rows, &bias_data); |
| spec.bias = bias_data.data(); |
| } |
| MakeSpecMultiplierFieldsImpl<TestSet>::Run(this); |
| MakeSpecClampFields(lhs.matrix, rhs.matrix, dst_zero_point, &spec); |
| life_stage = LifeStage::kHasSpec; |
| } |
| |
| inline int GetIntEnvVarOrZero(const char* name) { |
| const char* val = getenv(name); |
| if (!val) { |
| return 0; |
| } |
| return std::stoi(val); |
| } |
| |
| inline float GetFloatEnvVarOrZero(const char* name) { |
| const char* val = getenv(name); |
| if (!val) { |
| return 0; |
| } |
| return std::stof(val); |
| } |
| |
| inline int GetHexIntEnvVarOrZero(const char* name) { |
| const char* val = getenv(name); |
| if (!val) { |
| return 0; |
| } |
| return std::stoi(val, 0, 16); |
| } |
| |
| inline bool GetBoolEnvVarOrFalse(const char* name) { |
| return static_cast<bool>(GetIntEnvVarOrZero(name)); |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void TestSet<LhsScalar, RhsScalar, SpecType>::MakeOtherParams() { |
| RUY_CHECK(life_stage == LifeStage::kHasSpec); |
| if (max_num_threads == 0) { |
| max_num_threads = GetIntEnvVarOrZero("THREADS"); |
| } |
| life_stage = LifeStage::kHasOtherParams; |
| } |
| |
| std::vector<Path> PathsBitfieldAsVector(Path paths_bitfield) { |
| std::vector<Path> result; |
| std::uint32_t remaining_paths = static_cast<std::uint32_t>(paths_bitfield); |
| std::uint32_t test_bit = 1; |
| while (remaining_paths) { |
| if (remaining_paths & test_bit) { |
| result.push_back(static_cast<Path>(test_bit)); |
| } |
| remaining_paths &= ~test_bit; |
| test_bit <<= 1; |
| } |
| return result; |
| } |
| |
| std::vector<Tuning> EnumerateTuningsForPath(Path path, bool benchmark) { |
| if (benchmark) { |
| return {Tuning::kAuto}; |
| } |
| if (path == Path::kNeon || path == Path::kNeonDotprod) { |
| return {Tuning::kInOrder, Tuning::kOutOfOrder, Tuning::kAuto}; |
| } |
| return {Tuning::kAuto}; |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void TestSet<LhsScalar, RhsScalar, SpecType>::MakePrepackedMatrices() { |
| RUY_CHECK(life_stage == LifeStage::kHasResultPaths); |
| |
| // Prepacked matrices are Path-dependent, so create them for each test result. |
| for (auto& result : results) { |
| // If this result uses an external path, then skip this entirely. |
| if (result->path == Path::kNone) { |
| continue; |
| } |
| // Pre-packing doesn't make sense for Path::kReference. |
| // TODO(silvasean): Make Path::kReference an ExternalPath? |
| if (result->path == Path::kReference) { |
| continue; |
| } |
| |
| // Determine whether we should create/use prepacked matrices. |
| if (benchmark) { |
| // For benchmarking, do as requested. |
| result->use_prepacked_lhs = benchmark_prepack_lhs; |
| result->use_prepacked_rhs = benchmark_prepack_rhs; |
| } else { |
| // When testing, randomly pre-pack sometimes. But don't do it too often. |
| result->use_prepacked_lhs = (global_random_engine()() & 7) == 0; |
| result->use_prepacked_rhs = (global_random_engine()() & 7) == 0; |
| } |
| |
| // Create the pre-packed matrices. |
| PrepackedMatrix* prepacked_lhs_ptr = |
| result->use_prepacked_lhs ? &result->prepacked_lhs : nullptr; |
| PrepackedMatrix* prepacked_rhs_ptr = |
| result->use_prepacked_rhs ? &result->prepacked_rhs : nullptr; |
| auto alloc_fn = [&result](std::size_t num_bytes) { |
| return result->allocator.AllocateBytes(num_bytes); |
| }; |
| // Use a dst with a null data pointer to check that the pre-packing |
| // invocation doesn't write into it. |
| Matrix<DstScalar> null_data_dst = result->storage_matrix.matrix; |
| null_data_dst.data = nullptr; |
| GlobalContext().SetRuntimeEnabledPaths(result->path); |
| PrePackForMul<kAllPaths>(lhs.matrix, rhs.matrix, spec, &GlobalContext(), |
| &null_data_dst, prepacked_lhs_ptr, |
| prepacked_rhs_ptr, alloc_fn); |
| RUY_CHECK(GlobalContext().last_taken_path == result->path); |
| } |
| |
| life_stage = LifeStage::kHasPrepackedMatrices; |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void TestSet<LhsScalar, RhsScalar, SpecType>::MakeResultPaths() { |
| RUY_CHECK(life_stage == LifeStage::kHasOtherParams); |
| |
| Path paths_bitfield = static_cast<Path>(GetHexIntEnvVarOrZero("PATHS")); |
| |
| if (paths_bitfield == Path::kNone) { |
| // Use a dummy Context just to perform the resolution of specific runtime |
| // enabled paths. |
| Context context; |
| paths_bitfield = context.GetRuntimeEnabledPaths(); |
| } |
| |
| // Trim bits that don't correspond to a compiled path, |
| // to allow specifying e.g. ffff to mean 'all paths' regardless of whether all |
| // those bits exist as actual paths. |
| paths_bitfield = paths_bitfield & kAllPaths; |
| RUY_CHECK(paths_bitfield != Path::kNone); |
| paths = PathsBitfieldAsVector(paths_bitfield); |
| |
| #ifdef RUY_TEST_EXTERNAL_PATHS |
| |
| using TestSetType = TestSet<LhsScalar, RhsScalar, SpecType>; |
| |
| if (!getenv("NOEXT")) { |
| if (SupportsGemmlowp<TestSetType>::kValue) { |
| #ifdef GEMMLOWP_SSE4 |
| const bool gemmlowp_supported = !spec.multiplier_fixedpoint_perchannel; |
| #else |
| const bool gemmlowp_supported = true; |
| #endif |
| if (gemmlowp_supported) { |
| external_paths.push_back(ExternalPath::kGemmlowp); |
| } |
| } |
| if (UsesSingleScalarType<TestSetType>::kValue && |
| std::is_floating_point<AccumScalar>::value) { |
| external_paths.push_back(ExternalPath::kEigen); |
| if (layout_style == LayoutStyle::kPackedLinear) { |
| external_paths.push_back(ExternalPath::kEigenTensor); |
| } |
| // We link against a generic BLAS target that only maps to OpenBLAS on specific |
| // architectures. |
| #if RUY_PLATFORM(ARM_32) || RUY_PLATFORM(ARM_64) |
| // OpenBLAS multi-threading is disabled, so avoid mixing single-threaded |
| // and multi-threaded benchmark results. |
| if (max_num_threads == 1) { |
| external_paths.push_back(ExternalPath::kOpenBlas); |
| } |
| #endif |
| } |
| } |
| |
| #endif // RUY_TEST_EXTERNAL_PATHS |
| |
| for (Path path : paths) { |
| for (Tuning tuning : EnumerateTuningsForPath(path, benchmark)) { |
| results.emplace_back(new TestResultType); |
| TestResultType& result = *results.back(); |
| result.path = path; |
| result.tuning = tuning; |
| MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style, |
| RandomRange::kGeneral, &result.storage_matrix); |
| } |
| } |
| |
| for (ExternalPath external_path : external_paths) { |
| results.emplace_back(new TestResultType); |
| TestResultType& result = *results.back(); |
| result.external_path = external_path; |
| MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style, |
| RandomRange::kGeneral, &result.storage_matrix); |
| } |
| |
| life_stage = LifeStage::kHasResultPaths; |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void TestSet<LhsScalar, RhsScalar, SpecType>::EvalResult( |
| TestResult<typename SpecType::DstScalar>* result) { |
| RUY_CHECK(result->path != Path::kNone || |
| result->external_path != ExternalPath::kNone); |
| if (result->path != Path::kNone) { |
| EvalRuy(result); |
| } else { |
| #ifdef RUY_TEST_EXTERNAL_PATHS |
| using TestSetType = TestSet<LhsScalar, RhsScalar, SpecType>; |
| EvalExternalPath(this, result); |
| #endif |
| } |
| const std::string& pathname = PathName(*result); |
| if (std::find(CoveredPaths()->begin(), CoveredPaths()->end(), pathname) == |
| CoveredPaths()->end()) { |
| CoveredPaths()->push_back(pathname); |
| } |
| } |
| |
| using f32 = float; |
| using f64 = double; |
| using u8 = std::uint8_t; |
| using i8 = std::int8_t; |
| using u16 = std::uint16_t; |
| using i16 = std::int16_t; |
| using u32 = std::uint32_t; |
| using i32 = std::int32_t; |
| using u64 = std::uint64_t; |
| using i64 = std::int64_t; |
| |
| template <typename Scalar> |
| const char* TypeName() { |
| return nullptr; |
| } |
| |
| #define RUY_TYPENAME(TYPE) \ |
| template <> \ |
| const char* TypeName<TYPE>() { \ |
| return #TYPE; \ |
| } |
| |
| RUY_TYPENAME(f32) |
| RUY_TYPENAME(f64) |
| RUY_TYPENAME(u8) |
| RUY_TYPENAME(i8) |
| RUY_TYPENAME(u16) |
| RUY_TYPENAME(i16) |
| RUY_TYPENAME(u32) |
| RUY_TYPENAME(i32) |
| RUY_TYPENAME(u64) |
| RUY_TYPENAME(i64) |
| |
| #undef RUY_TYPENAME |
| |
| template <typename Scalar> |
| const char* SymmetryName(const Matrix<Scalar>& matrix) { |
| if (matrix.zero_point == SymmetricZeroPoint<Scalar>()) { |
| return "symm"; |
| } else { |
| return "asymm"; |
| } |
| } |
| |
| template <typename Scalar> |
| int StorageSize(const Matrix<Scalar>& matrix) { |
| return sizeof(Scalar) * FlatSize(matrix.layout); |
| } |
| |
| // Helper that replicates a buffer and gives out pointers to the replicas. |
| // This is useful when one wants to traverse data so that it is cold in cache. |
| // By having a sufficiently large value of num_repeats, one can ensure that the |
| // working set covered by the replicas is greater than the cache size. |
| template <typename T> |
| class RepeatedBuffer { |
| public: |
| RepeatedBuffer() = default; |
| void Init(const T* elems, std::size_t num_elems, int num_repeats) { |
| buffers_.clear(); |
| allocator_.FreeAll(); |
| for (int i = 0; i < num_repeats; i++) { |
| T* p; |
| allocator_.Allocate(num_elems, &p); |
| memcpy(p, elems, num_elems * sizeof(T)); |
| buffers_.push_back(p); |
| } |
| } |
| T* Next() { |
| T* ret = buffers_[current_]; |
| current_ = (current_ + 1) % buffers_.size(); |
| return ret; |
| } |
| |
| private: |
| Allocator allocator_; |
| std::vector<T*> buffers_; |
| int current_ = 0; |
| }; |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void TestSet<LhsScalar, RhsScalar, SpecType>::Benchmark( |
| TestResult<typename SpecType::DstScalar>* result) { |
| using DstScalar = typename SpecType::DstScalar; |
| |
| const bool cold = getenv("RUY_BENCHMARK_COLD"); |
| LhsScalar* orig_lhs_data = lhs.matrix.data.get(); |
| RhsScalar* orig_rhs_data = rhs.matrix.data.get(); |
| DstScalar* orig_dst_data = result->storage_matrix.matrix.data.get(); |
| void* orig_prepacked_lhs_data = result->prepacked_lhs.data; |
| void* orig_prepacked_rhs_data = result->prepacked_rhs.data; |
| |
| int num_matmul_sets = 0; |
| |
| RepeatedBuffer<LhsScalar> cold_lhs; |
| RepeatedBuffer<RhsScalar> cold_rhs; |
| RepeatedBuffer<DstScalar> cold_dst; |
| RepeatedBuffer<char> cold_prepacked_lhs; |
| RepeatedBuffer<char> cold_prepacked_rhs; |
| |
| if (cold) { |
| const int kWorkingSetSize = 100 << 20; |
| const int each_matmul_set_size = StorageSize(lhs.matrix) + |
| StorageSize(rhs.matrix) + |
| StorageSize(result->storage_matrix.matrix); |
| num_matmul_sets = |
| (kWorkingSetSize + each_matmul_set_size - 1) / each_matmul_set_size; |
| |
| cold_lhs.Init(lhs.matrix.data.get(), FlatSize(lhs.matrix.layout), |
| num_matmul_sets); |
| cold_rhs.Init(rhs.matrix.data.get(), FlatSize(rhs.matrix.layout), |
| num_matmul_sets); |
| cold_dst.Init(result->storage_matrix.matrix.data.get(), |
| FlatSize(result->storage_matrix.matrix.layout), |
| num_matmul_sets); |
| if (benchmark_prepack_lhs) { |
| cold_prepacked_lhs.Init(static_cast<char*>(result->prepacked_lhs.data), |
| result->prepacked_lhs.data_size, num_matmul_sets); |
| } |
| if (benchmark_prepack_rhs) { |
| cold_prepacked_rhs.Init(static_cast<char*>(result->prepacked_rhs.data), |
| result->prepacked_rhs.data_size, num_matmul_sets); |
| } |
| } |
| const bool record_pmu = GetBoolEnvVarOrFalse("RUY_BENCHMARK_PMU"); |
| int repeats = GetIntEnvVarOrZero("RUY_BENCHMARK_REPEATS"); |
| if (!repeats) { |
| repeats = 4; |
| } |
| float benchmark_min_secs = GetFloatEnvVarOrZero("RUY_BENCHMARK_MIN_SECS"); |
| if (!benchmark_min_secs) { |
| benchmark_min_secs = 0.5; |
| } |
| #ifdef GEMMLOWP_PROFILING |
| const char* lhstype = TypeName<LhsScalar>(); |
| const char* lhssymm = SymmetryName(lhs.matrix); |
| const char* rhstype = TypeName<RhsScalar>(); |
| const char* rhssymm = SymmetryName(rhs.matrix); |
| |
| printf("Profiling path=%s shape=(%dx%dx%d) lhs=(%s,%s) rhs=(%s,%s)\n", |
| PathName(*result).c_str(), rows, depth, cols, lhstype, lhssymm, |
| rhstype, rhssymm); |
| gemmlowp::RegisterCurrentThreadForProfiling(); |
| gemmlowp::StartProfiling(); |
| #endif |
| |
| float latency = std::numeric_limits<float>::infinity(); |
| float l1_refill_rate = std::numeric_limits<float>::infinity(); |
| float l2_refill_rate = std::numeric_limits<float>::infinity(); |
| float l3_refill_rate = std::numeric_limits<float>::infinity(); |
| float l1tlb_refill_rate = std::numeric_limits<float>::infinity(); |
| float l2tlb_refill_rate = std::numeric_limits<float>::infinity(); |
| float mispred_rate = std::numeric_limits<float>::infinity(); |
| float frontend_stall_rate = std::numeric_limits<float>::infinity(); |
| float backend_stall_rate = std::numeric_limits<float>::infinity(); |
| |
| for (int repeat = 0; repeat < repeats; repeat++) { |
| PmuEvents pmu_events; |
| if (record_pmu) { |
| pmu_events.StartRecording(); |
| } |
| TimePoint time_start = Now(); |
| TimePoint t = time_start; |
| int iters = 0; |
| int iters_at_a_time = 1; |
| while (ToFloatSeconds(t - time_start) < benchmark_min_secs) { |
| for (int i = 0; i < iters_at_a_time; i++) { |
| if (cold) { |
| lhs.matrix.data = cold_lhs.Next(); |
| rhs.matrix.data = cold_rhs.Next(); |
| result->storage_matrix.matrix.data = cold_dst.Next(); |
| if (benchmark_prepack_lhs) { |
| result->prepacked_lhs.data = cold_prepacked_lhs.Next(); |
| } |
| if (benchmark_prepack_rhs) { |
| result->prepacked_rhs.data = cold_prepacked_rhs.Next(); |
| } |
| } |
| EvalResult(result); |
| iters++; |
| } |
| iters_at_a_time *= 2; |
| t = Now(); |
| } |
| latency = std::min( |
| latency, static_cast<float>(ToFloatSeconds(t - time_start) / iters)); |
| if (record_pmu) { |
| pmu_events.StopRecording(); |
| const float normalization_factor = |
| 1.0f / (static_cast<float>(iters) * rows * cols * depth); |
| l1_refill_rate = std::min( |
| l1_refill_rate, pmu_events.L1RefillCount() * normalization_factor); |
| l2_refill_rate = std::min( |
| l2_refill_rate, pmu_events.L2RefillCount() * normalization_factor); |
| l3_refill_rate = std::min( |
| l3_refill_rate, pmu_events.L3RefillCount() * normalization_factor); |
| l1tlb_refill_rate = |
| std::min(l1tlb_refill_rate, |
| pmu_events.L1TLBRefillCount() * normalization_factor); |
| l2tlb_refill_rate = |
| std::min(l2tlb_refill_rate, |
| pmu_events.L2TLBRefillCount() * normalization_factor); |
| mispred_rate = |
| std::min(mispred_rate, pmu_events.BranchMispredictionCount() * |
| normalization_factor); |
| frontend_stall_rate = |
| std::min(frontend_stall_rate, |
| pmu_events.FrontendStallCount() * normalization_factor); |
| backend_stall_rate = |
| std::min(backend_stall_rate, |
| pmu_events.BackendStallCount() * normalization_factor); |
| } |
| } |
| result->latency = latency; |
| if (record_pmu) { |
| result->l1_refill_rate = l1_refill_rate; |
| result->l2_refill_rate = l2_refill_rate; |
| result->l3_refill_rate = l3_refill_rate; |
| result->l1tlb_refill_rate = l1tlb_refill_rate; |
| result->l2tlb_refill_rate = l2tlb_refill_rate; |
| result->mispred_rate = mispred_rate; |
| result->frontend_stall_rate = frontend_stall_rate; |
| result->backend_stall_rate = backend_stall_rate; |
| } |
| |
| #ifdef GEMMLOWP_PROFILING |
| gemmlowp::FinishProfiling(); |
| fflush(stdout); |
| #endif |
| |
| if (cold) { |
| lhs.matrix.data = orig_lhs_data; |
| rhs.matrix.data = orig_rhs_data; |
| memcpy(orig_dst_data, result->storage_matrix.matrix.data.get(), |
| StorageSize(result->storage_matrix.matrix)); |
| result->storage_matrix.matrix.data = orig_dst_data; |
| result->prepacked_lhs.data = orig_prepacked_lhs_data; |
| result->prepacked_rhs.data = orig_prepacked_rhs_data; |
| } |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void TestSet<LhsScalar, RhsScalar, SpecType>::Eval() { |
| RUY_CHECK(life_stage == LifeStage::kHasPrepackedMatrices); |
| for (auto& result : results) { |
| if (benchmark) { |
| Benchmark(result.get()); |
| } else { |
| EvalResult(result.get()); |
| } |
| } |
| life_stage = LifeStage::kEvaluated; |
| } |
| |
| template <typename Scalar> |
| std::string DumpRegion(const Matrix<Scalar>& matrix, int center_row, |
| int center_col) { |
| static constexpr int kRadius = 20; |
| int first_row = std::max(0, center_row - kRadius); |
| int last_row = std::min(matrix.layout.rows - 1, center_row + kRadius); |
| int first_col = std::max(0, center_col - kRadius); |
| int last_col = std::min(matrix.layout.cols - 1, center_col + kRadius); |
| std::ostringstream stream; |
| for (int row = first_row; row <= last_row; row++) { |
| for (int col = first_col; col <= last_col; col++) { |
| stream << static_cast<double>(Element(matrix, row, col)) << " "; |
| } |
| stream << "\n"; |
| } |
| return stream.str(); |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void TestSet<LhsScalar, RhsScalar, SpecType>::VerifyTestResults() const { |
| const int depth = lhs.matrix.layout.cols; |
| for (int i = 0; i < results.size() - 1; i++) { |
| if (!Agree(*results[i], *results[i + 1], depth)) { |
| std::string paths_in_agreement; |
| paths_in_agreement.append(PathName(*results[0])); |
| for (int j = 1; j <= i; j++) { |
| paths_in_agreement.append(", "); |
| paths_in_agreement.append(PathName(*results[j])); |
| } |
| ErrorAnalysis error_analysis; |
| AnalyzeTestError(*this, i + 1, &error_analysis); |
| std::cerr << "Error: path (" << PathName(*results[i + 1]) |
| << ") disagrees with the other paths (" << paths_in_agreement |
| << "), which agree with each other." << std::endl; |
| std::cerr << "Shape: rows = " << rows << ", cols = " << cols |
| << ", depth = " << depth << std::endl; |
| std::cerr << "Stats of the good result matrix: " |
| << StatsAsString(error_analysis.stats_good) << std::endl; |
| std::cerr << "Stats of the bad result matrix: " |
| << StatsAsString(error_analysis.stats_bad) << std::endl; |
| if (error_analysis.error_rows.size() < rows) { |
| std::cerr << "Rows containing errors: " |
| << Join(error_analysis.error_rows) << std::endl; |
| } else { |
| std::cerr << "Errors found in ALL rows." << std::endl; |
| } |
| if (error_analysis.error_cols.size() < cols) { |
| std::cerr << "Cols containing errors: " |
| << Join(error_analysis.error_cols) << std::endl; |
| } else { |
| std::cerr << "Errors found in ALL cols." << std::endl; |
| } |
| std::cerr << "The first error occurs at row " |
| << error_analysis.row_of_first_error << ", col " |
| << error_analysis.col_of_first_error << std::endl; |
| std::cerr << "Good value: " << error_analysis.first_error_good_value |
| << std::endl; |
| std::cerr << "Bad value : " << error_analysis.first_error_bad_value |
| << std::endl; |
| std::cerr << "Region of Good result matrix around first error:\n\n" |
| << DumpRegion(results[0]->storage_matrix.matrix, |
| error_analysis.row_of_first_error, |
| error_analysis.col_of_first_error) |
| << std::endl; |
| std::cerr << "Region of Bad result matrix around first error:\n\n" |
| << DumpRegion(results[i + 1]->storage_matrix.matrix, |
| error_analysis.row_of_first_error, |
| error_analysis.col_of_first_error) |
| << std::endl; |
| RUY_CHECK(false); |
| } |
| } |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void TestSet<LhsScalar, RhsScalar, SpecType>::VerifyNonTrivial() const { |
| if (getenv("QUICK_BENCHMARK")) { |
| return; |
| } |
| if (results.front()->path != Path::kReference) { |
| return; |
| } |
| Context context; |
| context.SetRuntimeEnabledPaths(Path::kReference); |
| const auto& dst_storage = results.front()->storage_matrix; |
| const Matrix<DstScalar>& dst = dst_storage.matrix; |
| Matrix<DstScalar> unclamped_dst; |
| unclamped_dst.layout = dst.layout; |
| unclamped_dst.zero_point = dst.zero_point; |
| const int size = FlatSize(unclamped_dst.layout); |
| std::vector<DstScalar> unclamped_dst_data(size); |
| unclamped_dst.data = unclamped_dst_data.data(); |
| ruy::BasicSpec<AccumScalar, DstScalar> spec_unclamped; |
| spec_unclamped.bias = spec.bias; |
| spec_unclamped.multiplier_fixedpoint = spec.multiplier_fixedpoint; |
| spec_unclamped.multiplier_exponent = spec.multiplier_exponent; |
| Mul<Path::kReference>(lhs.matrix, rhs.matrix, spec_unclamped, &context, |
| &unclamped_dst); |
| int count_clamped = 0; |
| bool found_distinct_values = false; |
| for (int row = 0; row < dst.layout.rows; row++) { |
| for (int col = 0; col < dst.layout.cols; col++) { |
| count_clamped += |
| (Element(dst, row, col) != Element(unclamped_dst, row, col)); |
| found_distinct_values |= (Element(dst, row, col) != Element(dst, 0, 0)); |
| } |
| } |
| if (!spec.multiplier_exponent_perchannel) { |
| RUY_CHECK_LE(count_clamped, std::floor(2 * kClampRatio * size)); |
| if (size > 10) { |
| RUY_CHECK(found_distinct_values); |
| } |
| } |
| } |
| |
| template <typename LhsScalar, typename RhsScalar, typename SpecType> |
| void TestSet<LhsScalar, RhsScalar, SpecType>::Verify() { |
| RUY_CHECK(life_stage == LifeStage::kEvaluated); |
| if (expected_outcome == ExpectedOutcome::kSuccess) { |
| VerifyTestResults(); |
| VerifyNonTrivial(); |
| } |
| life_stage = LifeStage::kFinal; |
| } |
| |
| template <typename TestSetType> |
| void TestRCC(int rows, int depth, int cols, ExpectedOutcome expected_outcome) { |
| TestSetType test_set; |
| test_set.rows = rows; |
| test_set.depth = depth; |
| test_set.cols = cols; |
| test_set.lhs_order = Order::kRowMajor; |
| test_set.rhs_order = Order::kColMajor; |
| test_set.dst_order = Order::kColMajor; |
| test_set.layout_style = LayoutStyle::kPackedLinear; |
| test_set.expected_outcome = expected_outcome; |
| test_set.Run(); |
| } |
| |
| template <typename TestSetType> |
| void TestRCC(int rows, int depth, int cols) { |
| TestRCC<TestSetType>(rows, depth, cols, ExpectedOutcome::kSuccess); |
| } |
| |
| template <typename TestSetType> |
| void TestNonRCC(int rows, int depth, int cols, |
| ExpectedOutcome expected_outcome) { |
| TestSetType test_set; |
| test_set.rows = rows; |
| test_set.depth = depth; |
| test_set.cols = cols; |
| test_set.lhs_order = Order::kColMajor; |
| test_set.rhs_order = Order::kColMajor; |
| test_set.dst_order = Order::kColMajor; |
| test_set.layout_style = LayoutStyle::kPackedLinear; |
| test_set.expected_outcome = expected_outcome; |
| test_set.Run(); |
| } |
| |
| template <typename TestSetType> |
| void TestLinearAllOrders(int rows, int depth, int cols, |
| ExpectedOutcome expected_outcome) { |
| const std::vector<Order> orders{Order::kColMajor, Order::kRowMajor}; |
| |
| for (Order lhs_order : orders) { |
| for (Order rhs_order : orders) { |
| for (Order dst_order : orders) { |
| TestSetType test_set; |
| test_set.rows = rows; |
| test_set.depth = depth; |
| test_set.cols = cols; |
| test_set.lhs_order = lhs_order; |
| test_set.rhs_order = rhs_order; |
| test_set.dst_order = dst_order; |
| test_set.layout_style = LayoutStyle::kLinear; |
| test_set.expected_outcome = expected_outcome; |
| test_set.Run(); |
| } |
| } |
| } |
| } |
| |
| template <typename TestSetType> |
| void TestLinearAllOrders(int rows, int depth, int cols) { |
| TestLinearAllOrders<TestSetType>(rows, depth, cols, |
| ExpectedOutcome::kSuccess); |
| } |
| |
| } // namespace ruy |
| |
| #endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_TEST_H_ |