internal/common.h - platform/external/gemmlowp - Gitiles

 // Copyright 2015 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // common.h: contains stuff that's used throughout gemmlowp
 // and should always be available.

 #ifndef GEMMLOWP_INTERNAL_COMMON_H_
 #define GEMMLOWP_INTERNAL_COMMON_H_

 #include <pthread.h>

 #include <cassert>
 #include <cmath>
 #include <cstdlib>
 #include <algorithm>

 #include "../profiling/instrumentation.h"

 #ifdef GEMMLOWP_PROFILING
 #include <set>
 #include <cstdio>
 #include <cstring>
 #endif

 // Detect NEON. It's important to check for both tokens.
 #if (defined __ARM_NEON) || (defined __ARM_NEON__)
 #define GEMMLOWP_NEON
 #ifdef __arm__
 #define GEMMLOWP_NEON32
 #endif
 #ifdef __aarch64__
 #define GEMMLOWP_NEON64
 #endif
 #endif

 // Detect SSE.
 #if defined __SSE4_2__  // at the moment, our SSE code assumes SSE 4.something
 #define GEMMLOWP_SSE
 #if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386)
 #define GEMMLOWP_SSE32
 #endif
 #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
 #define GEMMLOWP_SSE64
 #endif
 #endif

 namespace gemmlowp {

 // Standard cache line size. Useful to optimize alignment and
 // prefetches. Ideally we would query this at runtime, however
 // 64 byte cache lines are the vast majority, and even if it's
 // wrong on some device, it will be wrong by no more than a 2x factor,
 // which should be acceptable.
 const int kDefaultCacheLineSize = 64;

 // Default L1 and L2 data cache sizes. On x86, we should ideally query this at
 // runtime. On ARM, the instruction to query this is privileged and
 // Android kernels do not expose it to userspace. Fortunately, the majority
 // of ARM devices have roughly comparable values:
 //   Nexus 5: L1 16k, L2 1M
 //   Android One: L1 32k, L2 512k
 // The following values are equal to or somewhat lower than that, and were
 // found to perform well on both the Nexus 5 and Android One.
 // Of course, they would be too low for typical x86 CPUs where we would want
 // to set the L2 value to (L3 cache size / number of cores) at least.
 const int kDefaultL1CacheSize = 16 * 1024;
 const int kDefaultL2CacheSize = 384 * 1024;

 // The proportion of the cache that we intend to use for storing
 // RHS blocks. This should be between 0 and 1, and typically closer to 1,
 // as we typically want to use most of the L2 cache for storing a large
 // RHS block.
 // Note: with less-than-8-bit depth, requantization makes packing more
 // expensive. We lowered this value from 0.9 to 0.75 with the introduction
 // of expensive requantization; this results in much higher performance
 // for 1000x1000 matrices; the exact reason for that is not understood.
 // Anyway, clearly we will eventually need better heuristics than just
 // those constant parameters here.
 const float kDefaultL2RhsFactor = 0.75f;

 // The number of bytes in a SIMD register. This is used to determine
 // the dimensions of PackingRegisterBlock so that such blocks can
 // be efficiently loaded into registers, so that packing code can
 // work within registers as much as possible.
 // In the non-SIMD generic fallback code, this is just a generic array
 // size, so any size would work there. Different platforms may set this
 // to different values but must ensure that their own optimized packing paths
 // are consistent with this value.
 const int kRegisterSize = 16;

 // The threshold on the depth dimension at which we switch to
 // probabilistic rounding instead of rounding-to-nearest when
 // requantizing input data. Indeed, both statistical theory and
 // empirical measurements show that for given input data and bit depth,
 // probabilistic rounding gives more accurate results for large enough
 // depth, while rounding-to-nearest does for smaller depth. This threshold
 // is naively determined from some experiments with Inception at 7bit/5bit
 // on a set of 10,000 images:
 //
 //   7 bit weights, 5 bit activations, switch at 64:   59.82% top-1 accuracy
 //   7 bit weights, 5 bit activations, switch at 128:  59.58% top-1 accuracy
 //   7 bit weights, 5 bit activations, switch at 192:  63.37% top-1 accuracy
 //   7 bit weights, 5 bit activations, switch at 256:  63.47% top-1 accuracy
 //   7 bit weights, 5 bit activations, switch at 320:  63.71% top-1 accuracy
 //   7 bit weights, 5 bit activations, switch at 384:  63.71% top-1 accuracy
 //   7 bit weights, 5 bit activations, switch at 448:  63.58% top-1 accuracy
 //   7 bit weights, 5 bit activations, switch at 512:  64.10% top-1 accuracy
 //   7 bit weights, 5 bit activations, switch at 640:  62.49% top-1 accuracy
 //   7 bit weights, 5 bit activations, switch at 768:  62.49% top-1 accuracy
 //   7 bit weights, 5 bit activations, switch at 1024: 58.96% top-1 accuracy
 //
 // So here, 384 looks comfortably in the middle of a plateau of good values,
 // and it's a roundish number (3/2 * 256) so let's stick with that for now.
 // It would be nice to work out the theory of this, and understand how this
 // should depend on the distribution of inputs and the bit depth.
 const int kProbabilisticRoundingThreshold = 384;

 // Hints the CPU to prefetch the cache line containing ptr.
 inline void Prefetch(const void* ptr) {
 #ifdef __GNUC__  // Clang and GCC define __GNUC__ and have __builtin_prefetch.
   __builtin_prefetch(ptr);
 #else
   (void)ptr;
 #endif
 }

 // Returns the runtime argument rounded down to the nearest multiple of
 // the fixed Modulus.
 template <int Modulus>
 int RoundDown(int i) {
   return i - (i % Modulus);
 }

 // Returns the runtime argument rounded up to the nearest multiple of
 // the fixed Modulus.
 template <int Modulus>
 int RoundUp(int i) {
   return RoundDown<Modulus>(i + Modulus - 1);
 }

 // Returns the quotient a / b rounded up ('ceil') to the nearest integer.
 template <typename Integer>
 Integer CeilQuotient(Integer a, Integer b) {
   return (a + b - 1) / b;
 }

 // Returns the argument rounded up to the nearest power of two.
 template <typename Integer>
 Integer RoundUpToPowerOfTwo(Integer n) {
   Integer i = n - 1;
   i |= i >> 1;
   i |= i >> 2;
   i |= i >> 4;
   i |= i >> 8;
   i |= i >> 16;
   return i + 1;
 }

 template <int N>
 struct IsPowerOfTwo {
   static const bool value = !(N & (N - 1));
 };

 }  // namespace gemmlowp

 #endif  // GEMMLOWP_INTERNAL_COMMON_H_
	// Copyright 2015 Google Inc. All Rights Reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// common.h: contains stuff that's used throughout gemmlowp
	// and should always be available.

	#ifndef GEMMLOWP_INTERNAL_COMMON_H_
	#define GEMMLOWP_INTERNAL_COMMON_H_

	#include <pthread.h>

	#include <cassert>
	#include <cmath>
	#include <cstdlib>
	#include <algorithm>

	#include "../profiling/instrumentation.h"

	#ifdef GEMMLOWP_PROFILING
	#include <set>
	#include <cstdio>
	#include <cstring>
	#endif

	// Detect NEON. It's important to check for both tokens.
	#if (defined __ARM_NEON) \|\| (defined __ARM_NEON__)
	#define GEMMLOWP_NEON
	#ifdef __arm__
	#define GEMMLOWP_NEON32
	#endif
	#ifdef __aarch64__
	#define GEMMLOWP_NEON64
	#endif
	#endif

	// Detect SSE.
	#if defined __SSE4_2__ // at the moment, our SSE code assumes SSE 4.something
	#define GEMMLOWP_SSE
	#if defined(__i386__) \|\| defined(_M_IX86) \|\| defined(_X86_) \|\| defined(__i386)
	#define GEMMLOWP_SSE32
	#endif
	#if defined(__x86_64__) \|\| defined(_M_X64) \|\| defined(__amd64)
	#define GEMMLOWP_SSE64
	#endif
	#endif

	namespace gemmlowp {

	// Standard cache line size. Useful to optimize alignment and
	// prefetches. Ideally we would query this at runtime, however
	// 64 byte cache lines are the vast majority, and even if it's
	// wrong on some device, it will be wrong by no more than a 2x factor,
	// which should be acceptable.
	const int kDefaultCacheLineSize = 64;

	// Default L1 and L2 data cache sizes. On x86, we should ideally query this at
	// runtime. On ARM, the instruction to query this is privileged and
	// Android kernels do not expose it to userspace. Fortunately, the majority
	// of ARM devices have roughly comparable values:
	// Nexus 5: L1 16k, L2 1M
	// Android One: L1 32k, L2 512k
	// The following values are equal to or somewhat lower than that, and were
	// found to perform well on both the Nexus 5 and Android One.
	// Of course, they would be too low for typical x86 CPUs where we would want
	// to set the L2 value to (L3 cache size / number of cores) at least.
	const int kDefaultL1CacheSize = 16 * 1024;
	const int kDefaultL2CacheSize = 384 * 1024;

	// The proportion of the cache that we intend to use for storing
	// RHS blocks. This should be between 0 and 1, and typically closer to 1,
	// as we typically want to use most of the L2 cache for storing a large
	// RHS block.
	// Note: with less-than-8-bit depth, requantization makes packing more
	// expensive. We lowered this value from 0.9 to 0.75 with the introduction
	// of expensive requantization; this results in much higher performance
	// for 1000x1000 matrices; the exact reason for that is not understood.
	// Anyway, clearly we will eventually need better heuristics than just
	// those constant parameters here.
	const float kDefaultL2RhsFactor = 0.75f;

	// The number of bytes in a SIMD register. This is used to determine
	// the dimensions of PackingRegisterBlock so that such blocks can
	// be efficiently loaded into registers, so that packing code can
	// work within registers as much as possible.
	// In the non-SIMD generic fallback code, this is just a generic array
	// size, so any size would work there. Different platforms may set this
	// to different values but must ensure that their own optimized packing paths
	// are consistent with this value.
	const int kRegisterSize = 16;

	// The threshold on the depth dimension at which we switch to
	// probabilistic rounding instead of rounding-to-nearest when
	// requantizing input data. Indeed, both statistical theory and
	// empirical measurements show that for given input data and bit depth,
	// probabilistic rounding gives more accurate results for large enough
	// depth, while rounding-to-nearest does for smaller depth. This threshold
	// is naively determined from some experiments with Inception at 7bit/5bit
	// on a set of 10,000 images:
	//
	// 7 bit weights, 5 bit activations, switch at 64: 59.82% top-1 accuracy
	// 7 bit weights, 5 bit activations, switch at 128: 59.58% top-1 accuracy
	// 7 bit weights, 5 bit activations, switch at 192: 63.37% top-1 accuracy
	// 7 bit weights, 5 bit activations, switch at 256: 63.47% top-1 accuracy
	// 7 bit weights, 5 bit activations, switch at 320: 63.71% top-1 accuracy
	// 7 bit weights, 5 bit activations, switch at 384: 63.71% top-1 accuracy
	// 7 bit weights, 5 bit activations, switch at 448: 63.58% top-1 accuracy
	// 7 bit weights, 5 bit activations, switch at 512: 64.10% top-1 accuracy
	// 7 bit weights, 5 bit activations, switch at 640: 62.49% top-1 accuracy
	// 7 bit weights, 5 bit activations, switch at 768: 62.49% top-1 accuracy
	// 7 bit weights, 5 bit activations, switch at 1024: 58.96% top-1 accuracy
	//
	// So here, 384 looks comfortably in the middle of a plateau of good values,
	// and it's a roundish number (3/2 * 256) so let's stick with that for now.
	// It would be nice to work out the theory of this, and understand how this
	// should depend on the distribution of inputs and the bit depth.
	const int kProbabilisticRoundingThreshold = 384;

	// Hints the CPU to prefetch the cache line containing ptr.
	inline void Prefetch(const void* ptr) {
	#ifdef __GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch.
	__builtin_prefetch(ptr);
	#else
	(void)ptr;
	#endif
	}

	// Returns the runtime argument rounded down to the nearest multiple of
	// the fixed Modulus.
	template <int Modulus>
	int RoundDown(int i) {
	return i - (i % Modulus);
	}

	// Returns the runtime argument rounded up to the nearest multiple of
	// the fixed Modulus.
	template <int Modulus>
	int RoundUp(int i) {
	return RoundDown<Modulus>(i + Modulus - 1);
	}

	// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
	template <typename Integer>
	Integer CeilQuotient(Integer a, Integer b) {
	return (a + b - 1) / b;
	}

	// Returns the argument rounded up to the nearest power of two.
	template <typename Integer>
	Integer RoundUpToPowerOfTwo(Integer n) {
	Integer i = n - 1;
	i \|= i >> 1;
	i \|= i >> 2;
	i \|= i >> 4;
	i \|= i >> 8;
	i \|= i >> 16;
	return i + 1;
	}

	template <int N>
	struct IsPowerOfTwo {
	static const bool value = !(N & (N - 1));
	};

	} // namespace gemmlowp

	#endif // GEMMLOWP_INTERNAL_COMMON_H_