Blame - internal/common.h - platform/external/gemmlowp

blob: cd382e4d50f7bf28ad142557c99c97b6e439dee8 [file] [log] [blame]

Benoit Jacob	321f694	2015-07-06 18:11:19 -0400	[diff] [blame]	1	// Copyright 2015 Google Inc. All Rights Reserved.
Benoit Jacob	75c4ec0	2015-06-25 15:50:59 -0400	[diff] [blame]	2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	// common.h: contains stuff that's used throughout gemmlowp
				16	// and should always be available.
				17
				18	#ifndef GEMMLOWP_INTERNAL_COMMON_H_
				19	#define GEMMLOWP_INTERNAL_COMMON_H_
				20
				21	#include <pthread.h>
Miao Wang	93754b5	2015-07-09 14:02:08 -0700	[diff] [blame]	22
Benoit Jacob	75c4ec0	2015-06-25 15:50:59 -0400	[diff] [blame]	23	#include <cassert>
				24	#include <cmath>
				25	#include <cstdlib>
				26	#include <algorithm>
				27
Miao Wang	544690c	2015-07-16 15:27:57 -0700	[diff] [blame]	28	#include "../profiling/instrumentation.h"
Benoit Jacob	75c4ec0	2015-06-25 15:50:59 -0400	[diff] [blame]	29
				30	#ifdef GEMMLOWP_PROFILING
				31	#include <set>
				32	#include <cstdio>
				33	#include <cstring>
				34	#endif
				35
				36	// Detect NEON. It's important to check for both tokens.
				37	#if (defined __ARM_NEON) \|\| (defined __ARM_NEON__)
				38	#define GEMMLOWP_NEON
Miao Wang	544690c	2015-07-16 15:27:57 -0700	[diff] [blame]	39	#ifdef __arm__
				40	#define GEMMLOWP_NEON32
				41	#endif
				42	#ifdef __aarch64__
				43	#define GEMMLOWP_NEON64
Benoit Jacob	75c4ec0	2015-06-25 15:50:59 -0400	[diff] [blame]	44	#endif
Miao Wang	8bff50e	2015-07-15 15:18:10 -0700	[diff] [blame]	45	#endif
Benoit Jacob	75c4ec0	2015-06-25 15:50:59 -0400	[diff] [blame]	46
Miao Wang	0a70f98	2015-09-14 15:39:13 -0700	[diff] [blame^]	47	// Detect SSE.
				48	#if defined __SSE4_2__ // at the moment, our SSE code assumes SSE 4.something
				49	#define GEMMLOWP_SSE
				50	#if defined(__i386__) \|\| defined(_M_IX86) \|\| defined(_X86_) \|\| defined(__i386)
				51	#define GEMMLOWP_SSE32
				52	#endif
				53	#if defined(__x86_64__) \|\| defined(_M_X64) \|\| defined(__amd64)
				54	#define GEMMLOWP_SSE64
				55	#endif
				56	#endif
				57
Benoit Jacob	75c4ec0	2015-06-25 15:50:59 -0400	[diff] [blame]	58	namespace gemmlowp {
				59
				60	// Standard cache line size. Useful to optimize alignment and
				61	// prefetches. Ideally we would query this at runtime, however
				62	// 64 byte cache lines are the vast majority, and even if it's
				63	// wrong on some device, it will be wrong by no more than a 2x factor,
				64	// which should be acceptable.
				65	const int kDefaultCacheLineSize = 64;
				66
				67	// Default L1 and L2 data cache sizes. On x86, we should ideally query this at
				68	// runtime. On ARM, the instruction to query this is privileged and
				69	// Android kernels do not expose it to userspace. Fortunately, the majority
				70	// of ARM devices have roughly comparable values:
				71	// Nexus 5: L1 16k, L2 1M
				72	// Android One: L1 32k, L2 512k
				73	// The following values are equal to or somewhat lower than that, and were
				74	// found to perform well on both the Nexus 5 and Android One.
				75	// Of course, they would be too low for typical x86 CPUs where we would want
				76	// to set the L2 value to (L3 cache size / number of cores) at least.
				77	const int kDefaultL1CacheSize = 16 * 1024;
Miao Wang	0a70f98	2015-09-14 15:39:13 -0700	[diff] [blame^]	78	const int kDefaultL2CacheSize = 384 * 1024;
Benoit Jacob	75c4ec0	2015-06-25 15:50:59 -0400	[diff] [blame]	79
				80	// The proportion of the cache that we intend to use for storing
				81	// RHS blocks. This should be between 0 and 1, and typically closer to 1,
				82	// as we typically want to use most of the L2 cache for storing a large
				83	// RHS block.
Miao Wang	0a70f98	2015-09-14 15:39:13 -0700	[diff] [blame^]	84	// Note: with less-than-8-bit depth, requantization makes packing more
				85	// expensive. We lowered this value from 0.9 to 0.75 with the introduction
				86	// of expensive requantization; this results in much higher performance
				87	// for 1000x1000 matrices; the exact reason for that is not understood.
				88	// Anyway, clearly we will eventually need better heuristics than just
				89	// those constant parameters here.
				90	const float kDefaultL2RhsFactor = 0.75f;
				91
				92	// The number of bytes in a SIMD register. This is used to determine
				93	// the dimensions of PackingRegisterBlock so that such blocks can
				94	// be efficiently loaded into registers, so that packing code can
				95	// work within registers as much as possible.
				96	// In the non-SIMD generic fallback code, this is just a generic array
				97	// size, so any size would work there. Different platforms may set this
				98	// to different values but must ensure that their own optimized packing paths
				99	// are consistent with this value.
				100	const int kRegisterSize = 16;
				101
				102	// The threshold on the depth dimension at which we switch to
				103	// probabilistic rounding instead of rounding-to-nearest when
				104	// requantizing input data. Indeed, both statistical theory and
				105	// empirical measurements show that for given input data and bit depth,
				106	// probabilistic rounding gives more accurate results for large enough
				107	// depth, while rounding-to-nearest does for smaller depth. This threshold
				108	// is naively determined from some experiments with Inception at 7bit/5bit
				109	// on a set of 10,000 images:
				110	//
				111	// 7 bit weights, 5 bit activations, switch at 64: 59.82% top-1 accuracy
				112	// 7 bit weights, 5 bit activations, switch at 128: 59.58% top-1 accuracy
				113	// 7 bit weights, 5 bit activations, switch at 192: 63.37% top-1 accuracy
				114	// 7 bit weights, 5 bit activations, switch at 256: 63.47% top-1 accuracy
				115	// 7 bit weights, 5 bit activations, switch at 320: 63.71% top-1 accuracy
				116	// 7 bit weights, 5 bit activations, switch at 384: 63.71% top-1 accuracy
				117	// 7 bit weights, 5 bit activations, switch at 448: 63.58% top-1 accuracy
				118	// 7 bit weights, 5 bit activations, switch at 512: 64.10% top-1 accuracy
				119	// 7 bit weights, 5 bit activations, switch at 640: 62.49% top-1 accuracy
				120	// 7 bit weights, 5 bit activations, switch at 768: 62.49% top-1 accuracy
				121	// 7 bit weights, 5 bit activations, switch at 1024: 58.96% top-1 accuracy
				122	//
				123	// So here, 384 looks comfortably in the middle of a plateau of good values,
				124	// and it's a roundish number (3/2 * 256) so let's stick with that for now.
				125	// It would be nice to work out the theory of this, and understand how this
				126	// should depend on the distribution of inputs and the bit depth.
				127	const int kProbabilisticRoundingThreshold = 384;
Benoit Jacob	75c4ec0	2015-06-25 15:50:59 -0400	[diff] [blame]	128
				129	// Hints the CPU to prefetch the cache line containing ptr.
				130	inline void Prefetch(const void* ptr) {
				131	#ifdef __GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch.
				132	__builtin_prefetch(ptr);
				133	#else
				134	(void)ptr;
				135	#endif
				136	}
				137
				138	// Returns the runtime argument rounded down to the nearest multiple of
				139	// the fixed Modulus.
				140	template <int Modulus>
				141	int RoundDown(int i) {
				142	return i - (i % Modulus);
				143	}
				144
				145	// Returns the runtime argument rounded up to the nearest multiple of
				146	// the fixed Modulus.
				147	template <int Modulus>
				148	int RoundUp(int i) {
				149	return RoundDown<Modulus>(i + Modulus - 1);
				150	}
				151
				152	// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
				153	template <typename Integer>
				154	Integer CeilQuotient(Integer a, Integer b) {
				155	return (a + b - 1) / b;
				156	}
				157
				158	// Returns the argument rounded up to the nearest power of two.
				159	template <typename Integer>
				160	Integer RoundUpToPowerOfTwo(Integer n) {
				161	Integer i = n - 1;
				162	i \|= i >> 1;
				163	i \|= i >> 2;
				164	i \|= i >> 4;
				165	i \|= i >> 8;
				166	i \|= i >> 16;
				167	return i + 1;
				168	}
				169
				170	template <int N>
				171	struct IsPowerOfTwo {
				172	static const bool value = !(N & (N - 1));
				173	};
				174
				175	} // namespace gemmlowp
				176
				177	#endif // GEMMLOWP_INTERNAL_COMMON_H_