blob: cd382e4d50f7bf28ad142557c99c97b6e439dee8 [file] [log] [blame]
Benoit Jacob321f6942015-07-06 18:11:19 -04001// Copyright 2015 Google Inc. All Rights Reserved.
Benoit Jacob75c4ec02015-06-25 15:50:59 -04002//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// common.h: contains stuff that's used throughout gemmlowp
16// and should always be available.
17
18#ifndef GEMMLOWP_INTERNAL_COMMON_H_
19#define GEMMLOWP_INTERNAL_COMMON_H_
20
21#include <pthread.h>
Miao Wang93754b52015-07-09 14:02:08 -070022
Benoit Jacob75c4ec02015-06-25 15:50:59 -040023#include <cassert>
24#include <cmath>
25#include <cstdlib>
26#include <algorithm>
27
Miao Wang544690c2015-07-16 15:27:57 -070028#include "../profiling/instrumentation.h"
Benoit Jacob75c4ec02015-06-25 15:50:59 -040029
30#ifdef GEMMLOWP_PROFILING
31#include <set>
32#include <cstdio>
33#include <cstring>
34#endif
35
36// Detect NEON. It's important to check for both tokens.
37#if (defined __ARM_NEON) || (defined __ARM_NEON__)
38#define GEMMLOWP_NEON
Miao Wang544690c2015-07-16 15:27:57 -070039#ifdef __arm__
40#define GEMMLOWP_NEON32
41#endif
42#ifdef __aarch64__
43#define GEMMLOWP_NEON64
Benoit Jacob75c4ec02015-06-25 15:50:59 -040044#endif
Miao Wang8bff50e2015-07-15 15:18:10 -070045#endif
Benoit Jacob75c4ec02015-06-25 15:50:59 -040046
Miao Wang0a70f982015-09-14 15:39:13 -070047// Detect SSE.
48#if defined __SSE4_2__ // at the moment, our SSE code assumes SSE 4.something
49#define GEMMLOWP_SSE
50#if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386)
51#define GEMMLOWP_SSE32
52#endif
53#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
54#define GEMMLOWP_SSE64
55#endif
56#endif
57
Benoit Jacob75c4ec02015-06-25 15:50:59 -040058namespace gemmlowp {
59
60// Standard cache line size. Useful to optimize alignment and
61// prefetches. Ideally we would query this at runtime, however
62// 64 byte cache lines are the vast majority, and even if it's
63// wrong on some device, it will be wrong by no more than a 2x factor,
64// which should be acceptable.
65const int kDefaultCacheLineSize = 64;
66
67// Default L1 and L2 data cache sizes. On x86, we should ideally query this at
68// runtime. On ARM, the instruction to query this is privileged and
69// Android kernels do not expose it to userspace. Fortunately, the majority
70// of ARM devices have roughly comparable values:
71// Nexus 5: L1 16k, L2 1M
72// Android One: L1 32k, L2 512k
73// The following values are equal to or somewhat lower than that, and were
74// found to perform well on both the Nexus 5 and Android One.
75// Of course, they would be too low for typical x86 CPUs where we would want
76// to set the L2 value to (L3 cache size / number of cores) at least.
77const int kDefaultL1CacheSize = 16 * 1024;
Miao Wang0a70f982015-09-14 15:39:13 -070078const int kDefaultL2CacheSize = 384 * 1024;
Benoit Jacob75c4ec02015-06-25 15:50:59 -040079
80// The proportion of the cache that we intend to use for storing
81// RHS blocks. This should be between 0 and 1, and typically closer to 1,
82// as we typically want to use most of the L2 cache for storing a large
83// RHS block.
Miao Wang0a70f982015-09-14 15:39:13 -070084// Note: with less-than-8-bit depth, requantization makes packing more
85// expensive. We lowered this value from 0.9 to 0.75 with the introduction
86// of expensive requantization; this results in much higher performance
87// for 1000x1000 matrices; the exact reason for that is not understood.
88// Anyway, clearly we will eventually need better heuristics than just
89// those constant parameters here.
90const float kDefaultL2RhsFactor = 0.75f;
91
92// The number of bytes in a SIMD register. This is used to determine
93// the dimensions of PackingRegisterBlock so that such blocks can
94// be efficiently loaded into registers, so that packing code can
95// work within registers as much as possible.
96// In the non-SIMD generic fallback code, this is just a generic array
97// size, so any size would work there. Different platforms may set this
98// to different values but must ensure that their own optimized packing paths
99// are consistent with this value.
100const int kRegisterSize = 16;
101
102// The threshold on the depth dimension at which we switch to
103// probabilistic rounding instead of rounding-to-nearest when
104// requantizing input data. Indeed, both statistical theory and
105// empirical measurements show that for given input data and bit depth,
106// probabilistic rounding gives more accurate results for large enough
107// depth, while rounding-to-nearest does for smaller depth. This threshold
108// is naively determined from some experiments with Inception at 7bit/5bit
109// on a set of 10,000 images:
110//
111// 7 bit weights, 5 bit activations, switch at 64: 59.82% top-1 accuracy
112// 7 bit weights, 5 bit activations, switch at 128: 59.58% top-1 accuracy
113// 7 bit weights, 5 bit activations, switch at 192: 63.37% top-1 accuracy
114// 7 bit weights, 5 bit activations, switch at 256: 63.47% top-1 accuracy
115// 7 bit weights, 5 bit activations, switch at 320: 63.71% top-1 accuracy
116// 7 bit weights, 5 bit activations, switch at 384: 63.71% top-1 accuracy
117// 7 bit weights, 5 bit activations, switch at 448: 63.58% top-1 accuracy
118// 7 bit weights, 5 bit activations, switch at 512: 64.10% top-1 accuracy
119// 7 bit weights, 5 bit activations, switch at 640: 62.49% top-1 accuracy
120// 7 bit weights, 5 bit activations, switch at 768: 62.49% top-1 accuracy
121// 7 bit weights, 5 bit activations, switch at 1024: 58.96% top-1 accuracy
122//
123// So here, 384 looks comfortably in the middle of a plateau of good values,
124// and it's a roundish number (3/2 * 256) so let's stick with that for now.
125// It would be nice to work out the theory of this, and understand how this
126// should depend on the distribution of inputs and the bit depth.
127const int kProbabilisticRoundingThreshold = 384;
Benoit Jacob75c4ec02015-06-25 15:50:59 -0400128
129// Hints the CPU to prefetch the cache line containing ptr.
130inline void Prefetch(const void* ptr) {
131#ifdef __GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch.
132 __builtin_prefetch(ptr);
133#else
134 (void)ptr;
135#endif
136}
137
138// Returns the runtime argument rounded down to the nearest multiple of
139// the fixed Modulus.
140template <int Modulus>
141int RoundDown(int i) {
142 return i - (i % Modulus);
143}
144
145// Returns the runtime argument rounded up to the nearest multiple of
146// the fixed Modulus.
147template <int Modulus>
148int RoundUp(int i) {
149 return RoundDown<Modulus>(i + Modulus - 1);
150}
151
152// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
153template <typename Integer>
154Integer CeilQuotient(Integer a, Integer b) {
155 return (a + b - 1) / b;
156}
157
158// Returns the argument rounded up to the nearest power of two.
159template <typename Integer>
160Integer RoundUpToPowerOfTwo(Integer n) {
161 Integer i = n - 1;
162 i |= i >> 1;
163 i |= i >> 2;
164 i |= i >> 4;
165 i |= i >> 8;
166 i |= i >> 16;
167 return i + 1;
168}
169
170template <int N>
171struct IsPowerOfTwo {
172 static const bool value = !(N & (N - 1));
173};
174
175} // namespace gemmlowp
176
177#endif // GEMMLOWP_INTERNAL_COMMON_H_