Benoit Jacob | 321f694 | 2015-07-06 18:11:19 -0400 | [diff] [blame] | 1 | // Copyright 2015 Google Inc. All Rights Reserved. |
Benoit Jacob | 75c4ec0 | 2015-06-25 15:50:59 -0400 | [diff] [blame] | 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | // common.h: contains stuff that's used throughout gemmlowp |
| 16 | // and should always be available. |
| 17 | |
| 18 | #ifndef GEMMLOWP_INTERNAL_COMMON_H_ |
| 19 | #define GEMMLOWP_INTERNAL_COMMON_H_ |
| 20 | |
| 21 | #include <pthread.h> |
Miao Wang | 93754b5 | 2015-07-09 14:02:08 -0700 | [diff] [blame] | 22 | |
Benoit Jacob | 75c4ec0 | 2015-06-25 15:50:59 -0400 | [diff] [blame] | 23 | #include <cassert> |
| 24 | #include <cmath> |
| 25 | #include <cstdlib> |
| 26 | #include <algorithm> |
| 27 | |
Miao Wang | 544690c | 2015-07-16 15:27:57 -0700 | [diff] [blame] | 28 | #include "../profiling/instrumentation.h" |
Benoit Jacob | 75c4ec0 | 2015-06-25 15:50:59 -0400 | [diff] [blame] | 29 | |
| 30 | #ifdef GEMMLOWP_PROFILING |
| 31 | #include <set> |
| 32 | #include <cstdio> |
| 33 | #include <cstring> |
| 34 | #endif |
| 35 | |
| 36 | // Detect NEON. It's important to check for both tokens. |
| 37 | #if (defined __ARM_NEON) || (defined __ARM_NEON__) |
| 38 | #define GEMMLOWP_NEON |
Miao Wang | 544690c | 2015-07-16 15:27:57 -0700 | [diff] [blame] | 39 | #ifdef __arm__ |
| 40 | #define GEMMLOWP_NEON32 |
| 41 | #endif |
| 42 | #ifdef __aarch64__ |
| 43 | #define GEMMLOWP_NEON64 |
Benoit Jacob | 75c4ec0 | 2015-06-25 15:50:59 -0400 | [diff] [blame] | 44 | #endif |
Miao Wang | 8bff50e | 2015-07-15 15:18:10 -0700 | [diff] [blame] | 45 | #endif |
Benoit Jacob | 75c4ec0 | 2015-06-25 15:50:59 -0400 | [diff] [blame] | 46 | |
Miao Wang | 0a70f98 | 2015-09-14 15:39:13 -0700 | [diff] [blame^] | 47 | // Detect SSE. |
| 48 | #if defined __SSE4_2__ // at the moment, our SSE code assumes SSE 4.something |
| 49 | #define GEMMLOWP_SSE |
| 50 | #if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386) |
| 51 | #define GEMMLOWP_SSE32 |
| 52 | #endif |
| 53 | #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64) |
| 54 | #define GEMMLOWP_SSE64 |
| 55 | #endif |
| 56 | #endif |
| 57 | |
Benoit Jacob | 75c4ec0 | 2015-06-25 15:50:59 -0400 | [diff] [blame] | 58 | namespace gemmlowp { |
| 59 | |
| 60 | // Standard cache line size. Useful to optimize alignment and |
| 61 | // prefetches. Ideally we would query this at runtime, however |
| 62 | // 64 byte cache lines are the vast majority, and even if it's |
| 63 | // wrong on some device, it will be wrong by no more than a 2x factor, |
| 64 | // which should be acceptable. |
| 65 | const int kDefaultCacheLineSize = 64; |
| 66 | |
| 67 | // Default L1 and L2 data cache sizes. On x86, we should ideally query this at |
| 68 | // runtime. On ARM, the instruction to query this is privileged and |
| 69 | // Android kernels do not expose it to userspace. Fortunately, the majority |
| 70 | // of ARM devices have roughly comparable values: |
| 71 | // Nexus 5: L1 16k, L2 1M |
| 72 | // Android One: L1 32k, L2 512k |
| 73 | // The following values are equal to or somewhat lower than that, and were |
| 74 | // found to perform well on both the Nexus 5 and Android One. |
| 75 | // Of course, they would be too low for typical x86 CPUs where we would want |
| 76 | // to set the L2 value to (L3 cache size / number of cores) at least. |
| 77 | const int kDefaultL1CacheSize = 16 * 1024; |
Miao Wang | 0a70f98 | 2015-09-14 15:39:13 -0700 | [diff] [blame^] | 78 | const int kDefaultL2CacheSize = 384 * 1024; |
Benoit Jacob | 75c4ec0 | 2015-06-25 15:50:59 -0400 | [diff] [blame] | 79 | |
| 80 | // The proportion of the cache that we intend to use for storing |
| 81 | // RHS blocks. This should be between 0 and 1, and typically closer to 1, |
| 82 | // as we typically want to use most of the L2 cache for storing a large |
| 83 | // RHS block. |
Miao Wang | 0a70f98 | 2015-09-14 15:39:13 -0700 | [diff] [blame^] | 84 | // Note: with less-than-8-bit depth, requantization makes packing more |
| 85 | // expensive. We lowered this value from 0.9 to 0.75 with the introduction |
| 86 | // of expensive requantization; this results in much higher performance |
| 87 | // for 1000x1000 matrices; the exact reason for that is not understood. |
| 88 | // Anyway, clearly we will eventually need better heuristics than just |
| 89 | // those constant parameters here. |
| 90 | const float kDefaultL2RhsFactor = 0.75f; |
| 91 | |
| 92 | // The number of bytes in a SIMD register. This is used to determine |
| 93 | // the dimensions of PackingRegisterBlock so that such blocks can |
| 94 | // be efficiently loaded into registers, so that packing code can |
| 95 | // work within registers as much as possible. |
| 96 | // In the non-SIMD generic fallback code, this is just a generic array |
| 97 | // size, so any size would work there. Different platforms may set this |
| 98 | // to different values but must ensure that their own optimized packing paths |
| 99 | // are consistent with this value. |
| 100 | const int kRegisterSize = 16; |
| 101 | |
| 102 | // The threshold on the depth dimension at which we switch to |
| 103 | // probabilistic rounding instead of rounding-to-nearest when |
| 104 | // requantizing input data. Indeed, both statistical theory and |
| 105 | // empirical measurements show that for given input data and bit depth, |
| 106 | // probabilistic rounding gives more accurate results for large enough |
| 107 | // depth, while rounding-to-nearest does for smaller depth. This threshold |
| 108 | // is naively determined from some experiments with Inception at 7bit/5bit |
| 109 | // on a set of 10,000 images: |
| 110 | // |
| 111 | // 7 bit weights, 5 bit activations, switch at 64: 59.82% top-1 accuracy |
| 112 | // 7 bit weights, 5 bit activations, switch at 128: 59.58% top-1 accuracy |
| 113 | // 7 bit weights, 5 bit activations, switch at 192: 63.37% top-1 accuracy |
| 114 | // 7 bit weights, 5 bit activations, switch at 256: 63.47% top-1 accuracy |
| 115 | // 7 bit weights, 5 bit activations, switch at 320: 63.71% top-1 accuracy |
| 116 | // 7 bit weights, 5 bit activations, switch at 384: 63.71% top-1 accuracy |
| 117 | // 7 bit weights, 5 bit activations, switch at 448: 63.58% top-1 accuracy |
| 118 | // 7 bit weights, 5 bit activations, switch at 512: 64.10% top-1 accuracy |
| 119 | // 7 bit weights, 5 bit activations, switch at 640: 62.49% top-1 accuracy |
| 120 | // 7 bit weights, 5 bit activations, switch at 768: 62.49% top-1 accuracy |
| 121 | // 7 bit weights, 5 bit activations, switch at 1024: 58.96% top-1 accuracy |
| 122 | // |
| 123 | // So here, 384 looks comfortably in the middle of a plateau of good values, |
| 124 | // and it's a roundish number (3/2 * 256) so let's stick with that for now. |
| 125 | // It would be nice to work out the theory of this, and understand how this |
| 126 | // should depend on the distribution of inputs and the bit depth. |
| 127 | const int kProbabilisticRoundingThreshold = 384; |
Benoit Jacob | 75c4ec0 | 2015-06-25 15:50:59 -0400 | [diff] [blame] | 128 | |
| 129 | // Hints the CPU to prefetch the cache line containing ptr. |
| 130 | inline void Prefetch(const void* ptr) { |
| 131 | #ifdef __GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch. |
| 132 | __builtin_prefetch(ptr); |
| 133 | #else |
| 134 | (void)ptr; |
| 135 | #endif |
| 136 | } |
| 137 | |
| 138 | // Returns the runtime argument rounded down to the nearest multiple of |
| 139 | // the fixed Modulus. |
| 140 | template <int Modulus> |
| 141 | int RoundDown(int i) { |
| 142 | return i - (i % Modulus); |
| 143 | } |
| 144 | |
| 145 | // Returns the runtime argument rounded up to the nearest multiple of |
| 146 | // the fixed Modulus. |
| 147 | template <int Modulus> |
| 148 | int RoundUp(int i) { |
| 149 | return RoundDown<Modulus>(i + Modulus - 1); |
| 150 | } |
| 151 | |
| 152 | // Returns the quotient a / b rounded up ('ceil') to the nearest integer. |
| 153 | template <typename Integer> |
| 154 | Integer CeilQuotient(Integer a, Integer b) { |
| 155 | return (a + b - 1) / b; |
| 156 | } |
| 157 | |
| 158 | // Returns the argument rounded up to the nearest power of two. |
| 159 | template <typename Integer> |
| 160 | Integer RoundUpToPowerOfTwo(Integer n) { |
| 161 | Integer i = n - 1; |
| 162 | i |= i >> 1; |
| 163 | i |= i >> 2; |
| 164 | i |= i >> 4; |
| 165 | i |= i >> 8; |
| 166 | i |= i >> 16; |
| 167 | return i + 1; |
| 168 | } |
| 169 | |
| 170 | template <int N> |
| 171 | struct IsPowerOfTwo { |
| 172 | static const bool value = !(N & (N - 1)); |
| 173 | }; |
| 174 | |
| 175 | } // namespace gemmlowp |
| 176 | |
| 177 | #endif // GEMMLOWP_INTERNAL_COMMON_H_ |