mtklein | 4e97607 | 2016-08-08 09:06:27 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2016 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #ifndef SkChecksum_opts_DEFINED |
| 9 | #define SkChecksum_opts_DEFINED |
| 10 | |
| 11 | #include "SkChecksum.h" |
| 12 | #include "SkTypes.h" |
| 13 | |
| 14 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 |
| 15 | #include <immintrin.h> |
Amaury Le Leyzour | 4c29633 | 2017-05-04 14:32:22 -0700 | [diff] [blame] | 16 | #elif defined(SK_ARM_HAS_CRC32) |
mtklein | 78559a7 | 2016-08-22 08:53:45 -0700 | [diff] [blame] | 17 | #include <arm_acle.h> |
mtklein | 4e97607 | 2016-08-08 09:06:27 -0700 | [diff] [blame] | 18 | #endif |
| 19 | |
mtklein | 4e97607 | 2016-08-08 09:06:27 -0700 | [diff] [blame] | 20 | namespace SK_OPTS_NS { |
| 21 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 22 | template <typename T, typename P> |
| 23 | static inline T unaligned_load(const P* p) { |
| 24 | T v; |
| 25 | memcpy(&v, p, sizeof(v)); |
| 26 | return v; |
mtklein | 2f4114a | 2016-08-16 09:29:57 -0700 | [diff] [blame] | 27 | } |
mtklein | 4e97607 | 2016-08-08 09:06:27 -0700 | [diff] [blame] | 28 | |
mtklein | 2f4114a | 2016-08-16 09:29:57 -0700 | [diff] [blame] | 29 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64)) |
| 30 | // This is not a CRC32. It's Just A Hash that uses those instructions because they're fast. |
Mike Klein | cd71f11 | 2017-08-23 11:11:55 -0400 | [diff] [blame] | 31 | /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) { |
mtklein | 4e97607 | 2016-08-08 09:06:27 -0700 | [diff] [blame] | 32 | auto data = (const uint8_t*)vdata; |
| 33 | |
| 34 | // _mm_crc32_u64() operates on 64-bit registers, so we use uint64_t for a while. |
| 35 | uint64_t hash = seed; |
| 36 | if (bytes >= 24) { |
| 37 | // We'll create 3 independent hashes, each using _mm_crc32_u64() |
| 38 | // to hash 8 bytes per step. Both 3 and independent are important: |
| 39 | // we can execute 3 of these instructions in parallel on a single core. |
| 40 | uint64_t a = hash, |
| 41 | b = hash, |
| 42 | c = hash; |
| 43 | size_t steps = bytes/24; |
| 44 | while (steps --> 0) { |
| 45 | a = _mm_crc32_u64(a, unaligned_load<uint64_t>(data+ 0)); |
| 46 | b = _mm_crc32_u64(b, unaligned_load<uint64_t>(data+ 8)); |
| 47 | c = _mm_crc32_u64(c, unaligned_load<uint64_t>(data+16)); |
| 48 | data += 24; |
| 49 | } |
| 50 | bytes %= 24; |
| 51 | hash = a^b^c; |
| 52 | } |
| 53 | |
| 54 | SkASSERT(bytes < 24); |
| 55 | if (bytes >= 16) { |
| 56 | hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data)); |
| 57 | bytes -= 8; |
| 58 | data += 8; |
| 59 | } |
| 60 | |
| 61 | SkASSERT(bytes < 16); |
| 62 | if (bytes & 8) { |
| 63 | hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data)); |
| 64 | data += 8; |
| 65 | } |
| 66 | |
| 67 | // The remainder of these _mm_crc32_u*() operate on a 32-bit register. |
| 68 | // We don't lose anything here: only the bottom 32-bits were populated. |
| 69 | auto hash32 = (uint32_t)hash; |
| 70 | |
| 71 | if (bytes & 4) { |
| 72 | hash32 = _mm_crc32_u32(hash32, unaligned_load<uint32_t>(data)); |
| 73 | data += 4; |
| 74 | } |
| 75 | if (bytes & 2) { |
| 76 | hash32 = _mm_crc32_u16(hash32, unaligned_load<uint16_t>(data)); |
| 77 | data += 2; |
| 78 | } |
| 79 | if (bytes & 1) { |
| 80 | hash32 = _mm_crc32_u8(hash32, unaligned_load<uint8_t>(data)); |
| 81 | } |
| 82 | return hash32; |
| 83 | } |
| 84 | |
mtklein | 2f4114a | 2016-08-16 09:29:57 -0700 | [diff] [blame] | 85 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 |
| 86 | // 32-bit version of above, using _mm_crc32_u32() but not _mm_crc32_u64(). |
Mike Klein | cd71f11 | 2017-08-23 11:11:55 -0400 | [diff] [blame] | 87 | /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { |
mtklein | 2f4114a | 2016-08-16 09:29:57 -0700 | [diff] [blame] | 88 | auto data = (const uint8_t*)vdata; |
mtklein | 4e97607 | 2016-08-08 09:06:27 -0700 | [diff] [blame] | 89 | |
mtklein | 2f4114a | 2016-08-16 09:29:57 -0700 | [diff] [blame] | 90 | if (bytes >= 12) { |
| 91 | // We'll create 3 independent hashes, each using _mm_crc32_u32() |
| 92 | // to hash 4 bytes per step. Both 3 and independent are important: |
| 93 | // we can execute 3 of these instructions in parallel on a single core. |
| 94 | uint32_t a = hash, |
| 95 | b = hash, |
| 96 | c = hash; |
| 97 | size_t steps = bytes/12; |
| 98 | while (steps --> 0) { |
| 99 | a = _mm_crc32_u32(a, unaligned_load<uint32_t>(data+0)); |
| 100 | b = _mm_crc32_u32(b, unaligned_load<uint32_t>(data+4)); |
| 101 | c = _mm_crc32_u32(c, unaligned_load<uint32_t>(data+8)); |
| 102 | data += 12; |
| 103 | } |
| 104 | bytes %= 12; |
| 105 | hash = a^b^c; |
| 106 | } |
| 107 | |
| 108 | SkASSERT(bytes < 12); |
| 109 | if (bytes >= 8) { |
| 110 | hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data)); |
| 111 | bytes -= 4; |
| 112 | data += 4; |
| 113 | } |
| 114 | |
| 115 | SkASSERT(bytes < 8); |
| 116 | if (bytes & 4) { |
| 117 | hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data)); |
| 118 | data += 4; |
| 119 | } |
| 120 | if (bytes & 2) { |
| 121 | hash = _mm_crc32_u16(hash, unaligned_load<uint16_t>(data)); |
| 122 | data += 2; |
| 123 | } |
| 124 | if (bytes & 1) { |
| 125 | hash = _mm_crc32_u8(hash, unaligned_load<uint8_t>(data)); |
| 126 | } |
| 127 | return hash; |
| 128 | } |
| 129 | |
Amaury Le Leyzour | 4c29633 | 2017-05-04 14:32:22 -0700 | [diff] [blame] | 130 | #elif defined(SK_ARM_HAS_CRC32) |
Mike Klein | cd71f11 | 2017-08-23 11:11:55 -0400 | [diff] [blame] | 131 | /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { |
mtklein | 78559a7 | 2016-08-22 08:53:45 -0700 | [diff] [blame] | 132 | auto data = (const uint8_t*)vdata; |
| 133 | if (bytes >= 24) { |
| 134 | uint32_t a = hash, |
| 135 | b = hash, |
| 136 | c = hash; |
| 137 | size_t steps = bytes/24; |
| 138 | while (steps --> 0) { |
| 139 | a = __crc32d(a, unaligned_load<uint64_t>(data+ 0)); |
| 140 | b = __crc32d(b, unaligned_load<uint64_t>(data+ 8)); |
| 141 | c = __crc32d(c, unaligned_load<uint64_t>(data+16)); |
| 142 | data += 24; |
| 143 | } |
| 144 | bytes %= 24; |
| 145 | hash = a^b^c; |
| 146 | } |
| 147 | |
| 148 | SkASSERT(bytes < 24); |
| 149 | if (bytes >= 16) { |
| 150 | hash = __crc32d(hash, unaligned_load<uint64_t>(data)); |
| 151 | bytes -= 8; |
| 152 | data += 8; |
| 153 | } |
| 154 | |
| 155 | SkASSERT(bytes < 16); |
| 156 | if (bytes & 8) { |
| 157 | hash = __crc32d(hash, unaligned_load<uint64_t>(data)); |
| 158 | data += 8; |
| 159 | } |
| 160 | if (bytes & 4) { |
| 161 | hash = __crc32w(hash, unaligned_load<uint32_t>(data)); |
| 162 | data += 4; |
| 163 | } |
| 164 | if (bytes & 2) { |
| 165 | hash = __crc32h(hash, unaligned_load<uint16_t>(data)); |
| 166 | data += 2; |
| 167 | } |
| 168 | if (bytes & 1) { |
| 169 | hash = __crc32b(hash, unaligned_load<uint8_t>(data)); |
| 170 | } |
| 171 | return hash; |
| 172 | } |
| 173 | |
mtklein | 2f4114a | 2016-08-16 09:29:57 -0700 | [diff] [blame] | 174 | #else |
| 175 | // This is Murmur3. |
Mike Klein | cd71f11 | 2017-08-23 11:11:55 -0400 | [diff] [blame] | 176 | /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { |
mtklein | 2f4114a | 2016-08-16 09:29:57 -0700 | [diff] [blame] | 177 | auto data = (const uint8_t*)vdata; |
| 178 | |
| 179 | size_t original_bytes = bytes; |
mtklein | 4e97607 | 2016-08-08 09:06:27 -0700 | [diff] [blame] | 180 | |
| 181 | // Handle 4 bytes at a time while possible. |
mtklein | 2f4114a | 2016-08-16 09:29:57 -0700 | [diff] [blame] | 182 | while (bytes >= 4) { |
| 183 | uint32_t k = unaligned_load<uint32_t>(data); |
mtklein | 4e97607 | 2016-08-08 09:06:27 -0700 | [diff] [blame] | 184 | k *= 0xcc9e2d51; |
| 185 | k = (k << 15) | (k >> 17); |
| 186 | k *= 0x1b873593; |
| 187 | |
| 188 | hash ^= k; |
| 189 | hash = (hash << 13) | (hash >> 19); |
| 190 | hash *= 5; |
| 191 | hash += 0xe6546b64; |
mtklein | 2f4114a | 2016-08-16 09:29:57 -0700 | [diff] [blame] | 192 | |
| 193 | bytes -= 4; |
| 194 | data += 4; |
mtklein | 4e97607 | 2016-08-08 09:06:27 -0700 | [diff] [blame] | 195 | } |
| 196 | |
| 197 | // Handle last 0-3 bytes. |
mtklein | 4e97607 | 2016-08-08 09:06:27 -0700 | [diff] [blame] | 198 | uint32_t k = 0; |
| 199 | switch (bytes & 3) { |
mtklein | 2f4114a | 2016-08-16 09:29:57 -0700 | [diff] [blame] | 200 | case 3: k ^= data[2] << 16; |
| 201 | case 2: k ^= data[1] << 8; |
| 202 | case 1: k ^= data[0] << 0; |
mtklein | 4e97607 | 2016-08-08 09:06:27 -0700 | [diff] [blame] | 203 | k *= 0xcc9e2d51; |
| 204 | k = (k << 15) | (k >> 17); |
| 205 | k *= 0x1b873593; |
| 206 | hash ^= k; |
| 207 | } |
| 208 | |
mtklein | 2f4114a | 2016-08-16 09:29:57 -0700 | [diff] [blame] | 209 | hash ^= original_bytes; |
mtklein | 4e97607 | 2016-08-08 09:06:27 -0700 | [diff] [blame] | 210 | return SkChecksum::Mix(hash); |
| 211 | } |
| 212 | #endif |
| 213 | |
| 214 | } // namespace SK_OPTS_NS |
| 215 | |
| 216 | #endif//SkChecksum_opts_DEFINED |