| /* |
| * Copyright 2016 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| #ifndef SkChecksum_opts_DEFINED |
| #define SkChecksum_opts_DEFINED |
| |
| #include "SkChecksum.h" |
| #include "SkTypes.h" |
| |
| #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 |
| #include <immintrin.h> |
| #elif defined(SK_CPU_ARM64) && defined(__ARM_FEATURE_CRC32) |
| #include <arm_acle.h> |
| #endif |
| |
| namespace SK_OPTS_NS { |
| |
| template <typename T> |
| static inline T unaligned_load(const uint8_t* src) { |
| T val; |
| memcpy(&val, src, sizeof(val)); |
| return val; |
| } |
| |
| #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64)) |
| // This is not a CRC32. It's Just A Hash that uses those instructions because they're fast. |
| static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) { |
| auto data = (const uint8_t*)vdata; |
| |
| // _mm_crc32_u64() operates on 64-bit registers, so we use uint64_t for a while. |
| uint64_t hash = seed; |
| if (bytes >= 24) { |
| // We'll create 3 independent hashes, each using _mm_crc32_u64() |
| // to hash 8 bytes per step. Both 3 and independent are important: |
| // we can execute 3 of these instructions in parallel on a single core. |
| uint64_t a = hash, |
| b = hash, |
| c = hash; |
| size_t steps = bytes/24; |
| while (steps --> 0) { |
| a = _mm_crc32_u64(a, unaligned_load<uint64_t>(data+ 0)); |
| b = _mm_crc32_u64(b, unaligned_load<uint64_t>(data+ 8)); |
| c = _mm_crc32_u64(c, unaligned_load<uint64_t>(data+16)); |
| data += 24; |
| } |
| bytes %= 24; |
| hash = a^b^c; |
| } |
| |
| SkASSERT(bytes < 24); |
| if (bytes >= 16) { |
| hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data)); |
| bytes -= 8; |
| data += 8; |
| } |
| |
| SkASSERT(bytes < 16); |
| if (bytes & 8) { |
| hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data)); |
| data += 8; |
| } |
| |
| // The remainder of these _mm_crc32_u*() operate on a 32-bit register. |
| // We don't lose anything here: only the bottom 32-bits were populated. |
| auto hash32 = (uint32_t)hash; |
| |
| if (bytes & 4) { |
| hash32 = _mm_crc32_u32(hash32, unaligned_load<uint32_t>(data)); |
| data += 4; |
| } |
| if (bytes & 2) { |
| hash32 = _mm_crc32_u16(hash32, unaligned_load<uint16_t>(data)); |
| data += 2; |
| } |
| if (bytes & 1) { |
| hash32 = _mm_crc32_u8(hash32, unaligned_load<uint8_t>(data)); |
| } |
| return hash32; |
| } |
| |
| #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 |
| // 32-bit version of above, using _mm_crc32_u32() but not _mm_crc32_u64(). |
| static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { |
| auto data = (const uint8_t*)vdata; |
| |
| if (bytes >= 12) { |
| // We'll create 3 independent hashes, each using _mm_crc32_u32() |
| // to hash 4 bytes per step. Both 3 and independent are important: |
| // we can execute 3 of these instructions in parallel on a single core. |
| uint32_t a = hash, |
| b = hash, |
| c = hash; |
| size_t steps = bytes/12; |
| while (steps --> 0) { |
| a = _mm_crc32_u32(a, unaligned_load<uint32_t>(data+0)); |
| b = _mm_crc32_u32(b, unaligned_load<uint32_t>(data+4)); |
| c = _mm_crc32_u32(c, unaligned_load<uint32_t>(data+8)); |
| data += 12; |
| } |
| bytes %= 12; |
| hash = a^b^c; |
| } |
| |
| SkASSERT(bytes < 12); |
| if (bytes >= 8) { |
| hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data)); |
| bytes -= 4; |
| data += 4; |
| } |
| |
| SkASSERT(bytes < 8); |
| if (bytes & 4) { |
| hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data)); |
| data += 4; |
| } |
| if (bytes & 2) { |
| hash = _mm_crc32_u16(hash, unaligned_load<uint16_t>(data)); |
| data += 2; |
| } |
| if (bytes & 1) { |
| hash = _mm_crc32_u8(hash, unaligned_load<uint8_t>(data)); |
| } |
| return hash; |
| } |
| |
| #elif defined(SK_CPU_ARM64) && defined(__ARM_FEATURE_CRC32) |
| static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { |
| auto data = (const uint8_t*)vdata; |
| if (bytes >= 24) { |
| uint32_t a = hash, |
| b = hash, |
| c = hash; |
| size_t steps = bytes/24; |
| while (steps --> 0) { |
| a = __crc32d(a, unaligned_load<uint64_t>(data+ 0)); |
| b = __crc32d(b, unaligned_load<uint64_t>(data+ 8)); |
| c = __crc32d(c, unaligned_load<uint64_t>(data+16)); |
| data += 24; |
| } |
| bytes %= 24; |
| hash = a^b^c; |
| } |
| |
| SkASSERT(bytes < 24); |
| if (bytes >= 16) { |
| hash = __crc32d(hash, unaligned_load<uint64_t>(data)); |
| bytes -= 8; |
| data += 8; |
| } |
| |
| SkASSERT(bytes < 16); |
| if (bytes & 8) { |
| hash = __crc32d(hash, unaligned_load<uint64_t>(data)); |
| data += 8; |
| } |
| if (bytes & 4) { |
| hash = __crc32w(hash, unaligned_load<uint32_t>(data)); |
| data += 4; |
| } |
| if (bytes & 2) { |
| hash = __crc32h(hash, unaligned_load<uint16_t>(data)); |
| data += 2; |
| } |
| if (bytes & 1) { |
| hash = __crc32b(hash, unaligned_load<uint8_t>(data)); |
| } |
| return hash; |
| } |
| |
| #else |
| // This is Murmur3. |
| static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { |
| auto data = (const uint8_t*)vdata; |
| |
| size_t original_bytes = bytes; |
| |
| // Handle 4 bytes at a time while possible. |
| while (bytes >= 4) { |
| uint32_t k = unaligned_load<uint32_t>(data); |
| k *= 0xcc9e2d51; |
| k = (k << 15) | (k >> 17); |
| k *= 0x1b873593; |
| |
| hash ^= k; |
| hash = (hash << 13) | (hash >> 19); |
| hash *= 5; |
| hash += 0xe6546b64; |
| |
| bytes -= 4; |
| data += 4; |
| } |
| |
| // Handle last 0-3 bytes. |
| uint32_t k = 0; |
| switch (bytes & 3) { |
| case 3: k ^= data[2] << 16; |
| case 2: k ^= data[1] << 8; |
| case 1: k ^= data[0] << 0; |
| k *= 0xcc9e2d51; |
| k = (k << 15) | (k >> 17); |
| k *= 0x1b873593; |
| hash ^= k; |
| } |
| |
| hash ^= original_bytes; |
| return SkChecksum::Mix(hash); |
| } |
| #endif |
| |
| } // namespace SK_OPTS_NS |
| |
| #endif//SkChecksum_opts_DEFINED |