32-bit fast hash, tidy up murmur3 a bit Nothing too surprising in the new 32-bit x86 hash. It's about half speed of the 64-bit variant, just as you'd expect. Using unaligned_load like the others makes the may_alias parts of murmur3 moot. I've updated some of the terms in the murmur hash to read consistently with the others. The existing hashes are the same speed and produce the same hashes. In case this is not obvious, all three hash functions produce different hashes. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2251773002 CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review-Url: https://codereview.chromium.org/2251773002

commit: 2f4114a246e4753ef5dab63a4b40caf0cf7950ac [log] [tgz]
author: mtklein <mtklein@chromium.org> Tue Aug 16 09:29:57 2016 -0700
committer: Commit bot <commit-bot@chromium.org> Tue Aug 16 09:29:57 2016 -0700
tree: c087a71690e0be1caed9403dcb785f9949db6dbb
parent: af68fa11ed61d8a14d9ca15996e2da6ca06795eb [diff]
diff --git a/src/opts/SkChecksum_opts.h b/src/opts/SkChecksum_opts.h
index 346b16b..07fdfaa 100644
--- a/src/opts/SkChecksum_opts.h
+++ b/src/opts/SkChecksum_opts.h

@@ -16,18 +16,18 @@
 #endif
 
 // TODO: ARMv8 has optional CRC instructions similar to SSE 4.2
-// TODO: 32-bit x86 version: same sort of idea using only _mm_crc32_u32() and smaller
 
 namespace SK_OPTS_NS {
 
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
-    template <typename T>
-    static inline T unaligned_load(const uint8_t* src) {
-        T val;
-        memcpy(&val, src, sizeof(val));
-        return val;
-    }
+template <typename T>
+static inline T unaligned_load(const uint8_t* src) {
+    T val;
+    memcpy(&val, src, sizeof(val));
+    return val;
+}
 
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
+    // This is not a CRC32.  It's Just A Hash that uses those instructions because they're fast.
     static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) {
         auto data = (const uint8_t*)vdata;
 
@@ -82,21 +82,61 @@
         return hash32;
     }
 
-#else
-    static uint32_t hash_fn(const void* data, size_t bytes, uint32_t seed) {
-        // This is Murmur3.
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
+    // 32-bit version of above, using _mm_crc32_u32() but not _mm_crc32_u64().
+    static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
+        auto data = (const uint8_t*)vdata;
 
-        // Use may_alias to remind the compiler we're intentionally violating strict aliasing,
-        // and so not to apply strict-aliasing-based optimizations.
-        typedef uint32_t SK_ATTRIBUTE(may_alias) aliased_uint32_t;
-        typedef uint8_t SK_ATTRIBUTE(may_alias) aliased_uint8_t;
+        if (bytes >= 12) {
+            // We'll create 3 independent hashes, each using _mm_crc32_u32()
+            // to hash 4 bytes per step.  Both 3 and independent are important:
+            // we can execute 3 of these instructions in parallel on a single core.
+            uint32_t a = hash,
+                     b = hash,
+                     c = hash;
+            size_t steps = bytes/12;
+            while (steps --> 0) {
+                a = _mm_crc32_u32(a, unaligned_load<uint32_t>(data+0));
+                b = _mm_crc32_u32(b, unaligned_load<uint32_t>(data+4));
+                c = _mm_crc32_u32(c, unaligned_load<uint32_t>(data+8));
+                data += 12;
+            }
+            bytes %= 12;
+            hash = a^b^c;
+        }
+
+        SkASSERT(bytes < 12);
+        if (bytes >= 8) {
+            hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
+            bytes -= 4;
+            data  += 4;
+        }
+
+        SkASSERT(bytes < 8);
+        if (bytes & 4) {
+            hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
+            data += 4;
+        }
+        if (bytes & 2) {
+            hash = _mm_crc32_u16(hash, unaligned_load<uint16_t>(data));
+            data += 2;
+        }
+        if (bytes & 1) {
+            hash = _mm_crc32_u8(hash, unaligned_load<uint8_t>(data));
+        }
+        return hash;
+    }
+
+#else
+    // This is Murmur3.
+    static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
+        auto data = (const uint8_t*)vdata;
+
+        size_t original_bytes = bytes;
 
         // Handle 4 bytes at a time while possible.
-        const aliased_uint32_t* safe_data = (const aliased_uint32_t*)data;
-        const size_t words = bytes/4;
-        uint32_t hash = seed;
-        for (size_t i = 0; i < words; i++) {
-            uint32_t k = safe_data[i];
+        while (bytes >= 4) {
+            uint32_t k = unaligned_load<uint32_t>(data);
             k *= 0xcc9e2d51;
             k = (k << 15) | (k >> 17);
             k *= 0x1b873593;
@@ -105,22 +145,24 @@
             hash = (hash << 13) | (hash >> 19);
             hash *= 5;
             hash += 0xe6546b64;
+
+            bytes -= 4;
+            data  += 4;
         }
 
         // Handle last 0-3 bytes.
-        const aliased_uint8_t* safe_tail = (const uint8_t*)(safe_data + words);
         uint32_t k = 0;
         switch (bytes & 3) {
-            case 3: k ^= safe_tail[2] << 16;
-            case 2: k ^= safe_tail[1] <<  8;
-            case 1: k ^= safe_tail[0] <<  0;
+            case 3: k ^= data[2] << 16;
+            case 2: k ^= data[1] <<  8;
+            case 1: k ^= data[0] <<  0;
                     k *= 0xcc9e2d51;
                     k = (k << 15) | (k >> 17);
                     k *= 0x1b873593;
                     hash ^= k;
         }
 
-        hash ^= bytes;
+        hash ^= original_bytes;
         return SkChecksum::Mix(hash);
     }
 #endif
commit	2f4114a246e4753ef5dab63a4b40caf0cf7950ac	[log] [tgz]
author	mtklein <mtklein@chromium.org>	Tue Aug 16 09:29:57 2016 -0700
committer	Commit bot <commit-bot@chromium.org>	Tue Aug 16 09:29:57 2016 -0700
tree	c087a71690e0be1caed9403dcb785f9949db6dbb
parent	af68fa11ed61d8a14d9ca15996e2da6ca06795eb [diff]