Use sse4.2 CRC32 instructions to hash when available.

About 9x faster than Murmur3 for long inputs.

Most of this is a mechanical change from SkChecksum::Murmur3(...) to SkOpts::hash(...).

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2208903002
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot;master.client.skia.compile:Build-Ubuntu-GCC-x86_64-Release-CMake-Trybot,Build-Mac-Clang-x86_64-Release-CMake-Trybot

Review-Url: https://codereview.chromium.org/2208903002
diff --git a/src/core/SkChecksum.cpp b/src/core/SkChecksum.cpp
deleted file mode 100644
index 4457eb4..0000000
--- a/src/core/SkChecksum.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkChecksum.h"
-
-uint32_t SkChecksum::Murmur3(const void* data, size_t bytes, uint32_t seed) {
-    // Use may_alias to remind the compiler we're intentionally violating strict aliasing,
-    // and so not to apply strict-aliasing-based optimizations.
-    typedef uint32_t SK_ATTRIBUTE(may_alias) aliased_uint32_t;
-    typedef uint8_t SK_ATTRIBUTE(may_alias) aliased_uint8_t;
-
-    // Handle 4 bytes at a time while possible.
-    const aliased_uint32_t* safe_data = (const aliased_uint32_t*)data;
-    const size_t words = bytes/4;
-    uint32_t hash = seed;
-    for (size_t i = 0; i < words; i++) {
-        uint32_t k = safe_data[i];
-        k *= 0xcc9e2d51;
-        k = (k << 15) | (k >> 17);
-        k *= 0x1b873593;
-
-        hash ^= k;
-        hash = (hash << 13) | (hash >> 19);
-        hash *= 5;
-        hash += 0xe6546b64;
-    }
-
-    // Handle last 0-3 bytes.
-    const aliased_uint8_t* safe_tail = (const uint8_t*)(safe_data + words);
-    uint32_t k = 0;
-    switch (bytes & 3) {
-        case 3: k ^= safe_tail[2] << 16;
-        case 2: k ^= safe_tail[1] <<  8;
-        case 1: k ^= safe_tail[0] <<  0;
-                k *= 0xcc9e2d51;
-                k = (k << 15) | (k >> 17);
-                k *= 0x1b873593;
-                hash ^= k;
-    }
-
-    hash ^= bytes;
-    return SkChecksum::Mix(hash);
-}
diff --git a/src/core/SkDescriptor.h b/src/core/SkDescriptor.h
index 71f7133..efa0278 100644
--- a/src/core/SkDescriptor.h
+++ b/src/core/SkDescriptor.h
@@ -9,7 +9,7 @@
 #ifndef SkDescriptor_DEFINED
 #define SkDescriptor_DEFINED
 
-#include "SkChecksum.h"
+#include "SkOpts.h"
 #include "SkTypes.h"
 
 class SkDescriptor : SkNoncopyable {
@@ -123,7 +123,7 @@
     static uint32_t ComputeChecksum(const SkDescriptor* desc) {
         const uint32_t* ptr = (const uint32_t*)desc + 1; // skip the checksum field
         size_t len = desc->fLength - sizeof(uint32_t);
-        return SkChecksum::Murmur3(ptr, len);
+        return SkOpts::hash(ptr, len);
     }
 
     // private so no one can create one except our factories
diff --git a/src/core/SkImageFilterCache.cpp b/src/core/SkImageFilterCache.cpp
index ba8a32c..c7104de 100644
--- a/src/core/SkImageFilterCache.cpp
+++ b/src/core/SkImageFilterCache.cpp
@@ -7,9 +7,9 @@
 
 #include "SkImageFilterCache.h"
 
-#include "SkChecksum.h"
 #include "SkMutex.h"
 #include "SkOnce.h"
+#include "SkOpts.h"
 #include "SkRefCnt.h"
 #include "SkSpecialImage.h"
 #include "SkTDynamicHash.h"
@@ -47,7 +47,7 @@
             return v.fKey;
         }
         static uint32_t Hash(const Key& key) {
-            return SkChecksum::Murmur3(reinterpret_cast<const uint32_t*>(&key), sizeof(Key));
+            return SkOpts::hash(reinterpret_cast<const uint32_t*>(&key), sizeof(Key));
         }
         SK_DECLARE_INTERNAL_LLIST_INTERFACE(Value);
     };
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index 5263fe4..a4da111 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -26,6 +26,7 @@
 #include "SkBlitMask_opts.h"
 #include "SkBlitRow_opts.h"
 #include "SkBlurImageFilter_opts.h"
+#include "SkChecksum_opts.h"
 #include "SkColorCubeFilter_opts.h"
 #include "SkMorphologyImageFilter_opts.h"
 #include "SkSwizzler_opts.h"
@@ -70,12 +71,14 @@
     DEFINE_DEFAULT(inverted_CMYK_to_BGR1);
 
     DEFINE_DEFAULT(srcover_srgb_srgb);
+
+    DEFINE_DEFAULT(hash_fn);
 #undef DEFINE_DEFAULT
 
     // Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
     void Init_ssse3();
     void Init_sse41();
-    void Init_sse42() {}
+    void Init_sse42();
     void Init_avx();
     void Init_avx2() {}
 
diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index 7c6cfb0..44e337d 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@@ -65,6 +65,12 @@
     // Blend ndst src pixels over dst, where both src and dst point to sRGB pixels (RGBA or BGRA).
     // If nsrc < ndst, we loop over src to create a pattern.
     extern void (*srcover_srgb_srgb)(uint32_t* dst, const uint32_t* src, int ndst, int nsrc);
+
+    // The fastest high quality 32-bit hash we can provide on this platform.
+    extern uint32_t (*hash_fn)(const void*, size_t, uint32_t seed);
+    static inline uint32_t hash(const void* data, size_t bytes, uint32_t seed=0) {
+        return hash_fn(data, bytes, seed);
+    }
 }
 
 #endif//SkOpts_DEFINED
diff --git a/src/core/SkPaint.cpp b/src/core/SkPaint.cpp
index 07c10d5..44b2928 100644
--- a/src/core/SkPaint.cpp
+++ b/src/core/SkPaint.cpp
@@ -7,7 +7,6 @@
 
 #include "SkPaint.h"
 #include "SkAutoKern.h"
-#include "SkChecksum.h"
 #include "SkColorFilter.h"
 #include "SkData.h"
 #include "SkDraw.h"
@@ -19,6 +18,7 @@
 #include "SkMutex.h"
 #include "SkReadBuffer.h"
 #include "SkWriteBuffer.h"
+#include "SkOpts.h"
 #include "SkPaintDefaults.h"
 #include "SkPathEffect.h"
 #include "SkRasterizer.h"
@@ -2388,6 +2388,6 @@
     // so fBitfields should be 10 pointers and 6 32-bit values from the start.
     static_assert(offsetof(SkPaint, fBitfields) == 9 * sizeof(void*) + 6 * sizeof(uint32_t),
                   "SkPaint_notPackedTightly");
-    return SkChecksum::Murmur3(reinterpret_cast<const uint32_t*>(this),
-                               offsetof(SkPaint, fBitfields) + sizeof(fBitfields));
+    return SkOpts::hash(reinterpret_cast<const uint32_t*>(this),
+                        offsetof(SkPaint, fBitfields) + sizeof(fBitfields));
 }
diff --git a/src/core/SkResourceCache.cpp b/src/core/SkResourceCache.cpp
index e465132..4bdc8dd 100644
--- a/src/core/SkResourceCache.cpp
+++ b/src/core/SkResourceCache.cpp
@@ -5,10 +5,10 @@
  * found in the LICENSE file.
  */
 
-#include "SkChecksum.h"
 #include "SkMessageBus.h"
 #include "SkMipMap.h"
 #include "SkMutex.h"
+#include "SkOpts.h"
 #include "SkPixelRef.h"
 #include "SkResourceCache.h"
 #include "SkTraceMemoryDump.h"
@@ -46,9 +46,9 @@
     fSharedID_lo = (uint32_t)sharedID;
     fSharedID_hi = (uint32_t)(sharedID >> 32);
     fNamespace = nameSpace;
-    // skip unhashed fields when computing the murmur
-    fHash = SkChecksum::Murmur3(this->as32() + kUnhashedLocal32s,
-                                (fCount32 - kUnhashedLocal32s) << 2);
+    // skip unhashed fields when computing the hash
+    fHash = SkOpts::hash(this->as32() + kUnhashedLocal32s,
+                         (fCount32 - kUnhashedLocal32s) << 2);
 }
 
 #include "SkTDynamicHash.h"
diff --git a/src/gpu/GrProgramDesc.h b/src/gpu/GrProgramDesc.h
index ec6447d..b17d146 100644
--- a/src/gpu/GrProgramDesc.h
+++ b/src/gpu/GrProgramDesc.h
@@ -10,7 +10,8 @@
 
 #include "GrColor.h"
 #include "GrTypesPriv.h"
-#include "SkChecksum.h"
+#include "SkOpts.h"
+#include "SkTArray.h"
 
 /** This class describes a program to generate. It also serves as a program cache key. Very little
     of this is GL-specific. The GL-specific parts could be factored out into a subclass. */
@@ -112,7 +113,7 @@
 
         uint32_t* checksum = this->atOffset<uint32_t, GrProgramDesc::kChecksumOffset>();
         *checksum = 0;  // We'll hash through these bytes, so make sure they're initialized.
-        *checksum = SkChecksum::Murmur3(fKey.begin(), keyLength);
+        *checksum = SkOpts::hash(fKey.begin(), keyLength);
     }
 
     // The key, stored in fKey, is composed of four parts:
diff --git a/src/gpu/GrResourceCache.cpp b/src/gpu/GrResourceCache.cpp
index 71e40f1..62360ed 100644
--- a/src/gpu/GrResourceCache.cpp
+++ b/src/gpu/GrResourceCache.cpp
@@ -11,9 +11,9 @@
 #include "GrCaps.h"
 #include "GrGpuResourceCacheAccess.h"
 #include "GrTracing.h"
-#include "SkChecksum.h"
 #include "SkGr.h"
 #include "SkMessageBus.h"
+#include "SkOpts.h"
 #include "SkTSort.h"
 
 DECLARE_SKMESSAGEBUS_MESSAGE(GrUniqueKeyInvalidatedMessage);
@@ -43,7 +43,7 @@
 }
 
 uint32_t GrResourceKeyHash(const uint32_t* data, size_t size) {
-    return SkChecksum::Murmur3(data, size);
+    return SkOpts::hash(data, size);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -687,7 +687,7 @@
                 SkASSERT(SkBudgeted::kNo == resource->resourcePriv().isBudgeted() ||
                          uniqueKey.isValid());
                 if (!uniqueKey.isValid()) {
-                    ++fCouldBeScratch;                
+                    ++fCouldBeScratch;
                     SkASSERT(fScratchMap->countForKey(scratchKey));
                 }
                 SkASSERT(!resource->resourcePriv().refsWrappedObjects());
diff --git a/src/gpu/batches/GrAADistanceFieldPathRenderer.h b/src/gpu/batches/GrAADistanceFieldPathRenderer.h
index db17b07..985b2f1 100755
--- a/src/gpu/batches/GrAADistanceFieldPathRenderer.h
+++ b/src/gpu/batches/GrAADistanceFieldPathRenderer.h
@@ -13,7 +13,7 @@
 #include "GrRect.h"
 #include "GrShape.h"
 
-#include "SkChecksum.h"
+#include "SkOpts.h"
 #include "SkTDynamicHash.h"
 
 class GrContext;
@@ -81,7 +81,7 @@
         }
 
         static inline uint32_t Hash(Key key) {
-            return SkChecksum::Murmur3(key.data(), sizeof(uint32_t) * key.count32());
+            return SkOpts::hash(key.data(), sizeof(uint32_t) * key.count32());
         }
     };
 
diff --git a/src/gpu/effects/GrTextureStripAtlas.h b/src/gpu/effects/GrTextureStripAtlas.h
index 91ce61c..5b90a34 100644
--- a/src/gpu/effects/GrTextureStripAtlas.h
+++ b/src/gpu/effects/GrTextureStripAtlas.h
@@ -9,7 +9,7 @@
 #define GrTextureStripAtlas_DEFINED
 
 #include "SkBitmap.h"
-#include "SkChecksum.h"
+#include "SkOpts.h"
 #include "SkGr.h"
 #include "SkTDArray.h"
 #include "SkTDynamicHash.h"
@@ -142,7 +142,7 @@
     public:
         // for SkTDynamicHash
         static const Desc& GetKey(const AtlasEntry& entry) { return entry.fDesc; }
-        static uint32_t Hash(const Desc& desc) { return SkChecksum::Murmur3(&desc, sizeof(Desc)); }
+        static uint32_t Hash(const Desc& desc) { return SkOpts::hash(&desc, sizeof(Desc)); }
 
         // AtlasEntry proper
         AtlasEntry() : fAtlas(nullptr) {}
diff --git a/src/gpu/text/GrAtlasTextBlob.h b/src/gpu/text/GrAtlasTextBlob.h
index f76d026..afc11a9 100644
--- a/src/gpu/text/GrAtlasTextBlob.h
+++ b/src/gpu/text/GrAtlasTextBlob.h
@@ -14,6 +14,7 @@
 #include "GrMemoryPool.h"
 #include "SkDescriptor.h"
 #include "SkMaskFilter.h"
+#include "SkOpts.h"
 #include "SkPathEffect.h"
 #include "SkRasterizer.h"
 #include "SkSurfaceProps.h"
@@ -89,7 +90,7 @@
     }
 
     static uint32_t Hash(const Key& key) {
-        return SkChecksum::Murmur3(&key, sizeof(Key));
+        return SkOpts::hash(&key, sizeof(Key));
     }
 
     void operator delete(void* p) {
diff --git a/src/gpu/text/GrStencilAndCoverTextContext.h b/src/gpu/text/GrStencilAndCoverTextContext.h
index 9b29719..d4f1abd 100644
--- a/src/gpu/text/GrStencilAndCoverTextContext.h
+++ b/src/gpu/text/GrStencilAndCoverTextContext.h
@@ -11,6 +11,7 @@
 #include "GrDrawContext.h"
 #include "GrStyle.h"
 #include "SkDrawFilter.h"
+#include "SkOpts.h"
 #include "SkTextBlob.h"
 #include "SkTHash.h"
 #include "SkTInternalLList.h"
@@ -120,7 +121,7 @@
 
         static uint32_t Hash(const Key& key) {
             SkASSERT(key.count() > 1); // 1-length keys should be using the blob-id hash map.
-            return SkChecksum::Murmur3(key.begin(), sizeof(uint32_t) * key.count());
+            return SkOpts::hash(key.begin(), sizeof(uint32_t) * key.count());
         }
 
         TextBlob(uint32_t blobId, const SkTextBlob* skBlob, const SkPaint& skPaint)
diff --git a/src/gpu/vk/GrVkPipelineStateCache.cpp b/src/gpu/vk/GrVkPipelineStateCache.cpp
index 5e4013d..d404a8d 100644
--- a/src/gpu/vk/GrVkPipelineStateCache.cpp
+++ b/src/gpu/vk/GrVkPipelineStateCache.cpp
@@ -11,6 +11,7 @@
 #include "GrProcessor.h"
 #include "GrVkPipelineState.h"
 #include "GrVkPipelineStateBuilder.h"
+#include "SkOpts.h"
 #include "glsl/GrGLSLFragmentProcessor.h"
 #include "glsl/GrGLSLProgramDataManager.h"
 
@@ -112,8 +113,8 @@
     int keyLength = desc.fStateKey.count();
     SkASSERT(0 == (keyLength % 4));
     // Seed the checksum with the checksum of the programDesc then add the vulkan key to it.
-    desc.fChecksum = SkChecksum::Murmur3(desc.fStateKey.begin(), keyLength,
-                                         desc.fProgramDesc.getChecksum());
+    desc.fChecksum = SkOpts::hash(desc.fStateKey.begin(), keyLength,
+                                  desc.fProgramDesc.getChecksum());
 
     Entry* entry = nullptr;
     if (Entry** entryptr = fHashTable.find(desc)) {
diff --git a/src/opts/SkChecksum_opts.h b/src/opts/SkChecksum_opts.h
new file mode 100644
index 0000000..346b16b
--- /dev/null
+++ b/src/opts/SkChecksum_opts.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkChecksum_opts_DEFINED
+#define SkChecksum_opts_DEFINED
+
+#include "SkChecksum.h"
+#include "SkTypes.h"
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
+    #include <immintrin.h>
+#endif
+
+// TODO: ARMv8 has optional CRC instructions similar to SSE 4.2
+// TODO: 32-bit x86 version: same sort of idea using only _mm_crc32_u32() and smaller
+
+namespace SK_OPTS_NS {
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
+    template <typename T>
+    static inline T unaligned_load(const uint8_t* src) {
+        T val;
+        memcpy(&val, src, sizeof(val));
+        return val;
+    }
+
+    static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) {
+        auto data = (const uint8_t*)vdata;
+
+        // _mm_crc32_u64() operates on 64-bit registers, so we use uint64_t for a while.
+        uint64_t hash = seed;
+        if (bytes >= 24) {
+            // We'll create 3 independent hashes, each using _mm_crc32_u64()
+            // to hash 8 bytes per step.  Both 3 and independent are important:
+            // we can execute 3 of these instructions in parallel on a single core.
+            uint64_t a = hash,
+                     b = hash,
+                     c = hash;
+            size_t steps = bytes/24;
+            while (steps --> 0) {
+                a = _mm_crc32_u64(a, unaligned_load<uint64_t>(data+ 0));
+                b = _mm_crc32_u64(b, unaligned_load<uint64_t>(data+ 8));
+                c = _mm_crc32_u64(c, unaligned_load<uint64_t>(data+16));
+                data += 24;
+            }
+            bytes %= 24;
+            hash = a^b^c;
+        }
+
+        SkASSERT(bytes < 24);
+        if (bytes >= 16) {
+            hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
+            bytes -= 8;
+            data  += 8;
+        }
+
+        SkASSERT(bytes < 16);
+        if (bytes & 8) {
+            hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
+            data  += 8;
+        }
+
+        // The remainder of these _mm_crc32_u*() operate on a 32-bit register.
+        // We don't lose anything here: only the bottom 32-bits were populated.
+        auto hash32 = (uint32_t)hash;
+
+        if (bytes & 4) {
+            hash32 = _mm_crc32_u32(hash32, unaligned_load<uint32_t>(data));
+            data += 4;
+        }
+        if (bytes & 2) {
+            hash32 = _mm_crc32_u16(hash32, unaligned_load<uint16_t>(data));
+            data += 2;
+        }
+        if (bytes & 1) {
+            hash32 = _mm_crc32_u8(hash32, unaligned_load<uint8_t>(data));
+        }
+        return hash32;
+    }
+
+#else
+    static uint32_t hash_fn(const void* data, size_t bytes, uint32_t seed) {
+        // This is Murmur3.
+
+        // Use may_alias to remind the compiler we're intentionally violating strict aliasing,
+        // and so not to apply strict-aliasing-based optimizations.
+        typedef uint32_t SK_ATTRIBUTE(may_alias) aliased_uint32_t;
+        typedef uint8_t SK_ATTRIBUTE(may_alias) aliased_uint8_t;
+
+        // Handle 4 bytes at a time while possible.
+        const aliased_uint32_t* safe_data = (const aliased_uint32_t*)data;
+        const size_t words = bytes/4;
+        uint32_t hash = seed;
+        for (size_t i = 0; i < words; i++) {
+            uint32_t k = safe_data[i];
+            k *= 0xcc9e2d51;
+            k = (k << 15) | (k >> 17);
+            k *= 0x1b873593;
+
+            hash ^= k;
+            hash = (hash << 13) | (hash >> 19);
+            hash *= 5;
+            hash += 0xe6546b64;
+        }
+
+        // Handle last 0-3 bytes.
+        const aliased_uint8_t* safe_tail = (const uint8_t*)(safe_data + words);
+        uint32_t k = 0;
+        switch (bytes & 3) {
+            case 3: k ^= safe_tail[2] << 16;
+            case 2: k ^= safe_tail[1] <<  8;
+            case 1: k ^= safe_tail[0] <<  0;
+                    k *= 0xcc9e2d51;
+                    k = (k << 15) | (k >> 17);
+                    k *= 0x1b873593;
+                    hash ^= k;
+        }
+
+        hash ^= bytes;
+        return SkChecksum::Mix(hash);
+    }
+#endif
+
+}  // namespace SK_OPTS_NS
+
+#endif//SkChecksum_opts_DEFINED
diff --git a/src/opts/SkOpts_sse42.cpp b/src/opts/SkOpts_sse42.cpp
new file mode 100644
index 0000000..1883182
--- /dev/null
+++ b/src/opts/SkOpts_sse42.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkOpts.h"
+
+#define SK_OPTS_NS sse42
+#include "SkChecksum_opts.h"
+
+namespace SkOpts {
+    void Init_sse42() {
+        hash_fn = sse42::hash_fn;
+    }
+}
+
diff --git a/src/pdf/SkPDFGraphicState.h b/src/pdf/SkPDFGraphicState.h
index 84491ba..49723bd 100644
--- a/src/pdf/SkPDFGraphicState.h
+++ b/src/pdf/SkPDFGraphicState.h
@@ -10,7 +10,7 @@
 #define SkPDFGraphicState_DEFINED
 
 #include "SkPDFTypes.h"
-#include "SkChecksum.h"
+#include "SkOpts.h"
 
 class SkPaint;
 class SkPDFCanon;
@@ -63,7 +63,7 @@
     bool operator==(const SkPDFGraphicState& rhs) const {
         return 0 == memcmp(&fStrokeWidth, &rhs.fStrokeWidth, 12);
     }
-    uint32_t hash() const { return SkChecksum::Murmur3(&fStrokeWidth, 12); }
+    uint32_t hash() const { return SkOpts::hash(&fStrokeWidth, 12); }
 
 private:
     const SkScalar fStrokeWidth;
diff --git a/src/utils/SkWhitelistTypefaces.cpp b/src/utils/SkWhitelistTypefaces.cpp
index ac82f7c..139e697 100644
--- a/src/utils/SkWhitelistTypefaces.cpp
+++ b/src/utils/SkWhitelistTypefaces.cpp
@@ -5,8 +5,8 @@
  * found in the LICENSE file.
  */
 
-#include "SkChecksum.h"
 #include "SkFontDescriptor.h"
+#include "SkOpts.h"
 #include "SkStream.h"
 #include "SkString.h"
 #include "SkTypeface.h"
@@ -80,7 +80,7 @@
     if (!fontStream->peek(data.begin(), length)) {
         return 0;
     }
-    return SkChecksum::Murmur3(data.begin(), length);
+    return SkOpts::hash(data.begin(), length);
 }
 
 static void serialize_sub(const char* fontName, SkFontStyle style, SkWStream* wstream) {