Use sse4.2 CRC32 instructions to hash when available.

About 9x faster than Murmur3 for long inputs.

Most of this is a mechanical change from SkChecksum::Murmur3(...) to SkOpts::hash(...).

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2208903002
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot;master.client.skia.compile:Build-Ubuntu-GCC-x86_64-Release-CMake-Trybot,Build-Mac-Clang-x86_64-Release-CMake-Trybot

Review-Url: https://codereview.chromium.org/2208903002
diff --git a/BUILD.gn b/BUILD.gn
index 76b04d5..ff3c492 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -158,6 +158,13 @@
   cflags = [ "-msse4.1" ]
 }
 
+source_set("opts_sse42") {
+  configs += skia_library_configs
+
+  sources = opts_gypi.sse42_sources
+  cflags = [ "-msse4.2" ]
+}
+
 source_set("opts_avx") {
   configs += skia_library_configs
 
@@ -172,6 +179,7 @@
   deps = [
     ":opts_avx",
     ":opts_sse41",
+    ":opts_sse42",
     ":opts_ssse3",
     "//third_party/expat",
     "//third_party/giflib",
diff --git a/bench/ChecksumBench.cpp b/bench/ChecksumBench.cpp
index 4c2ac1a..f84cd3d 100644
--- a/bench/ChecksumBench.cpp
+++ b/bench/ChecksumBench.cpp
@@ -7,13 +7,14 @@
 #include "Benchmark.h"
 #include "SkCanvas.h"
 #include "SkChecksum.h"
+#include "SkOpts.h"
 #include "SkMD5.h"
 #include "SkRandom.h"
 #include "SkTemplates.h"
 
 enum ChecksumType {
     kMD5_ChecksumType,
-    kMurmur3_ChecksumType,
+    kHash_ChecksumType,
 };
 
 class ComputeChecksumBench : public Benchmark {
@@ -40,7 +41,7 @@
     const char* onGetName() override {
         switch (fType) {
             case kMD5_ChecksumType: return "compute_md5";
-            case kMurmur3_ChecksumType: return "compute_murmur3";
+            case kHash_ChecksumType: return "compute_hash";
 
             default: SK_ABORT("Invalid Type"); return "";
         }
@@ -56,9 +57,9 @@
                     md5.finish(digest);
                 }
             } break;
-            case kMurmur3_ChecksumType: {
+            case kHash_ChecksumType: {
                 for (int i = 0; i < loops; i++) {
-                    volatile uint32_t result = SkChecksum::Murmur3(fData, sizeof(fData));
+                    volatile uint32_t result = SkOpts::hash(fData, sizeof(fData));
                     sk_ignore_unused_variable(result);
                 }
             }break;
@@ -73,4 +74,4 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 DEF_BENCH( return new ComputeChecksumBench(kMD5_ChecksumType); )
-DEF_BENCH( return new ComputeChecksumBench(kMurmur3_ChecksumType); )
+DEF_BENCH( return new ComputeChecksumBench(kHash_ChecksumType); )
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index b18510e..fd1b98c 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -130,12 +130,14 @@
 
 # Certain files must be compiled with support for SSSE3, SSE4.1, AVX, or AVX2 intrinsics.
 file (GLOB_RECURSE ssse3_srcs ../src/*ssse3*.cpp ../src/*SSSE3*.cpp)
-file (GLOB_RECURSE sse41_srcs ../src/*sse4*.cpp ../src/*SSE4*.cpp)
+file (GLOB_RECURSE sse41_srcs ../src/*sse41*.cpp ../src/*SSE41*.cpp)
+file (GLOB_RECURSE sse42_srcs ../src/*sse42*.cpp ../src/*SSE42*.cpp)
 file (GLOB_RECURSE avx_srcs   ../src/*_avx.cpp)
 file (GLOB_RECURSE avx2_srcs  ../src/*_avx2.cpp)
 if (NOT WIN32)
     set_source_files_properties(${ssse3_srcs} PROPERTIES COMPILE_FLAGS -mssse3)
     set_source_files_properties(${sse41_srcs} PROPERTIES COMPILE_FLAGS -msse4.1)
+    set_source_files_properties(${sse42_srcs} PROPERTIES COMPILE_FLAGS -msse4.2)
     set_source_files_properties(${avx_srcs}   PROPERTIES COMPILE_FLAGS -mavx)
     set_source_files_properties(${avx2_srcs}  PROPERTIES COMPILE_FLAGS -mavx2)
 endif()
diff --git a/gyp/core.gypi b/gyp/core.gypi
index 306c90f..76ed86b 100644
--- a/gyp/core.gypi
+++ b/gyp/core.gypi
@@ -65,7 +65,6 @@
         '<(skia_src_path)/core/SkCachedData.cpp',
         '<(skia_src_path)/core/SkCanvas.cpp',
         '<(skia_src_path)/core/SkCanvasPriv.h',
-        '<(skia_src_path)/core/SkChecksum.cpp',
         '<(skia_src_path)/core/SkChunkAlloc.cpp',
         '<(skia_src_path)/core/SkClipStack.cpp',
         '<(skia_src_path)/core/SkColor.cpp',
diff --git a/gyp/opts.gypi b/gyp/opts.gypi
index 6db9d0c..e89334d 100644
--- a/gyp/opts.gypi
+++ b/gyp/opts.gypi
@@ -49,13 +49,13 @@
         'sse41_sources': [
             '<(skia_src_path)/opts/SkOpts_sse41.cpp',
         ],
+        'sse42_sources': [
+            '<(skia_src_path)/opts/SkOpts_sse42.cpp',
+        ],
         'avx_sources': [
             '<(skia_src_path)/opts/SkOpts_avx.cpp',
         ],
         # These targets are empty, but XCode doesn't like that, so add an empty file to each.
-        'sse42_sources': [
-            '<(skia_src_path)/core/SkForceCPlusPlusLinking.cpp',
-        ],
         'avx2_sources': [
             '<(skia_src_path)/core/SkForceCPlusPlusLinking.cpp',
         ],
diff --git a/include/private/SkChecksum.h b/include/private/SkChecksum.h
index 6289a44..8a04c89 100644
--- a/include/private/SkChecksum.h
+++ b/include/private/SkChecksum.h
@@ -12,6 +12,12 @@
 #include "SkTLogic.h"
 #include "SkTypes.h"
 
+// #include "SkOpts.h"
+// It's sort of pesky to be able to include SkOpts.h here, so we'll just re-declare what we need.
+namespace SkOpts {
+    extern uint32_t (*hash_fn)(const void*, size_t, uint32_t);
+}
+
 class SkChecksum : SkNoncopyable {
 public:
     /**
@@ -41,17 +47,6 @@
         hash ^= hash >> 16;
         return hash;
     }
-
-    /**
-     * Calculate 32-bit Murmur hash (murmur3).
-     * See en.wikipedia.org/wiki/MurmurHash.
-     *
-     *  @param data Memory address of the data block to be processed.
-     *  @param size Size of the data block in bytes.
-     *  @param seed Initial hash seed. (optional)
-     *  @return hash result
-     */
-    static uint32_t Murmur3(const void* data, size_t bytes, uint32_t seed=0);
 };
 
 // SkGoodHash should usually be your first choice in hashing data.
@@ -64,11 +59,11 @@
 
     template <typename K>
     SK_WHEN(sizeof(K) != 4, uint32_t) operator()(const K& k) const {
-        return SkChecksum::Murmur3(&k, sizeof(K));
+        return SkOpts::hash_fn(&k, sizeof(K), 0);
     }
 
     uint32_t operator()(const SkString& k) const {
-        return SkChecksum::Murmur3(k.c_str(), k.size());
+        return SkOpts::hash_fn(k.c_str(), k.size(), 0);
     }
 };
 
diff --git a/src/core/SkChecksum.cpp b/src/core/SkChecksum.cpp
deleted file mode 100644
index 4457eb4..0000000
--- a/src/core/SkChecksum.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkChecksum.h"
-
-uint32_t SkChecksum::Murmur3(const void* data, size_t bytes, uint32_t seed) {
-    // Use may_alias to remind the compiler we're intentionally violating strict aliasing,
-    // and so not to apply strict-aliasing-based optimizations.
-    typedef uint32_t SK_ATTRIBUTE(may_alias) aliased_uint32_t;
-    typedef uint8_t SK_ATTRIBUTE(may_alias) aliased_uint8_t;
-
-    // Handle 4 bytes at a time while possible.
-    const aliased_uint32_t* safe_data = (const aliased_uint32_t*)data;
-    const size_t words = bytes/4;
-    uint32_t hash = seed;
-    for (size_t i = 0; i < words; i++) {
-        uint32_t k = safe_data[i];
-        k *= 0xcc9e2d51;
-        k = (k << 15) | (k >> 17);
-        k *= 0x1b873593;
-
-        hash ^= k;
-        hash = (hash << 13) | (hash >> 19);
-        hash *= 5;
-        hash += 0xe6546b64;
-    }
-
-    // Handle last 0-3 bytes.
-    const aliased_uint8_t* safe_tail = (const uint8_t*)(safe_data + words);
-    uint32_t k = 0;
-    switch (bytes & 3) {
-        case 3: k ^= safe_tail[2] << 16;
-        case 2: k ^= safe_tail[1] <<  8;
-        case 1: k ^= safe_tail[0] <<  0;
-                k *= 0xcc9e2d51;
-                k = (k << 15) | (k >> 17);
-                k *= 0x1b873593;
-                hash ^= k;
-    }
-
-    hash ^= bytes;
-    return SkChecksum::Mix(hash);
-}
diff --git a/src/core/SkDescriptor.h b/src/core/SkDescriptor.h
index 71f7133..efa0278 100644
--- a/src/core/SkDescriptor.h
+++ b/src/core/SkDescriptor.h
@@ -9,7 +9,7 @@
 #ifndef SkDescriptor_DEFINED
 #define SkDescriptor_DEFINED
 
-#include "SkChecksum.h"
+#include "SkOpts.h"
 #include "SkTypes.h"
 
 class SkDescriptor : SkNoncopyable {
@@ -123,7 +123,7 @@
     static uint32_t ComputeChecksum(const SkDescriptor* desc) {
         const uint32_t* ptr = (const uint32_t*)desc + 1; // skip the checksum field
         size_t len = desc->fLength - sizeof(uint32_t);
-        return SkChecksum::Murmur3(ptr, len);
+        return SkOpts::hash(ptr, len);
     }
 
     // private so no one can create one except our factories
diff --git a/src/core/SkImageFilterCache.cpp b/src/core/SkImageFilterCache.cpp
index ba8a32c..c7104de 100644
--- a/src/core/SkImageFilterCache.cpp
+++ b/src/core/SkImageFilterCache.cpp
@@ -7,9 +7,9 @@
 
 #include "SkImageFilterCache.h"
 
-#include "SkChecksum.h"
 #include "SkMutex.h"
 #include "SkOnce.h"
+#include "SkOpts.h"
 #include "SkRefCnt.h"
 #include "SkSpecialImage.h"
 #include "SkTDynamicHash.h"
@@ -47,7 +47,7 @@
             return v.fKey;
         }
         static uint32_t Hash(const Key& key) {
-            return SkChecksum::Murmur3(reinterpret_cast<const uint32_t*>(&key), sizeof(Key));
+            return SkOpts::hash(reinterpret_cast<const uint32_t*>(&key), sizeof(Key));
         }
         SK_DECLARE_INTERNAL_LLIST_INTERFACE(Value);
     };
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index 5263fe4..a4da111 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -26,6 +26,7 @@
 #include "SkBlitMask_opts.h"
 #include "SkBlitRow_opts.h"
 #include "SkBlurImageFilter_opts.h"
+#include "SkChecksum_opts.h"
 #include "SkColorCubeFilter_opts.h"
 #include "SkMorphologyImageFilter_opts.h"
 #include "SkSwizzler_opts.h"
@@ -70,12 +71,14 @@
     DEFINE_DEFAULT(inverted_CMYK_to_BGR1);
 
     DEFINE_DEFAULT(srcover_srgb_srgb);
+
+    DEFINE_DEFAULT(hash_fn);
 #undef DEFINE_DEFAULT
 
     // Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
     void Init_ssse3();
     void Init_sse41();
-    void Init_sse42() {}
+    void Init_sse42();
     void Init_avx();
     void Init_avx2() {}
 
diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index 7c6cfb0..44e337d 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@@ -65,6 +65,12 @@
     // Blend ndst src pixels over dst, where both src and dst point to sRGB pixels (RGBA or BGRA).
     // If nsrc < ndst, we loop over src to create a pattern.
     extern void (*srcover_srgb_srgb)(uint32_t* dst, const uint32_t* src, int ndst, int nsrc);
+
+    // The fastest high quality 32-bit hash we can provide on this platform.
+    extern uint32_t (*hash_fn)(const void*, size_t, uint32_t seed);
+    static inline uint32_t hash(const void* data, size_t bytes, uint32_t seed=0) {
+        return hash_fn(data, bytes, seed);
+    }
 }
 
 #endif//SkOpts_DEFINED
diff --git a/src/core/SkPaint.cpp b/src/core/SkPaint.cpp
index 07c10d5..44b2928 100644
--- a/src/core/SkPaint.cpp
+++ b/src/core/SkPaint.cpp
@@ -7,7 +7,6 @@
 
 #include "SkPaint.h"
 #include "SkAutoKern.h"
-#include "SkChecksum.h"
 #include "SkColorFilter.h"
 #include "SkData.h"
 #include "SkDraw.h"
@@ -19,6 +18,7 @@
 #include "SkMutex.h"
 #include "SkReadBuffer.h"
 #include "SkWriteBuffer.h"
+#include "SkOpts.h"
 #include "SkPaintDefaults.h"
 #include "SkPathEffect.h"
 #include "SkRasterizer.h"
@@ -2388,6 +2388,6 @@
     // so fBitfields should be 10 pointers and 6 32-bit values from the start.
     static_assert(offsetof(SkPaint, fBitfields) == 9 * sizeof(void*) + 6 * sizeof(uint32_t),
                   "SkPaint_notPackedTightly");
-    return SkChecksum::Murmur3(reinterpret_cast<const uint32_t*>(this),
-                               offsetof(SkPaint, fBitfields) + sizeof(fBitfields));
+    return SkOpts::hash(reinterpret_cast<const uint32_t*>(this),
+                        offsetof(SkPaint, fBitfields) + sizeof(fBitfields));
 }
diff --git a/src/core/SkResourceCache.cpp b/src/core/SkResourceCache.cpp
index e465132..4bdc8dd 100644
--- a/src/core/SkResourceCache.cpp
+++ b/src/core/SkResourceCache.cpp
@@ -5,10 +5,10 @@
  * found in the LICENSE file.
  */
 
-#include "SkChecksum.h"
 #include "SkMessageBus.h"
 #include "SkMipMap.h"
 #include "SkMutex.h"
+#include "SkOpts.h"
 #include "SkPixelRef.h"
 #include "SkResourceCache.h"
 #include "SkTraceMemoryDump.h"
@@ -46,9 +46,9 @@
     fSharedID_lo = (uint32_t)sharedID;
     fSharedID_hi = (uint32_t)(sharedID >> 32);
     fNamespace = nameSpace;
-    // skip unhashed fields when computing the murmur
-    fHash = SkChecksum::Murmur3(this->as32() + kUnhashedLocal32s,
-                                (fCount32 - kUnhashedLocal32s) << 2);
+    // skip unhashed fields when computing the hash
+    fHash = SkOpts::hash(this->as32() + kUnhashedLocal32s,
+                         (fCount32 - kUnhashedLocal32s) << 2);
 }
 
 #include "SkTDynamicHash.h"
diff --git a/src/gpu/GrProgramDesc.h b/src/gpu/GrProgramDesc.h
index ec6447d..b17d146 100644
--- a/src/gpu/GrProgramDesc.h
+++ b/src/gpu/GrProgramDesc.h
@@ -10,7 +10,8 @@
 
 #include "GrColor.h"
 #include "GrTypesPriv.h"
-#include "SkChecksum.h"
+#include "SkOpts.h"
+#include "SkTArray.h"
 
 /** This class describes a program to generate. It also serves as a program cache key. Very little
     of this is GL-specific. The GL-specific parts could be factored out into a subclass. */
@@ -112,7 +113,7 @@
 
         uint32_t* checksum = this->atOffset<uint32_t, GrProgramDesc::kChecksumOffset>();
         *checksum = 0;  // We'll hash through these bytes, so make sure they're initialized.
-        *checksum = SkChecksum::Murmur3(fKey.begin(), keyLength);
+        *checksum = SkOpts::hash(fKey.begin(), keyLength);
     }
 
     // The key, stored in fKey, is composed of four parts:
diff --git a/src/gpu/GrResourceCache.cpp b/src/gpu/GrResourceCache.cpp
index 71e40f1..62360ed 100644
--- a/src/gpu/GrResourceCache.cpp
+++ b/src/gpu/GrResourceCache.cpp
@@ -11,9 +11,9 @@
 #include "GrCaps.h"
 #include "GrGpuResourceCacheAccess.h"
 #include "GrTracing.h"
-#include "SkChecksum.h"
 #include "SkGr.h"
 #include "SkMessageBus.h"
+#include "SkOpts.h"
 #include "SkTSort.h"
 
 DECLARE_SKMESSAGEBUS_MESSAGE(GrUniqueKeyInvalidatedMessage);
@@ -43,7 +43,7 @@
 }
 
 uint32_t GrResourceKeyHash(const uint32_t* data, size_t size) {
-    return SkChecksum::Murmur3(data, size);
+    return SkOpts::hash(data, size);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -687,7 +687,7 @@
                 SkASSERT(SkBudgeted::kNo == resource->resourcePriv().isBudgeted() ||
                          uniqueKey.isValid());
                 if (!uniqueKey.isValid()) {
-                    ++fCouldBeScratch;                
+                    ++fCouldBeScratch;
                     SkASSERT(fScratchMap->countForKey(scratchKey));
                 }
                 SkASSERT(!resource->resourcePriv().refsWrappedObjects());
diff --git a/src/gpu/batches/GrAADistanceFieldPathRenderer.h b/src/gpu/batches/GrAADistanceFieldPathRenderer.h
index db17b07..985b2f1 100755
--- a/src/gpu/batches/GrAADistanceFieldPathRenderer.h
+++ b/src/gpu/batches/GrAADistanceFieldPathRenderer.h
@@ -13,7 +13,7 @@
 #include "GrRect.h"
 #include "GrShape.h"
 
-#include "SkChecksum.h"
+#include "SkOpts.h"
 #include "SkTDynamicHash.h"
 
 class GrContext;
@@ -81,7 +81,7 @@
         }
 
         static inline uint32_t Hash(Key key) {
-            return SkChecksum::Murmur3(key.data(), sizeof(uint32_t) * key.count32());
+            return SkOpts::hash(key.data(), sizeof(uint32_t) * key.count32());
         }
     };
 
diff --git a/src/gpu/effects/GrTextureStripAtlas.h b/src/gpu/effects/GrTextureStripAtlas.h
index 91ce61c..5b90a34 100644
--- a/src/gpu/effects/GrTextureStripAtlas.h
+++ b/src/gpu/effects/GrTextureStripAtlas.h
@@ -9,7 +9,7 @@
 #define GrTextureStripAtlas_DEFINED
 
 #include "SkBitmap.h"
-#include "SkChecksum.h"
+#include "SkOpts.h"
 #include "SkGr.h"
 #include "SkTDArray.h"
 #include "SkTDynamicHash.h"
@@ -142,7 +142,7 @@
     public:
         // for SkTDynamicHash
         static const Desc& GetKey(const AtlasEntry& entry) { return entry.fDesc; }
-        static uint32_t Hash(const Desc& desc) { return SkChecksum::Murmur3(&desc, sizeof(Desc)); }
+        static uint32_t Hash(const Desc& desc) { return SkOpts::hash(&desc, sizeof(Desc)); }
 
         // AtlasEntry proper
         AtlasEntry() : fAtlas(nullptr) {}
diff --git a/src/gpu/text/GrAtlasTextBlob.h b/src/gpu/text/GrAtlasTextBlob.h
index f76d026..afc11a9 100644
--- a/src/gpu/text/GrAtlasTextBlob.h
+++ b/src/gpu/text/GrAtlasTextBlob.h
@@ -14,6 +14,7 @@
 #include "GrMemoryPool.h"
 #include "SkDescriptor.h"
 #include "SkMaskFilter.h"
+#include "SkOpts.h"
 #include "SkPathEffect.h"
 #include "SkRasterizer.h"
 #include "SkSurfaceProps.h"
@@ -89,7 +90,7 @@
     }
 
     static uint32_t Hash(const Key& key) {
-        return SkChecksum::Murmur3(&key, sizeof(Key));
+        return SkOpts::hash(&key, sizeof(Key));
     }
 
     void operator delete(void* p) {
diff --git a/src/gpu/text/GrStencilAndCoverTextContext.h b/src/gpu/text/GrStencilAndCoverTextContext.h
index 9b29719..d4f1abd 100644
--- a/src/gpu/text/GrStencilAndCoverTextContext.h
+++ b/src/gpu/text/GrStencilAndCoverTextContext.h
@@ -11,6 +11,7 @@
 #include "GrDrawContext.h"
 #include "GrStyle.h"
 #include "SkDrawFilter.h"
+#include "SkOpts.h"
 #include "SkTextBlob.h"
 #include "SkTHash.h"
 #include "SkTInternalLList.h"
@@ -120,7 +121,7 @@
 
         static uint32_t Hash(const Key& key) {
             SkASSERT(key.count() > 1); // 1-length keys should be using the blob-id hash map.
-            return SkChecksum::Murmur3(key.begin(), sizeof(uint32_t) * key.count());
+            return SkOpts::hash(key.begin(), sizeof(uint32_t) * key.count());
         }
 
         TextBlob(uint32_t blobId, const SkTextBlob* skBlob, const SkPaint& skPaint)
diff --git a/src/gpu/vk/GrVkPipelineStateCache.cpp b/src/gpu/vk/GrVkPipelineStateCache.cpp
index 5e4013d..d404a8d 100644
--- a/src/gpu/vk/GrVkPipelineStateCache.cpp
+++ b/src/gpu/vk/GrVkPipelineStateCache.cpp
@@ -11,6 +11,7 @@
 #include "GrProcessor.h"
 #include "GrVkPipelineState.h"
 #include "GrVkPipelineStateBuilder.h"
+#include "SkOpts.h"
 #include "glsl/GrGLSLFragmentProcessor.h"
 #include "glsl/GrGLSLProgramDataManager.h"
 
@@ -112,8 +113,8 @@
     int keyLength = desc.fStateKey.count();
     SkASSERT(0 == (keyLength % 4));
     // Seed the checksum with the checksum of the programDesc then add the vulkan key to it.
-    desc.fChecksum = SkChecksum::Murmur3(desc.fStateKey.begin(), keyLength,
-                                         desc.fProgramDesc.getChecksum());
+    desc.fChecksum = SkOpts::hash(desc.fStateKey.begin(), keyLength,
+                                  desc.fProgramDesc.getChecksum());
 
     Entry* entry = nullptr;
     if (Entry** entryptr = fHashTable.find(desc)) {
diff --git a/src/opts/SkChecksum_opts.h b/src/opts/SkChecksum_opts.h
new file mode 100644
index 0000000..346b16b
--- /dev/null
+++ b/src/opts/SkChecksum_opts.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkChecksum_opts_DEFINED
+#define SkChecksum_opts_DEFINED
+
+#include "SkChecksum.h"
+#include "SkTypes.h"
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
+    #include <immintrin.h>
+#endif
+
+// TODO: ARMv8 has optional CRC instructions similar to SSE 4.2
+// TODO: 32-bit x86 version: same sort of idea using only _mm_crc32_u32() and smaller
+
+namespace SK_OPTS_NS {
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
+    template <typename T>
+    static inline T unaligned_load(const uint8_t* src) {
+        T val;
+        memcpy(&val, src, sizeof(val));
+        return val;
+    }
+
+    static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) {
+        auto data = (const uint8_t*)vdata;
+
+        // _mm_crc32_u64() operates on 64-bit registers, so we use uint64_t for a while.
+        uint64_t hash = seed;
+        if (bytes >= 24) {
+            // We'll create 3 independent hashes, each using _mm_crc32_u64()
+            // to hash 8 bytes per step.  Both 3 and independent are important:
+            // we can execute 3 of these instructions in parallel on a single core.
+            uint64_t a = hash,
+                     b = hash,
+                     c = hash;
+            size_t steps = bytes/24;
+            while (steps --> 0) {
+                a = _mm_crc32_u64(a, unaligned_load<uint64_t>(data+ 0));
+                b = _mm_crc32_u64(b, unaligned_load<uint64_t>(data+ 8));
+                c = _mm_crc32_u64(c, unaligned_load<uint64_t>(data+16));
+                data += 24;
+            }
+            bytes %= 24;
+            hash = a^b^c;
+        }
+
+        SkASSERT(bytes < 24);
+        if (bytes >= 16) {
+            hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
+            bytes -= 8;
+            data  += 8;
+        }
+
+        SkASSERT(bytes < 16);
+        if (bytes & 8) {
+            hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
+            data  += 8;
+        }
+
+        // The remainder of these _mm_crc32_u*() operate on a 32-bit register.
+        // We don't lose anything here: only the bottom 32-bits were populated.
+        auto hash32 = (uint32_t)hash;
+
+        if (bytes & 4) {
+            hash32 = _mm_crc32_u32(hash32, unaligned_load<uint32_t>(data));
+            data += 4;
+        }
+        if (bytes & 2) {
+            hash32 = _mm_crc32_u16(hash32, unaligned_load<uint16_t>(data));
+            data += 2;
+        }
+        if (bytes & 1) {
+            hash32 = _mm_crc32_u8(hash32, unaligned_load<uint8_t>(data));
+        }
+        return hash32;
+    }
+
+#else
+    static uint32_t hash_fn(const void* data, size_t bytes, uint32_t seed) {
+        // This is Murmur3.
+
+        // Use may_alias to remind the compiler we're intentionally violating strict aliasing,
+        // and so not to apply strict-aliasing-based optimizations.
+        typedef uint32_t SK_ATTRIBUTE(may_alias) aliased_uint32_t;
+        typedef uint8_t SK_ATTRIBUTE(may_alias) aliased_uint8_t;
+
+        // Handle 4 bytes at a time while possible.
+        const aliased_uint32_t* safe_data = (const aliased_uint32_t*)data;
+        const size_t words = bytes/4;
+        uint32_t hash = seed;
+        for (size_t i = 0; i < words; i++) {
+            uint32_t k = safe_data[i];
+            k *= 0xcc9e2d51;
+            k = (k << 15) | (k >> 17);
+            k *= 0x1b873593;
+
+            hash ^= k;
+            hash = (hash << 13) | (hash >> 19);
+            hash *= 5;
+            hash += 0xe6546b64;
+        }
+
+        // Handle last 0-3 bytes.
+        const aliased_uint8_t* safe_tail = (const uint8_t*)(safe_data + words);
+        uint32_t k = 0;
+        switch (bytes & 3) {
+            case 3: k ^= safe_tail[2] << 16;
+            case 2: k ^= safe_tail[1] <<  8;
+            case 1: k ^= safe_tail[0] <<  0;
+                    k *= 0xcc9e2d51;
+                    k = (k << 15) | (k >> 17);
+                    k *= 0x1b873593;
+                    hash ^= k;
+        }
+
+        hash ^= bytes;
+        return SkChecksum::Mix(hash);
+    }
+#endif
+
+}  // namespace SK_OPTS_NS
+
+#endif//SkChecksum_opts_DEFINED
diff --git a/src/opts/SkOpts_sse42.cpp b/src/opts/SkOpts_sse42.cpp
new file mode 100644
index 0000000..1883182
--- /dev/null
+++ b/src/opts/SkOpts_sse42.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkOpts.h"
+
+#define SK_OPTS_NS sse42
+#include "SkChecksum_opts.h"
+
+namespace SkOpts {
+    void Init_sse42() {
+        hash_fn = sse42::hash_fn;
+    }
+}
+
diff --git a/src/pdf/SkPDFGraphicState.h b/src/pdf/SkPDFGraphicState.h
index 84491ba..49723bd 100644
--- a/src/pdf/SkPDFGraphicState.h
+++ b/src/pdf/SkPDFGraphicState.h
@@ -10,7 +10,7 @@
 #define SkPDFGraphicState_DEFINED
 
 #include "SkPDFTypes.h"
-#include "SkChecksum.h"
+#include "SkOpts.h"
 
 class SkPaint;
 class SkPDFCanon;
@@ -63,7 +63,7 @@
     bool operator==(const SkPDFGraphicState& rhs) const {
         return 0 == memcmp(&fStrokeWidth, &rhs.fStrokeWidth, 12);
     }
-    uint32_t hash() const { return SkChecksum::Murmur3(&fStrokeWidth, 12); }
+    uint32_t hash() const { return SkOpts::hash(&fStrokeWidth, 12); }
 
 private:
     const SkScalar fStrokeWidth;
diff --git a/src/utils/SkWhitelistTypefaces.cpp b/src/utils/SkWhitelistTypefaces.cpp
index ac82f7c..139e697 100644
--- a/src/utils/SkWhitelistTypefaces.cpp
+++ b/src/utils/SkWhitelistTypefaces.cpp
@@ -5,8 +5,8 @@
  * found in the LICENSE file.
  */
 
-#include "SkChecksum.h"
 #include "SkFontDescriptor.h"
+#include "SkOpts.h"
 #include "SkStream.h"
 #include "SkString.h"
 #include "SkTypeface.h"
@@ -80,7 +80,7 @@
     if (!fontStream->peek(data.begin(), length)) {
         return 0;
     }
-    return SkChecksum::Murmur3(data.begin(), length);
+    return SkOpts::hash(data.begin(), length);
 }
 
 static void serialize_sub(const char* fontName, SkFontStyle style, SkWStream* wstream) {
diff --git a/tests/ChecksumTest.cpp b/tests/ChecksumTest.cpp
index cf9d65c..2aaf268 100644
--- a/tests/ChecksumTest.cpp
+++ b/tests/ChecksumTest.cpp
@@ -5,21 +5,11 @@
  * found in the LICENSE file.
  */
 
-#include "SkChecksum.h"
+#include "SkOpts.h"
 #include "SkRandom.h"
 #include "Test.h"
 
-
-// Murmur3 has an optional third seed argument, so we wrap it to fit a uniform type.
-static uint32_t murmur_noseed(const uint32_t* d, size_t l) { return SkChecksum::Murmur3(d, l); }
-
-#define ASSERT(x) REPORTER_ASSERT(r, x)
-
 DEF_TEST(Checksum, r) {
-    // Algorithms to test.  They're currently all uint32_t(const uint32_t*, size_t).
-    typedef uint32_t(*algorithmProc)(const uint32_t*, size_t);
-    const algorithmProc kAlgorithms[] = { &murmur_noseed };
-
     // Put 128 random bytes into two identical buffers.  Any multiple of 4 will do.
     const size_t kBytes = SkAlign4(128);
     SkRandom rand;
@@ -28,38 +18,26 @@
         data[i] = tweaked[i] = rand.nextU();
     }
 
-    // Test each algorithm.
-    for (size_t i = 0; i < SK_ARRAY_COUNT(kAlgorithms); ++i) {
-        const algorithmProc algorithm = kAlgorithms[i];
+    // Hash of nullptr is always 0.
+    REPORTER_ASSERT(r, SkOpts::hash(nullptr, 0) == 0);
 
-        // Hash of nullptr is always 0.
-        ASSERT(algorithm(nullptr, 0) == 0);
+    const uint32_t hash = SkOpts::hash(data, kBytes);
+    // Should be deterministic.
+    REPORTER_ASSERT(r, hash == SkOpts::hash(data, kBytes));
 
-        const uint32_t hash = algorithm(data, kBytes);
-        // Should be deterministic.
-        ASSERT(hash == algorithm(data, kBytes));
-
-        // Changing any single element should change the hash.
-        for (size_t j = 0; j < SK_ARRAY_COUNT(tweaked); ++j) {
-            const uint32_t saved = tweaked[j];
-            tweaked[j] = rand.nextU();
-            const uint32_t tweakedHash = algorithm(tweaked, kBytes);
-            ASSERT(tweakedHash != hash);
-            ASSERT(tweakedHash == algorithm(tweaked, kBytes));
-            tweaked[j] = saved;
-        }
+    // Changing any single element should change the hash.
+    for (size_t j = 0; j < SK_ARRAY_COUNT(tweaked); ++j) {
+        const uint32_t saved = tweaked[j];
+        tweaked[j] = rand.nextU();
+        const uint32_t tweakedHash = SkOpts::hash(tweaked, kBytes);
+        REPORTER_ASSERT(r, tweakedHash != hash);
+        REPORTER_ASSERT(r, tweakedHash == SkOpts::hash(tweaked, kBytes));
+        tweaked[j] = saved;
     }
 }
 
 DEF_TEST(GoodHash, r) {
-    ASSERT(SkGoodHash()(( int32_t)4) ==  614249093);  // 4 bytes.  Hits SkChecksum::Mix fast path.
-    ASSERT(SkGoodHash()((uint32_t)4) ==  614249093);  // (Ditto)
-
-    // None of these are 4 byte sized, so they use SkChecksum::Murmur3, not SkChecksum::Mix.
-    ASSERT(SkGoodHash()((uint64_t)4) == 3491892518);
-    ASSERT(SkGoodHash()((uint16_t)4) ==  899251846);
-    ASSERT(SkGoodHash()( (uint8_t)4) ==  962700458);
-
-    // Tests SkString is correctly specialized.
-    ASSERT(SkGoodHash()(SkString("Hi")) == 55667557);
+    // 4 bytes --> hits SkChecksum::Mix fast path.
+    REPORTER_ASSERT(r, SkGoodHash()(( int32_t)4) ==  614249093);
+    REPORTER_ASSERT(r, SkGoodHash()((uint32_t)4) ==  614249093);
 }
diff --git a/tools/UrlDataManager.h b/tools/UrlDataManager.h
index 49646ca..bae1dc7 100644
--- a/tools/UrlDataManager.h
+++ b/tools/UrlDataManager.h
@@ -8,8 +8,8 @@
 #ifndef SkUrlDataManager_DEFINED
 #define SkUrlDataManager_DEFINED
 
-#include "SkChecksum.h"
 #include "SkData.h"
+#include "SkOpts.h"
 #include "SkString.h"
 #include "SkTDynamicHash.h"
 
@@ -52,7 +52,7 @@
         }
 
         static uint32_t Hash(const SkData& key) {
-            return SkChecksum::Murmur3(key.bytes(), key.size());
+            return SkOpts::hash(key.bytes(), key.size());
         }
     };
 
@@ -62,7 +62,7 @@
         }
 
         static uint32_t Hash(const SkString& key) {
-            return SkChecksum::Murmur3(key.c_str(), strlen(key.c_str()));
+            return SkOpts::hash(key.c_str(), strlen(key.c_str()));
         }
     };