Improve zlib inflate speed by using SSE4.2 crc32

Using an SSE4.2-based crc32 improves the decoding rate of the PNG
140 corpus by 4% average, giving a total 40% performance increase
when combined with adler32 SIMD code and inflate chunk copy code,
see https://crbug.com/796178#c2 for details.

Raw crc32 speed is 5x - 25x faster than the zlib default "BYFOUR"
crc32, and gzip- and zlib-wrapped inflate performance improves by
69% and 50% for the snappy corpus (https://crbug.com/796178#c3 #4
for details).

Add crc32 SIMD implementation and update the call-site in crc32.c
to use the new crc32 code, using run-time detection of the SSE4.2
and PCLMUL support required by the crc32 SIMD code.

Update BUILD.gn to compile the crc32 SIMD code for Intel devices,
also update names.h with the new symbol defined by the crc32 SIMD
code path.

Bug: 796178
Change-Id: I1bb94b47c9a4934eed01ba3d4feda51d67c4bf85
Reviewed-on: https://chromium-review.googlesource.com/833820
Commit-Queue: Noel Gordon <noel@chromium.org>
Reviewed-by: Chris Blume <cblume@chromium.org>
Cr-Original-Commit-Position: refs/heads/master@{#526935}
Cr-Mirrored-From: https://chromium.googlesource.com/chromium/src
Cr-Mirrored-Commit: 65e2abcb74b1c07fa14f46abaa1fb1717892eec3
diff --git a/BUILD.gn b/BUILD.gn
index c3cc8b5..e87d129 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -107,12 +107,41 @@
   public_configs = [ ":zlib_inflate_chunk_simd_config" ]
 }
 
+config("zlib_crc32_simd_config") {
+  if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
+    defines = [ "CRC32_SIMD_SSE42_PCLMUL" ]
+  }
+}
+
+source_set("zlib_crc32_simd") {
+  visibility = [ ":*" ]
+
+  if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
+    sources = [
+      "crc32_simd.c",
+      "crc32_simd.h",
+    ]
+
+    if (!is_win || is_clang) {
+      cflags = [
+        "-msse4.2",
+        "-mpclmul",
+      ]
+    }
+  }
+
+  public_configs = [ ":zlib_crc32_simd_config" ]
+}
+
 static_library("zlib_x86_simd") {
+  visibility = [ ":*" ]
+
   if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
     sources = [
       "crc_folding.c",
       "fill_window_sse.c",
     ]
+
     if (!is_win || is_clang) {
       cflags = [
         "-msse4.2",
@@ -176,6 +205,8 @@
   deps = []
 
   if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
+    deps += [ ":zlib_crc32_simd" ]
+
     deps += [ ":zlib_adler32_simd" ]
     sources += [ "x86.c" ]
 
diff --git a/crc32.c b/crc32.c
index 9162429..b4ad1e1 100644
--- a/crc32.c
+++ b/crc32.c
@@ -30,6 +30,7 @@
 
 #include "deflate.h"
 #include "x86.h"
+#include "crc32_simd.h"
 #include "zutil.h"      /* for STDC and FAR definitions */
 
 /* Definitions for doing the crc four data bytes at a time. */
@@ -241,6 +242,32 @@
     const unsigned char FAR *buf;
     uInt len;
 {
+#if defined(CRC32_SIMD_SSE42_PCLMUL)
+    /*
+     * Use x86 sse4.2+pclmul SIMD to compute the crc32. Since this
+     * routine can be freely used, check the CPU features here, to
+     * stop TSAN complaining about thread data races accessing the
+     * x86_cpu_enable_simd feature variable below.
+     */
+    if (buf == Z_NULL) {
+        if (!len) /* Assume user is calling crc32(0, NULL, 0); */
+            x86_check_features();
+        return 0UL;
+    }
+
+    if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
+        /* crc32 16-byte chunks */
+        uInt chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
+        crc = ~crc32_sse42_simd_(buf, chunk_size, ~(uint32_t)crc);
+        /* check remaining data */
+        len -= chunk_size;
+        if (!len)
+            return crc;
+        /* Fall into the default crc32 for the remaining data. */
+        buf += chunk_size;
+    }
+#endif /* CRC32_SIMD_SSE42_PCLMUL */
+
     return crc32_z(crc, buf, len);
 }
 
diff --git a/crc32_simd.c b/crc32_simd.c
new file mode 100644
index 0000000..c2d4255
--- /dev/null
+++ b/crc32_simd.c
@@ -0,0 +1,157 @@
+/* crc32_simd.c
+ *
+ * Copyright 2017 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ */
+
+#include "crc32_simd.h"
+
+#if defined(CRC32_SIMD_SSE42_PCLMUL)
+
+/*
+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 64, and a multiple of 16. Based on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+ */
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <wmmintrin.h>
+
+uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */
+    const unsigned char *buf,
+    z_size_t len,
+    uint32_t crc)
+{
+    /*
+     * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
+     * the CRC32+Barrett polynomials given at the end of the paper.
+     */
+    static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
+    static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
+    static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
+    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
+
+    __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+
+    /*
+     * There's at least one block of 64.
+     */
+    x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+    x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+    x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+    x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+
+    x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
+
+    x0 = _mm_load_si128((__m128i *)k1k2);
+
+    buf += 64;
+    len -= 64;
+
+    /*
+     * Parallel fold blocks of 64, if any.
+     */
+    while (len >= 64)
+    {
+        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+        x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
+        x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
+        x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
+
+        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+        x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
+        x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
+        x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
+
+        y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+        y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+        y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+        y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+
+        x1 = _mm_xor_si128(x1, x5);
+        x2 = _mm_xor_si128(x2, x6);
+        x3 = _mm_xor_si128(x3, x7);
+        x4 = _mm_xor_si128(x4, x8);
+
+        x1 = _mm_xor_si128(x1, y5);
+        x2 = _mm_xor_si128(x2, y6);
+        x3 = _mm_xor_si128(x3, y7);
+        x4 = _mm_xor_si128(x4, y8);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold into 128-bits.
+     */
+    x0 = _mm_load_si128((__m128i *)k3k4);
+
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x2);
+    x1 = _mm_xor_si128(x1, x5);
+
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x3);
+    x1 = _mm_xor_si128(x1, x5);
+
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x4);
+    x1 = _mm_xor_si128(x1, x5);
+
+    /*
+     * Single fold blocks of 16, if any.
+     */
+    while (len >= 16)
+    {
+        x2 = _mm_loadu_si128((__m128i *)buf);
+
+        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+        x1 = _mm_xor_si128(x1, x2);
+        x1 = _mm_xor_si128(x1, x5);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Fold 128-bits to 64-bits.
+     */
+    x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
+    x3 = _mm_set_epi32(0, ~0, 0, ~0);
+    x1 = _mm_srli_si128(x1, 8);
+    x1 = _mm_xor_si128(x1, x2);
+
+    x0 = _mm_loadl_epi64((__m128i*)k5k0);
+
+    x2 = _mm_srli_si128(x1, 4);
+    x1 = _mm_and_si128(x1, x3);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_xor_si128(x1, x2);
+
+    /*
+     * Barret reduce to 32-bits.
+     */
+    x0 = _mm_load_si128((__m128i*)poly);
+
+    x2 = _mm_and_si128(x1, x3);
+    x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
+    x2 = _mm_and_si128(x2, x3);
+    x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
+    x1 = _mm_xor_si128(x1, x2);
+
+    /*
+     * Return the crc32.
+     */
+    return _mm_extract_epi32(x1, 1);
+}
+
+#endif  /* CRC32_SIMD_SSE42_PCLMUL */
diff --git a/crc32_simd.h b/crc32_simd.h
new file mode 100644
index 0000000..4e6f326
--- /dev/null
+++ b/crc32_simd.h
@@ -0,0 +1,27 @@
+/* crc32_simd.h
+ *
+ * Copyright 2017 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ */
+
+#include <stdint.h>
+
+#include "zconf.h"
+#include "zutil.h"
+
+/*
+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 64, and a multiple of 16.
+ */
+uint32_t ZLIB_INTERNAL crc32_sse42_simd_(
+    const unsigned char *buf,
+    z_size_t len,
+    uint32_t crc);
+
+/*
+ * crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c
+ * for computing the crc32 of an arbitrary length buffer.
+ */
+#define Z_CRC32_SSE42_MINIMUM_LENGTH 64
+#define Z_CRC32_SSE42_CHUNKSIZE_MASK 15
diff --git a/names.h b/names.h
index c18b90f..55a8a3f 100644
--- a/names.h
+++ b/names.h
@@ -176,4 +176,9 @@
 #define inflate_fast_chunk_ Cr_z_inflate_fast_chunk_
 #endif
 
+#if defined(CRC32_SIMD_SSE42_PCLMUL)
+/* Symbols added by crc32_simd.c */
+#define crc32_sse42_simd_ Cr_z_crc32_sse42_simd_
+#endif
+
 #endif  /* THIRD_PARTY_ZLIB_NAMES_H_ */