Improve zlib inflate speed by using SSE4.2 crc32 Using an SSE4.2-based crc32 improves the decoding rate of the PNG 140 corpus by 4% average, giving a total 40% performance increase when combined with adler32 SIMD code and inflate chunk copy code, see https://crbug.com/796178#c2 for details. Raw crc32 speed is 5x - 25x faster than the zlib default "BYFOUR" crc32, and gzip- and zlib-wrapped inflate performance improves by 69% and 50% for the snappy corpus (https://crbug.com/796178#c3 #4 for details). Add crc32 SIMD implementation and update the call-site in crc32.c to use the new crc32 code, using run-time detection of the SSE4.2 and PCLMUL support required by the crc32 SIMD code. Update BUILD.gn to compile the crc32 SIMD code for Intel devices, also update names.h with the new symbol defined by the crc32 SIMD code path. Bug: 796178 Change-Id: I1bb94b47c9a4934eed01ba3d4feda51d67c4bf85 Reviewed-on: https://chromium-review.googlesource.com/833820 Commit-Queue: Noel Gordon <noel@chromium.org> Reviewed-by: Chris Blume <cblume@chromium.org> Cr-Original-Commit-Position: refs/heads/master@{#526935} Cr-Mirrored-From: https://chromium.googlesource.com/chromium/src Cr-Mirrored-Commit: 65e2abcb74b1c07fa14f46abaa1fb1717892eec3

commit: 8e904b333705c2472e94375b9c70511e27c6a9ff [log] [tgz]
author: Noel Gordon <noel@chromium.org> Thu Jan 04 12:10:08 2018 +1100
committer: Commit Bot <commit-bot@chromium.org> Thu Jan 04 07:30:36 2018 +0000
tree: 355c613c894965bf0cec1ae39db209d9fdc92920
parent: 71cbfe554f58d5a1dbe3493d01945f85aec2e71a [diff]
diff --git a/BUILD.gn b/BUILD.gn
index c3cc8b5..e87d129 100644
--- a/BUILD.gn
+++ b/BUILD.gn

@@ -107,12 +107,41 @@
   public_configs = [ ":zlib_inflate_chunk_simd_config" ]
 }
 
+config("zlib_crc32_simd_config") {
+  if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
+    defines = [ "CRC32_SIMD_SSE42_PCLMUL" ]
+  }
+}
+
+source_set("zlib_crc32_simd") {
+  visibility = [ ":*" ]
+
+  if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
+    sources = [
+      "crc32_simd.c",
+      "crc32_simd.h",
+    ]
+
+    if (!is_win || is_clang) {
+      cflags = [
+        "-msse4.2",
+        "-mpclmul",
+      ]
+    }
+  }
+
+  public_configs = [ ":zlib_crc32_simd_config" ]
+}
+
 static_library("zlib_x86_simd") {
+  visibility = [ ":*" ]
+
   if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
     sources = [
       "crc_folding.c",
       "fill_window_sse.c",
     ]
+
     if (!is_win || is_clang) {
       cflags = [
         "-msse4.2",
@@ -176,6 +205,8 @@
   deps = []
 
   if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
+    deps += [ ":zlib_crc32_simd" ]
+
     deps += [ ":zlib_adler32_simd" ]
     sources += [ "x86.c" ]
 

diff --git a/crc32.c b/crc32.c
index 9162429..b4ad1e1 100644
--- a/crc32.c
+++ b/crc32.c

@@ -30,6 +30,7 @@
 
 #include "deflate.h"
 #include "x86.h"
+#include "crc32_simd.h"
 #include "zutil.h"      /* for STDC and FAR definitions */
 
 /* Definitions for doing the crc four data bytes at a time. */
@@ -241,6 +242,32 @@
     const unsigned char FAR *buf;
     uInt len;
 {
+#if defined(CRC32_SIMD_SSE42_PCLMUL)
+    /*
+     * Use x86 sse4.2+pclmul SIMD to compute the crc32. Since this
+     * routine can be freely used, check the CPU features here, to
+     * stop TSAN complaining about thread data races accessing the
+     * x86_cpu_enable_simd feature variable below.
+     */
+    if (buf == Z_NULL) {
+        if (!len) /* Assume user is calling crc32(0, NULL, 0); */
+            x86_check_features();
+        return 0UL;
+    }
+
+    if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
+        /* crc32 16-byte chunks */
+        uInt chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
+        crc = ~crc32_sse42_simd_(buf, chunk_size, ~(uint32_t)crc);
+        /* check remaining data */
+        len -= chunk_size;
+        if (!len)
+            return crc;
+        /* Fall into the default crc32 for the remaining data. */
+        buf += chunk_size;
+    }
+#endif /* CRC32_SIMD_SSE42_PCLMUL */
+
     return crc32_z(crc, buf, len);
 }
 

diff --git a/crc32_simd.c b/crc32_simd.c
new file mode 100644
index 0000000..c2d4255
--- /dev/null
+++ b/crc32_simd.c

@@ -0,0 +1,157 @@
+/* crc32_simd.c
+ *
+ * Copyright 2017 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ */
+
+#include "crc32_simd.h"
+
+#if defined(CRC32_SIMD_SSE42_PCLMUL)
+
+/*
+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 64, and a multiple of 16. Based on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+ */
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <wmmintrin.h>
+
+uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */
+    const unsigned char *buf,
+    z_size_t len,
+    uint32_t crc)
+{
+    /*
+     * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
+     * the CRC32+Barrett polynomials given at the end of the paper.
+     */
+    static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
+    static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
+    static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
+    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
+
+    __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+
+    /*
+     * There's at least one block of 64.
+     */
+    x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+    x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+    x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+    x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+
+    x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
+
+    x0 = _mm_load_si128((__m128i *)k1k2);
+
+    buf += 64;
+    len -= 64;
+
+    /*
+     * Parallel fold blocks of 64, if any.
+     */
+    while (len >= 64)
+    {
+        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+        x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
+        x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
+        x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
+
+        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+        x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
+        x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
+        x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
+
+        y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+        y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+        y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+        y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+
+        x1 = _mm_xor_si128(x1, x5);
+        x2 = _mm_xor_si128(x2, x6);
+        x3 = _mm_xor_si128(x3, x7);
+        x4 = _mm_xor_si128(x4, x8);
+
+        x1 = _mm_xor_si128(x1, y5);
+        x2 = _mm_xor_si128(x2, y6);
+        x3 = _mm_xor_si128(x3, y7);
+        x4 = _mm_xor_si128(x4, y8);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold into 128-bits.
+     */
+    x0 = _mm_load_si128((__m128i *)k3k4);
+
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x2);
+    x1 = _mm_xor_si128(x1, x5);
+
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x3);
+    x1 = _mm_xor_si128(x1, x5);
+
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x4);
+    x1 = _mm_xor_si128(x1, x5);
+
+    /*
+     * Single fold blocks of 16, if any.
+     */
+    while (len >= 16)
+    {
+        x2 = _mm_loadu_si128((__m128i *)buf);
+
+        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+        x1 = _mm_xor_si128(x1, x2);
+        x1 = _mm_xor_si128(x1, x5);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Fold 128-bits to 64-bits.
+     */
+    x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
+    x3 = _mm_set_epi32(0, ~0, 0, ~0);
+    x1 = _mm_srli_si128(x1, 8);
+    x1 = _mm_xor_si128(x1, x2);
+
+    x0 = _mm_loadl_epi64((__m128i*)k5k0);
+
+    x2 = _mm_srli_si128(x1, 4);
+    x1 = _mm_and_si128(x1, x3);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_xor_si128(x1, x2);
+
+    /*
+     * Barret reduce to 32-bits.
+     */
+    x0 = _mm_load_si128((__m128i*)poly);
+
+    x2 = _mm_and_si128(x1, x3);
+    x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
+    x2 = _mm_and_si128(x2, x3);
+    x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
+    x1 = _mm_xor_si128(x1, x2);
+
+    /*
+     * Return the crc32.
+     */
+    return _mm_extract_epi32(x1, 1);
+}
+
+#endif  /* CRC32_SIMD_SSE42_PCLMUL */

diff --git a/crc32_simd.h b/crc32_simd.h
new file mode 100644
index 0000000..4e6f326
--- /dev/null
+++ b/crc32_simd.h

@@ -0,0 +1,27 @@
+/* crc32_simd.h
+ *
+ * Copyright 2017 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ */
+
+#include <stdint.h>
+
+#include "zconf.h"
+#include "zutil.h"
+
+/*
+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 64, and a multiple of 16.
+ */
+uint32_t ZLIB_INTERNAL crc32_sse42_simd_(
+    const unsigned char *buf,
+    z_size_t len,
+    uint32_t crc);
+
+/*
+ * crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c
+ * for computing the crc32 of an arbitrary length buffer.
+ */
+#define Z_CRC32_SSE42_MINIMUM_LENGTH 64
+#define Z_CRC32_SSE42_CHUNKSIZE_MASK 15

diff --git a/names.h b/names.h
index c18b90f..55a8a3f 100644
--- a/names.h
+++ b/names.h

@@ -176,4 +176,9 @@
 #define inflate_fast_chunk_ Cr_z_inflate_fast_chunk_
 #endif
 
+#if defined(CRC32_SIMD_SSE42_PCLMUL)
+/* Symbols added by crc32_simd.c */
+#define crc32_sse42_simd_ Cr_z_crc32_sse42_simd_
+#endif
+
 #endif  /* THIRD_PARTY_ZLIB_NAMES_H_ */
commit	8e904b333705c2472e94375b9c70511e27c6a9ff	[log] [tgz]
author	Noel Gordon <noel@chromium.org>	Thu Jan 04 12:10:08 2018 +1100
committer	Commit Bot <commit-bot@chromium.org>	Thu Jan 04 07:30:36 2018 +0000
tree	355c613c894965bf0cec1ae39db209d9fdc92920
parent	71cbfe554f58d5a1dbe3493d01945f85aec2e71a [diff]