Improve zlib inflate speed by using SSE2 chunk copy

Using SSE2 chunk copies improves the decoding rate of the PNG 140
corpus by an average 17%, giving a total 37% performance increase
when combined with SIMD adler32 code (https://crbug.com/772870#c3
for details).

Move the arm-specific code back into the main chunk copy code and
generalize the SIMD parts of chunkset_core() with inline function
calls for ARM, and Intel SSE2 devices. This removes the TODO from
arm/chunkcopy_arm.h, and that file can be deleted as a result.

Add SSE2 vector load / store SSE helpers for chunkset_core(). The
existing NEON load code had alignment issues, as noted in review.
Fix that: use unaligned loads in the ARM helper code.

Change chunkcopy.h to use __builtin_memcpy if it's available, use
zmemcpy otherwise such as on MSVC. Also call x86_check_features()
in inflateInit2_() to keep the adler32 SIMD code path enabled.

Update BUILD.gn to conditionally compile the SIMD chunk copy code
on Intel SSE2 and ARM NEON devices. Update names.h to add the new
symbol defined by the inflate chunk copy code path.

Code had various comment styles; pick one and use it consistently
everywhere. Add inffast_chunk.h TODO(cblume).

Bug: 772870
Change-Id: I47004c68ee675acf418825fb0e1f8fa8018d4342
Reviewed-on: https://chromium-review.googlesource.com/708834
Commit-Queue: Noel Gordon <noel@chromium.org>
Reviewed-by: Chris Blume <cblume@chromium.org>
Cr-Original-Commit-Position: refs/heads/master@{#522764}
Cr-Mirrored-From: https://chromium.googlesource.com/chromium/src
Cr-Mirrored-Commit: c293a3255eb27dee8879f85f2c45dedff58e2452
diff --git a/BUILD.gn b/BUILD.gn
index 2f19a8f..65d4dda 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -10,11 +10,6 @@
 
 config("zlib_config") {
   include_dirs = [ "." ]
-  if (current_cpu == "arm" || current_cpu == "arm64") {
-    if (arm_use_neon) {
-      include_dirs += [ "contrib/optimizations/arm" ]
-    }
-  }
 }
 
 config("zlib_adler32_simd_config") {
@@ -64,6 +59,57 @@
   public_configs = [ ":zlib_adler32_simd_config" ]
 }
 
+config("zlib_inflate_chunk_simd_config") {
+  if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
+    defines = [ "INFLATE_CHUNK_SIMD_SSE2" ]
+  }
+
+  if (current_cpu == "arm" || current_cpu == "arm64") {
+    if (arm_use_neon) {
+      defines = [ "INFLATE_CHUNK_SIMD_NEON" ]
+    }
+  }
+}
+
+source_set("zlib_inflate_chunk_simd") {
+  visibility = [ ":*" ]
+
+  if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
+    include_dirs = [ "." ]
+
+    sources = [
+      "contrib/optimizations/chunkcopy.h",
+      "contrib/optimizations/inffast_chunk.c",
+      "contrib/optimizations/inffast_chunk.h",
+      "contrib/optimizations/inflate.c",
+    ]
+  }
+
+  if (current_cpu == "arm" || current_cpu == "arm64") {
+    if (arm_use_neon) {
+      include_dirs = [ "." ]
+
+      sources = [
+        "contrib/optimizations/chunkcopy.h",
+        "contrib/optimizations/inffast_chunk.c",
+        "contrib/optimizations/inffast_chunk.h",
+        "contrib/optimizations/inflate.c",
+      ]
+
+      if (!is_debug) {
+        # Use optimize_speed (-O3) to output the _smallest_ code.
+        configs -= [ "//build/config/compiler:default_optimization" ]
+        configs += [ "//build/config/compiler:optimize_speed" ]
+      }
+    }
+  }
+
+  configs -= [ "//build/config/compiler:chromium_code" ]
+  configs += [ "//build/config/compiler:no_chromium_code" ]
+
+  public_configs = [ ":zlib_inflate_chunk_simd_config" ]
+}
+
 static_library("zlib_x86_simd") {
   if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
     sources = [
@@ -129,31 +175,23 @@
     "zutil.h",
   ]
 
-  if (current_cpu == "arm" || current_cpu == "arm64") {
-    if (arm_use_neon) {
-      sources -= [ "inflate.c" ]
-      sources += [
-        "contrib/optimizations/arm/chunkcopy_arm.h",
-        "contrib/optimizations/chunkcopy.h",
-        "contrib/optimizations/inffast_chunky.c",
-        "contrib/optimizations/inffast_chunky.h",
-        "contrib/optimizations/inflate.c",
-      ]
-    }
-  }
-
   defines = []
   deps = []
 
   if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
+    deps += [ ":zlib_adler32_simd" ]
     sources += [ "x86.c" ]
 
-    deps += [ ":zlib_adler32_simd" ]
+    deps += [ ":zlib_inflate_chunk_simd" ]
+    sources -= [ "inflate.c" ]
   }
 
   if (current_cpu == "arm" || current_cpu == "arm64") {
     if (arm_use_neon) {
       deps += [ ":zlib_adler32_simd" ]
+
+      deps += [ ":zlib_inflate_chunk_simd" ]
+      sources -= [ "inflate.c" ]
     }
   }
 
diff --git a/contrib/optimizations/arm/chunkcopy_arm.h b/contrib/optimizations/arm/chunkcopy_arm.h
deleted file mode 100644
index 41474c8..0000000
--- a/contrib/optimizations/arm/chunkcopy_arm.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/* chunkcopy_arm.h -- fast copies and sets
- * Copyright (C) 2017 ARM, Inc.
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#ifndef CHUNKCOPY_ARM_H
-#define CHUNKCOPY_ARM_H
-
-#include <arm_neon.h>
-#include "zutil.h"
-
-#if __STDC_VERSION__ >= 199901L
-#define Z_RESTRICT restrict
-#else
-#define Z_RESTRICT
-#endif
-
-/* A port to a new arch only requires to implement 2 functions
-  (vld_dup and chunkset_core) and the chunk type.
-*/
-
-typedef uint8x16_t chunkcopy_chunk_t;
-#define CHUNKCOPY_CHUNK_SIZE sizeof(chunkcopy_chunk_t)
-
-/* Forward declarations. */
-static inline unsigned char FAR* chunkunroll_relaxed(unsigned char FAR* out,
-                                                     unsigned FAR* dist,
-                                                     unsigned FAR* len);
-
-static inline unsigned char FAR* chunkcopy_core(unsigned char FAR* out,
-                                                const unsigned char FAR* from,
-                                                unsigned len);
-
-/* Architecture specific code starts here. */
-static inline uint8x16_t chunkset_vld1q_dup_u8x8(
-    const unsigned char FAR* Z_RESTRICT from) {
-#if defined(__clang__) || defined(__aarch64__)
-  return vreinterpretq_u8_u64(vld1q_dup_u64((void*)from));
-#else
-  /* 32-bit GCC uses an alignment hint for vld1q_dup_u64, even when given a
-   * void pointer, so here's an alternate implementation.
-   */
-  uint8x8_t h = vld1_u8(from);
-  return vcombine_u8(h, h);
-#endif
-}
-
-/*
-   Perform an overlapping copy which behaves as a memset() operation, but
-   supporting periods other than one, and assume that length is non-zero and
-   that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output
-   even if the length is shorter than this.
-   TODO(cavalcantii): maybe rename vreinterpretq and chunkset_vld to make it
-                      generic and move this code to chunkcopy.h (plus we
-                      won't need the forward declarations).
- */
-static inline unsigned char FAR* chunkset_core(unsigned char FAR* out,
-                                               unsigned period,
-                                               unsigned len) {
-  uint8x16_t f;
-  int bump = ((len - 1) % sizeof(f)) + 1;
-
-  switch (period) {
-    case 1:
-      f = vld1q_dup_u8(out - 1);
-      vst1q_u8(out, f);
-      out += bump;
-      len -= bump;
-      while (len > 0) {
-        vst1q_u8(out, f);
-        out += sizeof(f);
-        len -= sizeof(f);
-      }
-      return out;
-    case 2:
-      f = vreinterpretq_u8_u16(vld1q_dup_u16((void*)(out - 2)));
-      vst1q_u8(out, f);
-      out += bump;
-      len -= bump;
-      if (len > 0) {
-        f = vreinterpretq_u8_u16(vld1q_dup_u16((void*)(out - 2)));
-        do {
-          vst1q_u8(out, f);
-          out += sizeof(f);
-          len -= sizeof(f);
-        } while (len > 0);
-      }
-      return out;
-    case 4:
-      f = vreinterpretq_u8_u32(vld1q_dup_u32((void*)(out - 4)));
-      vst1q_u8(out, f);
-      out += bump;
-      len -= bump;
-      if (len > 0) {
-        f = vreinterpretq_u8_u32(vld1q_dup_u32((void*)(out - 4)));
-        do {
-          vst1q_u8(out, f);
-          out += sizeof(f);
-          len -= sizeof(f);
-        } while (len > 0);
-      }
-      return out;
-    case 8:
-      f = chunkset_vld1q_dup_u8x8(out - 8);
-      vst1q_u8(out, f);
-      out += bump;
-      len -= bump;
-      if (len > 0) {
-        f = chunkset_vld1q_dup_u8x8(out - 8);
-        do {
-          vst1q_u8(out, f);
-          out += sizeof(f);
-          len -= sizeof(f);
-        } while (len > 0);
-      }
-      return out;
-  }
-  out = chunkunroll_relaxed(out, &period, &len);
-  return chunkcopy_core(out, out - period, len);
-}
-
-#endif /* CHUNKCOPY_ARM_H */
diff --git a/contrib/optimizations/chunkcopy.h b/contrib/optimizations/chunkcopy.h
index 2080643..2988fb0 100644
--- a/contrib/optimizations/chunkcopy.h
+++ b/contrib/optimizations/chunkcopy.h
@@ -1,50 +1,90 @@
-/* chunkcopy.h -- fast copies and sets
+/* chunkcopy.h -- fast chunk copy and set operations
  * Copyright (C) 2017 ARM, Inc.
+ * Copyright 2017 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
 #ifndef CHUNKCOPY_H
 #define CHUNKCOPY_H
 
-// TODO(cavalcantii): add the Intel code next.
-#include "chunkcopy_arm.h"
+#include <stdint.h>
+#include "zutil.h"
+
+#define Z_STATIC_ASSERT(name, assert) typedef char name[(assert) ? 1 : -1]
+
+#if __STDC_VERSION__ >= 199901L
+#define Z_RESTRICT restrict
+#else
+#define Z_RESTRICT
+#endif
+
+#if defined(__clang__) || defined(__GNUC__) || defined(__llvm__)
+#define Z_BUILTIN_MEMCPY __builtin_memcpy
+#else
+#define Z_BUILTIN_MEMCPY zmemcpy
+#endif
+
+#if defined(INFLATE_CHUNK_SIMD_NEON)
+#include <arm_neon.h>
+typedef uint8x16_t z_vec128i_t;
+#elif defined(INFLATE_CHUNK_SIMD_SSE2)
+#include <emmintrin.h>
+typedef __m128i z_vec128i_t;
+#else
+#error chunkcopy.h inflate chunk SIMD is not defined for your build target
+#endif
 
 /*
-   Ask the compiler to perform a wide, unaligned load with an machine
-   instruction appropriate for the chunkcopy_chunk_t type.
+ * chunk copy type: the z_vec128i_t type size should be exactly 128-bits
+ * and equal to CHUNKCOPY_CHUNK_SIZE.
  */
-static inline chunkcopy_chunk_t loadchunk(const unsigned char FAR* s) {
-  chunkcopy_chunk_t c;
-  __builtin_memcpy(&c, s, sizeof(c));
-  return c;
+#define CHUNKCOPY_CHUNK_SIZE sizeof(z_vec128i_t)
+
+Z_STATIC_ASSERT(vector_128_bits_wide,
+                CHUNKCOPY_CHUNK_SIZE == sizeof(int8_t) * 16);
+
+/*
+ * Ask the compiler to perform a wide, unaligned load with a machine
+ * instruction appropriate for the z_vec128i_t type.
+ */
+static inline z_vec128i_t loadchunk(
+    const unsigned char FAR* s) {
+  z_vec128i_t v;
+  Z_BUILTIN_MEMCPY(&v, s, sizeof(v));
+  return v;
 }
 
 /*
-   Ask the compiler to perform a wide, unaligned store with an machine
-   instruction appropriate for the chunkcopy_chunk_t type.
+ * Ask the compiler to perform a wide, unaligned store with a machine
+ * instruction appropriate for the z_vec128i_t type.
  */
-static inline void storechunk(unsigned char FAR* d, chunkcopy_chunk_t c) {
-  __builtin_memcpy(d, &c, sizeof(c));
+static inline void storechunk(
+    unsigned char FAR* d,
+    const z_vec128i_t v) {
+  Z_BUILTIN_MEMCPY(d, &v, sizeof(v));
 }
 
 /*
-   Perform a memcpy-like operation, but assume that length is non-zero and that
-   it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
-   the length is shorter than this.
-
-   It also guarantees that it will properly unroll the data if the distance
-   between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on
-   in chunkcopy_relaxed().
-
-   Aside from better memory bus utilisation, this means that short copies
-   (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop
-   without iteration, which will hopefully make the branch prediction more
-   reliable.
+ * Perform a memcpy-like operation, assuming that length is non-zero and that
+ * it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
+ * the length is shorter than this.
+ *
+ * It also guarantees that it will properly unroll the data if the distance
+ * between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on
+ * in chunkcopy_relaxed().
+ *
+ * Aside from better memory bus utilisation, this means that short copies
+ * (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop
+ * without iteration, which will hopefully make the branch prediction more
+ * reliable.
  */
-static inline unsigned char FAR* chunkcopy_core(unsigned char FAR* out,
-                                                const unsigned char FAR* from,
-                                                unsigned len) {
-  int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1;
+static inline unsigned char FAR* chunkcopy_core(
+    unsigned char FAR* out,
+    const unsigned char FAR* from,
+    unsigned len) {
+  const int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1;
   storechunk(out, loadchunk(from));
   out += bump;
   from += bump;
@@ -58,12 +98,12 @@
 }
 
 /*
-   Like chunkcopy_core, but avoid writing beyond of legal output.
-
-   Accepts an additional pointer to the end of safe output.  A generic safe
-   copy would use (out + len), but it's normally the case that the end of the
-   output buffer is beyond the end of the current copy, and this can still be
-   exploited.
+ * Like chunkcopy_core(), but avoid writing beyond of legal output.
+ *
+ * Accepts an additional pointer to the end of safe output.  A generic safe
+ * copy would use (out + len), but it's normally the case that the end of the
+ * output buffer is beyond the end of the current copy, and this can still be
+ * exploited.
  */
 static inline unsigned char FAR* chunkcopy_core_safe(
     unsigned char FAR* out,
@@ -74,17 +114,17 @@
   if (limit - out < CHUNKCOPY_CHUNK_SIZE) {
     const unsigned char FAR* Z_RESTRICT rfrom = from;
     if (len & 8) {
-      __builtin_memcpy(out, rfrom, 8);
+      Z_BUILTIN_MEMCPY(out, rfrom, 8);
       out += 8;
       rfrom += 8;
     }
     if (len & 4) {
-      __builtin_memcpy(out, rfrom, 4);
+      Z_BUILTIN_MEMCPY(out, rfrom, 4);
       out += 4;
       rfrom += 4;
     }
     if (len & 2) {
-      __builtin_memcpy(out, rfrom, 2);
+      Z_BUILTIN_MEMCPY(out, rfrom, 2);
       out += 2;
       rfrom += 2;
     }
@@ -97,18 +137,19 @@
 }
 
 /*
-   Perform short copies until distance can be rewritten as being at least
-   CHUNKCOPY_CHUNK_SIZE.
-
-   This assumes that it's OK to overwrite at least the first
-   2*CHUNKCOPY_CHUNK_SIZE bytes of output even if the copy is shorter than
-   this.  This assumption holds within inflate_fast() which starts every
-   iteration with at least 258 bytes of output space available (258 being the
-   maximum length output from a single token; see inffast.c).
+ * Perform short copies until distance can be rewritten as being at least
+ * CHUNKCOPY_CHUNK_SIZE.
+ *
+ * Assumes it's OK to overwrite at least the first 2*CHUNKCOPY_CHUNK_SIZE
+ * bytes of output even if the copy is shorter than this.  This assumption
+ * holds within zlib inflate_fast(), which starts every iteration with at
+ * least 258 bytes of output space available (258 being the maximum length
+ * output from a single token; see inffast.c).
  */
-static inline unsigned char FAR* chunkunroll_relaxed(unsigned char FAR* out,
-                                                     unsigned FAR* dist,
-                                                     unsigned FAR* len) {
+static inline unsigned char FAR* chunkunroll_relaxed(
+    unsigned char FAR* out,
+    unsigned FAR* dist,
+    unsigned FAR* len) {
   const unsigned char FAR* from = out - *dist;
   while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) {
     storechunk(out, loadchunk(from));
@@ -119,15 +160,180 @@
   return out;
 }
 
+#if defined(INFLATE_CHUNK_SIMD_NEON)
 /*
-   Perform a memcpy-like operation, but assume that length is non-zero and that
-   it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
-   the length is shorter than this.
+ * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
+ * every 64-bit component of the 128-bit result (64-bit int splat).
+ */
+static inline z_vec128i_t v_load64_dup(const void* src) {
+  return vcombine_u8(vld1_u8(src), vld1_u8(src));
+}
 
-   Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour
-   of overlapping buffers, regardless of the distance between the pointers.
-   This is reflected in the `restrict`-qualified pointers, allowing the
-   compiler to reorder loads and stores.
+/*
+ * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
+ * every 32-bit component of the 128-bit result (32-bit int splat).
+ */
+static inline z_vec128i_t v_load32_dup(const void* src) {
+  int32_t i32;
+  Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
+  return vreinterpretq_u8_s32(vdupq_n_s32(i32));
+}
+
+/*
+ * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
+ * every 16-bit component of the 128-bit result (16-bit int splat).
+ */
+static inline z_vec128i_t v_load16_dup(const void* src) {
+  int16_t i16;
+  Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
+  return vreinterpretq_u8_s16(vdupq_n_s16(i16));
+}
+
+/*
+ * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
+ * component of the 128-bit result (8-bit int splat).
+ */
+static inline z_vec128i_t v_load8_dup(const void* src) {
+  return vld1q_dup_u8((const uint8_t*)src);
+}
+
+/*
+ * v_store_128(): store the 128-bit vec in a memory destination (that might
+ * not be 16-byte aligned) void* out.
+ */
+static inline void v_store_128(void* out, const z_vec128i_t vec) {
+  vst1q_u8(out, vec);
+}
+
+#elif defined(INFLATE_CHUNK_SIMD_SSE2)
+/*
+ * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
+ * every 64-bit component of the 128-bit result (64-bit int splat).
+ */
+static inline z_vec128i_t v_load64_dup(const void* src) {
+  int64_t i64;
+  Z_BUILTIN_MEMCPY(&i64, src, sizeof(i64));
+  return _mm_set1_epi64x(i64);
+}
+
+/*
+ * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
+ * every 32-bit component of the 128-bit result (32-bit int splat).
+ */
+static inline z_vec128i_t v_load32_dup(const void* src) {
+  int32_t i32;
+  Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
+  return _mm_set1_epi32(i32);
+}
+
+/*
+ * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
+ * every 16-bit component of the 128-bit result (16-bit int splat).
+ */
+static inline z_vec128i_t v_load16_dup(const void* src) {
+  int16_t i16;
+  Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
+  return _mm_set1_epi16(i16);
+}
+
+/*
+ * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
+ * component of the 128-bit result (8-bit int splat).
+ */
+static inline z_vec128i_t v_load8_dup(const void* src) {
+  return _mm_set1_epi8(*(const char*)src);
+}
+
+/*
+ * v_store_128(): store the 128-bit vec in a memory destination (that might
+ * not be 16-byte aligned) void* out.
+ */
+static inline void v_store_128(void* out, const z_vec128i_t vec) {
+  _mm_storeu_si128((__m128i*)out, vec);
+}
+#endif
+
+/*
+ * Perform an overlapping copy which behaves as a memset() operation, but
+ * supporting periods other than one, and assume that length is non-zero and
+ * that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output
+ * even if the length is shorter than this.
+ */
+static inline unsigned char FAR* chunkset_core(
+    unsigned char FAR* out,
+    unsigned period,
+    unsigned len) {
+  z_vec128i_t v;
+  const int bump = ((len - 1) % sizeof(v)) + 1;
+
+  switch (period) {
+    case 1:
+      v = v_load8_dup(out - 1);
+      v_store_128(out, v);
+      out += bump;
+      len -= bump;
+      while (len > 0) {
+        v_store_128(out, v);
+        out += sizeof(v);
+        len -= sizeof(v);
+      }
+      return out;
+    case 2:
+      v = v_load16_dup(out - 2);
+      v_store_128(out, v);
+      out += bump;
+      len -= bump;
+      if (len > 0) {
+        v = v_load16_dup(out - 2);
+        do {
+          v_store_128(out, v);
+          out += sizeof(v);
+          len -= sizeof(v);
+        } while (len > 0);
+      }
+      return out;
+    case 4:
+      v = v_load32_dup(out - 4);
+      v_store_128(out, v);
+      out += bump;
+      len -= bump;
+      if (len > 0) {
+        v = v_load32_dup(out - 4);
+        do {
+          v_store_128(out, v);
+          out += sizeof(v);
+          len -= sizeof(v);
+        } while (len > 0);
+      }
+      return out;
+    case 8:
+      v = v_load64_dup(out - 8);
+      v_store_128(out, v);
+      out += bump;
+      len -= bump;
+      if (len > 0) {
+        v = v_load64_dup(out - 8);
+        do {
+          v_store_128(out, v);
+          out += sizeof(v);
+          len -= sizeof(v);
+        } while (len > 0);
+      }
+      return out;
+  }
+  out = chunkunroll_relaxed(out, &period, &len);
+  return chunkcopy_core(out, out - period, len);
+}
+
+/*
+ * Perform a memcpy-like operation, but assume that length is non-zero and that
+ * it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
+ * the length is shorter than this.
+ *
+ * Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour
+ * of overlapping buffers, regardless of the distance between the pointers.
+ * This is reflected in the `restrict`-qualified pointers, allowing the
+ * compiler to re-order loads and stores.
  */
 static inline unsigned char FAR* chunkcopy_relaxed(
     unsigned char FAR* Z_RESTRICT out,
@@ -137,17 +343,17 @@
 }
 
 /*
-   Like chunkcopy_relaxed, but avoid writing beyond of legal output.
-
-   Unlike chunkcopy_core_safe() above, no guarantee is made regarding the
-   behaviour of overlapping buffers, regardless of the distance between the
-   pointers.  This is reflected in the `restrict`-qualified pointers, allowing
-   the compiler to reorder loads and stores.
-
-   Accepts an additional pointer to the end of safe output.  A generic safe
-   copy would use (out + len), but it's normally the case that the end of the
-   output buffer is beyond the end of the current copy, and this can still be
-   exploited.
+ * Like chunkcopy_relaxed(), but avoid writing beyond of legal output.
+ *
+ * Unlike chunkcopy_core_safe() above, no guarantee is made regarding the
+ * behaviour of overlapping buffers, regardless of the distance between the
+ * pointers.  This is reflected in the `restrict`-qualified pointers, allowing
+ * the compiler to re-order loads and stores.
+ *
+ * Accepts an additional pointer to the end of safe output.  A generic safe
+ * copy would use (out + len), but it's normally the case that the end of the
+ * output buffer is beyond the end of the current copy, and this can still be
+ * exploited.
  */
 static inline unsigned char FAR* chunkcopy_safe(
     unsigned char FAR* out,
@@ -159,14 +365,16 @@
 }
 
 /*
-   Perform chunky copy within the same buffer, where the source and destination
-   may potentially overlap.
-
-   Assumes that len > 0 on entry, and that it's safe to write at least
-   CHUNKCOPY_CHUNK_SIZE*3 bytes to the output.
+ * Perform chunky copy within the same buffer, where the source and destination
+ * may potentially overlap.
+ *
+ * Assumes that len > 0 on entry, and that it's safe to write at least
+ * CHUNKCOPY_CHUNK_SIZE*3 bytes to the output.
  */
-static inline unsigned char FAR*
-chunkcopy_lapped_relaxed(unsigned char FAR* out, unsigned dist, unsigned len) {
+static inline unsigned char FAR* chunkcopy_lapped_relaxed(
+    unsigned char FAR* out,
+    unsigned dist,
+    unsigned len) {
   if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) {
     return chunkset_core(out, dist, len);
   }
@@ -174,13 +382,13 @@
 }
 
 /*
-   Behave like chunkcopy_lapped_relaxed, but avoid writing beyond of legal
-   output.
-
-   Accepts an additional pointer to the end of safe output.  A generic safe
-   copy would use (out + len), but it's normally the case that the end of the
-   output buffer is beyond the end of the current copy, and this can still be
-   exploited.
+ * Behave like chunkcopy_lapped_relaxed(), but avoid writing beyond of legal
+ * output.
+ *
+ * Accepts an additional pointer to the end of safe output.  A generic safe
+ * copy would use (out + len), but it's normally the case that the end of the
+ * output buffer is beyond the end of the current copy, and this can still be
+ * exploited.
  */
 static inline unsigned char FAR* chunkcopy_lapped_safe(
     unsigned char FAR* out,
@@ -199,6 +407,8 @@
   return chunkcopy_lapped_relaxed(out, dist, len);
 }
 
+#undef Z_STATIC_ASSERT
 #undef Z_RESTRICT
+#undef Z_BUILTIN_MEMCPY
 
 #endif /* CHUNKCOPY_H */
diff --git a/contrib/optimizations/inffast_chunky.c b/contrib/optimizations/inffast_chunk.c
similarity index 98%
rename from contrib/optimizations/inffast_chunky.c
rename to contrib/optimizations/inffast_chunk.c
index e2bc735..4829d0d 100644
--- a/contrib/optimizations/inffast_chunky.c
+++ b/contrib/optimizations/inffast_chunk.c
@@ -6,7 +6,7 @@
 #include "zutil.h"
 #include "inftrees.h"
 #include "inflate.h"
-#include "contrib/optimizations/inffast_chunky.h"
+#include "contrib/optimizations/inffast_chunk.h"
 #include "contrib/optimizations/chunkcopy.h"
 
 #ifdef ASMINF
@@ -52,7 +52,7 @@
       requires strm->avail_out >= 258 for each loop to avoid checking for
       output space.
  */
-void ZLIB_INTERNAL inflate_fast_chunky(strm, start)
+void ZLIB_INTERNAL inflate_fast_chunk_(strm, start)
 z_streamp strm;
 unsigned start;         /* inflate()'s starting value for strm->avail_out */
 {
diff --git a/contrib/optimizations/inffast_chunky.h b/contrib/optimizations/inffast_chunk.h
similarity index 65%
rename from contrib/optimizations/inffast_chunky.h
rename to contrib/optimizations/inffast_chunk.h
index 7f033f2..80636e7 100644
--- a/contrib/optimizations/inffast_chunky.h
+++ b/contrib/optimizations/inffast_chunk.h
@@ -9,4 +9,7 @@
    subject to change. Applications should only use zlib.h.
  */
 
-void ZLIB_INTERNAL inflate_fast_chunky OF((z_streamp strm, unsigned start));
+// TODO(cblume): incorporate the patch done on crbug.com/764431 here and
+// in related files to define and use INFLATE_FAST_MIN_HAVE/_LEFT etc.
+
+void ZLIB_INTERNAL inflate_fast_chunk_ OF((z_streamp strm, unsigned start));
diff --git a/contrib/optimizations/inflate.c b/contrib/optimizations/inflate.c
index 152f174..d6c5614 100644
--- a/contrib/optimizations/inflate.c
+++ b/contrib/optimizations/inflate.c
@@ -83,8 +83,9 @@
 #include "zutil.h"
 #include "inftrees.h"
 #include "inflate.h"
-#include "contrib/optimizations/inffast_chunky.h"
+#include "contrib/optimizations/inffast_chunk.h"
 #include "contrib/optimizations/chunkcopy.h"
+#include "x86.h"
 
 #ifdef MAKEFIXED
 #  ifndef BUILDFIXED
@@ -202,6 +203,8 @@
     int ret;
     struct inflate_state FAR *state;
 
+    x86_check_features();
+
     if (version == Z_NULL || version[0] != ZLIB_VERSION[0] ||
         stream_size != (int)(sizeof(z_stream)))
         return Z_VERSION_ERROR;
@@ -419,7 +422,7 @@
            and is subsequently either overwritten or left deliberately
            undefined at the end of decode; so there's really no point.
          */
-        memset(state->window + wsize, 0, CHUNKCOPY_CHUNK_SIZE);
+        zmemzero(state->window + wsize, CHUNKCOPY_CHUNK_SIZE);
 #endif
     }
 
@@ -1056,7 +1059,7 @@
         case LEN:
             if (have >= 6 && left >= 258) {
                 RESTORE();
-                inflate_fast_chunky(strm, out);
+                inflate_fast_chunk_(strm, out);
                 LOAD();
                 if (state->mode == TYPE)
                     state->back = -1;
diff --git a/names.h b/names.h
index cd98ec9..c18b90f 100644
--- a/names.h
+++ b/names.h
@@ -171,4 +171,9 @@
 #define adler32_simd_ Cr_z_adler32_simd_
 #endif
 
+#if defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_SIMD_NEON)
+/* Symbols added by contrib/optimizations/inffast_chunk */
+#define inflate_fast_chunk_ Cr_z_inflate_fast_chunk_
+#endif
+
 #endif  /* THIRD_PARTY_ZLIB_NAMES_H_ */