Neon-Optimized hash chain

This should help with compression of data, using NEON instructions
(therefore useful for ARMv7/ARMv8).

Average gains were around 4% to 5% in data compression, depending on the
data entropy.

Re-write of a patch contributed to Fedora Core, for reference:
https://src.fedoraproject.org/rpms/zlib/c/25e9802713484882c27c1f979a6610a42414ee13?branch=master

Bug: 863257
Change-Id: I76573d75843d6a63de75d8a9536da98515314543
Reviewed-on: https://chromium-review.googlesource.com/1136940
Reviewed-by: Mike Klein <mtklein@chromium.org>
Reviewed-by: Chris Blume <cblume@chromium.org>
Commit-Queue: Adenilson Cavalcanti <cavalcantii@chromium.org>
Cr-Original-Commit-Position: refs/heads/master@{#581241}
Cr-Mirrored-From: https://chromium.googlesource.com/chromium/src
Cr-Mirrored-Commit: ddbbeb05cb3f0a4c27e7a5d5d0305462db373677
diff --git a/BUILD.gn b/BUILD.gn
index 902e287..69b49ba 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -283,6 +283,7 @@
 
       deps += [ ":zlib_inflate_chunk_simd" ]
       sources -= [ "inflate.c" ]
+      sources += [ "contrib/optimizations/slide_hash_neon.h" ]
     }
   }
 
diff --git a/contrib/optimizations/slide_hash_neon.h b/contrib/optimizations/slide_hash_neon.h
new file mode 100644
index 0000000..26995d7
--- /dev/null
+++ b/contrib/optimizations/slide_hash_neon.h
@@ -0,0 +1,65 @@
+/* Copyright 2018 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ */
+#ifndef __SLIDE_HASH__NEON__
+#define __SLIDE_HASH__NEON__
+
+#include "deflate.h"
+#include <arm_neon.h>
+
+inline static void ZLIB_INTERNAL neon_slide_hash_update(Posf *hash,
+                                                        const uInt hash_size,
+                                                        const ush w_size)
+{
+   /* NEON 'Q' registers allow to store 128 bits, so we can load 8x16-bits
+     * values. For further details, check:
+     * ARM DHT 0002A, section 1.3.2 NEON Registers.
+     */
+    const size_t chunk = sizeof(uint16x8_t) / sizeof(uint16_t);
+    /* Unrolling the operation yielded a compression performance boost in both
+     * ARMv7 (from 11.7% to 13.4%) and ARMv8 (from 3.7% to 7.5%) for HTML4
+     * content. For full benchmarking data, check: http://crbug.com/863257.
+     */
+    const size_t stride = 2*chunk;
+    const uint16x8_t v = vdupq_n_u16(w_size);
+
+    for (Posf *end = hash + hash_size; hash != end; hash += stride) {
+        uint16x8_t m_low = vld1q_u16(hash);
+        uint16x8_t m_high = vld1q_u16(hash + chunk);
+
+        /* The first 'q' in vqsubq_u16 makes these subtracts saturate to zero,
+         * replacing the ternary operator expression in the original code:
+         * (m >= wsize ? m - wsize : NIL).
+         */
+        m_low = vqsubq_u16(m_low, v);
+        m_high = vqsubq_u16(m_high, v);
+
+        vst1q_u16(hash, m_low);
+        vst1q_u16(hash + chunk, m_high);
+    }
+}
+
+
+inline static void ZLIB_INTERNAL neon_slide_hash(Posf *head, Posf *prev,
+                                                 const unsigned short w_size,
+                                                 const uInt hash_size)
+{
+    /*
+     * SIMD implementation for hash table rebase assumes:
+     * 1. hash chain offset (Pos) is 2 bytes.
+     * 2. hash table size is multiple of 32 bytes.
+     * #1 should be true as Pos is defined as "ush"
+     * #2 should be true as hash_bits are greater than 7
+     */
+    const size_t size = hash_size * sizeof(head[0]);
+    Assert(sizeof(Pos) == 2, "Wrong Pos size.");
+    Assert((size % sizeof(uint16x8_t) * 2) == 0, "Hash table size error.");
+
+    neon_slide_hash_update(head, hash_size, w_size);
+#ifndef FASTEST
+    neon_slide_hash_update(prev, w_size, w_size);
+#endif
+}
+
+#endif
diff --git a/deflate.c b/deflate.c
index 6fe9c7e..68d75b2 100644
--- a/deflate.c
+++ b/deflate.c
@@ -51,6 +51,9 @@
 #include <assert.h>
 #include "deflate.h"
 #include "x86.h"
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include "contrib/optimizations/slide_hash_neon.h"
+#endif
 
 const char deflate_copyright[] =
    " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
@@ -226,6 +229,10 @@
 local void slide_hash(s)
     deflate_state *s;
 {
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
+    /* NEON based hash table rebase. */
+    return neon_slide_hash(s->head, s->prev, s->w_size, s->hash_size);
+#endif
     unsigned n, m;
     Posf *p;
     uInt wsize = s->w_size;