Neon-Optimized hash chain
This should help with compression of data, using NEON instructions
(therefore useful for ARMv7/ARMv8).
Average gains were around 4% to 5% in data compression, depending on the
data entropy.
Re-write of a patch contributed to Fedora Core, for reference:
https://src.fedoraproject.org/rpms/zlib/c/25e9802713484882c27c1f979a6610a42414ee13?branch=master
Bug: 863257
Change-Id: I76573d75843d6a63de75d8a9536da98515314543
Reviewed-on: https://chromium-review.googlesource.com/1136940
Reviewed-by: Mike Klein <mtklein@chromium.org>
Reviewed-by: Chris Blume <cblume@chromium.org>
Commit-Queue: Adenilson Cavalcanti <cavalcantii@chromium.org>
Cr-Original-Commit-Position: refs/heads/master@{#581241}
Cr-Mirrored-From: https://chromium.googlesource.com/chromium/src
Cr-Mirrored-Commit: ddbbeb05cb3f0a4c27e7a5d5d0305462db373677
diff --git a/BUILD.gn b/BUILD.gn
index 902e287..69b49ba 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -283,6 +283,7 @@
deps += [ ":zlib_inflate_chunk_simd" ]
sources -= [ "inflate.c" ]
+ sources += [ "contrib/optimizations/slide_hash_neon.h" ]
}
}
diff --git a/contrib/optimizations/slide_hash_neon.h b/contrib/optimizations/slide_hash_neon.h
new file mode 100644
index 0000000..26995d7
--- /dev/null
+++ b/contrib/optimizations/slide_hash_neon.h
@@ -0,0 +1,65 @@
+/* Copyright 2018 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ */
+#ifndef __SLIDE_HASH__NEON__
+#define __SLIDE_HASH__NEON__
+
+#include "deflate.h"
+#include <arm_neon.h>
+
+inline static void ZLIB_INTERNAL neon_slide_hash_update(Posf *hash,
+ const uInt hash_size,
+ const ush w_size)
+{
+ /* NEON 'Q' registers allow to store 128 bits, so we can load 8x16-bits
+ * values. For further details, check:
+ * ARM DHT 0002A, section 1.3.2 NEON Registers.
+ */
+ const size_t chunk = sizeof(uint16x8_t) / sizeof(uint16_t);
+ /* Unrolling the operation yielded a compression performance boost in both
+ * ARMv7 (from 11.7% to 13.4%) and ARMv8 (from 3.7% to 7.5%) for HTML4
+ * content. For full benchmarking data, check: http://crbug.com/863257.
+ */
+ const size_t stride = 2*chunk;
+ const uint16x8_t v = vdupq_n_u16(w_size);
+
+ for (Posf *end = hash + hash_size; hash != end; hash += stride) {
+ uint16x8_t m_low = vld1q_u16(hash);
+ uint16x8_t m_high = vld1q_u16(hash + chunk);
+
+ /* The first 'q' in vqsubq_u16 makes these subtracts saturate to zero,
+ * replacing the ternary operator expression in the original code:
+ * (m >= wsize ? m - wsize : NIL).
+ */
+ m_low = vqsubq_u16(m_low, v);
+ m_high = vqsubq_u16(m_high, v);
+
+ vst1q_u16(hash, m_low);
+ vst1q_u16(hash + chunk, m_high);
+ }
+}
+
+
+inline static void ZLIB_INTERNAL neon_slide_hash(Posf *head, Posf *prev,
+ const unsigned short w_size,
+ const uInt hash_size)
+{
+ /*
+ * SIMD implementation for hash table rebase assumes:
+ * 1. hash chain offset (Pos) is 2 bytes.
+ * 2. hash table size is multiple of 32 bytes.
+ * #1 should be true as Pos is defined as "ush"
+ * #2 should be true as hash_bits are greater than 7
+ */
+ const size_t size = hash_size * sizeof(head[0]);
+ Assert(sizeof(Pos) == 2, "Wrong Pos size.");
+ Assert((size % sizeof(uint16x8_t) * 2) == 0, "Hash table size error.");
+
+ neon_slide_hash_update(head, hash_size, w_size);
+#ifndef FASTEST
+ neon_slide_hash_update(prev, w_size, w_size);
+#endif
+}
+
+#endif
diff --git a/deflate.c b/deflate.c
index 6fe9c7e..68d75b2 100644
--- a/deflate.c
+++ b/deflate.c
@@ -51,6 +51,9 @@
#include <assert.h>
#include "deflate.h"
#include "x86.h"
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include "contrib/optimizations/slide_hash_neon.h"
+#endif
const char deflate_copyright[] =
" deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
@@ -226,6 +229,10 @@
local void slide_hash(s)
deflate_state *s;
{
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
+ /* NEON based hash table rebase. */
+ return neon_slide_hash(s->head, s->prev, s->w_size, s->hash_size);
+#endif
unsigned n, m;
Posf *p;
uInt wsize = s->w_size;