Compute crc32 using ARMv8 specific instruction

CRC32 affects performance for both image decoding (PNG)
as also in general browsing while accessing websites that serve
content using compression (i.e. Content-Encoding: gzip).

This patch implements an optimized CRC32 function using the
dedicated instruction available in ARMv8a. We only support
ARM Little-Endian (LE).

This instruction is available in new Android devices featuring an
ARMv8 SoC, like Nexus 5x and Google Pixel. It should be between
3x (A72) to 7x faster (A53) than the C implementation currently used
by zlib for 8KB vectors.

This is performance critical code and can be called with both large (8KB)
or small vectors, therefore we must avoid extraneous function calls or
branching (otherwise the performance benefits are negated). So the use
of 'public' variables to read the CPU features status flags
(i.e. arm_cpu_enable_crc32 | pmull).

Finally it also introduces code to perform run-time ARM CPU feature
detection on the supported platforms: Android and Linux/CrOS. We build
and link the CRC32 instruction dependent code, but will decide to use it
at run-time if the ARM CPU supports the CRC32 instruction. Otherwise,
we fallback to using zlib's default C implementation.

This approach allows to use the instruction in both 32bits and 64bits and
works fine either in ARMv7 or ARMv8 processor. I tested the generated
Chrome apk in both a Nexus 6 (ARMv7) and a Google Pixel (ARMv8).

The crc32 function benefited from input from Yang Zang and Mike Klein,
while the arm_features benefited from input from Noel Gordon.

Bug: 709716
Change-Id: I315c1216f8b3a8d88607630a28737c41f52a2f5d
Reviewed-on: https://chromium-review.googlesource.com/801108
Reviewed-by: Chris Blume <cblume@chromium.org>
Reviewed-by: Noel Gordon <noel@chromium.org>
Commit-Queue: Noel Gordon <noel@chromium.org>
Cr-Original-Commit-Position: refs/heads/master@{#537179}
Cr-Mirrored-From: https://chromium.googlesource.com/chromium/src
Cr-Mirrored-Commit: 28c9623083688b3a354c33bf77746f4c51f58826
diff --git a/BUILD.gn b/BUILD.gn
index d070e3c..789ad62 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -54,6 +54,63 @@
   public_configs = [ ":zlib_adler32_simd_config" ]
 }
 
+config("zlib_arm_crc32_config") {
+  if (current_cpu == "arm" || current_cpu == "arm64") {
+    # Restrictions:
+    #  - Disabled for iPhones, as described in DDI0487C_a_armv8_arm:
+    #     "All implementations of the ARMv8.1 architecture are required to
+    #      implement the CRC32* instructions. These are optional in ARMv8.0."
+    #  - ChromeOS has wrapper scripts that are borking the compiler flags.
+    #  - Fuchsia just added a syscall for feature detection.
+    # TODO(cavalcantii): crbug.com/810125.
+    if (!is_ios && !is_chromeos && !is_fuchsia) {
+      defines = [ "CRC32_ARMV8_CRC32" ]
+      if (is_android) {
+        defines += [ "ARMV8_OS_ANDROID" ]
+      } else if (is_linux || is_chromeos) {
+        defines += [ "ARMV8_OS_LINUX" ]
+      }
+    }
+  }
+}
+
+if (current_cpu == "arm" || current_cpu == "arm64") {
+  source_set("zlib_arm_crc32") {
+    visibility = [ ":*" ]
+
+    if (!is_ios && !is_chromeos && !is_fuchsia) {
+      include_dirs = [ "." ]
+
+      if (is_android) {
+        import("//build/config/android/config.gni")
+        if (defined(android_ndk_root) && android_ndk_root != "") {
+          deps = [
+            "//third_party/android_tools:cpu_features",
+          ]
+        }
+      }
+
+      sources = [
+        "arm_features.c",
+        "arm_features.h",
+        "crc32_simd.c",
+        "crc32_simd.h",
+      ]
+
+      if (!is_win || is_clang) {
+        cflags_c = [ "-march=armv8-a+crc" ]
+      }
+
+      if (!is_debug) {
+        configs -= [ "//build/config/compiler:default_optimization" ]
+        configs += [ "//build/config/compiler:optimize_speed" ]
+      }
+    }
+
+    public_configs = [ ":zlib_arm_crc32_config" ]
+  }
+}
+
 config("zlib_inflate_chunk_simd_config") {
   if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
     defines = [ "INFLATE_CHUNK_SIMD_SSE2" ]
@@ -222,6 +279,8 @@
     if (arm_use_neon) {
       deps += [ ":zlib_adler32_simd" ]
 
+      deps += [ ":zlib_arm_crc32" ]
+
       deps += [ ":zlib_inflate_chunk_simd" ]
       sources -= [ "inflate.c" ]
     }
diff --git a/arm_features.c b/arm_features.c
new file mode 100644
index 0000000..60acbac
--- /dev/null
+++ b/arm_features.c
@@ -0,0 +1,60 @@
+/* arm_features.c -- ARM processor features detection.
+ *
+ * Copyright 2018 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ */
+#include "arm_features.h"
+
+#include "zutil.h"
+#include <pthread.h>
+#include <stdint.h>
+
+#if defined(ARMV8_OS_ANDROID)
+#include <cpu-features.h>
+#elif defined(ARMV8_OS_LINUX)
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#else
+#error ### No ARM CPU features detection in your platform/OS
+#endif
+
+int ZLIB_INTERNAL arm_cpu_enable_crc32 = 0;
+int ZLIB_INTERNAL arm_cpu_enable_pmull = 0;
+
+static pthread_once_t cpu_check_inited_once = PTHREAD_ONCE_INIT;
+
+static void init_arm_features(void)
+{
+    uint64_t flag_crc32 = 0, flag_pmull = 0, capabilities = 0;
+
+#if defined(ARMV8_OS_ANDROID)
+    flag_crc32 = ANDROID_CPU_ARM_FEATURE_CRC32;
+    flag_pmull = ANDROID_CPU_ARM_FEATURE_PMULL;
+    capabilities = android_getCpuFeatures();
+#elif defined(ARMV8_OS_LINUX)
+    #if defined(__aarch64__)
+        flag_crc32 = HWCAP_CRC32;
+        flag_pmull = HWCAP_PMULL;
+        capabilities = getauxval(AT_HWCAP);
+    #elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+        /* The use of HWCAP2 is for getting features of newer ARMv8-A SoCs
+         * while running in 32bits mode (i.e. aarch32).
+         */
+        flag_crc32 = HWCAP2_CRC32;
+        flag_pmull = HWCAP2_PMULL;
+        capabilities = getauxval(AT_HWCAP2);
+    #endif
+#endif
+
+    if (capabilities & flag_crc32)
+        arm_cpu_enable_crc32 = 1;
+
+    if (capabilities & flag_pmull)
+        arm_cpu_enable_pmull = 1;
+}
+
+void ZLIB_INTERNAL arm_check_features(void)
+{
+    pthread_once(&cpu_check_inited_once, init_arm_features);
+}
diff --git a/arm_features.h b/arm_features.h
new file mode 100644
index 0000000..09fec25
--- /dev/null
+++ b/arm_features.h
@@ -0,0 +1,13 @@
+/* arm_features.h -- ARM processor features detection.
+ *
+ * Copyright 2018 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ */
+
+#include "zlib.h"
+
+extern int arm_cpu_enable_crc32;
+extern int arm_cpu_enable_pmull;
+
+void arm_check_features(void);
diff --git a/crc32.c b/crc32.c
index 0757859..e95b908 100644
--- a/crc32.c
+++ b/crc32.c
@@ -34,6 +34,9 @@
 
 #if defined(CRC32_SIMD_SSE42_PCLMUL)
 #include "crc32_simd.h"
+#elif defined(CRC32_ARMV8_CRC32)
+#include "arm_features.h"
+#include "crc32_simd.h"
 #endif
 
 /* Definitions for doing the crc four data bytes at a time. */
@@ -277,6 +280,22 @@
     const unsigned char FAR *buf;
     uInt len;
 {
+#if defined(CRC32_ARMV8_CRC32)
+    /* We got to verify ARM CPU features, so exploit the common usage pattern
+     * of calling this function with Z_NULL for an initial valid crc value.
+     * This allows to cache the result of the feature check and avoid extraneous
+     * function calls.
+     * TODO: try to move this to crc32_z if we don't loose performance on ARM.
+     */
+    if (buf == Z_NULL) {
+        if (!len) /* Assume user is calling crc32(0, NULL, 0); */
+            arm_check_features();
+        return 0UL;
+    }
+
+    if (arm_cpu_enable_crc32)
+        return armv8_crc32_little(crc, buf, len);
+#endif
     return crc32_z(crc, buf, len);
 }
 
diff --git a/crc32_simd.c b/crc32_simd.c
index 6538652..f5d9dd8 100644
--- a/crc32_simd.c
+++ b/crc32_simd.c
@@ -154,4 +154,52 @@
     return _mm_extract_epi32(x1, 1);
 }
 
-#endif  /* CRC32_SIMD_SSE42_PCLMUL */
+#elif defined(CRC32_ARMV8_CRC32)
+
+/* CRC32 checksums using ARMv8-a crypto instructions.
+ *
+ * TODO: implement a version using the PMULL instruction.
+ */
+#include <arm_acle.h>
+
+uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,
+                                          const unsigned char *buf,
+                                          z_size_t len)
+{
+    uint32_t c = (uint32_t) ~crc;
+
+    while (len && ((uintptr_t)buf & 7)) {
+        c = __crc32b(c, *buf++);
+        --len;
+    }
+
+    const uint64_t *buf8 = (const uint64_t *)buf;
+
+    while (len >= 64) {
+        c = __crc32d(c, *buf8++);
+        c = __crc32d(c, *buf8++);
+        c = __crc32d(c, *buf8++);
+        c = __crc32d(c, *buf8++);
+
+        c = __crc32d(c, *buf8++);
+        c = __crc32d(c, *buf8++);
+        c = __crc32d(c, *buf8++);
+        c = __crc32d(c, *buf8++);
+        len -= 64;
+    }
+
+    while (len >= 8) {
+        c = __crc32d(c, *buf8++);
+        len -= 8;
+    }
+
+    buf = (const unsigned char *)buf8;
+
+    while (len--) {
+        c = __crc32b(c, *buf++);
+    }
+
+    return ~c;
+}
+
+#endif
diff --git a/crc32_simd.h b/crc32_simd.h
index 4e6f326..d3d0bce 100644
--- a/crc32_simd.h
+++ b/crc32_simd.h
@@ -25,3 +25,11 @@
  */
 #define Z_CRC32_SSE42_MINIMUM_LENGTH 64
 #define Z_CRC32_SSE42_CHUNKSIZE_MASK 15
+
+/*
+ * CRC32 checksums using ARMv8-a crypto instructions.
+ */
+uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,
+                                          const unsigned char* buf,
+                                          z_size_t len);
+
diff --git a/names.h b/names.h
index c58e6bc..c61f2a7 100644
--- a/names.h
+++ b/names.h
@@ -174,4 +174,10 @@
 /* Symbols added by crc32_simd.c */
 #define crc32_sse42_simd_ Cr_z_crc32_sse42_simd_
 
+/* Symbols added by armv8_crc32 */
+#define arm_cpu_enable_crc32 Cr_z_arm_cpu_enable_crc32
+#define arm_cpu_enable_pmull Cr_z_arm_cpu_enable_pmull
+#define arm_check_features Cr_z_arm_check_features
+#define armv8_crc32_little Cr_z_armv8_crc32_little
+
 #endif  /* THIRD_PARTY_ZLIB_NAMES_H_ */