Noel Gordon | 8e904b3 | 2018-01-04 12:10:08 +1100 | [diff] [blame] | 1 | /* crc32_simd.c |
| 2 | * |
| 3 | * Copyright 2017 The Chromium Authors. All rights reserved. |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the Chromium source repository LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #include "crc32_simd.h" |
| 9 | |
| 10 | #if defined(CRC32_SIMD_SSE42_PCLMUL) |
| 11 | |
| 12 | /* |
| 13 | * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer |
| 14 | * length must be at least 64, and a multiple of 16. Based on: |
| 15 | * |
| 16 | * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" |
| 17 | * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 |
| 18 | */ |
| 19 | |
| 20 | #include <emmintrin.h> |
| 21 | #include <smmintrin.h> |
| 22 | #include <wmmintrin.h> |
| 23 | |
| 24 | uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */ |
| 25 | const unsigned char *buf, |
| 26 | z_size_t len, |
| 27 | uint32_t crc) |
| 28 | { |
| 29 | /* |
| 30 | * Definitions of the bit-reflected domain constants k1,k2,k3, etc and |
| 31 | * the CRC32+Barrett polynomials given at the end of the paper. |
| 32 | */ |
| 33 | static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 }; |
| 34 | static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e }; |
| 35 | static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 }; |
| 36 | static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; |
| 37 | |
| 38 | __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; |
| 39 | |
| 40 | /* |
| 41 | * There's at least one block of 64. |
| 42 | */ |
| 43 | x1 = _mm_loadu_si128((__m128i *)(buf + 0x00)); |
| 44 | x2 = _mm_loadu_si128((__m128i *)(buf + 0x10)); |
| 45 | x3 = _mm_loadu_si128((__m128i *)(buf + 0x20)); |
| 46 | x4 = _mm_loadu_si128((__m128i *)(buf + 0x30)); |
| 47 | |
| 48 | x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc)); |
| 49 | |
| 50 | x0 = _mm_load_si128((__m128i *)k1k2); |
| 51 | |
| 52 | buf += 64; |
| 53 | len -= 64; |
| 54 | |
| 55 | /* |
| 56 | * Parallel fold blocks of 64, if any. |
| 57 | */ |
| 58 | while (len >= 64) |
| 59 | { |
| 60 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 61 | x6 = _mm_clmulepi64_si128(x2, x0, 0x00); |
| 62 | x7 = _mm_clmulepi64_si128(x3, x0, 0x00); |
| 63 | x8 = _mm_clmulepi64_si128(x4, x0, 0x00); |
| 64 | |
| 65 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
| 66 | x2 = _mm_clmulepi64_si128(x2, x0, 0x11); |
| 67 | x3 = _mm_clmulepi64_si128(x3, x0, 0x11); |
| 68 | x4 = _mm_clmulepi64_si128(x4, x0, 0x11); |
| 69 | |
| 70 | y5 = _mm_loadu_si128((__m128i *)(buf + 0x00)); |
| 71 | y6 = _mm_loadu_si128((__m128i *)(buf + 0x10)); |
| 72 | y7 = _mm_loadu_si128((__m128i *)(buf + 0x20)); |
| 73 | y8 = _mm_loadu_si128((__m128i *)(buf + 0x30)); |
| 74 | |
| 75 | x1 = _mm_xor_si128(x1, x5); |
| 76 | x2 = _mm_xor_si128(x2, x6); |
| 77 | x3 = _mm_xor_si128(x3, x7); |
| 78 | x4 = _mm_xor_si128(x4, x8); |
| 79 | |
| 80 | x1 = _mm_xor_si128(x1, y5); |
| 81 | x2 = _mm_xor_si128(x2, y6); |
| 82 | x3 = _mm_xor_si128(x3, y7); |
| 83 | x4 = _mm_xor_si128(x4, y8); |
| 84 | |
| 85 | buf += 64; |
| 86 | len -= 64; |
| 87 | } |
| 88 | |
| 89 | /* |
| 90 | * Fold into 128-bits. |
| 91 | */ |
| 92 | x0 = _mm_load_si128((__m128i *)k3k4); |
| 93 | |
| 94 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 95 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
| 96 | x1 = _mm_xor_si128(x1, x2); |
| 97 | x1 = _mm_xor_si128(x1, x5); |
| 98 | |
| 99 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 100 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
| 101 | x1 = _mm_xor_si128(x1, x3); |
| 102 | x1 = _mm_xor_si128(x1, x5); |
| 103 | |
| 104 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 105 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
| 106 | x1 = _mm_xor_si128(x1, x4); |
| 107 | x1 = _mm_xor_si128(x1, x5); |
| 108 | |
| 109 | /* |
| 110 | * Single fold blocks of 16, if any. |
| 111 | */ |
| 112 | while (len >= 16) |
| 113 | { |
| 114 | x2 = _mm_loadu_si128((__m128i *)buf); |
| 115 | |
| 116 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 117 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
| 118 | x1 = _mm_xor_si128(x1, x2); |
| 119 | x1 = _mm_xor_si128(x1, x5); |
| 120 | |
| 121 | buf += 16; |
| 122 | len -= 16; |
| 123 | } |
| 124 | |
| 125 | /* |
| 126 | * Fold 128-bits to 64-bits. |
| 127 | */ |
| 128 | x2 = _mm_clmulepi64_si128(x1, x0, 0x10); |
Noel Gordon | bb1fbea | 2018-02-05 20:37:08 +0000 | [diff] [blame] | 129 | x3 = _mm_setr_epi32(~0, 0, ~0, 0); |
Noel Gordon | 8e904b3 | 2018-01-04 12:10:08 +1100 | [diff] [blame] | 130 | x1 = _mm_srli_si128(x1, 8); |
| 131 | x1 = _mm_xor_si128(x1, x2); |
| 132 | |
| 133 | x0 = _mm_loadl_epi64((__m128i*)k5k0); |
| 134 | |
| 135 | x2 = _mm_srli_si128(x1, 4); |
| 136 | x1 = _mm_and_si128(x1, x3); |
| 137 | x1 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 138 | x1 = _mm_xor_si128(x1, x2); |
| 139 | |
| 140 | /* |
| 141 | * Barret reduce to 32-bits. |
| 142 | */ |
| 143 | x0 = _mm_load_si128((__m128i*)poly); |
| 144 | |
| 145 | x2 = _mm_and_si128(x1, x3); |
| 146 | x2 = _mm_clmulepi64_si128(x2, x0, 0x10); |
| 147 | x2 = _mm_and_si128(x2, x3); |
| 148 | x2 = _mm_clmulepi64_si128(x2, x0, 0x00); |
| 149 | x1 = _mm_xor_si128(x1, x2); |
| 150 | |
| 151 | /* |
| 152 | * Return the crc32. |
| 153 | */ |
| 154 | return _mm_extract_epi32(x1, 1); |
| 155 | } |
| 156 | |
Adenilson Cavalcanti | 7235672 | 2018-02-16 03:41:14 +0000 | [diff] [blame] | 157 | #elif defined(CRC32_ARMV8_CRC32) |
| 158 | |
| 159 | /* CRC32 checksums using ARMv8-a crypto instructions. |
| 160 | * |
| 161 | * TODO: implement a version using the PMULL instruction. |
| 162 | */ |
Adenilson Cavalcanti | 7235672 | 2018-02-16 03:41:14 +0000 | [diff] [blame] | 163 | |
Jose Dapena Paz | bbacb13 | 2019-06-10 09:16:22 +0000 | [diff] [blame] | 164 | #if defined(__clang__) |
George Burgess IV | de0fe05 | 2018-08-23 00:40:55 +0000 | [diff] [blame] | 165 | /* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an |
| 166 | * armv8 target, which is incompatible with ThinLTO optimizations on Android. |
| 167 | * (Namely, mixing and matching different module-level targets makes ThinLTO |
| 168 | * warn, and Android defaults to armv7-a. This restriction does not apply to |
| 169 | * function-level `target`s, however.) |
| 170 | * |
Adenilson Cavalcanti | 2b4888a | 2019-08-01 17:28:10 +0000 | [diff] [blame] | 171 | * Since we only need four crc intrinsics, and since clang's implementation of |
George Burgess IV | de0fe05 | 2018-08-23 00:40:55 +0000 | [diff] [blame] | 172 | * those are just wrappers around compiler builtins, it's simplest to #define |
| 173 | * those builtins directly. If this #define list grows too much (or we depend on |
| 174 | * an intrinsic that isn't a trivial wrapper), we may have to find a better way |
| 175 | * to go about this. |
| 176 | * |
| 177 | * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized |
| 178 | * feature for this target (ignoring feature)." This appears to be a harmless |
| 179 | * bug in clang. |
| 180 | */ |
| 181 | #define __crc32b __builtin_arm_crc32b |
| 182 | #define __crc32d __builtin_arm_crc32d |
| 183 | #define __crc32w __builtin_arm_crc32w |
Adenilson Cavalcanti | 2b4888a | 2019-08-01 17:28:10 +0000 | [diff] [blame] | 184 | #define __crc32cw __builtin_arm_crc32cw |
George Burgess IV | de0fe05 | 2018-08-23 00:40:55 +0000 | [diff] [blame] | 185 | |
| 186 | #if defined(__aarch64__) |
| 187 | #define TARGET_ARMV8_WITH_CRC __attribute__((target("crc"))) |
Jose Dapena Paz | bbacb13 | 2019-06-10 09:16:22 +0000 | [diff] [blame] | 188 | #else // !defined(__aarch64__) |
George Burgess IV | de0fe05 | 2018-08-23 00:40:55 +0000 | [diff] [blame] | 189 | #define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc"))) |
Jose Dapena Paz | bbacb13 | 2019-06-10 09:16:22 +0000 | [diff] [blame] | 190 | #endif // defined(__aarch64__) |
| 191 | |
| 192 | #elif defined(__GNUC__) |
| 193 | /* For GCC, we are setting CRC extensions at module level, so ThinLTO is not |
| 194 | * allowed. We can just include arm_acle.h. |
| 195 | */ |
| 196 | #include <arm_acle.h> |
| 197 | #define TARGET_ARMV8_WITH_CRC |
| 198 | #else // !defined(__GNUC__) && !defined(_aarch64__) |
| 199 | #error ARM CRC32 SIMD extensions only supported for Clang and GCC |
George Burgess IV | de0fe05 | 2018-08-23 00:40:55 +0000 | [diff] [blame] | 200 | #endif |
| 201 | |
| 202 | TARGET_ARMV8_WITH_CRC |
Adenilson Cavalcanti | 7235672 | 2018-02-16 03:41:14 +0000 | [diff] [blame] | 203 | uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc, |
| 204 | const unsigned char *buf, |
| 205 | z_size_t len) |
| 206 | { |
| 207 | uint32_t c = (uint32_t) ~crc; |
| 208 | |
| 209 | while (len && ((uintptr_t)buf & 7)) { |
| 210 | c = __crc32b(c, *buf++); |
| 211 | --len; |
| 212 | } |
| 213 | |
| 214 | const uint64_t *buf8 = (const uint64_t *)buf; |
| 215 | |
| 216 | while (len >= 64) { |
| 217 | c = __crc32d(c, *buf8++); |
| 218 | c = __crc32d(c, *buf8++); |
| 219 | c = __crc32d(c, *buf8++); |
| 220 | c = __crc32d(c, *buf8++); |
| 221 | |
| 222 | c = __crc32d(c, *buf8++); |
| 223 | c = __crc32d(c, *buf8++); |
| 224 | c = __crc32d(c, *buf8++); |
| 225 | c = __crc32d(c, *buf8++); |
| 226 | len -= 64; |
| 227 | } |
| 228 | |
| 229 | while (len >= 8) { |
| 230 | c = __crc32d(c, *buf8++); |
| 231 | len -= 8; |
| 232 | } |
| 233 | |
| 234 | buf = (const unsigned char *)buf8; |
| 235 | |
| 236 | while (len--) { |
| 237 | c = __crc32b(c, *buf++); |
| 238 | } |
| 239 | |
| 240 | return ~c; |
| 241 | } |
| 242 | |
| 243 | #endif |