blob: 2fef610e21a2e6350c7965b54b90c22c903c2b6b [file] [log] [blame]
Noel Gordon8e904b32018-01-04 12:10:08 +11001/* crc32_simd.c
2 *
3 * Copyright 2017 The Chromium Authors. All rights reserved.
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the Chromium source repository LICENSE file.
6 */
7
8#include "crc32_simd.h"
9
10#if defined(CRC32_SIMD_SSE42_PCLMUL)
11
12/*
13 * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
14 * length must be at least 64, and a multiple of 16. Based on:
15 *
16 * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
17 * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
18 */
19
20#include <emmintrin.h>
21#include <smmintrin.h>
22#include <wmmintrin.h>
23
24uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */
25 const unsigned char *buf,
26 z_size_t len,
27 uint32_t crc)
28{
29 /*
30 * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
31 * the CRC32+Barrett polynomials given at the end of the paper.
32 */
33 static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
34 static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
35 static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
36 static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
37
38 __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
39
40 /*
41 * There's at least one block of 64.
42 */
43 x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
44 x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
45 x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
46 x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
47
48 x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
49
50 x0 = _mm_load_si128((__m128i *)k1k2);
51
52 buf += 64;
53 len -= 64;
54
55 /*
56 * Parallel fold blocks of 64, if any.
57 */
58 while (len >= 64)
59 {
60 x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
61 x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
62 x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
63 x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
64
65 x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
66 x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
67 x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
68 x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
69
70 y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
71 y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
72 y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
73 y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
74
75 x1 = _mm_xor_si128(x1, x5);
76 x2 = _mm_xor_si128(x2, x6);
77 x3 = _mm_xor_si128(x3, x7);
78 x4 = _mm_xor_si128(x4, x8);
79
80 x1 = _mm_xor_si128(x1, y5);
81 x2 = _mm_xor_si128(x2, y6);
82 x3 = _mm_xor_si128(x3, y7);
83 x4 = _mm_xor_si128(x4, y8);
84
85 buf += 64;
86 len -= 64;
87 }
88
89 /*
90 * Fold into 128-bits.
91 */
92 x0 = _mm_load_si128((__m128i *)k3k4);
93
94 x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
95 x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
96 x1 = _mm_xor_si128(x1, x2);
97 x1 = _mm_xor_si128(x1, x5);
98
99 x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
100 x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
101 x1 = _mm_xor_si128(x1, x3);
102 x1 = _mm_xor_si128(x1, x5);
103
104 x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
105 x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
106 x1 = _mm_xor_si128(x1, x4);
107 x1 = _mm_xor_si128(x1, x5);
108
109 /*
110 * Single fold blocks of 16, if any.
111 */
112 while (len >= 16)
113 {
114 x2 = _mm_loadu_si128((__m128i *)buf);
115
116 x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
117 x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
118 x1 = _mm_xor_si128(x1, x2);
119 x1 = _mm_xor_si128(x1, x5);
120
121 buf += 16;
122 len -= 16;
123 }
124
125 /*
126 * Fold 128-bits to 64-bits.
127 */
128 x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
Noel Gordonbb1fbea2018-02-05 20:37:08 +0000129 x3 = _mm_setr_epi32(~0, 0, ~0, 0);
Noel Gordon8e904b32018-01-04 12:10:08 +1100130 x1 = _mm_srli_si128(x1, 8);
131 x1 = _mm_xor_si128(x1, x2);
132
133 x0 = _mm_loadl_epi64((__m128i*)k5k0);
134
135 x2 = _mm_srli_si128(x1, 4);
136 x1 = _mm_and_si128(x1, x3);
137 x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
138 x1 = _mm_xor_si128(x1, x2);
139
140 /*
141 * Barret reduce to 32-bits.
142 */
143 x0 = _mm_load_si128((__m128i*)poly);
144
145 x2 = _mm_and_si128(x1, x3);
146 x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
147 x2 = _mm_and_si128(x2, x3);
148 x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
149 x1 = _mm_xor_si128(x1, x2);
150
151 /*
152 * Return the crc32.
153 */
154 return _mm_extract_epi32(x1, 1);
155}
156
Adenilson Cavalcanti72356722018-02-16 03:41:14 +0000157#elif defined(CRC32_ARMV8_CRC32)
158
159/* CRC32 checksums using ARMv8-a crypto instructions.
160 *
161 * TODO: implement a version using the PMULL instruction.
162 */
Adenilson Cavalcanti72356722018-02-16 03:41:14 +0000163
Jose Dapena Pazbbacb132019-06-10 09:16:22 +0000164#if defined(__clang__)
George Burgess IVde0fe052018-08-23 00:40:55 +0000165/* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
166 * armv8 target, which is incompatible with ThinLTO optimizations on Android.
167 * (Namely, mixing and matching different module-level targets makes ThinLTO
168 * warn, and Android defaults to armv7-a. This restriction does not apply to
169 * function-level `target`s, however.)
170 *
171 * Since we only need three crc intrinsics, and since clang's implementation of
172 * those are just wrappers around compiler builtins, it's simplest to #define
173 * those builtins directly. If this #define list grows too much (or we depend on
174 * an intrinsic that isn't a trivial wrapper), we may have to find a better way
175 * to go about this.
176 *
177 * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
178 * feature for this target (ignoring feature)." This appears to be a harmless
179 * bug in clang.
180 */
181#define __crc32b __builtin_arm_crc32b
182#define __crc32d __builtin_arm_crc32d
183#define __crc32w __builtin_arm_crc32w
184
185#if defined(__aarch64__)
186#define TARGET_ARMV8_WITH_CRC __attribute__((target("crc")))
Jose Dapena Pazbbacb132019-06-10 09:16:22 +0000187#else // !defined(__aarch64__)
George Burgess IVde0fe052018-08-23 00:40:55 +0000188#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
Jose Dapena Pazbbacb132019-06-10 09:16:22 +0000189#endif // defined(__aarch64__)
190
191#elif defined(__GNUC__)
192/* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
193 * allowed. We can just include arm_acle.h.
194 */
195#include <arm_acle.h>
196#define TARGET_ARMV8_WITH_CRC
197#else // !defined(__GNUC__) && !defined(_aarch64__)
198#error ARM CRC32 SIMD extensions only supported for Clang and GCC
George Burgess IVde0fe052018-08-23 00:40:55 +0000199#endif
200
201TARGET_ARMV8_WITH_CRC
Adenilson Cavalcanti72356722018-02-16 03:41:14 +0000202uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,
203 const unsigned char *buf,
204 z_size_t len)
205{
206 uint32_t c = (uint32_t) ~crc;
207
208 while (len && ((uintptr_t)buf & 7)) {
209 c = __crc32b(c, *buf++);
210 --len;
211 }
212
213 const uint64_t *buf8 = (const uint64_t *)buf;
214
215 while (len >= 64) {
216 c = __crc32d(c, *buf8++);
217 c = __crc32d(c, *buf8++);
218 c = __crc32d(c, *buf8++);
219 c = __crc32d(c, *buf8++);
220
221 c = __crc32d(c, *buf8++);
222 c = __crc32d(c, *buf8++);
223 c = __crc32d(c, *buf8++);
224 c = __crc32d(c, *buf8++);
225 len -= 64;
226 }
227
228 while (len >= 8) {
229 c = __crc32d(c, *buf8++);
230 len -= 8;
231 }
232
233 buf = (const unsigned char *)buf8;
234
235 while (len--) {
236 c = __crc32b(c, *buf++);
237 }
238
239 return ~c;
240}
241
George Burgess IVde0fe052018-08-23 00:40:55 +0000242TARGET_ARMV8_WITH_CRC
Adenilson Cavalcanti21cc38f2018-08-15 01:06:05 +0000243Pos ZLIB_INTERNAL insert_string_arm(deflate_state *const s, const Pos str)
244{
245 Pos ret;
246 unsigned *ip, val, h = 0;
247
248 ip = (unsigned *)&s->window[str];
249 val = *ip;
250
251 if (s->level >= 6)
252 val &= 0xFFFFFF;
253
254 h = __crc32w(h, val);
255
256 ret = s->head[h & s->hash_mask];
257 s->head[h & s->hash_mask] = str;
258 s->prev[str & s->w_mask] = ret;
259 return ret;
260}
261
Adenilson Cavalcanti72356722018-02-16 03:41:14 +0000262#endif