Noel Gordon | 8e904b3 | 2018-01-04 12:10:08 +1100 | [diff] [blame^] | 1 | /* crc32_simd.c |
| 2 | * |
| 3 | * Copyright 2017 The Chromium Authors. All rights reserved. |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the Chromium source repository LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #include "crc32_simd.h" |
| 9 | |
| 10 | #if defined(CRC32_SIMD_SSE42_PCLMUL) |
| 11 | |
| 12 | /* |
| 13 | * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer |
| 14 | * length must be at least 64, and a multiple of 16. Based on: |
| 15 | * |
| 16 | * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" |
| 17 | * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 |
| 18 | */ |
| 19 | |
| 20 | #include <emmintrin.h> |
| 21 | #include <smmintrin.h> |
| 22 | #include <wmmintrin.h> |
| 23 | |
| 24 | uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */ |
| 25 | const unsigned char *buf, |
| 26 | z_size_t len, |
| 27 | uint32_t crc) |
| 28 | { |
| 29 | /* |
| 30 | * Definitions of the bit-reflected domain constants k1,k2,k3, etc and |
| 31 | * the CRC32+Barrett polynomials given at the end of the paper. |
| 32 | */ |
| 33 | static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 }; |
| 34 | static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e }; |
| 35 | static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 }; |
| 36 | static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; |
| 37 | |
| 38 | __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; |
| 39 | |
| 40 | /* |
| 41 | * There's at least one block of 64. |
| 42 | */ |
| 43 | x1 = _mm_loadu_si128((__m128i *)(buf + 0x00)); |
| 44 | x2 = _mm_loadu_si128((__m128i *)(buf + 0x10)); |
| 45 | x3 = _mm_loadu_si128((__m128i *)(buf + 0x20)); |
| 46 | x4 = _mm_loadu_si128((__m128i *)(buf + 0x30)); |
| 47 | |
| 48 | x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc)); |
| 49 | |
| 50 | x0 = _mm_load_si128((__m128i *)k1k2); |
| 51 | |
| 52 | buf += 64; |
| 53 | len -= 64; |
| 54 | |
| 55 | /* |
| 56 | * Parallel fold blocks of 64, if any. |
| 57 | */ |
| 58 | while (len >= 64) |
| 59 | { |
| 60 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 61 | x6 = _mm_clmulepi64_si128(x2, x0, 0x00); |
| 62 | x7 = _mm_clmulepi64_si128(x3, x0, 0x00); |
| 63 | x8 = _mm_clmulepi64_si128(x4, x0, 0x00); |
| 64 | |
| 65 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
| 66 | x2 = _mm_clmulepi64_si128(x2, x0, 0x11); |
| 67 | x3 = _mm_clmulepi64_si128(x3, x0, 0x11); |
| 68 | x4 = _mm_clmulepi64_si128(x4, x0, 0x11); |
| 69 | |
| 70 | y5 = _mm_loadu_si128((__m128i *)(buf + 0x00)); |
| 71 | y6 = _mm_loadu_si128((__m128i *)(buf + 0x10)); |
| 72 | y7 = _mm_loadu_si128((__m128i *)(buf + 0x20)); |
| 73 | y8 = _mm_loadu_si128((__m128i *)(buf + 0x30)); |
| 74 | |
| 75 | x1 = _mm_xor_si128(x1, x5); |
| 76 | x2 = _mm_xor_si128(x2, x6); |
| 77 | x3 = _mm_xor_si128(x3, x7); |
| 78 | x4 = _mm_xor_si128(x4, x8); |
| 79 | |
| 80 | x1 = _mm_xor_si128(x1, y5); |
| 81 | x2 = _mm_xor_si128(x2, y6); |
| 82 | x3 = _mm_xor_si128(x3, y7); |
| 83 | x4 = _mm_xor_si128(x4, y8); |
| 84 | |
| 85 | buf += 64; |
| 86 | len -= 64; |
| 87 | } |
| 88 | |
| 89 | /* |
| 90 | * Fold into 128-bits. |
| 91 | */ |
| 92 | x0 = _mm_load_si128((__m128i *)k3k4); |
| 93 | |
| 94 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 95 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
| 96 | x1 = _mm_xor_si128(x1, x2); |
| 97 | x1 = _mm_xor_si128(x1, x5); |
| 98 | |
| 99 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 100 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
| 101 | x1 = _mm_xor_si128(x1, x3); |
| 102 | x1 = _mm_xor_si128(x1, x5); |
| 103 | |
| 104 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 105 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
| 106 | x1 = _mm_xor_si128(x1, x4); |
| 107 | x1 = _mm_xor_si128(x1, x5); |
| 108 | |
| 109 | /* |
| 110 | * Single fold blocks of 16, if any. |
| 111 | */ |
| 112 | while (len >= 16) |
| 113 | { |
| 114 | x2 = _mm_loadu_si128((__m128i *)buf); |
| 115 | |
| 116 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 117 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
| 118 | x1 = _mm_xor_si128(x1, x2); |
| 119 | x1 = _mm_xor_si128(x1, x5); |
| 120 | |
| 121 | buf += 16; |
| 122 | len -= 16; |
| 123 | } |
| 124 | |
| 125 | /* |
| 126 | * Fold 128-bits to 64-bits. |
| 127 | */ |
| 128 | x2 = _mm_clmulepi64_si128(x1, x0, 0x10); |
| 129 | x3 = _mm_set_epi32(0, ~0, 0, ~0); |
| 130 | x1 = _mm_srli_si128(x1, 8); |
| 131 | x1 = _mm_xor_si128(x1, x2); |
| 132 | |
| 133 | x0 = _mm_loadl_epi64((__m128i*)k5k0); |
| 134 | |
| 135 | x2 = _mm_srli_si128(x1, 4); |
| 136 | x1 = _mm_and_si128(x1, x3); |
| 137 | x1 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 138 | x1 = _mm_xor_si128(x1, x2); |
| 139 | |
| 140 | /* |
| 141 | * Barret reduce to 32-bits. |
| 142 | */ |
| 143 | x0 = _mm_load_si128((__m128i*)poly); |
| 144 | |
| 145 | x2 = _mm_and_si128(x1, x3); |
| 146 | x2 = _mm_clmulepi64_si128(x2, x0, 0x10); |
| 147 | x2 = _mm_and_si128(x2, x3); |
| 148 | x2 = _mm_clmulepi64_si128(x2, x0, 0x00); |
| 149 | x1 = _mm_xor_si128(x1, x2); |
| 150 | |
| 151 | /* |
| 152 | * Return the crc32. |
| 153 | */ |
| 154 | return _mm_extract_epi32(x1, 1); |
| 155 | } |
| 156 | |
| 157 | #endif /* CRC32_SIMD_SSE42_PCLMUL */ |