Tim Chen | 31d9396 | 2013-05-01 12:52:49 -0700 | [diff] [blame] | 1 | ######################################################################## |
| 2 | # Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions |
| 3 | # |
| 4 | # Copyright (c) 2013, Intel Corporation |
| 5 | # |
| 6 | # Authors: |
| 7 | # Erdinc Ozturk <erdinc.ozturk@intel.com> |
| 8 | # Vinodh Gopal <vinodh.gopal@intel.com> |
| 9 | # James Guilford <james.guilford@intel.com> |
| 10 | # Tim Chen <tim.c.chen@linux.intel.com> |
| 11 | # |
| 12 | # This software is available to you under a choice of one of two |
| 13 | # licenses. You may choose to be licensed under the terms of the GNU |
| 14 | # General Public License (GPL) Version 2, available from the file |
| 15 | # COPYING in the main directory of this source tree, or the |
| 16 | # OpenIB.org BSD license below: |
| 17 | # |
| 18 | # Redistribution and use in source and binary forms, with or without |
| 19 | # modification, are permitted provided that the following conditions are |
| 20 | # met: |
| 21 | # |
| 22 | # * Redistributions of source code must retain the above copyright |
| 23 | # notice, this list of conditions and the following disclaimer. |
| 24 | # |
| 25 | # * Redistributions in binary form must reproduce the above copyright |
| 26 | # notice, this list of conditions and the following disclaimer in the |
| 27 | # documentation and/or other materials provided with the |
| 28 | # distribution. |
| 29 | # |
| 30 | # * Neither the name of the Intel Corporation nor the names of its |
| 31 | # contributors may be used to endorse or promote products derived from |
| 32 | # this software without specific prior written permission. |
| 33 | # |
| 34 | # |
| 35 | # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY |
| 36 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 37 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| 38 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR |
| 39 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 40 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 41 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 42 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| 43 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| 44 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 45 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 46 | ######################################################################## |
| 47 | # Function API: |
| 48 | # UINT16 crc_t10dif_pcl( |
| 49 | # UINT16 init_crc, //initial CRC value, 16 bits |
| 50 | # const unsigned char *buf, //buffer pointer to calculate CRC on |
| 51 | # UINT64 len //buffer length in bytes (64-bit data) |
| 52 | # ); |
| 53 | # |
| 54 | # Reference paper titled "Fast CRC Computation for Generic |
| 55 | # Polynomials Using PCLMULQDQ Instruction" |
| 56 | # URL: http://www.intel.com/content/dam/www/public/us/en/documents |
| 57 | # /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf |
| 58 | # |
| 59 | # |
| 60 | |
| 61 | #include <linux/linkage.h> |
| 62 | |
| 63 | .text |
| 64 | |
| 65 | #define arg1 %rdi |
| 66 | #define arg2 %rsi |
| 67 | #define arg3 %rdx |
| 68 | |
| 69 | #define arg1_low32 %edi |
| 70 | |
| 71 | ENTRY(crc_t10dif_pcl) |
| 72 | .align 16 |
| 73 | |
| 74 | # adjust the 16-bit initial_crc value, scale it to 32 bits |
| 75 | shl $16, arg1_low32 |
| 76 | |
| 77 | # Allocate Stack Space |
| 78 | mov %rsp, %rcx |
| 79 | sub $16*2, %rsp |
| 80 | # align stack to 16 byte boundary |
| 81 | and $~(0x10 - 1), %rsp |
| 82 | |
| 83 | # check if smaller than 256 |
| 84 | cmp $256, arg3 |
| 85 | |
| 86 | # for sizes less than 128, we can't fold 64B at a time... |
| 87 | jl _less_than_128 |
| 88 | |
| 89 | |
| 90 | # load the initial crc value |
| 91 | movd arg1_low32, %xmm10 # initial crc |
| 92 | |
| 93 | # crc value does not need to be byte-reflected, but it needs |
| 94 | # to be moved to the high part of the register. |
| 95 | # because data will be byte-reflected and will align with |
| 96 | # initial crc at correct place. |
| 97 | pslldq $12, %xmm10 |
| 98 | |
| 99 | movdqa SHUF_MASK(%rip), %xmm11 |
| 100 | # receive the initial 64B data, xor the initial crc value |
| 101 | movdqu 16*0(arg2), %xmm0 |
| 102 | movdqu 16*1(arg2), %xmm1 |
| 103 | movdqu 16*2(arg2), %xmm2 |
| 104 | movdqu 16*3(arg2), %xmm3 |
| 105 | movdqu 16*4(arg2), %xmm4 |
| 106 | movdqu 16*5(arg2), %xmm5 |
| 107 | movdqu 16*6(arg2), %xmm6 |
| 108 | movdqu 16*7(arg2), %xmm7 |
| 109 | |
| 110 | pshufb %xmm11, %xmm0 |
| 111 | # XOR the initial_crc value |
| 112 | pxor %xmm10, %xmm0 |
| 113 | pshufb %xmm11, %xmm1 |
| 114 | pshufb %xmm11, %xmm2 |
| 115 | pshufb %xmm11, %xmm3 |
| 116 | pshufb %xmm11, %xmm4 |
| 117 | pshufb %xmm11, %xmm5 |
| 118 | pshufb %xmm11, %xmm6 |
| 119 | pshufb %xmm11, %xmm7 |
| 120 | |
| 121 | movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4 |
| 122 | #imm value of pclmulqdq instruction |
| 123 | #will determine which constant to use |
| 124 | |
| 125 | ################################################################# |
| 126 | # we subtract 256 instead of 128 to save one instruction from the loop |
| 127 | sub $256, arg3 |
| 128 | |
| 129 | # at this section of the code, there is 64*x+y (0<=y<64) bytes of |
| 130 | # buffer. The _fold_64_B_loop will fold 64B at a time |
| 131 | # until we have 64+y Bytes of buffer |
| 132 | |
| 133 | |
| 134 | # fold 64B at a time. This section of the code folds 4 xmm |
| 135 | # registers in parallel |
| 136 | _fold_64_B_loop: |
| 137 | |
| 138 | # update the buffer pointer |
| 139 | add $128, arg2 # buf += 64# |
| 140 | |
| 141 | movdqu 16*0(arg2), %xmm9 |
| 142 | movdqu 16*1(arg2), %xmm12 |
| 143 | pshufb %xmm11, %xmm9 |
| 144 | pshufb %xmm11, %xmm12 |
| 145 | movdqa %xmm0, %xmm8 |
| 146 | movdqa %xmm1, %xmm13 |
| 147 | pclmulqdq $0x0 , %xmm10, %xmm0 |
| 148 | pclmulqdq $0x11, %xmm10, %xmm8 |
| 149 | pclmulqdq $0x0 , %xmm10, %xmm1 |
| 150 | pclmulqdq $0x11, %xmm10, %xmm13 |
| 151 | pxor %xmm9 , %xmm0 |
| 152 | xorps %xmm8 , %xmm0 |
| 153 | pxor %xmm12, %xmm1 |
| 154 | xorps %xmm13, %xmm1 |
| 155 | |
| 156 | movdqu 16*2(arg2), %xmm9 |
| 157 | movdqu 16*3(arg2), %xmm12 |
| 158 | pshufb %xmm11, %xmm9 |
| 159 | pshufb %xmm11, %xmm12 |
| 160 | movdqa %xmm2, %xmm8 |
| 161 | movdqa %xmm3, %xmm13 |
| 162 | pclmulqdq $0x0, %xmm10, %xmm2 |
| 163 | pclmulqdq $0x11, %xmm10, %xmm8 |
| 164 | pclmulqdq $0x0, %xmm10, %xmm3 |
| 165 | pclmulqdq $0x11, %xmm10, %xmm13 |
| 166 | pxor %xmm9 , %xmm2 |
| 167 | xorps %xmm8 , %xmm2 |
| 168 | pxor %xmm12, %xmm3 |
| 169 | xorps %xmm13, %xmm3 |
| 170 | |
| 171 | movdqu 16*4(arg2), %xmm9 |
| 172 | movdqu 16*5(arg2), %xmm12 |
| 173 | pshufb %xmm11, %xmm9 |
| 174 | pshufb %xmm11, %xmm12 |
| 175 | movdqa %xmm4, %xmm8 |
| 176 | movdqa %xmm5, %xmm13 |
| 177 | pclmulqdq $0x0, %xmm10, %xmm4 |
| 178 | pclmulqdq $0x11, %xmm10, %xmm8 |
| 179 | pclmulqdq $0x0, %xmm10, %xmm5 |
| 180 | pclmulqdq $0x11, %xmm10, %xmm13 |
| 181 | pxor %xmm9 , %xmm4 |
| 182 | xorps %xmm8 , %xmm4 |
| 183 | pxor %xmm12, %xmm5 |
| 184 | xorps %xmm13, %xmm5 |
| 185 | |
| 186 | movdqu 16*6(arg2), %xmm9 |
| 187 | movdqu 16*7(arg2), %xmm12 |
| 188 | pshufb %xmm11, %xmm9 |
| 189 | pshufb %xmm11, %xmm12 |
| 190 | movdqa %xmm6 , %xmm8 |
| 191 | movdqa %xmm7 , %xmm13 |
| 192 | pclmulqdq $0x0 , %xmm10, %xmm6 |
| 193 | pclmulqdq $0x11, %xmm10, %xmm8 |
| 194 | pclmulqdq $0x0 , %xmm10, %xmm7 |
| 195 | pclmulqdq $0x11, %xmm10, %xmm13 |
| 196 | pxor %xmm9 , %xmm6 |
| 197 | xorps %xmm8 , %xmm6 |
| 198 | pxor %xmm12, %xmm7 |
| 199 | xorps %xmm13, %xmm7 |
| 200 | |
| 201 | sub $128, arg3 |
| 202 | |
| 203 | # check if there is another 64B in the buffer to be able to fold |
| 204 | jge _fold_64_B_loop |
| 205 | ################################################################## |
| 206 | |
| 207 | |
| 208 | add $128, arg2 |
| 209 | # at this point, the buffer pointer is pointing at the last y Bytes |
| 210 | # of the buffer the 64B of folded data is in 4 of the xmm |
| 211 | # registers: xmm0, xmm1, xmm2, xmm3 |
| 212 | |
| 213 | |
| 214 | # fold the 8 xmm registers to 1 xmm register with different constants |
| 215 | |
| 216 | movdqa rk9(%rip), %xmm10 |
| 217 | movdqa %xmm0, %xmm8 |
| 218 | pclmulqdq $0x11, %xmm10, %xmm0 |
| 219 | pclmulqdq $0x0 , %xmm10, %xmm8 |
| 220 | pxor %xmm8, %xmm7 |
| 221 | xorps %xmm0, %xmm7 |
| 222 | |
| 223 | movdqa rk11(%rip), %xmm10 |
| 224 | movdqa %xmm1, %xmm8 |
| 225 | pclmulqdq $0x11, %xmm10, %xmm1 |
| 226 | pclmulqdq $0x0 , %xmm10, %xmm8 |
| 227 | pxor %xmm8, %xmm7 |
| 228 | xorps %xmm1, %xmm7 |
| 229 | |
| 230 | movdqa rk13(%rip), %xmm10 |
| 231 | movdqa %xmm2, %xmm8 |
| 232 | pclmulqdq $0x11, %xmm10, %xmm2 |
| 233 | pclmulqdq $0x0 , %xmm10, %xmm8 |
| 234 | pxor %xmm8, %xmm7 |
| 235 | pxor %xmm2, %xmm7 |
| 236 | |
| 237 | movdqa rk15(%rip), %xmm10 |
| 238 | movdqa %xmm3, %xmm8 |
| 239 | pclmulqdq $0x11, %xmm10, %xmm3 |
| 240 | pclmulqdq $0x0 , %xmm10, %xmm8 |
| 241 | pxor %xmm8, %xmm7 |
| 242 | xorps %xmm3, %xmm7 |
| 243 | |
| 244 | movdqa rk17(%rip), %xmm10 |
| 245 | movdqa %xmm4, %xmm8 |
| 246 | pclmulqdq $0x11, %xmm10, %xmm4 |
| 247 | pclmulqdq $0x0 , %xmm10, %xmm8 |
| 248 | pxor %xmm8, %xmm7 |
| 249 | pxor %xmm4, %xmm7 |
| 250 | |
| 251 | movdqa rk19(%rip), %xmm10 |
| 252 | movdqa %xmm5, %xmm8 |
| 253 | pclmulqdq $0x11, %xmm10, %xmm5 |
| 254 | pclmulqdq $0x0 , %xmm10, %xmm8 |
| 255 | pxor %xmm8, %xmm7 |
| 256 | xorps %xmm5, %xmm7 |
| 257 | |
| 258 | movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2 |
| 259 | #imm value of pclmulqdq instruction |
| 260 | #will determine which constant to use |
| 261 | movdqa %xmm6, %xmm8 |
| 262 | pclmulqdq $0x11, %xmm10, %xmm6 |
| 263 | pclmulqdq $0x0 , %xmm10, %xmm8 |
| 264 | pxor %xmm8, %xmm7 |
| 265 | pxor %xmm6, %xmm7 |
| 266 | |
| 267 | |
| 268 | # instead of 64, we add 48 to the loop counter to save 1 instruction |
| 269 | # from the loop instead of a cmp instruction, we use the negative |
| 270 | # flag with the jl instruction |
| 271 | add $128-16, arg3 |
| 272 | jl _final_reduction_for_128 |
| 273 | |
| 274 | # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 |
| 275 | # and the rest is in memory. We can fold 16 bytes at a time if y>=16 |
| 276 | # continue folding 16B at a time |
| 277 | |
| 278 | _16B_reduction_loop: |
| 279 | movdqa %xmm7, %xmm8 |
| 280 | pclmulqdq $0x11, %xmm10, %xmm7 |
| 281 | pclmulqdq $0x0 , %xmm10, %xmm8 |
| 282 | pxor %xmm8, %xmm7 |
| 283 | movdqu (arg2), %xmm0 |
| 284 | pshufb %xmm11, %xmm0 |
| 285 | pxor %xmm0 , %xmm7 |
| 286 | add $16, arg2 |
| 287 | sub $16, arg3 |
| 288 | # instead of a cmp instruction, we utilize the flags with the |
| 289 | # jge instruction equivalent of: cmp arg3, 16-16 |
| 290 | # check if there is any more 16B in the buffer to be able to fold |
| 291 | jge _16B_reduction_loop |
| 292 | |
| 293 | #now we have 16+z bytes left to reduce, where 0<= z < 16. |
| 294 | #first, we reduce the data in the xmm7 register |
| 295 | |
| 296 | |
| 297 | _final_reduction_for_128: |
| 298 | # check if any more data to fold. If not, compute the CRC of |
| 299 | # the final 128 bits |
| 300 | add $16, arg3 |
| 301 | je _128_done |
| 302 | |
| 303 | # here we are getting data that is less than 16 bytes. |
| 304 | # since we know that there was data before the pointer, we can |
| 305 | # offset the input pointer before the actual point, to receive |
| 306 | # exactly 16 bytes. after that the registers need to be adjusted. |
| 307 | _get_last_two_xmms: |
| 308 | movdqa %xmm7, %xmm2 |
| 309 | |
| 310 | movdqu -16(arg2, arg3), %xmm1 |
| 311 | pshufb %xmm11, %xmm1 |
| 312 | |
| 313 | # get rid of the extra data that was loaded before |
| 314 | # load the shift constant |
| 315 | lea pshufb_shf_table+16(%rip), %rax |
| 316 | sub arg3, %rax |
| 317 | movdqu (%rax), %xmm0 |
| 318 | |
| 319 | # shift xmm2 to the left by arg3 bytes |
| 320 | pshufb %xmm0, %xmm2 |
| 321 | |
| 322 | # shift xmm7 to the right by 16-arg3 bytes |
| 323 | pxor mask1(%rip), %xmm0 |
| 324 | pshufb %xmm0, %xmm7 |
| 325 | pblendvb %xmm2, %xmm1 #xmm0 is implicit |
| 326 | |
| 327 | # fold 16 Bytes |
| 328 | movdqa %xmm1, %xmm2 |
| 329 | movdqa %xmm7, %xmm8 |
| 330 | pclmulqdq $0x11, %xmm10, %xmm7 |
| 331 | pclmulqdq $0x0 , %xmm10, %xmm8 |
| 332 | pxor %xmm8, %xmm7 |
| 333 | pxor %xmm2, %xmm7 |
| 334 | |
| 335 | _128_done: |
| 336 | # compute crc of a 128-bit value |
| 337 | movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10 |
| 338 | movdqa %xmm7, %xmm0 |
| 339 | |
| 340 | #64b fold |
| 341 | pclmulqdq $0x1, %xmm10, %xmm7 |
| 342 | pslldq $8 , %xmm0 |
| 343 | pxor %xmm0, %xmm7 |
| 344 | |
| 345 | #32b fold |
| 346 | movdqa %xmm7, %xmm0 |
| 347 | |
| 348 | pand mask2(%rip), %xmm0 |
| 349 | |
| 350 | psrldq $12, %xmm7 |
| 351 | pclmulqdq $0x10, %xmm10, %xmm7 |
| 352 | pxor %xmm0, %xmm7 |
| 353 | |
| 354 | #barrett reduction |
| 355 | _barrett: |
| 356 | movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10 |
| 357 | movdqa %xmm7, %xmm0 |
| 358 | pclmulqdq $0x01, %xmm10, %xmm7 |
| 359 | pslldq $4, %xmm7 |
| 360 | pclmulqdq $0x11, %xmm10, %xmm7 |
| 361 | |
| 362 | pslldq $4, %xmm7 |
| 363 | pxor %xmm0, %xmm7 |
| 364 | pextrd $1, %xmm7, %eax |
| 365 | |
| 366 | _cleanup: |
| 367 | # scale the result back to 16 bits |
| 368 | shr $16, %eax |
| 369 | mov %rcx, %rsp |
| 370 | ret |
| 371 | |
| 372 | ######################################################################## |
| 373 | |
| 374 | .align 16 |
| 375 | _less_than_128: |
| 376 | |
| 377 | # check if there is enough buffer to be able to fold 16B at a time |
| 378 | cmp $32, arg3 |
| 379 | jl _less_than_32 |
| 380 | movdqa SHUF_MASK(%rip), %xmm11 |
| 381 | |
| 382 | # now if there is, load the constants |
| 383 | movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 |
| 384 | |
| 385 | movd arg1_low32, %xmm0 # get the initial crc value |
| 386 | pslldq $12, %xmm0 # align it to its correct place |
| 387 | movdqu (arg2), %xmm7 # load the plaintext |
| 388 | pshufb %xmm11, %xmm7 # byte-reflect the plaintext |
| 389 | pxor %xmm0, %xmm7 |
| 390 | |
| 391 | |
| 392 | # update the buffer pointer |
| 393 | add $16, arg2 |
| 394 | |
| 395 | # update the counter. subtract 32 instead of 16 to save one |
| 396 | # instruction from the loop |
| 397 | sub $32, arg3 |
| 398 | |
| 399 | jmp _16B_reduction_loop |
| 400 | |
| 401 | |
| 402 | .align 16 |
| 403 | _less_than_32: |
| 404 | # mov initial crc to the return value. this is necessary for |
| 405 | # zero-length buffers. |
| 406 | mov arg1_low32, %eax |
| 407 | test arg3, arg3 |
| 408 | je _cleanup |
| 409 | |
| 410 | movdqa SHUF_MASK(%rip), %xmm11 |
| 411 | |
| 412 | movd arg1_low32, %xmm0 # get the initial crc value |
| 413 | pslldq $12, %xmm0 # align it to its correct place |
| 414 | |
| 415 | cmp $16, arg3 |
| 416 | je _exact_16_left |
| 417 | jl _less_than_16_left |
| 418 | |
| 419 | movdqu (arg2), %xmm7 # load the plaintext |
| 420 | pshufb %xmm11, %xmm7 # byte-reflect the plaintext |
| 421 | pxor %xmm0 , %xmm7 # xor the initial crc value |
| 422 | add $16, arg2 |
| 423 | sub $16, arg3 |
| 424 | movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 |
| 425 | jmp _get_last_two_xmms |
| 426 | |
| 427 | |
| 428 | .align 16 |
| 429 | _less_than_16_left: |
| 430 | # use stack space to load data less than 16 bytes, zero-out |
| 431 | # the 16B in memory first. |
| 432 | |
| 433 | pxor %xmm1, %xmm1 |
| 434 | mov %rsp, %r11 |
| 435 | movdqa %xmm1, (%r11) |
| 436 | |
| 437 | cmp $4, arg3 |
| 438 | jl _only_less_than_4 |
| 439 | |
| 440 | # backup the counter value |
| 441 | mov arg3, %r9 |
| 442 | cmp $8, arg3 |
| 443 | jl _less_than_8_left |
| 444 | |
| 445 | # load 8 Bytes |
| 446 | mov (arg2), %rax |
| 447 | mov %rax, (%r11) |
| 448 | add $8, %r11 |
| 449 | sub $8, arg3 |
| 450 | add $8, arg2 |
| 451 | _less_than_8_left: |
| 452 | |
| 453 | cmp $4, arg3 |
| 454 | jl _less_than_4_left |
| 455 | |
| 456 | # load 4 Bytes |
| 457 | mov (arg2), %eax |
| 458 | mov %eax, (%r11) |
| 459 | add $4, %r11 |
| 460 | sub $4, arg3 |
| 461 | add $4, arg2 |
| 462 | _less_than_4_left: |
| 463 | |
| 464 | cmp $2, arg3 |
| 465 | jl _less_than_2_left |
| 466 | |
| 467 | # load 2 Bytes |
| 468 | mov (arg2), %ax |
| 469 | mov %ax, (%r11) |
| 470 | add $2, %r11 |
| 471 | sub $2, arg3 |
| 472 | add $2, arg2 |
| 473 | _less_than_2_left: |
| 474 | cmp $1, arg3 |
| 475 | jl _zero_left |
| 476 | |
| 477 | # load 1 Byte |
| 478 | mov (arg2), %al |
| 479 | mov %al, (%r11) |
| 480 | _zero_left: |
| 481 | movdqa (%rsp), %xmm7 |
| 482 | pshufb %xmm11, %xmm7 |
| 483 | pxor %xmm0 , %xmm7 # xor the initial crc value |
| 484 | |
| 485 | # shl r9, 4 |
| 486 | lea pshufb_shf_table+16(%rip), %rax |
| 487 | sub %r9, %rax |
| 488 | movdqu (%rax), %xmm0 |
| 489 | pxor mask1(%rip), %xmm0 |
| 490 | |
| 491 | pshufb %xmm0, %xmm7 |
| 492 | jmp _128_done |
| 493 | |
| 494 | .align 16 |
| 495 | _exact_16_left: |
| 496 | movdqu (arg2), %xmm7 |
| 497 | pshufb %xmm11, %xmm7 |
| 498 | pxor %xmm0 , %xmm7 # xor the initial crc value |
| 499 | |
| 500 | jmp _128_done |
| 501 | |
| 502 | _only_less_than_4: |
| 503 | cmp $3, arg3 |
| 504 | jl _only_less_than_3 |
| 505 | |
| 506 | # load 3 Bytes |
| 507 | mov (arg2), %al |
| 508 | mov %al, (%r11) |
| 509 | |
| 510 | mov 1(arg2), %al |
| 511 | mov %al, 1(%r11) |
| 512 | |
| 513 | mov 2(arg2), %al |
| 514 | mov %al, 2(%r11) |
| 515 | |
| 516 | movdqa (%rsp), %xmm7 |
| 517 | pshufb %xmm11, %xmm7 |
| 518 | pxor %xmm0 , %xmm7 # xor the initial crc value |
| 519 | |
| 520 | psrldq $5, %xmm7 |
| 521 | |
| 522 | jmp _barrett |
| 523 | _only_less_than_3: |
| 524 | cmp $2, arg3 |
| 525 | jl _only_less_than_2 |
| 526 | |
| 527 | # load 2 Bytes |
| 528 | mov (arg2), %al |
| 529 | mov %al, (%r11) |
| 530 | |
| 531 | mov 1(arg2), %al |
| 532 | mov %al, 1(%r11) |
| 533 | |
| 534 | movdqa (%rsp), %xmm7 |
| 535 | pshufb %xmm11, %xmm7 |
| 536 | pxor %xmm0 , %xmm7 # xor the initial crc value |
| 537 | |
| 538 | psrldq $6, %xmm7 |
| 539 | |
| 540 | jmp _barrett |
| 541 | _only_less_than_2: |
| 542 | |
| 543 | # load 1 Byte |
| 544 | mov (arg2), %al |
| 545 | mov %al, (%r11) |
| 546 | |
| 547 | movdqa (%rsp), %xmm7 |
| 548 | pshufb %xmm11, %xmm7 |
| 549 | pxor %xmm0 , %xmm7 # xor the initial crc value |
| 550 | |
| 551 | psrldq $7, %xmm7 |
| 552 | |
| 553 | jmp _barrett |
| 554 | |
| 555 | ENDPROC(crc_t10dif_pcl) |
| 556 | |
| 557 | .data |
| 558 | |
| 559 | # precomputed constants |
| 560 | # these constants are precomputed from the poly: |
| 561 | # 0x8bb70000 (0x8bb7 scaled to 32 bits) |
| 562 | .align 16 |
| 563 | # Q = 0x18BB70000 |
| 564 | # rk1 = 2^(32*3) mod Q << 32 |
| 565 | # rk2 = 2^(32*5) mod Q << 32 |
| 566 | # rk3 = 2^(32*15) mod Q << 32 |
| 567 | # rk4 = 2^(32*17) mod Q << 32 |
| 568 | # rk5 = 2^(32*3) mod Q << 32 |
| 569 | # rk6 = 2^(32*2) mod Q << 32 |
| 570 | # rk7 = floor(2^64/Q) |
| 571 | # rk8 = Q |
| 572 | rk1: |
| 573 | .quad 0x2d56000000000000 |
| 574 | rk2: |
| 575 | .quad 0x06df000000000000 |
| 576 | rk3: |
| 577 | .quad 0x9d9d000000000000 |
| 578 | rk4: |
| 579 | .quad 0x7cf5000000000000 |
| 580 | rk5: |
| 581 | .quad 0x2d56000000000000 |
| 582 | rk6: |
| 583 | .quad 0x1368000000000000 |
| 584 | rk7: |
| 585 | .quad 0x00000001f65a57f8 |
| 586 | rk8: |
| 587 | .quad 0x000000018bb70000 |
| 588 | |
| 589 | rk9: |
| 590 | .quad 0xceae000000000000 |
| 591 | rk10: |
| 592 | .quad 0xbfd6000000000000 |
| 593 | rk11: |
| 594 | .quad 0x1e16000000000000 |
| 595 | rk12: |
| 596 | .quad 0x713c000000000000 |
| 597 | rk13: |
| 598 | .quad 0xf7f9000000000000 |
| 599 | rk14: |
| 600 | .quad 0x80a6000000000000 |
| 601 | rk15: |
| 602 | .quad 0x044c000000000000 |
| 603 | rk16: |
| 604 | .quad 0xe658000000000000 |
| 605 | rk17: |
| 606 | .quad 0xad18000000000000 |
| 607 | rk18: |
| 608 | .quad 0xa497000000000000 |
| 609 | rk19: |
| 610 | .quad 0x6ee3000000000000 |
| 611 | rk20: |
| 612 | .quad 0xe7b5000000000000 |
| 613 | |
| 614 | |
| 615 | |
| 616 | mask1: |
| 617 | .octa 0x80808080808080808080808080808080 |
| 618 | mask2: |
| 619 | .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF |
| 620 | |
| 621 | SHUF_MASK: |
| 622 | .octa 0x000102030405060708090A0B0C0D0E0F |
| 623 | |
| 624 | pshufb_shf_table: |
| 625 | # use these values for shift constants for the pshufb instruction |
| 626 | # different alignments result in values as shown: |
| 627 | # DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 |
| 628 | # DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 |
| 629 | # DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 |
| 630 | # DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 |
| 631 | # DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 |
| 632 | # DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 |
| 633 | # DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 |
| 634 | # DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 |
| 635 | # DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 |
| 636 | # DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 |
| 637 | # DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 |
| 638 | # DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 |
| 639 | # DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 |
| 640 | # DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 |
| 641 | # DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 |
| 642 | .octa 0x8f8e8d8c8b8a89888786858483828100 |
| 643 | .octa 0x000e0d0c0b0a09080706050403020100 |