Tim Chen | 6a8ce1e | 2012-09-27 15:44:22 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) |
| 3 | * |
| 4 | * The white paper on CRC32C calculations with PCLMULQDQ instruction can be |
| 5 | * downloaded from: |
| 6 | * http://download.intel.com/design/intarch/papers/323405.pdf |
| 7 | * |
| 8 | * Copyright (C) 2012 Intel Corporation. |
| 9 | * |
| 10 | * Authors: |
| 11 | * Wajdi Feghali <wajdi.k.feghali@intel.com> |
| 12 | * James Guilford <james.guilford@intel.com> |
| 13 | * David Cote <david.m.cote@intel.com> |
| 14 | * Tim Chen <tim.c.chen@linux.intel.com> |
| 15 | * |
| 16 | * This software is available to you under a choice of one of two |
| 17 | * licenses. You may choose to be licensed under the terms of the GNU |
| 18 | * General Public License (GPL) Version 2, available from the file |
| 19 | * COPYING in the main directory of this source tree, or the |
| 20 | * OpenIB.org BSD license below: |
| 21 | * |
| 22 | * Redistribution and use in source and binary forms, with or |
| 23 | * without modification, are permitted provided that the following |
| 24 | * conditions are met: |
| 25 | * |
| 26 | * - Redistributions of source code must retain the above |
| 27 | * copyright notice, this list of conditions and the following |
| 28 | * disclaimer. |
| 29 | * |
| 30 | * - Redistributions in binary form must reproduce the above |
| 31 | * copyright notice, this list of conditions and the following |
| 32 | * disclaimer in the documentation and/or other materials |
| 33 | * provided with the distribution. |
| 34 | * |
| 35 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| 36 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
| 37 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| 38 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
| 39 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
| 40 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
| 41 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 42 | * SOFTWARE. |
| 43 | */ |
| 44 | |
| 45 | ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction |
| 46 | |
| 47 | .macro LABEL prefix n |
| 48 | \prefix\n\(): |
| 49 | .endm |
| 50 | |
| 51 | .macro JMPTBL_ENTRY i |
| 52 | .word crc_\i - crc_array |
| 53 | .endm |
| 54 | |
| 55 | .macro JNC_LESS_THAN j |
| 56 | jnc less_than_\j |
| 57 | .endm |
| 58 | |
| 59 | # Define threshold where buffers are considered "small" and routed to more |
| 60 | # efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so |
| 61 | # SMALL_SIZE can be no larger than 255. |
| 62 | |
| 63 | #define SMALL_SIZE 200 |
| 64 | |
| 65 | .if (SMALL_SIZE > 255) |
| 66 | .error "SMALL_ SIZE must be < 256" |
| 67 | .endif |
| 68 | |
| 69 | # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); |
| 70 | |
| 71 | .global crc_pcl |
| 72 | crc_pcl: |
| 73 | #define bufp %rdi |
| 74 | #define bufp_dw %edi |
| 75 | #define bufp_w %di |
| 76 | #define bufp_b %dil |
| 77 | #define bufptmp %rcx |
| 78 | #define block_0 %rcx |
| 79 | #define block_1 %rdx |
| 80 | #define block_2 %r11 |
| 81 | #define len %rsi |
| 82 | #define len_dw %esi |
| 83 | #define len_w %si |
| 84 | #define len_b %sil |
| 85 | #define crc_init_arg %rdx |
| 86 | #define tmp %rbx |
| 87 | #define crc_init %r8 |
| 88 | #define crc_init_dw %r8d |
| 89 | #define crc1 %r9 |
| 90 | #define crc2 %r10 |
| 91 | |
| 92 | pushq %rbx |
| 93 | pushq %rdi |
| 94 | pushq %rsi |
| 95 | |
| 96 | ## Move crc_init for Linux to a different |
| 97 | mov crc_init_arg, crc_init |
| 98 | |
| 99 | ################################################################ |
| 100 | ## 1) ALIGN: |
| 101 | ################################################################ |
| 102 | |
| 103 | mov bufp, bufptmp # rdi = *buf |
| 104 | neg bufp |
| 105 | and $7, bufp # calculate the unalignment amount of |
| 106 | # the address |
| 107 | je proc_block # Skip if aligned |
| 108 | |
| 109 | ## If len is less than 8 and we're unaligned, we need to jump |
| 110 | ## to special code to avoid reading beyond the end of the buffer |
| 111 | cmp $8, len |
| 112 | jae do_align |
| 113 | # less_than_8 expects length in upper 3 bits of len_dw |
| 114 | # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] |
| 115 | shl $32-3+1, len_dw |
| 116 | jmp less_than_8_post_shl1 |
| 117 | |
| 118 | do_align: |
| 119 | #### Calculate CRC of unaligned bytes of the buffer (if any) |
| 120 | movq (bufptmp), tmp # load a quadward from the buffer |
| 121 | add bufp, bufptmp # align buffer pointer for quadword |
| 122 | # processing |
| 123 | sub bufp, len # update buffer length |
| 124 | align_loop: |
| 125 | crc32b %bl, crc_init_dw # compute crc32 of 1-byte |
| 126 | shr $8, tmp # get next byte |
| 127 | dec bufp |
| 128 | jne align_loop |
| 129 | |
| 130 | proc_block: |
| 131 | |
| 132 | ################################################################ |
| 133 | ## 2) PROCESS BLOCKS: |
| 134 | ################################################################ |
| 135 | |
| 136 | ## compute num of bytes to be processed |
| 137 | movq len, tmp # save num bytes in tmp |
| 138 | |
| 139 | cmpq $128*24, len |
| 140 | jae full_block |
| 141 | |
| 142 | continue_block: |
| 143 | cmpq $SMALL_SIZE, len |
| 144 | jb small |
| 145 | |
| 146 | ## len < 128*24 |
| 147 | movq $2731, %rax # 2731 = ceil(2^16 / 24) |
| 148 | mul len_dw |
| 149 | shrq $16, %rax |
| 150 | |
| 151 | ## eax contains floor(bytes / 24) = num 24-byte chunks to do |
| 152 | |
| 153 | ## process rax 24-byte chunks (128 >= rax >= 0) |
| 154 | |
| 155 | ## compute end address of each block |
| 156 | ## block 0 (base addr + RAX * 8) |
| 157 | ## block 1 (base addr + RAX * 16) |
| 158 | ## block 2 (base addr + RAX * 24) |
| 159 | lea (bufptmp, %rax, 8), block_0 |
| 160 | lea (block_0, %rax, 8), block_1 |
| 161 | lea (block_1, %rax, 8), block_2 |
| 162 | |
| 163 | xor crc1, crc1 |
| 164 | xor crc2, crc2 |
| 165 | |
| 166 | ## branch into array |
| 167 | lea jump_table(%rip), bufp |
| 168 | movzxw (bufp, %rax, 2), len |
| 169 | offset=crc_array-jump_table |
| 170 | lea offset(bufp, len, 1), bufp |
| 171 | jmp *bufp |
| 172 | |
| 173 | ################################################################ |
| 174 | ## 2a) PROCESS FULL BLOCKS: |
| 175 | ################################################################ |
| 176 | full_block: |
| 177 | movq $128,%rax |
| 178 | lea 128*8*2(block_0), block_1 |
| 179 | lea 128*8*3(block_0), block_2 |
| 180 | add $128*8*1, block_0 |
| 181 | |
| 182 | xor crc1,crc1 |
| 183 | xor crc2,crc2 |
| 184 | |
| 185 | # Fall thruogh into top of crc array (crc_128) |
| 186 | |
| 187 | ################################################################ |
| 188 | ## 3) CRC Array: |
| 189 | ################################################################ |
| 190 | |
| 191 | crc_array: |
| 192 | i=128 |
| 193 | .rept 128-1 |
| 194 | .altmacro |
| 195 | LABEL crc_ %i |
| 196 | .noaltmacro |
| 197 | crc32q -i*8(block_0), crc_init |
| 198 | crc32q -i*8(block_1), crc1 |
| 199 | crc32q -i*8(block_2), crc2 |
| 200 | i=(i-1) |
| 201 | .endr |
| 202 | |
| 203 | .altmacro |
| 204 | LABEL crc_ %i |
| 205 | .noaltmacro |
| 206 | crc32q -i*8(block_0), crc_init |
| 207 | crc32q -i*8(block_1), crc1 |
| 208 | # SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet |
| 209 | |
| 210 | mov block_2, block_0 |
| 211 | |
| 212 | ################################################################ |
| 213 | ## 4) Combine three results: |
| 214 | ################################################################ |
| 215 | |
| 216 | lea (K_table-16)(%rip), bufp # first entry is for idx 1 |
| 217 | shlq $3, %rax # rax *= 8 |
| 218 | subq %rax, tmp # tmp -= rax*8 |
| 219 | shlq $1, %rax |
| 220 | subq %rax, tmp # tmp -= rax*16 |
| 221 | # (total tmp -= rax*24) |
| 222 | addq %rax, bufp |
| 223 | |
| 224 | movdqa (bufp), %xmm0 # 2 consts: K1:K2 |
| 225 | |
| 226 | movq crc_init, %xmm1 # CRC for block 1 |
| 227 | pclmulqdq $0x00,%xmm0,%xmm1 # Multiply by K2 |
| 228 | |
| 229 | movq crc1, %xmm2 # CRC for block 2 |
| 230 | pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 |
| 231 | |
| 232 | pxor %xmm2,%xmm1 |
| 233 | movq %xmm1, %rax |
| 234 | xor -i*8(block_2), %rax |
| 235 | mov crc2, crc_init |
| 236 | crc32 %rax, crc_init |
| 237 | |
| 238 | ################################################################ |
| 239 | ## 5) Check for end: |
| 240 | ################################################################ |
| 241 | |
| 242 | LABEL crc_ 0 |
| 243 | mov tmp, len |
| 244 | cmp $128*24, tmp |
| 245 | jae full_block |
| 246 | cmp $24, tmp |
| 247 | jae continue_block |
| 248 | |
| 249 | less_than_24: |
| 250 | shl $32-4, len_dw # less_than_16 expects length |
| 251 | # in upper 4 bits of len_dw |
| 252 | jnc less_than_16 |
| 253 | crc32q (bufptmp), crc_init |
| 254 | crc32q 8(bufptmp), crc_init |
| 255 | jz do_return |
| 256 | add $16, bufptmp |
| 257 | # len is less than 8 if we got here |
| 258 | # less_than_8 expects length in upper 3 bits of len_dw |
| 259 | # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] |
| 260 | shl $2, len_dw |
| 261 | jmp less_than_8_post_shl1 |
| 262 | |
| 263 | ####################################################################### |
| 264 | ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) |
| 265 | ####################################################################### |
| 266 | small: |
| 267 | shl $32-8, len_dw # Prepare len_dw for less_than_256 |
| 268 | j=256 |
| 269 | .rept 5 # j = {256, 128, 64, 32, 16} |
| 270 | .altmacro |
| 271 | LABEL less_than_ %j # less_than_j: Length should be in |
| 272 | # upper lg(j) bits of len_dw |
| 273 | j=(j/2) |
| 274 | shl $1, len_dw # Get next MSB |
| 275 | JNC_LESS_THAN %j |
| 276 | .noaltmacro |
| 277 | i=0 |
| 278 | .rept (j/8) |
| 279 | crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data |
| 280 | i=i+8 |
| 281 | .endr |
| 282 | jz do_return # Return if remaining length is zero |
| 283 | add $j, bufptmp # Advance buf |
| 284 | .endr |
| 285 | |
| 286 | less_than_8: # Length should be stored in |
| 287 | # upper 3 bits of len_dw |
| 288 | shl $1, len_dw |
| 289 | less_than_8_post_shl1: |
| 290 | jnc less_than_4 |
| 291 | crc32l (bufptmp), crc_init_dw # CRC of 4 bytes |
| 292 | jz do_return # return if remaining data is zero |
| 293 | add $4, bufptmp |
| 294 | less_than_4: # Length should be stored in |
| 295 | # upper 2 bits of len_dw |
| 296 | shl $1, len_dw |
| 297 | jnc less_than_2 |
| 298 | crc32w (bufptmp), crc_init_dw # CRC of 2 bytes |
| 299 | jz do_return # return if remaining data is zero |
| 300 | add $2, bufptmp |
| 301 | less_than_2: # Length should be stored in the MSB |
| 302 | # of len_dw |
| 303 | shl $1, len_dw |
| 304 | jnc less_than_1 |
| 305 | crc32b (bufptmp), crc_init_dw # CRC of 1 byte |
| 306 | less_than_1: # Length should be zero |
| 307 | do_return: |
| 308 | movq crc_init, %rax |
| 309 | popq %rsi |
| 310 | popq %rdi |
| 311 | popq %rbx |
| 312 | ret |
| 313 | |
| 314 | ################################################################ |
| 315 | ## jump table Table is 129 entries x 2 bytes each |
| 316 | ################################################################ |
| 317 | .align 4 |
| 318 | jump_table: |
| 319 | i=0 |
| 320 | .rept 129 |
| 321 | .altmacro |
| 322 | JMPTBL_ENTRY %i |
| 323 | .noaltmacro |
| 324 | i=i+1 |
| 325 | .endr |
| 326 | ################################################################ |
| 327 | ## PCLMULQDQ tables |
| 328 | ## Table is 128 entries x 2 quad words each |
| 329 | ################################################################ |
| 330 | .data |
| 331 | .align 64 |
| 332 | K_table: |
| 333 | .quad 0x14cd00bd6,0x105ec76f0 |
| 334 | .quad 0x0ba4fc28e,0x14cd00bd6 |
| 335 | .quad 0x1d82c63da,0x0f20c0dfe |
| 336 | .quad 0x09e4addf8,0x0ba4fc28e |
| 337 | .quad 0x039d3b296,0x1384aa63a |
| 338 | .quad 0x102f9b8a2,0x1d82c63da |
| 339 | .quad 0x14237f5e6,0x01c291d04 |
| 340 | .quad 0x00d3b6092,0x09e4addf8 |
| 341 | .quad 0x0c96cfdc0,0x0740eef02 |
| 342 | .quad 0x18266e456,0x039d3b296 |
| 343 | .quad 0x0daece73e,0x0083a6eec |
| 344 | .quad 0x0ab7aff2a,0x102f9b8a2 |
| 345 | .quad 0x1248ea574,0x1c1733996 |
| 346 | .quad 0x083348832,0x14237f5e6 |
| 347 | .quad 0x12c743124,0x02ad91c30 |
| 348 | .quad 0x0b9e02b86,0x00d3b6092 |
| 349 | .quad 0x018b33a4e,0x06992cea2 |
| 350 | .quad 0x1b331e26a,0x0c96cfdc0 |
| 351 | .quad 0x17d35ba46,0x07e908048 |
| 352 | .quad 0x1bf2e8b8a,0x18266e456 |
| 353 | .quad 0x1a3e0968a,0x11ed1f9d8 |
| 354 | .quad 0x0ce7f39f4,0x0daece73e |
| 355 | .quad 0x061d82e56,0x0f1d0f55e |
| 356 | .quad 0x0d270f1a2,0x0ab7aff2a |
| 357 | .quad 0x1c3f5f66c,0x0a87ab8a8 |
| 358 | .quad 0x12ed0daac,0x1248ea574 |
| 359 | .quad 0x065863b64,0x08462d800 |
| 360 | .quad 0x11eef4f8e,0x083348832 |
| 361 | .quad 0x1ee54f54c,0x071d111a8 |
| 362 | .quad 0x0b3e32c28,0x12c743124 |
| 363 | .quad 0x0064f7f26,0x0ffd852c6 |
| 364 | .quad 0x0dd7e3b0c,0x0b9e02b86 |
| 365 | .quad 0x0f285651c,0x0dcb17aa4 |
| 366 | .quad 0x010746f3c,0x018b33a4e |
| 367 | .quad 0x1c24afea4,0x0f37c5aee |
| 368 | .quad 0x0271d9844,0x1b331e26a |
| 369 | .quad 0x08e766a0c,0x06051d5a2 |
| 370 | .quad 0x093a5f730,0x17d35ba46 |
| 371 | .quad 0x06cb08e5c,0x11d5ca20e |
| 372 | .quad 0x06b749fb2,0x1bf2e8b8a |
| 373 | .quad 0x1167f94f2,0x021f3d99c |
| 374 | .quad 0x0cec3662e,0x1a3e0968a |
| 375 | .quad 0x19329634a,0x08f158014 |
| 376 | .quad 0x0e6fc4e6a,0x0ce7f39f4 |
| 377 | .quad 0x08227bb8a,0x1a5e82106 |
| 378 | .quad 0x0b0cd4768,0x061d82e56 |
| 379 | .quad 0x13c2b89c4,0x188815ab2 |
| 380 | .quad 0x0d7a4825c,0x0d270f1a2 |
| 381 | .quad 0x10f5ff2ba,0x105405f3e |
| 382 | .quad 0x00167d312,0x1c3f5f66c |
| 383 | .quad 0x0f6076544,0x0e9adf796 |
| 384 | .quad 0x026f6a60a,0x12ed0daac |
| 385 | .quad 0x1a2adb74e,0x096638b34 |
| 386 | .quad 0x19d34af3a,0x065863b64 |
| 387 | .quad 0x049c3cc9c,0x1e50585a0 |
| 388 | .quad 0x068bce87a,0x11eef4f8e |
| 389 | .quad 0x1524fa6c6,0x19f1c69dc |
| 390 | .quad 0x16cba8aca,0x1ee54f54c |
| 391 | .quad 0x042d98888,0x12913343e |
| 392 | .quad 0x1329d9f7e,0x0b3e32c28 |
| 393 | .quad 0x1b1c69528,0x088f25a3a |
| 394 | .quad 0x02178513a,0x0064f7f26 |
| 395 | .quad 0x0e0ac139e,0x04e36f0b0 |
| 396 | .quad 0x0170076fa,0x0dd7e3b0c |
| 397 | .quad 0x141a1a2e2,0x0bd6f81f8 |
| 398 | .quad 0x16ad828b4,0x0f285651c |
| 399 | .quad 0x041d17b64,0x19425cbba |
| 400 | .quad 0x1fae1cc66,0x010746f3c |
| 401 | .quad 0x1a75b4b00,0x18db37e8a |
| 402 | .quad 0x0f872e54c,0x1c24afea4 |
| 403 | .quad 0x01e41e9fc,0x04c144932 |
| 404 | .quad 0x086d8e4d2,0x0271d9844 |
| 405 | .quad 0x160f7af7a,0x052148f02 |
| 406 | .quad 0x05bb8f1bc,0x08e766a0c |
| 407 | .quad 0x0a90fd27a,0x0a3c6f37a |
| 408 | .quad 0x0b3af077a,0x093a5f730 |
| 409 | .quad 0x04984d782,0x1d22c238e |
| 410 | .quad 0x0ca6ef3ac,0x06cb08e5c |
| 411 | .quad 0x0234e0b26,0x063ded06a |
| 412 | .quad 0x1d88abd4a,0x06b749fb2 |
| 413 | .quad 0x04597456a,0x04d56973c |
| 414 | .quad 0x0e9e28eb4,0x1167f94f2 |
| 415 | .quad 0x07b3ff57a,0x19385bf2e |
| 416 | .quad 0x0c9c8b782,0x0cec3662e |
| 417 | .quad 0x13a9cba9e,0x0e417f38a |
| 418 | .quad 0x093e106a4,0x19329634a |
| 419 | .quad 0x167001a9c,0x14e727980 |
| 420 | .quad 0x1ddffc5d4,0x0e6fc4e6a |
| 421 | .quad 0x00df04680,0x0d104b8fc |
| 422 | .quad 0x02342001e,0x08227bb8a |
| 423 | .quad 0x00a2a8d7e,0x05b397730 |
| 424 | .quad 0x168763fa6,0x0b0cd4768 |
| 425 | .quad 0x1ed5a407a,0x0e78eb416 |
| 426 | .quad 0x0d2c3ed1a,0x13c2b89c4 |
| 427 | .quad 0x0995a5724,0x1641378f0 |
| 428 | .quad 0x19b1afbc4,0x0d7a4825c |
| 429 | .quad 0x109ffedc0,0x08d96551c |
| 430 | .quad 0x0f2271e60,0x10f5ff2ba |
| 431 | .quad 0x00b0bf8ca,0x00bf80dd2 |
| 432 | .quad 0x123888b7a,0x00167d312 |
| 433 | .quad 0x1e888f7dc,0x18dcddd1c |
| 434 | .quad 0x002ee03b2,0x0f6076544 |
| 435 | .quad 0x183e8d8fe,0x06a45d2b2 |
| 436 | .quad 0x133d7a042,0x026f6a60a |
| 437 | .quad 0x116b0f50c,0x1dd3e10e8 |
| 438 | .quad 0x05fabe670,0x1a2adb74e |
| 439 | .quad 0x130004488,0x0de87806c |
| 440 | .quad 0x000bcf5f6,0x19d34af3a |
| 441 | .quad 0x18f0c7078,0x014338754 |
| 442 | .quad 0x017f27698,0x049c3cc9c |
| 443 | .quad 0x058ca5f00,0x15e3e77ee |
| 444 | .quad 0x1af900c24,0x068bce87a |
| 445 | .quad 0x0b5cfca28,0x0dd07448e |
| 446 | .quad 0x0ded288f8,0x1524fa6c6 |
| 447 | .quad 0x059f229bc,0x1d8048348 |
| 448 | .quad 0x06d390dec,0x16cba8aca |
| 449 | .quad 0x037170390,0x0a3e3e02c |
| 450 | .quad 0x06353c1cc,0x042d98888 |
| 451 | .quad 0x0c4584f5c,0x0d73c7bea |
| 452 | .quad 0x1f16a3418,0x1329d9f7e |
| 453 | .quad 0x0531377e2,0x185137662 |
| 454 | .quad 0x1d8d9ca7c,0x1b1c69528 |
| 455 | .quad 0x0b25b29f2,0x18a08b5bc |
| 456 | .quad 0x19fb2a8b0,0x02178513a |
| 457 | .quad 0x1a08fe6ac,0x1da758ae0 |
| 458 | .quad 0x045cddf4e,0x0e0ac139e |
| 459 | .quad 0x1a91647f2,0x169cf9eb0 |
| 460 | .quad 0x1a0f717c4,0x0170076fa |