Martin Willi | c9320b6 | 2015-07-16 19:14:01 +0200 | [diff] [blame] | 1 | /* |
| 2 | * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions |
| 3 | * |
| 4 | * Copyright (C) 2015 Martin Willi |
| 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License as published by |
| 8 | * the Free Software Foundation; either version 2 of the License, or |
| 9 | * (at your option) any later version. |
| 10 | */ |
| 11 | |
| 12 | #include <linux/linkage.h> |
| 13 | |
| 14 | .data |
| 15 | .align 16 |
| 16 | |
| 17 | ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 |
| 18 | ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 |
Martin Willi | 274f938 | 2015-07-16 19:14:02 +0200 | [diff] [blame] | 19 | CTRINC: .octa 0x00000003000000020000000100000000 |
Martin Willi | c9320b6 | 2015-07-16 19:14:01 +0200 | [diff] [blame] | 20 | |
| 21 | .text |
| 22 | |
| 23 | ENTRY(chacha20_block_xor_ssse3) |
| 24 | # %rdi: Input state matrix, s |
| 25 | # %rsi: 1 data block output, o |
| 26 | # %rdx: 1 data block input, i |
| 27 | |
| 28 | # This function encrypts one ChaCha20 block by loading the state matrix |
| 29 | # in four SSE registers. It performs matrix operation on four words in |
| 30 | # parallel, but requireds shuffling to rearrange the words after each |
| 31 | # round. 8/16-bit word rotation is done with the slightly better |
| 32 | # performing SSSE3 byte shuffling, 7/12-bit word rotation uses |
| 33 | # traditional shift+OR. |
| 34 | |
| 35 | # x0..3 = s0..3 |
| 36 | movdqa 0x00(%rdi),%xmm0 |
| 37 | movdqa 0x10(%rdi),%xmm1 |
| 38 | movdqa 0x20(%rdi),%xmm2 |
| 39 | movdqa 0x30(%rdi),%xmm3 |
| 40 | movdqa %xmm0,%xmm8 |
| 41 | movdqa %xmm1,%xmm9 |
| 42 | movdqa %xmm2,%xmm10 |
| 43 | movdqa %xmm3,%xmm11 |
| 44 | |
| 45 | movdqa ROT8(%rip),%xmm4 |
| 46 | movdqa ROT16(%rip),%xmm5 |
| 47 | |
| 48 | mov $10,%ecx |
| 49 | |
| 50 | .Ldoubleround: |
| 51 | |
| 52 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
| 53 | paddd %xmm1,%xmm0 |
| 54 | pxor %xmm0,%xmm3 |
| 55 | pshufb %xmm5,%xmm3 |
| 56 | |
| 57 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
| 58 | paddd %xmm3,%xmm2 |
| 59 | pxor %xmm2,%xmm1 |
| 60 | movdqa %xmm1,%xmm6 |
| 61 | pslld $12,%xmm6 |
| 62 | psrld $20,%xmm1 |
| 63 | por %xmm6,%xmm1 |
| 64 | |
| 65 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
| 66 | paddd %xmm1,%xmm0 |
| 67 | pxor %xmm0,%xmm3 |
| 68 | pshufb %xmm4,%xmm3 |
| 69 | |
| 70 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
| 71 | paddd %xmm3,%xmm2 |
| 72 | pxor %xmm2,%xmm1 |
| 73 | movdqa %xmm1,%xmm7 |
| 74 | pslld $7,%xmm7 |
| 75 | psrld $25,%xmm1 |
| 76 | por %xmm7,%xmm1 |
| 77 | |
| 78 | # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) |
| 79 | pshufd $0x39,%xmm1,%xmm1 |
| 80 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
| 81 | pshufd $0x4e,%xmm2,%xmm2 |
| 82 | # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) |
| 83 | pshufd $0x93,%xmm3,%xmm3 |
| 84 | |
| 85 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
| 86 | paddd %xmm1,%xmm0 |
| 87 | pxor %xmm0,%xmm3 |
| 88 | pshufb %xmm5,%xmm3 |
| 89 | |
| 90 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
| 91 | paddd %xmm3,%xmm2 |
| 92 | pxor %xmm2,%xmm1 |
| 93 | movdqa %xmm1,%xmm6 |
| 94 | pslld $12,%xmm6 |
| 95 | psrld $20,%xmm1 |
| 96 | por %xmm6,%xmm1 |
| 97 | |
| 98 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
| 99 | paddd %xmm1,%xmm0 |
| 100 | pxor %xmm0,%xmm3 |
| 101 | pshufb %xmm4,%xmm3 |
| 102 | |
| 103 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
| 104 | paddd %xmm3,%xmm2 |
| 105 | pxor %xmm2,%xmm1 |
| 106 | movdqa %xmm1,%xmm7 |
| 107 | pslld $7,%xmm7 |
| 108 | psrld $25,%xmm1 |
| 109 | por %xmm7,%xmm1 |
| 110 | |
| 111 | # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) |
| 112 | pshufd $0x93,%xmm1,%xmm1 |
| 113 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
| 114 | pshufd $0x4e,%xmm2,%xmm2 |
| 115 | # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) |
| 116 | pshufd $0x39,%xmm3,%xmm3 |
| 117 | |
| 118 | dec %ecx |
| 119 | jnz .Ldoubleround |
| 120 | |
| 121 | # o0 = i0 ^ (x0 + s0) |
| 122 | movdqu 0x00(%rdx),%xmm4 |
| 123 | paddd %xmm8,%xmm0 |
| 124 | pxor %xmm4,%xmm0 |
| 125 | movdqu %xmm0,0x00(%rsi) |
| 126 | # o1 = i1 ^ (x1 + s1) |
| 127 | movdqu 0x10(%rdx),%xmm5 |
| 128 | paddd %xmm9,%xmm1 |
| 129 | pxor %xmm5,%xmm1 |
| 130 | movdqu %xmm1,0x10(%rsi) |
| 131 | # o2 = i2 ^ (x2 + s2) |
| 132 | movdqu 0x20(%rdx),%xmm6 |
| 133 | paddd %xmm10,%xmm2 |
| 134 | pxor %xmm6,%xmm2 |
| 135 | movdqu %xmm2,0x20(%rsi) |
| 136 | # o3 = i3 ^ (x3 + s3) |
| 137 | movdqu 0x30(%rdx),%xmm7 |
| 138 | paddd %xmm11,%xmm3 |
| 139 | pxor %xmm7,%xmm3 |
| 140 | movdqu %xmm3,0x30(%rsi) |
| 141 | |
| 142 | ret |
| 143 | ENDPROC(chacha20_block_xor_ssse3) |
Martin Willi | 274f938 | 2015-07-16 19:14:02 +0200 | [diff] [blame] | 144 | |
| 145 | ENTRY(chacha20_4block_xor_ssse3) |
| 146 | # %rdi: Input state matrix, s |
| 147 | # %rsi: 4 data blocks output, o |
| 148 | # %rdx: 4 data blocks input, i |
| 149 | |
| 150 | # This function encrypts four consecutive ChaCha20 blocks by loading the |
| 151 | # the state matrix in SSE registers four times. As we need some scratch |
| 152 | # registers, we save the first four registers on the stack. The |
| 153 | # algorithm performs each operation on the corresponding word of each |
| 154 | # state matrix, hence requires no word shuffling. For final XORing step |
| 155 | # we transpose the matrix by interleaving 32- and then 64-bit words, |
| 156 | # which allows us to do XOR in SSE registers. 8/16-bit word rotation is |
| 157 | # done with the slightly better performing SSSE3 byte shuffling, |
| 158 | # 7/12-bit word rotation uses traditional shift+OR. |
| 159 | |
Eli Cooper | cbe09bd | 2016-01-22 00:24:08 +0800 | [diff] [blame] | 160 | mov %rsp,%r11 |
| 161 | sub $0x80,%rsp |
| 162 | and $~63,%rsp |
Martin Willi | 274f938 | 2015-07-16 19:14:02 +0200 | [diff] [blame] | 163 | |
| 164 | # x0..15[0-3] = s0..3[0..3] |
| 165 | movq 0x00(%rdi),%xmm1 |
| 166 | pshufd $0x00,%xmm1,%xmm0 |
| 167 | pshufd $0x55,%xmm1,%xmm1 |
| 168 | movq 0x08(%rdi),%xmm3 |
| 169 | pshufd $0x00,%xmm3,%xmm2 |
| 170 | pshufd $0x55,%xmm3,%xmm3 |
| 171 | movq 0x10(%rdi),%xmm5 |
| 172 | pshufd $0x00,%xmm5,%xmm4 |
| 173 | pshufd $0x55,%xmm5,%xmm5 |
| 174 | movq 0x18(%rdi),%xmm7 |
| 175 | pshufd $0x00,%xmm7,%xmm6 |
| 176 | pshufd $0x55,%xmm7,%xmm7 |
| 177 | movq 0x20(%rdi),%xmm9 |
| 178 | pshufd $0x00,%xmm9,%xmm8 |
| 179 | pshufd $0x55,%xmm9,%xmm9 |
| 180 | movq 0x28(%rdi),%xmm11 |
| 181 | pshufd $0x00,%xmm11,%xmm10 |
| 182 | pshufd $0x55,%xmm11,%xmm11 |
| 183 | movq 0x30(%rdi),%xmm13 |
| 184 | pshufd $0x00,%xmm13,%xmm12 |
| 185 | pshufd $0x55,%xmm13,%xmm13 |
| 186 | movq 0x38(%rdi),%xmm15 |
| 187 | pshufd $0x00,%xmm15,%xmm14 |
| 188 | pshufd $0x55,%xmm15,%xmm15 |
| 189 | # x0..3 on stack |
| 190 | movdqa %xmm0,0x00(%rsp) |
| 191 | movdqa %xmm1,0x10(%rsp) |
| 192 | movdqa %xmm2,0x20(%rsp) |
| 193 | movdqa %xmm3,0x30(%rsp) |
| 194 | |
| 195 | movdqa CTRINC(%rip),%xmm1 |
| 196 | movdqa ROT8(%rip),%xmm2 |
| 197 | movdqa ROT16(%rip),%xmm3 |
| 198 | |
| 199 | # x12 += counter values 0-3 |
| 200 | paddd %xmm1,%xmm12 |
| 201 | |
| 202 | mov $10,%ecx |
| 203 | |
| 204 | .Ldoubleround4: |
| 205 | # x0 += x4, x12 = rotl32(x12 ^ x0, 16) |
| 206 | movdqa 0x00(%rsp),%xmm0 |
| 207 | paddd %xmm4,%xmm0 |
| 208 | movdqa %xmm0,0x00(%rsp) |
| 209 | pxor %xmm0,%xmm12 |
| 210 | pshufb %xmm3,%xmm12 |
| 211 | # x1 += x5, x13 = rotl32(x13 ^ x1, 16) |
| 212 | movdqa 0x10(%rsp),%xmm0 |
| 213 | paddd %xmm5,%xmm0 |
| 214 | movdqa %xmm0,0x10(%rsp) |
| 215 | pxor %xmm0,%xmm13 |
| 216 | pshufb %xmm3,%xmm13 |
| 217 | # x2 += x6, x14 = rotl32(x14 ^ x2, 16) |
| 218 | movdqa 0x20(%rsp),%xmm0 |
| 219 | paddd %xmm6,%xmm0 |
| 220 | movdqa %xmm0,0x20(%rsp) |
| 221 | pxor %xmm0,%xmm14 |
| 222 | pshufb %xmm3,%xmm14 |
| 223 | # x3 += x7, x15 = rotl32(x15 ^ x3, 16) |
| 224 | movdqa 0x30(%rsp),%xmm0 |
| 225 | paddd %xmm7,%xmm0 |
| 226 | movdqa %xmm0,0x30(%rsp) |
| 227 | pxor %xmm0,%xmm15 |
| 228 | pshufb %xmm3,%xmm15 |
| 229 | |
| 230 | # x8 += x12, x4 = rotl32(x4 ^ x8, 12) |
| 231 | paddd %xmm12,%xmm8 |
| 232 | pxor %xmm8,%xmm4 |
| 233 | movdqa %xmm4,%xmm0 |
| 234 | pslld $12,%xmm0 |
| 235 | psrld $20,%xmm4 |
| 236 | por %xmm0,%xmm4 |
| 237 | # x9 += x13, x5 = rotl32(x5 ^ x9, 12) |
| 238 | paddd %xmm13,%xmm9 |
| 239 | pxor %xmm9,%xmm5 |
| 240 | movdqa %xmm5,%xmm0 |
| 241 | pslld $12,%xmm0 |
| 242 | psrld $20,%xmm5 |
| 243 | por %xmm0,%xmm5 |
| 244 | # x10 += x14, x6 = rotl32(x6 ^ x10, 12) |
| 245 | paddd %xmm14,%xmm10 |
| 246 | pxor %xmm10,%xmm6 |
| 247 | movdqa %xmm6,%xmm0 |
| 248 | pslld $12,%xmm0 |
| 249 | psrld $20,%xmm6 |
| 250 | por %xmm0,%xmm6 |
| 251 | # x11 += x15, x7 = rotl32(x7 ^ x11, 12) |
| 252 | paddd %xmm15,%xmm11 |
| 253 | pxor %xmm11,%xmm7 |
| 254 | movdqa %xmm7,%xmm0 |
| 255 | pslld $12,%xmm0 |
| 256 | psrld $20,%xmm7 |
| 257 | por %xmm0,%xmm7 |
| 258 | |
| 259 | # x0 += x4, x12 = rotl32(x12 ^ x0, 8) |
| 260 | movdqa 0x00(%rsp),%xmm0 |
| 261 | paddd %xmm4,%xmm0 |
| 262 | movdqa %xmm0,0x00(%rsp) |
| 263 | pxor %xmm0,%xmm12 |
| 264 | pshufb %xmm2,%xmm12 |
| 265 | # x1 += x5, x13 = rotl32(x13 ^ x1, 8) |
| 266 | movdqa 0x10(%rsp),%xmm0 |
| 267 | paddd %xmm5,%xmm0 |
| 268 | movdqa %xmm0,0x10(%rsp) |
| 269 | pxor %xmm0,%xmm13 |
| 270 | pshufb %xmm2,%xmm13 |
| 271 | # x2 += x6, x14 = rotl32(x14 ^ x2, 8) |
| 272 | movdqa 0x20(%rsp),%xmm0 |
| 273 | paddd %xmm6,%xmm0 |
| 274 | movdqa %xmm0,0x20(%rsp) |
| 275 | pxor %xmm0,%xmm14 |
| 276 | pshufb %xmm2,%xmm14 |
| 277 | # x3 += x7, x15 = rotl32(x15 ^ x3, 8) |
| 278 | movdqa 0x30(%rsp),%xmm0 |
| 279 | paddd %xmm7,%xmm0 |
| 280 | movdqa %xmm0,0x30(%rsp) |
| 281 | pxor %xmm0,%xmm15 |
| 282 | pshufb %xmm2,%xmm15 |
| 283 | |
| 284 | # x8 += x12, x4 = rotl32(x4 ^ x8, 7) |
| 285 | paddd %xmm12,%xmm8 |
| 286 | pxor %xmm8,%xmm4 |
| 287 | movdqa %xmm4,%xmm0 |
| 288 | pslld $7,%xmm0 |
| 289 | psrld $25,%xmm4 |
| 290 | por %xmm0,%xmm4 |
| 291 | # x9 += x13, x5 = rotl32(x5 ^ x9, 7) |
| 292 | paddd %xmm13,%xmm9 |
| 293 | pxor %xmm9,%xmm5 |
| 294 | movdqa %xmm5,%xmm0 |
| 295 | pslld $7,%xmm0 |
| 296 | psrld $25,%xmm5 |
| 297 | por %xmm0,%xmm5 |
| 298 | # x10 += x14, x6 = rotl32(x6 ^ x10, 7) |
| 299 | paddd %xmm14,%xmm10 |
| 300 | pxor %xmm10,%xmm6 |
| 301 | movdqa %xmm6,%xmm0 |
| 302 | pslld $7,%xmm0 |
| 303 | psrld $25,%xmm6 |
| 304 | por %xmm0,%xmm6 |
| 305 | # x11 += x15, x7 = rotl32(x7 ^ x11, 7) |
| 306 | paddd %xmm15,%xmm11 |
| 307 | pxor %xmm11,%xmm7 |
| 308 | movdqa %xmm7,%xmm0 |
| 309 | pslld $7,%xmm0 |
| 310 | psrld $25,%xmm7 |
| 311 | por %xmm0,%xmm7 |
| 312 | |
| 313 | # x0 += x5, x15 = rotl32(x15 ^ x0, 16) |
| 314 | movdqa 0x00(%rsp),%xmm0 |
| 315 | paddd %xmm5,%xmm0 |
| 316 | movdqa %xmm0,0x00(%rsp) |
| 317 | pxor %xmm0,%xmm15 |
| 318 | pshufb %xmm3,%xmm15 |
| 319 | # x1 += x6, x12 = rotl32(x12 ^ x1, 16) |
| 320 | movdqa 0x10(%rsp),%xmm0 |
| 321 | paddd %xmm6,%xmm0 |
| 322 | movdqa %xmm0,0x10(%rsp) |
| 323 | pxor %xmm0,%xmm12 |
| 324 | pshufb %xmm3,%xmm12 |
| 325 | # x2 += x7, x13 = rotl32(x13 ^ x2, 16) |
| 326 | movdqa 0x20(%rsp),%xmm0 |
| 327 | paddd %xmm7,%xmm0 |
| 328 | movdqa %xmm0,0x20(%rsp) |
| 329 | pxor %xmm0,%xmm13 |
| 330 | pshufb %xmm3,%xmm13 |
| 331 | # x3 += x4, x14 = rotl32(x14 ^ x3, 16) |
| 332 | movdqa 0x30(%rsp),%xmm0 |
| 333 | paddd %xmm4,%xmm0 |
| 334 | movdqa %xmm0,0x30(%rsp) |
| 335 | pxor %xmm0,%xmm14 |
| 336 | pshufb %xmm3,%xmm14 |
| 337 | |
| 338 | # x10 += x15, x5 = rotl32(x5 ^ x10, 12) |
| 339 | paddd %xmm15,%xmm10 |
| 340 | pxor %xmm10,%xmm5 |
| 341 | movdqa %xmm5,%xmm0 |
| 342 | pslld $12,%xmm0 |
| 343 | psrld $20,%xmm5 |
| 344 | por %xmm0,%xmm5 |
| 345 | # x11 += x12, x6 = rotl32(x6 ^ x11, 12) |
| 346 | paddd %xmm12,%xmm11 |
| 347 | pxor %xmm11,%xmm6 |
| 348 | movdqa %xmm6,%xmm0 |
| 349 | pslld $12,%xmm0 |
| 350 | psrld $20,%xmm6 |
| 351 | por %xmm0,%xmm6 |
| 352 | # x8 += x13, x7 = rotl32(x7 ^ x8, 12) |
| 353 | paddd %xmm13,%xmm8 |
| 354 | pxor %xmm8,%xmm7 |
| 355 | movdqa %xmm7,%xmm0 |
| 356 | pslld $12,%xmm0 |
| 357 | psrld $20,%xmm7 |
| 358 | por %xmm0,%xmm7 |
| 359 | # x9 += x14, x4 = rotl32(x4 ^ x9, 12) |
| 360 | paddd %xmm14,%xmm9 |
| 361 | pxor %xmm9,%xmm4 |
| 362 | movdqa %xmm4,%xmm0 |
| 363 | pslld $12,%xmm0 |
| 364 | psrld $20,%xmm4 |
| 365 | por %xmm0,%xmm4 |
| 366 | |
| 367 | # x0 += x5, x15 = rotl32(x15 ^ x0, 8) |
| 368 | movdqa 0x00(%rsp),%xmm0 |
| 369 | paddd %xmm5,%xmm0 |
| 370 | movdqa %xmm0,0x00(%rsp) |
| 371 | pxor %xmm0,%xmm15 |
| 372 | pshufb %xmm2,%xmm15 |
| 373 | # x1 += x6, x12 = rotl32(x12 ^ x1, 8) |
| 374 | movdqa 0x10(%rsp),%xmm0 |
| 375 | paddd %xmm6,%xmm0 |
| 376 | movdqa %xmm0,0x10(%rsp) |
| 377 | pxor %xmm0,%xmm12 |
| 378 | pshufb %xmm2,%xmm12 |
| 379 | # x2 += x7, x13 = rotl32(x13 ^ x2, 8) |
| 380 | movdqa 0x20(%rsp),%xmm0 |
| 381 | paddd %xmm7,%xmm0 |
| 382 | movdqa %xmm0,0x20(%rsp) |
| 383 | pxor %xmm0,%xmm13 |
| 384 | pshufb %xmm2,%xmm13 |
| 385 | # x3 += x4, x14 = rotl32(x14 ^ x3, 8) |
| 386 | movdqa 0x30(%rsp),%xmm0 |
| 387 | paddd %xmm4,%xmm0 |
| 388 | movdqa %xmm0,0x30(%rsp) |
| 389 | pxor %xmm0,%xmm14 |
| 390 | pshufb %xmm2,%xmm14 |
| 391 | |
| 392 | # x10 += x15, x5 = rotl32(x5 ^ x10, 7) |
| 393 | paddd %xmm15,%xmm10 |
| 394 | pxor %xmm10,%xmm5 |
| 395 | movdqa %xmm5,%xmm0 |
| 396 | pslld $7,%xmm0 |
| 397 | psrld $25,%xmm5 |
| 398 | por %xmm0,%xmm5 |
| 399 | # x11 += x12, x6 = rotl32(x6 ^ x11, 7) |
| 400 | paddd %xmm12,%xmm11 |
| 401 | pxor %xmm11,%xmm6 |
| 402 | movdqa %xmm6,%xmm0 |
| 403 | pslld $7,%xmm0 |
| 404 | psrld $25,%xmm6 |
| 405 | por %xmm0,%xmm6 |
| 406 | # x8 += x13, x7 = rotl32(x7 ^ x8, 7) |
| 407 | paddd %xmm13,%xmm8 |
| 408 | pxor %xmm8,%xmm7 |
| 409 | movdqa %xmm7,%xmm0 |
| 410 | pslld $7,%xmm0 |
| 411 | psrld $25,%xmm7 |
| 412 | por %xmm0,%xmm7 |
| 413 | # x9 += x14, x4 = rotl32(x4 ^ x9, 7) |
| 414 | paddd %xmm14,%xmm9 |
| 415 | pxor %xmm9,%xmm4 |
| 416 | movdqa %xmm4,%xmm0 |
| 417 | pslld $7,%xmm0 |
| 418 | psrld $25,%xmm4 |
| 419 | por %xmm0,%xmm4 |
| 420 | |
| 421 | dec %ecx |
| 422 | jnz .Ldoubleround4 |
| 423 | |
| 424 | # x0[0-3] += s0[0] |
| 425 | # x1[0-3] += s0[1] |
| 426 | movq 0x00(%rdi),%xmm3 |
| 427 | pshufd $0x00,%xmm3,%xmm2 |
| 428 | pshufd $0x55,%xmm3,%xmm3 |
| 429 | paddd 0x00(%rsp),%xmm2 |
| 430 | movdqa %xmm2,0x00(%rsp) |
| 431 | paddd 0x10(%rsp),%xmm3 |
| 432 | movdqa %xmm3,0x10(%rsp) |
| 433 | # x2[0-3] += s0[2] |
| 434 | # x3[0-3] += s0[3] |
| 435 | movq 0x08(%rdi),%xmm3 |
| 436 | pshufd $0x00,%xmm3,%xmm2 |
| 437 | pshufd $0x55,%xmm3,%xmm3 |
| 438 | paddd 0x20(%rsp),%xmm2 |
| 439 | movdqa %xmm2,0x20(%rsp) |
| 440 | paddd 0x30(%rsp),%xmm3 |
| 441 | movdqa %xmm3,0x30(%rsp) |
| 442 | |
| 443 | # x4[0-3] += s1[0] |
| 444 | # x5[0-3] += s1[1] |
| 445 | movq 0x10(%rdi),%xmm3 |
| 446 | pshufd $0x00,%xmm3,%xmm2 |
| 447 | pshufd $0x55,%xmm3,%xmm3 |
| 448 | paddd %xmm2,%xmm4 |
| 449 | paddd %xmm3,%xmm5 |
| 450 | # x6[0-3] += s1[2] |
| 451 | # x7[0-3] += s1[3] |
| 452 | movq 0x18(%rdi),%xmm3 |
| 453 | pshufd $0x00,%xmm3,%xmm2 |
| 454 | pshufd $0x55,%xmm3,%xmm3 |
| 455 | paddd %xmm2,%xmm6 |
| 456 | paddd %xmm3,%xmm7 |
| 457 | |
| 458 | # x8[0-3] += s2[0] |
| 459 | # x9[0-3] += s2[1] |
| 460 | movq 0x20(%rdi),%xmm3 |
| 461 | pshufd $0x00,%xmm3,%xmm2 |
| 462 | pshufd $0x55,%xmm3,%xmm3 |
| 463 | paddd %xmm2,%xmm8 |
| 464 | paddd %xmm3,%xmm9 |
| 465 | # x10[0-3] += s2[2] |
| 466 | # x11[0-3] += s2[3] |
| 467 | movq 0x28(%rdi),%xmm3 |
| 468 | pshufd $0x00,%xmm3,%xmm2 |
| 469 | pshufd $0x55,%xmm3,%xmm3 |
| 470 | paddd %xmm2,%xmm10 |
| 471 | paddd %xmm3,%xmm11 |
| 472 | |
| 473 | # x12[0-3] += s3[0] |
| 474 | # x13[0-3] += s3[1] |
| 475 | movq 0x30(%rdi),%xmm3 |
| 476 | pshufd $0x00,%xmm3,%xmm2 |
| 477 | pshufd $0x55,%xmm3,%xmm3 |
| 478 | paddd %xmm2,%xmm12 |
| 479 | paddd %xmm3,%xmm13 |
| 480 | # x14[0-3] += s3[2] |
| 481 | # x15[0-3] += s3[3] |
| 482 | movq 0x38(%rdi),%xmm3 |
| 483 | pshufd $0x00,%xmm3,%xmm2 |
| 484 | pshufd $0x55,%xmm3,%xmm3 |
| 485 | paddd %xmm2,%xmm14 |
| 486 | paddd %xmm3,%xmm15 |
| 487 | |
| 488 | # x12 += counter values 0-3 |
| 489 | paddd %xmm1,%xmm12 |
| 490 | |
| 491 | # interleave 32-bit words in state n, n+1 |
| 492 | movdqa 0x00(%rsp),%xmm0 |
| 493 | movdqa 0x10(%rsp),%xmm1 |
| 494 | movdqa %xmm0,%xmm2 |
| 495 | punpckldq %xmm1,%xmm2 |
| 496 | punpckhdq %xmm1,%xmm0 |
| 497 | movdqa %xmm2,0x00(%rsp) |
| 498 | movdqa %xmm0,0x10(%rsp) |
| 499 | movdqa 0x20(%rsp),%xmm0 |
| 500 | movdqa 0x30(%rsp),%xmm1 |
| 501 | movdqa %xmm0,%xmm2 |
| 502 | punpckldq %xmm1,%xmm2 |
| 503 | punpckhdq %xmm1,%xmm0 |
| 504 | movdqa %xmm2,0x20(%rsp) |
| 505 | movdqa %xmm0,0x30(%rsp) |
| 506 | movdqa %xmm4,%xmm0 |
| 507 | punpckldq %xmm5,%xmm4 |
| 508 | punpckhdq %xmm5,%xmm0 |
| 509 | movdqa %xmm0,%xmm5 |
| 510 | movdqa %xmm6,%xmm0 |
| 511 | punpckldq %xmm7,%xmm6 |
| 512 | punpckhdq %xmm7,%xmm0 |
| 513 | movdqa %xmm0,%xmm7 |
| 514 | movdqa %xmm8,%xmm0 |
| 515 | punpckldq %xmm9,%xmm8 |
| 516 | punpckhdq %xmm9,%xmm0 |
| 517 | movdqa %xmm0,%xmm9 |
| 518 | movdqa %xmm10,%xmm0 |
| 519 | punpckldq %xmm11,%xmm10 |
| 520 | punpckhdq %xmm11,%xmm0 |
| 521 | movdqa %xmm0,%xmm11 |
| 522 | movdqa %xmm12,%xmm0 |
| 523 | punpckldq %xmm13,%xmm12 |
| 524 | punpckhdq %xmm13,%xmm0 |
| 525 | movdqa %xmm0,%xmm13 |
| 526 | movdqa %xmm14,%xmm0 |
| 527 | punpckldq %xmm15,%xmm14 |
| 528 | punpckhdq %xmm15,%xmm0 |
| 529 | movdqa %xmm0,%xmm15 |
| 530 | |
| 531 | # interleave 64-bit words in state n, n+2 |
| 532 | movdqa 0x00(%rsp),%xmm0 |
| 533 | movdqa 0x20(%rsp),%xmm1 |
| 534 | movdqa %xmm0,%xmm2 |
| 535 | punpcklqdq %xmm1,%xmm2 |
| 536 | punpckhqdq %xmm1,%xmm0 |
| 537 | movdqa %xmm2,0x00(%rsp) |
| 538 | movdqa %xmm0,0x20(%rsp) |
| 539 | movdqa 0x10(%rsp),%xmm0 |
| 540 | movdqa 0x30(%rsp),%xmm1 |
| 541 | movdqa %xmm0,%xmm2 |
| 542 | punpcklqdq %xmm1,%xmm2 |
| 543 | punpckhqdq %xmm1,%xmm0 |
| 544 | movdqa %xmm2,0x10(%rsp) |
| 545 | movdqa %xmm0,0x30(%rsp) |
| 546 | movdqa %xmm4,%xmm0 |
| 547 | punpcklqdq %xmm6,%xmm4 |
| 548 | punpckhqdq %xmm6,%xmm0 |
| 549 | movdqa %xmm0,%xmm6 |
| 550 | movdqa %xmm5,%xmm0 |
| 551 | punpcklqdq %xmm7,%xmm5 |
| 552 | punpckhqdq %xmm7,%xmm0 |
| 553 | movdqa %xmm0,%xmm7 |
| 554 | movdqa %xmm8,%xmm0 |
| 555 | punpcklqdq %xmm10,%xmm8 |
| 556 | punpckhqdq %xmm10,%xmm0 |
| 557 | movdqa %xmm0,%xmm10 |
| 558 | movdqa %xmm9,%xmm0 |
| 559 | punpcklqdq %xmm11,%xmm9 |
| 560 | punpckhqdq %xmm11,%xmm0 |
| 561 | movdqa %xmm0,%xmm11 |
| 562 | movdqa %xmm12,%xmm0 |
| 563 | punpcklqdq %xmm14,%xmm12 |
| 564 | punpckhqdq %xmm14,%xmm0 |
| 565 | movdqa %xmm0,%xmm14 |
| 566 | movdqa %xmm13,%xmm0 |
| 567 | punpcklqdq %xmm15,%xmm13 |
| 568 | punpckhqdq %xmm15,%xmm0 |
| 569 | movdqa %xmm0,%xmm15 |
| 570 | |
| 571 | # xor with corresponding input, write to output |
| 572 | movdqa 0x00(%rsp),%xmm0 |
| 573 | movdqu 0x00(%rdx),%xmm1 |
| 574 | pxor %xmm1,%xmm0 |
| 575 | movdqu %xmm0,0x00(%rsi) |
| 576 | movdqa 0x10(%rsp),%xmm0 |
| 577 | movdqu 0x80(%rdx),%xmm1 |
| 578 | pxor %xmm1,%xmm0 |
| 579 | movdqu %xmm0,0x80(%rsi) |
| 580 | movdqa 0x20(%rsp),%xmm0 |
| 581 | movdqu 0x40(%rdx),%xmm1 |
| 582 | pxor %xmm1,%xmm0 |
| 583 | movdqu %xmm0,0x40(%rsi) |
| 584 | movdqa 0x30(%rsp),%xmm0 |
| 585 | movdqu 0xc0(%rdx),%xmm1 |
| 586 | pxor %xmm1,%xmm0 |
| 587 | movdqu %xmm0,0xc0(%rsi) |
| 588 | movdqu 0x10(%rdx),%xmm1 |
| 589 | pxor %xmm1,%xmm4 |
| 590 | movdqu %xmm4,0x10(%rsi) |
| 591 | movdqu 0x90(%rdx),%xmm1 |
| 592 | pxor %xmm1,%xmm5 |
| 593 | movdqu %xmm5,0x90(%rsi) |
| 594 | movdqu 0x50(%rdx),%xmm1 |
| 595 | pxor %xmm1,%xmm6 |
| 596 | movdqu %xmm6,0x50(%rsi) |
| 597 | movdqu 0xd0(%rdx),%xmm1 |
| 598 | pxor %xmm1,%xmm7 |
| 599 | movdqu %xmm7,0xd0(%rsi) |
| 600 | movdqu 0x20(%rdx),%xmm1 |
| 601 | pxor %xmm1,%xmm8 |
| 602 | movdqu %xmm8,0x20(%rsi) |
| 603 | movdqu 0xa0(%rdx),%xmm1 |
| 604 | pxor %xmm1,%xmm9 |
| 605 | movdqu %xmm9,0xa0(%rsi) |
| 606 | movdqu 0x60(%rdx),%xmm1 |
| 607 | pxor %xmm1,%xmm10 |
| 608 | movdqu %xmm10,0x60(%rsi) |
| 609 | movdqu 0xe0(%rdx),%xmm1 |
| 610 | pxor %xmm1,%xmm11 |
| 611 | movdqu %xmm11,0xe0(%rsi) |
| 612 | movdqu 0x30(%rdx),%xmm1 |
| 613 | pxor %xmm1,%xmm12 |
| 614 | movdqu %xmm12,0x30(%rsi) |
| 615 | movdqu 0xb0(%rdx),%xmm1 |
| 616 | pxor %xmm1,%xmm13 |
| 617 | movdqu %xmm13,0xb0(%rsi) |
| 618 | movdqu 0x70(%rdx),%xmm1 |
| 619 | pxor %xmm1,%xmm14 |
| 620 | movdqu %xmm14,0x70(%rsi) |
| 621 | movdqu 0xf0(%rdx),%xmm1 |
| 622 | pxor %xmm1,%xmm15 |
| 623 | movdqu %xmm15,0xf0(%rsi) |
| 624 | |
Eli Cooper | cbe09bd | 2016-01-22 00:24:08 +0800 | [diff] [blame] | 625 | mov %r11,%rsp |
Martin Willi | 274f938 | 2015-07-16 19:14:02 +0200 | [diff] [blame] | 626 | ret |
| 627 | ENDPROC(chacha20_4block_xor_ssse3) |