Martin Willi | c9320b6 | 2015-07-16 19:14:01 +0200 | [diff] [blame] | 1 | /* |
| 2 | * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions |
| 3 | * |
| 4 | * Copyright (C) 2015 Martin Willi |
| 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License as published by |
| 8 | * the Free Software Foundation; either version 2 of the License, or |
| 9 | * (at your option) any later version. |
| 10 | */ |
| 11 | |
| 12 | #include <linux/linkage.h> |
| 13 | |
Denys Vlasenko | e183914 | 2017-01-19 22:33:04 +0100 | [diff] [blame] | 14 | .section .rodata.cst16.ROT8, "aM", @progbits, 16 |
Martin Willi | c9320b6 | 2015-07-16 19:14:01 +0200 | [diff] [blame] | 15 | .align 16 |
Martin Willi | c9320b6 | 2015-07-16 19:14:01 +0200 | [diff] [blame] | 16 | ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 |
Denys Vlasenko | e183914 | 2017-01-19 22:33:04 +0100 | [diff] [blame] | 17 | .section .rodata.cst16.ROT16, "aM", @progbits, 16 |
| 18 | .align 16 |
Martin Willi | c9320b6 | 2015-07-16 19:14:01 +0200 | [diff] [blame] | 19 | ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 |
Denys Vlasenko | e183914 | 2017-01-19 22:33:04 +0100 | [diff] [blame] | 20 | .section .rodata.cst16.CTRINC, "aM", @progbits, 16 |
| 21 | .align 16 |
Martin Willi | 274f938 | 2015-07-16 19:14:02 +0200 | [diff] [blame] | 22 | CTRINC: .octa 0x00000003000000020000000100000000 |
Martin Willi | c9320b6 | 2015-07-16 19:14:01 +0200 | [diff] [blame] | 23 | |
| 24 | .text |
| 25 | |
| 26 | ENTRY(chacha20_block_xor_ssse3) |
| 27 | # %rdi: Input state matrix, s |
| 28 | # %rsi: 1 data block output, o |
| 29 | # %rdx: 1 data block input, i |
| 30 | |
| 31 | # This function encrypts one ChaCha20 block by loading the state matrix |
| 32 | # in four SSE registers. It performs matrix operation on four words in |
| 33 | # parallel, but requireds shuffling to rearrange the words after each |
| 34 | # round. 8/16-bit word rotation is done with the slightly better |
| 35 | # performing SSSE3 byte shuffling, 7/12-bit word rotation uses |
| 36 | # traditional shift+OR. |
| 37 | |
| 38 | # x0..3 = s0..3 |
| 39 | movdqa 0x00(%rdi),%xmm0 |
| 40 | movdqa 0x10(%rdi),%xmm1 |
| 41 | movdqa 0x20(%rdi),%xmm2 |
| 42 | movdqa 0x30(%rdi),%xmm3 |
| 43 | movdqa %xmm0,%xmm8 |
| 44 | movdqa %xmm1,%xmm9 |
| 45 | movdqa %xmm2,%xmm10 |
| 46 | movdqa %xmm3,%xmm11 |
| 47 | |
| 48 | movdqa ROT8(%rip),%xmm4 |
| 49 | movdqa ROT16(%rip),%xmm5 |
| 50 | |
| 51 | mov $10,%ecx |
| 52 | |
| 53 | .Ldoubleround: |
| 54 | |
| 55 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
| 56 | paddd %xmm1,%xmm0 |
| 57 | pxor %xmm0,%xmm3 |
| 58 | pshufb %xmm5,%xmm3 |
| 59 | |
| 60 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
| 61 | paddd %xmm3,%xmm2 |
| 62 | pxor %xmm2,%xmm1 |
| 63 | movdqa %xmm1,%xmm6 |
| 64 | pslld $12,%xmm6 |
| 65 | psrld $20,%xmm1 |
| 66 | por %xmm6,%xmm1 |
| 67 | |
| 68 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
| 69 | paddd %xmm1,%xmm0 |
| 70 | pxor %xmm0,%xmm3 |
| 71 | pshufb %xmm4,%xmm3 |
| 72 | |
| 73 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
| 74 | paddd %xmm3,%xmm2 |
| 75 | pxor %xmm2,%xmm1 |
| 76 | movdqa %xmm1,%xmm7 |
| 77 | pslld $7,%xmm7 |
| 78 | psrld $25,%xmm1 |
| 79 | por %xmm7,%xmm1 |
| 80 | |
| 81 | # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) |
| 82 | pshufd $0x39,%xmm1,%xmm1 |
| 83 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
| 84 | pshufd $0x4e,%xmm2,%xmm2 |
| 85 | # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) |
| 86 | pshufd $0x93,%xmm3,%xmm3 |
| 87 | |
| 88 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
| 89 | paddd %xmm1,%xmm0 |
| 90 | pxor %xmm0,%xmm3 |
| 91 | pshufb %xmm5,%xmm3 |
| 92 | |
| 93 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
| 94 | paddd %xmm3,%xmm2 |
| 95 | pxor %xmm2,%xmm1 |
| 96 | movdqa %xmm1,%xmm6 |
| 97 | pslld $12,%xmm6 |
| 98 | psrld $20,%xmm1 |
| 99 | por %xmm6,%xmm1 |
| 100 | |
| 101 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
| 102 | paddd %xmm1,%xmm0 |
| 103 | pxor %xmm0,%xmm3 |
| 104 | pshufb %xmm4,%xmm3 |
| 105 | |
| 106 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
| 107 | paddd %xmm3,%xmm2 |
| 108 | pxor %xmm2,%xmm1 |
| 109 | movdqa %xmm1,%xmm7 |
| 110 | pslld $7,%xmm7 |
| 111 | psrld $25,%xmm1 |
| 112 | por %xmm7,%xmm1 |
| 113 | |
| 114 | # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) |
| 115 | pshufd $0x93,%xmm1,%xmm1 |
| 116 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
| 117 | pshufd $0x4e,%xmm2,%xmm2 |
| 118 | # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) |
| 119 | pshufd $0x39,%xmm3,%xmm3 |
| 120 | |
| 121 | dec %ecx |
| 122 | jnz .Ldoubleround |
| 123 | |
| 124 | # o0 = i0 ^ (x0 + s0) |
| 125 | movdqu 0x00(%rdx),%xmm4 |
| 126 | paddd %xmm8,%xmm0 |
| 127 | pxor %xmm4,%xmm0 |
| 128 | movdqu %xmm0,0x00(%rsi) |
| 129 | # o1 = i1 ^ (x1 + s1) |
| 130 | movdqu 0x10(%rdx),%xmm5 |
| 131 | paddd %xmm9,%xmm1 |
| 132 | pxor %xmm5,%xmm1 |
| 133 | movdqu %xmm1,0x10(%rsi) |
| 134 | # o2 = i2 ^ (x2 + s2) |
| 135 | movdqu 0x20(%rdx),%xmm6 |
| 136 | paddd %xmm10,%xmm2 |
| 137 | pxor %xmm6,%xmm2 |
| 138 | movdqu %xmm2,0x20(%rsi) |
| 139 | # o3 = i3 ^ (x3 + s3) |
| 140 | movdqu 0x30(%rdx),%xmm7 |
| 141 | paddd %xmm11,%xmm3 |
| 142 | pxor %xmm7,%xmm3 |
| 143 | movdqu %xmm3,0x30(%rsi) |
| 144 | |
| 145 | ret |
| 146 | ENDPROC(chacha20_block_xor_ssse3) |
Martin Willi | 274f938 | 2015-07-16 19:14:02 +0200 | [diff] [blame] | 147 | |
| 148 | ENTRY(chacha20_4block_xor_ssse3) |
| 149 | # %rdi: Input state matrix, s |
| 150 | # %rsi: 4 data blocks output, o |
| 151 | # %rdx: 4 data blocks input, i |
| 152 | |
| 153 | # This function encrypts four consecutive ChaCha20 blocks by loading the |
| 154 | # the state matrix in SSE registers four times. As we need some scratch |
| 155 | # registers, we save the first four registers on the stack. The |
| 156 | # algorithm performs each operation on the corresponding word of each |
| 157 | # state matrix, hence requires no word shuffling. For final XORing step |
| 158 | # we transpose the matrix by interleaving 32- and then 64-bit words, |
| 159 | # which allows us to do XOR in SSE registers. 8/16-bit word rotation is |
| 160 | # done with the slightly better performing SSSE3 byte shuffling, |
| 161 | # 7/12-bit word rotation uses traditional shift+OR. |
| 162 | |
Eli Cooper | cbe09bd | 2016-01-22 00:24:08 +0800 | [diff] [blame] | 163 | mov %rsp,%r11 |
| 164 | sub $0x80,%rsp |
| 165 | and $~63,%rsp |
Martin Willi | 274f938 | 2015-07-16 19:14:02 +0200 | [diff] [blame] | 166 | |
| 167 | # x0..15[0-3] = s0..3[0..3] |
| 168 | movq 0x00(%rdi),%xmm1 |
| 169 | pshufd $0x00,%xmm1,%xmm0 |
| 170 | pshufd $0x55,%xmm1,%xmm1 |
| 171 | movq 0x08(%rdi),%xmm3 |
| 172 | pshufd $0x00,%xmm3,%xmm2 |
| 173 | pshufd $0x55,%xmm3,%xmm3 |
| 174 | movq 0x10(%rdi),%xmm5 |
| 175 | pshufd $0x00,%xmm5,%xmm4 |
| 176 | pshufd $0x55,%xmm5,%xmm5 |
| 177 | movq 0x18(%rdi),%xmm7 |
| 178 | pshufd $0x00,%xmm7,%xmm6 |
| 179 | pshufd $0x55,%xmm7,%xmm7 |
| 180 | movq 0x20(%rdi),%xmm9 |
| 181 | pshufd $0x00,%xmm9,%xmm8 |
| 182 | pshufd $0x55,%xmm9,%xmm9 |
| 183 | movq 0x28(%rdi),%xmm11 |
| 184 | pshufd $0x00,%xmm11,%xmm10 |
| 185 | pshufd $0x55,%xmm11,%xmm11 |
| 186 | movq 0x30(%rdi),%xmm13 |
| 187 | pshufd $0x00,%xmm13,%xmm12 |
| 188 | pshufd $0x55,%xmm13,%xmm13 |
| 189 | movq 0x38(%rdi),%xmm15 |
| 190 | pshufd $0x00,%xmm15,%xmm14 |
| 191 | pshufd $0x55,%xmm15,%xmm15 |
| 192 | # x0..3 on stack |
| 193 | movdqa %xmm0,0x00(%rsp) |
| 194 | movdqa %xmm1,0x10(%rsp) |
| 195 | movdqa %xmm2,0x20(%rsp) |
| 196 | movdqa %xmm3,0x30(%rsp) |
| 197 | |
| 198 | movdqa CTRINC(%rip),%xmm1 |
| 199 | movdqa ROT8(%rip),%xmm2 |
| 200 | movdqa ROT16(%rip),%xmm3 |
| 201 | |
| 202 | # x12 += counter values 0-3 |
| 203 | paddd %xmm1,%xmm12 |
| 204 | |
| 205 | mov $10,%ecx |
| 206 | |
| 207 | .Ldoubleround4: |
| 208 | # x0 += x4, x12 = rotl32(x12 ^ x0, 16) |
| 209 | movdqa 0x00(%rsp),%xmm0 |
| 210 | paddd %xmm4,%xmm0 |
| 211 | movdqa %xmm0,0x00(%rsp) |
| 212 | pxor %xmm0,%xmm12 |
| 213 | pshufb %xmm3,%xmm12 |
| 214 | # x1 += x5, x13 = rotl32(x13 ^ x1, 16) |
| 215 | movdqa 0x10(%rsp),%xmm0 |
| 216 | paddd %xmm5,%xmm0 |
| 217 | movdqa %xmm0,0x10(%rsp) |
| 218 | pxor %xmm0,%xmm13 |
| 219 | pshufb %xmm3,%xmm13 |
| 220 | # x2 += x6, x14 = rotl32(x14 ^ x2, 16) |
| 221 | movdqa 0x20(%rsp),%xmm0 |
| 222 | paddd %xmm6,%xmm0 |
| 223 | movdqa %xmm0,0x20(%rsp) |
| 224 | pxor %xmm0,%xmm14 |
| 225 | pshufb %xmm3,%xmm14 |
| 226 | # x3 += x7, x15 = rotl32(x15 ^ x3, 16) |
| 227 | movdqa 0x30(%rsp),%xmm0 |
| 228 | paddd %xmm7,%xmm0 |
| 229 | movdqa %xmm0,0x30(%rsp) |
| 230 | pxor %xmm0,%xmm15 |
| 231 | pshufb %xmm3,%xmm15 |
| 232 | |
| 233 | # x8 += x12, x4 = rotl32(x4 ^ x8, 12) |
| 234 | paddd %xmm12,%xmm8 |
| 235 | pxor %xmm8,%xmm4 |
| 236 | movdqa %xmm4,%xmm0 |
| 237 | pslld $12,%xmm0 |
| 238 | psrld $20,%xmm4 |
| 239 | por %xmm0,%xmm4 |
| 240 | # x9 += x13, x5 = rotl32(x5 ^ x9, 12) |
| 241 | paddd %xmm13,%xmm9 |
| 242 | pxor %xmm9,%xmm5 |
| 243 | movdqa %xmm5,%xmm0 |
| 244 | pslld $12,%xmm0 |
| 245 | psrld $20,%xmm5 |
| 246 | por %xmm0,%xmm5 |
| 247 | # x10 += x14, x6 = rotl32(x6 ^ x10, 12) |
| 248 | paddd %xmm14,%xmm10 |
| 249 | pxor %xmm10,%xmm6 |
| 250 | movdqa %xmm6,%xmm0 |
| 251 | pslld $12,%xmm0 |
| 252 | psrld $20,%xmm6 |
| 253 | por %xmm0,%xmm6 |
| 254 | # x11 += x15, x7 = rotl32(x7 ^ x11, 12) |
| 255 | paddd %xmm15,%xmm11 |
| 256 | pxor %xmm11,%xmm7 |
| 257 | movdqa %xmm7,%xmm0 |
| 258 | pslld $12,%xmm0 |
| 259 | psrld $20,%xmm7 |
| 260 | por %xmm0,%xmm7 |
| 261 | |
| 262 | # x0 += x4, x12 = rotl32(x12 ^ x0, 8) |
| 263 | movdqa 0x00(%rsp),%xmm0 |
| 264 | paddd %xmm4,%xmm0 |
| 265 | movdqa %xmm0,0x00(%rsp) |
| 266 | pxor %xmm0,%xmm12 |
| 267 | pshufb %xmm2,%xmm12 |
| 268 | # x1 += x5, x13 = rotl32(x13 ^ x1, 8) |
| 269 | movdqa 0x10(%rsp),%xmm0 |
| 270 | paddd %xmm5,%xmm0 |
| 271 | movdqa %xmm0,0x10(%rsp) |
| 272 | pxor %xmm0,%xmm13 |
| 273 | pshufb %xmm2,%xmm13 |
| 274 | # x2 += x6, x14 = rotl32(x14 ^ x2, 8) |
| 275 | movdqa 0x20(%rsp),%xmm0 |
| 276 | paddd %xmm6,%xmm0 |
| 277 | movdqa %xmm0,0x20(%rsp) |
| 278 | pxor %xmm0,%xmm14 |
| 279 | pshufb %xmm2,%xmm14 |
| 280 | # x3 += x7, x15 = rotl32(x15 ^ x3, 8) |
| 281 | movdqa 0x30(%rsp),%xmm0 |
| 282 | paddd %xmm7,%xmm0 |
| 283 | movdqa %xmm0,0x30(%rsp) |
| 284 | pxor %xmm0,%xmm15 |
| 285 | pshufb %xmm2,%xmm15 |
| 286 | |
| 287 | # x8 += x12, x4 = rotl32(x4 ^ x8, 7) |
| 288 | paddd %xmm12,%xmm8 |
| 289 | pxor %xmm8,%xmm4 |
| 290 | movdqa %xmm4,%xmm0 |
| 291 | pslld $7,%xmm0 |
| 292 | psrld $25,%xmm4 |
| 293 | por %xmm0,%xmm4 |
| 294 | # x9 += x13, x5 = rotl32(x5 ^ x9, 7) |
| 295 | paddd %xmm13,%xmm9 |
| 296 | pxor %xmm9,%xmm5 |
| 297 | movdqa %xmm5,%xmm0 |
| 298 | pslld $7,%xmm0 |
| 299 | psrld $25,%xmm5 |
| 300 | por %xmm0,%xmm5 |
| 301 | # x10 += x14, x6 = rotl32(x6 ^ x10, 7) |
| 302 | paddd %xmm14,%xmm10 |
| 303 | pxor %xmm10,%xmm6 |
| 304 | movdqa %xmm6,%xmm0 |
| 305 | pslld $7,%xmm0 |
| 306 | psrld $25,%xmm6 |
| 307 | por %xmm0,%xmm6 |
| 308 | # x11 += x15, x7 = rotl32(x7 ^ x11, 7) |
| 309 | paddd %xmm15,%xmm11 |
| 310 | pxor %xmm11,%xmm7 |
| 311 | movdqa %xmm7,%xmm0 |
| 312 | pslld $7,%xmm0 |
| 313 | psrld $25,%xmm7 |
| 314 | por %xmm0,%xmm7 |
| 315 | |
| 316 | # x0 += x5, x15 = rotl32(x15 ^ x0, 16) |
| 317 | movdqa 0x00(%rsp),%xmm0 |
| 318 | paddd %xmm5,%xmm0 |
| 319 | movdqa %xmm0,0x00(%rsp) |
| 320 | pxor %xmm0,%xmm15 |
| 321 | pshufb %xmm3,%xmm15 |
| 322 | # x1 += x6, x12 = rotl32(x12 ^ x1, 16) |
| 323 | movdqa 0x10(%rsp),%xmm0 |
| 324 | paddd %xmm6,%xmm0 |
| 325 | movdqa %xmm0,0x10(%rsp) |
| 326 | pxor %xmm0,%xmm12 |
| 327 | pshufb %xmm3,%xmm12 |
| 328 | # x2 += x7, x13 = rotl32(x13 ^ x2, 16) |
| 329 | movdqa 0x20(%rsp),%xmm0 |
| 330 | paddd %xmm7,%xmm0 |
| 331 | movdqa %xmm0,0x20(%rsp) |
| 332 | pxor %xmm0,%xmm13 |
| 333 | pshufb %xmm3,%xmm13 |
| 334 | # x3 += x4, x14 = rotl32(x14 ^ x3, 16) |
| 335 | movdqa 0x30(%rsp),%xmm0 |
| 336 | paddd %xmm4,%xmm0 |
| 337 | movdqa %xmm0,0x30(%rsp) |
| 338 | pxor %xmm0,%xmm14 |
| 339 | pshufb %xmm3,%xmm14 |
| 340 | |
| 341 | # x10 += x15, x5 = rotl32(x5 ^ x10, 12) |
| 342 | paddd %xmm15,%xmm10 |
| 343 | pxor %xmm10,%xmm5 |
| 344 | movdqa %xmm5,%xmm0 |
| 345 | pslld $12,%xmm0 |
| 346 | psrld $20,%xmm5 |
| 347 | por %xmm0,%xmm5 |
| 348 | # x11 += x12, x6 = rotl32(x6 ^ x11, 12) |
| 349 | paddd %xmm12,%xmm11 |
| 350 | pxor %xmm11,%xmm6 |
| 351 | movdqa %xmm6,%xmm0 |
| 352 | pslld $12,%xmm0 |
| 353 | psrld $20,%xmm6 |
| 354 | por %xmm0,%xmm6 |
| 355 | # x8 += x13, x7 = rotl32(x7 ^ x8, 12) |
| 356 | paddd %xmm13,%xmm8 |
| 357 | pxor %xmm8,%xmm7 |
| 358 | movdqa %xmm7,%xmm0 |
| 359 | pslld $12,%xmm0 |
| 360 | psrld $20,%xmm7 |
| 361 | por %xmm0,%xmm7 |
| 362 | # x9 += x14, x4 = rotl32(x4 ^ x9, 12) |
| 363 | paddd %xmm14,%xmm9 |
| 364 | pxor %xmm9,%xmm4 |
| 365 | movdqa %xmm4,%xmm0 |
| 366 | pslld $12,%xmm0 |
| 367 | psrld $20,%xmm4 |
| 368 | por %xmm0,%xmm4 |
| 369 | |
| 370 | # x0 += x5, x15 = rotl32(x15 ^ x0, 8) |
| 371 | movdqa 0x00(%rsp),%xmm0 |
| 372 | paddd %xmm5,%xmm0 |
| 373 | movdqa %xmm0,0x00(%rsp) |
| 374 | pxor %xmm0,%xmm15 |
| 375 | pshufb %xmm2,%xmm15 |
| 376 | # x1 += x6, x12 = rotl32(x12 ^ x1, 8) |
| 377 | movdqa 0x10(%rsp),%xmm0 |
| 378 | paddd %xmm6,%xmm0 |
| 379 | movdqa %xmm0,0x10(%rsp) |
| 380 | pxor %xmm0,%xmm12 |
| 381 | pshufb %xmm2,%xmm12 |
| 382 | # x2 += x7, x13 = rotl32(x13 ^ x2, 8) |
| 383 | movdqa 0x20(%rsp),%xmm0 |
| 384 | paddd %xmm7,%xmm0 |
| 385 | movdqa %xmm0,0x20(%rsp) |
| 386 | pxor %xmm0,%xmm13 |
| 387 | pshufb %xmm2,%xmm13 |
| 388 | # x3 += x4, x14 = rotl32(x14 ^ x3, 8) |
| 389 | movdqa 0x30(%rsp),%xmm0 |
| 390 | paddd %xmm4,%xmm0 |
| 391 | movdqa %xmm0,0x30(%rsp) |
| 392 | pxor %xmm0,%xmm14 |
| 393 | pshufb %xmm2,%xmm14 |
| 394 | |
| 395 | # x10 += x15, x5 = rotl32(x5 ^ x10, 7) |
| 396 | paddd %xmm15,%xmm10 |
| 397 | pxor %xmm10,%xmm5 |
| 398 | movdqa %xmm5,%xmm0 |
| 399 | pslld $7,%xmm0 |
| 400 | psrld $25,%xmm5 |
| 401 | por %xmm0,%xmm5 |
| 402 | # x11 += x12, x6 = rotl32(x6 ^ x11, 7) |
| 403 | paddd %xmm12,%xmm11 |
| 404 | pxor %xmm11,%xmm6 |
| 405 | movdqa %xmm6,%xmm0 |
| 406 | pslld $7,%xmm0 |
| 407 | psrld $25,%xmm6 |
| 408 | por %xmm0,%xmm6 |
| 409 | # x8 += x13, x7 = rotl32(x7 ^ x8, 7) |
| 410 | paddd %xmm13,%xmm8 |
| 411 | pxor %xmm8,%xmm7 |
| 412 | movdqa %xmm7,%xmm0 |
| 413 | pslld $7,%xmm0 |
| 414 | psrld $25,%xmm7 |
| 415 | por %xmm0,%xmm7 |
| 416 | # x9 += x14, x4 = rotl32(x4 ^ x9, 7) |
| 417 | paddd %xmm14,%xmm9 |
| 418 | pxor %xmm9,%xmm4 |
| 419 | movdqa %xmm4,%xmm0 |
| 420 | pslld $7,%xmm0 |
| 421 | psrld $25,%xmm4 |
| 422 | por %xmm0,%xmm4 |
| 423 | |
| 424 | dec %ecx |
| 425 | jnz .Ldoubleround4 |
| 426 | |
| 427 | # x0[0-3] += s0[0] |
| 428 | # x1[0-3] += s0[1] |
| 429 | movq 0x00(%rdi),%xmm3 |
| 430 | pshufd $0x00,%xmm3,%xmm2 |
| 431 | pshufd $0x55,%xmm3,%xmm3 |
| 432 | paddd 0x00(%rsp),%xmm2 |
| 433 | movdqa %xmm2,0x00(%rsp) |
| 434 | paddd 0x10(%rsp),%xmm3 |
| 435 | movdqa %xmm3,0x10(%rsp) |
| 436 | # x2[0-3] += s0[2] |
| 437 | # x3[0-3] += s0[3] |
| 438 | movq 0x08(%rdi),%xmm3 |
| 439 | pshufd $0x00,%xmm3,%xmm2 |
| 440 | pshufd $0x55,%xmm3,%xmm3 |
| 441 | paddd 0x20(%rsp),%xmm2 |
| 442 | movdqa %xmm2,0x20(%rsp) |
| 443 | paddd 0x30(%rsp),%xmm3 |
| 444 | movdqa %xmm3,0x30(%rsp) |
| 445 | |
| 446 | # x4[0-3] += s1[0] |
| 447 | # x5[0-3] += s1[1] |
| 448 | movq 0x10(%rdi),%xmm3 |
| 449 | pshufd $0x00,%xmm3,%xmm2 |
| 450 | pshufd $0x55,%xmm3,%xmm3 |
| 451 | paddd %xmm2,%xmm4 |
| 452 | paddd %xmm3,%xmm5 |
| 453 | # x6[0-3] += s1[2] |
| 454 | # x7[0-3] += s1[3] |
| 455 | movq 0x18(%rdi),%xmm3 |
| 456 | pshufd $0x00,%xmm3,%xmm2 |
| 457 | pshufd $0x55,%xmm3,%xmm3 |
| 458 | paddd %xmm2,%xmm6 |
| 459 | paddd %xmm3,%xmm7 |
| 460 | |
| 461 | # x8[0-3] += s2[0] |
| 462 | # x9[0-3] += s2[1] |
| 463 | movq 0x20(%rdi),%xmm3 |
| 464 | pshufd $0x00,%xmm3,%xmm2 |
| 465 | pshufd $0x55,%xmm3,%xmm3 |
| 466 | paddd %xmm2,%xmm8 |
| 467 | paddd %xmm3,%xmm9 |
| 468 | # x10[0-3] += s2[2] |
| 469 | # x11[0-3] += s2[3] |
| 470 | movq 0x28(%rdi),%xmm3 |
| 471 | pshufd $0x00,%xmm3,%xmm2 |
| 472 | pshufd $0x55,%xmm3,%xmm3 |
| 473 | paddd %xmm2,%xmm10 |
| 474 | paddd %xmm3,%xmm11 |
| 475 | |
| 476 | # x12[0-3] += s3[0] |
| 477 | # x13[0-3] += s3[1] |
| 478 | movq 0x30(%rdi),%xmm3 |
| 479 | pshufd $0x00,%xmm3,%xmm2 |
| 480 | pshufd $0x55,%xmm3,%xmm3 |
| 481 | paddd %xmm2,%xmm12 |
| 482 | paddd %xmm3,%xmm13 |
| 483 | # x14[0-3] += s3[2] |
| 484 | # x15[0-3] += s3[3] |
| 485 | movq 0x38(%rdi),%xmm3 |
| 486 | pshufd $0x00,%xmm3,%xmm2 |
| 487 | pshufd $0x55,%xmm3,%xmm3 |
| 488 | paddd %xmm2,%xmm14 |
| 489 | paddd %xmm3,%xmm15 |
| 490 | |
| 491 | # x12 += counter values 0-3 |
| 492 | paddd %xmm1,%xmm12 |
| 493 | |
| 494 | # interleave 32-bit words in state n, n+1 |
| 495 | movdqa 0x00(%rsp),%xmm0 |
| 496 | movdqa 0x10(%rsp),%xmm1 |
| 497 | movdqa %xmm0,%xmm2 |
| 498 | punpckldq %xmm1,%xmm2 |
| 499 | punpckhdq %xmm1,%xmm0 |
| 500 | movdqa %xmm2,0x00(%rsp) |
| 501 | movdqa %xmm0,0x10(%rsp) |
| 502 | movdqa 0x20(%rsp),%xmm0 |
| 503 | movdqa 0x30(%rsp),%xmm1 |
| 504 | movdqa %xmm0,%xmm2 |
| 505 | punpckldq %xmm1,%xmm2 |
| 506 | punpckhdq %xmm1,%xmm0 |
| 507 | movdqa %xmm2,0x20(%rsp) |
| 508 | movdqa %xmm0,0x30(%rsp) |
| 509 | movdqa %xmm4,%xmm0 |
| 510 | punpckldq %xmm5,%xmm4 |
| 511 | punpckhdq %xmm5,%xmm0 |
| 512 | movdqa %xmm0,%xmm5 |
| 513 | movdqa %xmm6,%xmm0 |
| 514 | punpckldq %xmm7,%xmm6 |
| 515 | punpckhdq %xmm7,%xmm0 |
| 516 | movdqa %xmm0,%xmm7 |
| 517 | movdqa %xmm8,%xmm0 |
| 518 | punpckldq %xmm9,%xmm8 |
| 519 | punpckhdq %xmm9,%xmm0 |
| 520 | movdqa %xmm0,%xmm9 |
| 521 | movdqa %xmm10,%xmm0 |
| 522 | punpckldq %xmm11,%xmm10 |
| 523 | punpckhdq %xmm11,%xmm0 |
| 524 | movdqa %xmm0,%xmm11 |
| 525 | movdqa %xmm12,%xmm0 |
| 526 | punpckldq %xmm13,%xmm12 |
| 527 | punpckhdq %xmm13,%xmm0 |
| 528 | movdqa %xmm0,%xmm13 |
| 529 | movdqa %xmm14,%xmm0 |
| 530 | punpckldq %xmm15,%xmm14 |
| 531 | punpckhdq %xmm15,%xmm0 |
| 532 | movdqa %xmm0,%xmm15 |
| 533 | |
| 534 | # interleave 64-bit words in state n, n+2 |
| 535 | movdqa 0x00(%rsp),%xmm0 |
| 536 | movdqa 0x20(%rsp),%xmm1 |
| 537 | movdqa %xmm0,%xmm2 |
| 538 | punpcklqdq %xmm1,%xmm2 |
| 539 | punpckhqdq %xmm1,%xmm0 |
| 540 | movdqa %xmm2,0x00(%rsp) |
| 541 | movdqa %xmm0,0x20(%rsp) |
| 542 | movdqa 0x10(%rsp),%xmm0 |
| 543 | movdqa 0x30(%rsp),%xmm1 |
| 544 | movdqa %xmm0,%xmm2 |
| 545 | punpcklqdq %xmm1,%xmm2 |
| 546 | punpckhqdq %xmm1,%xmm0 |
| 547 | movdqa %xmm2,0x10(%rsp) |
| 548 | movdqa %xmm0,0x30(%rsp) |
| 549 | movdqa %xmm4,%xmm0 |
| 550 | punpcklqdq %xmm6,%xmm4 |
| 551 | punpckhqdq %xmm6,%xmm0 |
| 552 | movdqa %xmm0,%xmm6 |
| 553 | movdqa %xmm5,%xmm0 |
| 554 | punpcklqdq %xmm7,%xmm5 |
| 555 | punpckhqdq %xmm7,%xmm0 |
| 556 | movdqa %xmm0,%xmm7 |
| 557 | movdqa %xmm8,%xmm0 |
| 558 | punpcklqdq %xmm10,%xmm8 |
| 559 | punpckhqdq %xmm10,%xmm0 |
| 560 | movdqa %xmm0,%xmm10 |
| 561 | movdqa %xmm9,%xmm0 |
| 562 | punpcklqdq %xmm11,%xmm9 |
| 563 | punpckhqdq %xmm11,%xmm0 |
| 564 | movdqa %xmm0,%xmm11 |
| 565 | movdqa %xmm12,%xmm0 |
| 566 | punpcklqdq %xmm14,%xmm12 |
| 567 | punpckhqdq %xmm14,%xmm0 |
| 568 | movdqa %xmm0,%xmm14 |
| 569 | movdqa %xmm13,%xmm0 |
| 570 | punpcklqdq %xmm15,%xmm13 |
| 571 | punpckhqdq %xmm15,%xmm0 |
| 572 | movdqa %xmm0,%xmm15 |
| 573 | |
| 574 | # xor with corresponding input, write to output |
| 575 | movdqa 0x00(%rsp),%xmm0 |
| 576 | movdqu 0x00(%rdx),%xmm1 |
| 577 | pxor %xmm1,%xmm0 |
| 578 | movdqu %xmm0,0x00(%rsi) |
| 579 | movdqa 0x10(%rsp),%xmm0 |
| 580 | movdqu 0x80(%rdx),%xmm1 |
| 581 | pxor %xmm1,%xmm0 |
| 582 | movdqu %xmm0,0x80(%rsi) |
| 583 | movdqa 0x20(%rsp),%xmm0 |
| 584 | movdqu 0x40(%rdx),%xmm1 |
| 585 | pxor %xmm1,%xmm0 |
| 586 | movdqu %xmm0,0x40(%rsi) |
| 587 | movdqa 0x30(%rsp),%xmm0 |
| 588 | movdqu 0xc0(%rdx),%xmm1 |
| 589 | pxor %xmm1,%xmm0 |
| 590 | movdqu %xmm0,0xc0(%rsi) |
| 591 | movdqu 0x10(%rdx),%xmm1 |
| 592 | pxor %xmm1,%xmm4 |
| 593 | movdqu %xmm4,0x10(%rsi) |
| 594 | movdqu 0x90(%rdx),%xmm1 |
| 595 | pxor %xmm1,%xmm5 |
| 596 | movdqu %xmm5,0x90(%rsi) |
| 597 | movdqu 0x50(%rdx),%xmm1 |
| 598 | pxor %xmm1,%xmm6 |
| 599 | movdqu %xmm6,0x50(%rsi) |
| 600 | movdqu 0xd0(%rdx),%xmm1 |
| 601 | pxor %xmm1,%xmm7 |
| 602 | movdqu %xmm7,0xd0(%rsi) |
| 603 | movdqu 0x20(%rdx),%xmm1 |
| 604 | pxor %xmm1,%xmm8 |
| 605 | movdqu %xmm8,0x20(%rsi) |
| 606 | movdqu 0xa0(%rdx),%xmm1 |
| 607 | pxor %xmm1,%xmm9 |
| 608 | movdqu %xmm9,0xa0(%rsi) |
| 609 | movdqu 0x60(%rdx),%xmm1 |
| 610 | pxor %xmm1,%xmm10 |
| 611 | movdqu %xmm10,0x60(%rsi) |
| 612 | movdqu 0xe0(%rdx),%xmm1 |
| 613 | pxor %xmm1,%xmm11 |
| 614 | movdqu %xmm11,0xe0(%rsi) |
| 615 | movdqu 0x30(%rdx),%xmm1 |
| 616 | pxor %xmm1,%xmm12 |
| 617 | movdqu %xmm12,0x30(%rsi) |
| 618 | movdqu 0xb0(%rdx),%xmm1 |
| 619 | pxor %xmm1,%xmm13 |
| 620 | movdqu %xmm13,0xb0(%rsi) |
| 621 | movdqu 0x70(%rdx),%xmm1 |
| 622 | pxor %xmm1,%xmm14 |
| 623 | movdqu %xmm14,0x70(%rsi) |
| 624 | movdqu 0xf0(%rdx),%xmm1 |
| 625 | pxor %xmm1,%xmm15 |
| 626 | movdqu %xmm15,0xf0(%rsi) |
| 627 | |
Eli Cooper | cbe09bd | 2016-01-22 00:24:08 +0800 | [diff] [blame] | 628 | mov %r11,%rsp |
Martin Willi | 274f938 | 2015-07-16 19:14:02 +0200 | [diff] [blame] | 629 | ret |
| 630 | ENDPROC(chacha20_4block_xor_ssse3) |