Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 1 | /* |
| 2 | * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions |
| 3 | * |
| 4 | * Copyright (C) 2015 Martin Willi |
| 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License as published by |
| 8 | * the Free Software Foundation; either version 2 of the License, or |
| 9 | * (at your option) any later version. |
| 10 | */ |
| 11 | |
| 12 | #include <linux/linkage.h> |
| 13 | |
| 14 | .data |
| 15 | .align 32 |
| 16 | |
| 17 | ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 |
| 18 | .octa 0x0e0d0c0f0a09080b0605040702010003 |
| 19 | ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 |
| 20 | .octa 0x0d0c0f0e09080b0a0504070601000302 |
| 21 | CTRINC: .octa 0x00000003000000020000000100000000 |
| 22 | .octa 0x00000007000000060000000500000004 |
| 23 | |
| 24 | .text |
| 25 | |
| 26 | ENTRY(chacha20_8block_xor_avx2) |
| 27 | # %rdi: Input state matrix, s |
| 28 | # %rsi: 8 data blocks output, o |
| 29 | # %rdx: 8 data blocks input, i |
| 30 | |
| 31 | # This function encrypts eight consecutive ChaCha20 blocks by loading |
| 32 | # the state matrix in AVX registers eight times. As we need some |
| 33 | # scratch registers, we save the first four registers on the stack. The |
| 34 | # algorithm performs each operation on the corresponding word of each |
| 35 | # state matrix, hence requires no word shuffling. For final XORing step |
| 36 | # we transpose the matrix by interleaving 32-, 64- and then 128-bit |
| 37 | # words, which allows us to do XOR in AVX registers. 8/16-bit word |
| 38 | # rotation is done with the slightly better performing byte shuffling, |
| 39 | # 7/12-bit word rotation uses traditional shift+OR. |
| 40 | |
| 41 | vzeroupper |
| 42 | # 4 * 32 byte stack, 32-byte aligned |
| 43 | mov %rsp, %r8 |
| 44 | and $~31, %rsp |
| 45 | sub $0x80, %rsp |
| 46 | |
| 47 | # x0..15[0-7] = s[0..15] |
| 48 | vpbroadcastd 0x00(%rdi),%ymm0 |
| 49 | vpbroadcastd 0x04(%rdi),%ymm1 |
| 50 | vpbroadcastd 0x08(%rdi),%ymm2 |
| 51 | vpbroadcastd 0x0c(%rdi),%ymm3 |
| 52 | vpbroadcastd 0x10(%rdi),%ymm4 |
| 53 | vpbroadcastd 0x14(%rdi),%ymm5 |
| 54 | vpbroadcastd 0x18(%rdi),%ymm6 |
| 55 | vpbroadcastd 0x1c(%rdi),%ymm7 |
| 56 | vpbroadcastd 0x20(%rdi),%ymm8 |
| 57 | vpbroadcastd 0x24(%rdi),%ymm9 |
| 58 | vpbroadcastd 0x28(%rdi),%ymm10 |
| 59 | vpbroadcastd 0x2c(%rdi),%ymm11 |
| 60 | vpbroadcastd 0x30(%rdi),%ymm12 |
| 61 | vpbroadcastd 0x34(%rdi),%ymm13 |
| 62 | vpbroadcastd 0x38(%rdi),%ymm14 |
| 63 | vpbroadcastd 0x3c(%rdi),%ymm15 |
| 64 | # x0..3 on stack |
| 65 | vmovdqa %ymm0,0x00(%rsp) |
| 66 | vmovdqa %ymm1,0x20(%rsp) |
| 67 | vmovdqa %ymm2,0x40(%rsp) |
| 68 | vmovdqa %ymm3,0x60(%rsp) |
| 69 | |
| 70 | vmovdqa CTRINC(%rip),%ymm1 |
| 71 | vmovdqa ROT8(%rip),%ymm2 |
| 72 | vmovdqa ROT16(%rip),%ymm3 |
| 73 | |
| 74 | # x12 += counter values 0-3 |
| 75 | vpaddd %ymm1,%ymm12,%ymm12 |
| 76 | |
| 77 | mov $10,%ecx |
| 78 | |
| 79 | .Ldoubleround8: |
| 80 | # x0 += x4, x12 = rotl32(x12 ^ x0, 16) |
| 81 | vpaddd 0x00(%rsp),%ymm4,%ymm0 |
| 82 | vmovdqa %ymm0,0x00(%rsp) |
| 83 | vpxor %ymm0,%ymm12,%ymm12 |
| 84 | vpshufb %ymm3,%ymm12,%ymm12 |
| 85 | # x1 += x5, x13 = rotl32(x13 ^ x1, 16) |
| 86 | vpaddd 0x20(%rsp),%ymm5,%ymm0 |
| 87 | vmovdqa %ymm0,0x20(%rsp) |
| 88 | vpxor %ymm0,%ymm13,%ymm13 |
| 89 | vpshufb %ymm3,%ymm13,%ymm13 |
| 90 | # x2 += x6, x14 = rotl32(x14 ^ x2, 16) |
| 91 | vpaddd 0x40(%rsp),%ymm6,%ymm0 |
| 92 | vmovdqa %ymm0,0x40(%rsp) |
| 93 | vpxor %ymm0,%ymm14,%ymm14 |
| 94 | vpshufb %ymm3,%ymm14,%ymm14 |
| 95 | # x3 += x7, x15 = rotl32(x15 ^ x3, 16) |
| 96 | vpaddd 0x60(%rsp),%ymm7,%ymm0 |
| 97 | vmovdqa %ymm0,0x60(%rsp) |
| 98 | vpxor %ymm0,%ymm15,%ymm15 |
| 99 | vpshufb %ymm3,%ymm15,%ymm15 |
| 100 | |
| 101 | # x8 += x12, x4 = rotl32(x4 ^ x8, 12) |
| 102 | vpaddd %ymm12,%ymm8,%ymm8 |
| 103 | vpxor %ymm8,%ymm4,%ymm4 |
| 104 | vpslld $12,%ymm4,%ymm0 |
| 105 | vpsrld $20,%ymm4,%ymm4 |
| 106 | vpor %ymm0,%ymm4,%ymm4 |
| 107 | # x9 += x13, x5 = rotl32(x5 ^ x9, 12) |
| 108 | vpaddd %ymm13,%ymm9,%ymm9 |
| 109 | vpxor %ymm9,%ymm5,%ymm5 |
| 110 | vpslld $12,%ymm5,%ymm0 |
| 111 | vpsrld $20,%ymm5,%ymm5 |
| 112 | vpor %ymm0,%ymm5,%ymm5 |
| 113 | # x10 += x14, x6 = rotl32(x6 ^ x10, 12) |
| 114 | vpaddd %ymm14,%ymm10,%ymm10 |
| 115 | vpxor %ymm10,%ymm6,%ymm6 |
| 116 | vpslld $12,%ymm6,%ymm0 |
| 117 | vpsrld $20,%ymm6,%ymm6 |
| 118 | vpor %ymm0,%ymm6,%ymm6 |
| 119 | # x11 += x15, x7 = rotl32(x7 ^ x11, 12) |
| 120 | vpaddd %ymm15,%ymm11,%ymm11 |
| 121 | vpxor %ymm11,%ymm7,%ymm7 |
| 122 | vpslld $12,%ymm7,%ymm0 |
| 123 | vpsrld $20,%ymm7,%ymm7 |
| 124 | vpor %ymm0,%ymm7,%ymm7 |
| 125 | |
| 126 | # x0 += x4, x12 = rotl32(x12 ^ x0, 8) |
| 127 | vpaddd 0x00(%rsp),%ymm4,%ymm0 |
| 128 | vmovdqa %ymm0,0x00(%rsp) |
| 129 | vpxor %ymm0,%ymm12,%ymm12 |
| 130 | vpshufb %ymm2,%ymm12,%ymm12 |
| 131 | # x1 += x5, x13 = rotl32(x13 ^ x1, 8) |
| 132 | vpaddd 0x20(%rsp),%ymm5,%ymm0 |
| 133 | vmovdqa %ymm0,0x20(%rsp) |
| 134 | vpxor %ymm0,%ymm13,%ymm13 |
| 135 | vpshufb %ymm2,%ymm13,%ymm13 |
| 136 | # x2 += x6, x14 = rotl32(x14 ^ x2, 8) |
| 137 | vpaddd 0x40(%rsp),%ymm6,%ymm0 |
| 138 | vmovdqa %ymm0,0x40(%rsp) |
| 139 | vpxor %ymm0,%ymm14,%ymm14 |
| 140 | vpshufb %ymm2,%ymm14,%ymm14 |
| 141 | # x3 += x7, x15 = rotl32(x15 ^ x3, 8) |
| 142 | vpaddd 0x60(%rsp),%ymm7,%ymm0 |
| 143 | vmovdqa %ymm0,0x60(%rsp) |
| 144 | vpxor %ymm0,%ymm15,%ymm15 |
| 145 | vpshufb %ymm2,%ymm15,%ymm15 |
| 146 | |
| 147 | # x8 += x12, x4 = rotl32(x4 ^ x8, 7) |
| 148 | vpaddd %ymm12,%ymm8,%ymm8 |
| 149 | vpxor %ymm8,%ymm4,%ymm4 |
| 150 | vpslld $7,%ymm4,%ymm0 |
| 151 | vpsrld $25,%ymm4,%ymm4 |
| 152 | vpor %ymm0,%ymm4,%ymm4 |
| 153 | # x9 += x13, x5 = rotl32(x5 ^ x9, 7) |
| 154 | vpaddd %ymm13,%ymm9,%ymm9 |
| 155 | vpxor %ymm9,%ymm5,%ymm5 |
| 156 | vpslld $7,%ymm5,%ymm0 |
| 157 | vpsrld $25,%ymm5,%ymm5 |
| 158 | vpor %ymm0,%ymm5,%ymm5 |
| 159 | # x10 += x14, x6 = rotl32(x6 ^ x10, 7) |
| 160 | vpaddd %ymm14,%ymm10,%ymm10 |
| 161 | vpxor %ymm10,%ymm6,%ymm6 |
| 162 | vpslld $7,%ymm6,%ymm0 |
| 163 | vpsrld $25,%ymm6,%ymm6 |
| 164 | vpor %ymm0,%ymm6,%ymm6 |
| 165 | # x11 += x15, x7 = rotl32(x7 ^ x11, 7) |
| 166 | vpaddd %ymm15,%ymm11,%ymm11 |
| 167 | vpxor %ymm11,%ymm7,%ymm7 |
| 168 | vpslld $7,%ymm7,%ymm0 |
| 169 | vpsrld $25,%ymm7,%ymm7 |
| 170 | vpor %ymm0,%ymm7,%ymm7 |
| 171 | |
| 172 | # x0 += x5, x15 = rotl32(x15 ^ x0, 16) |
| 173 | vpaddd 0x00(%rsp),%ymm5,%ymm0 |
| 174 | vmovdqa %ymm0,0x00(%rsp) |
| 175 | vpxor %ymm0,%ymm15,%ymm15 |
| 176 | vpshufb %ymm3,%ymm15,%ymm15 |
| 177 | # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 |
| 178 | vpaddd 0x20(%rsp),%ymm6,%ymm0 |
| 179 | vmovdqa %ymm0,0x20(%rsp) |
| 180 | vpxor %ymm0,%ymm12,%ymm12 |
| 181 | vpshufb %ymm3,%ymm12,%ymm12 |
| 182 | # x2 += x7, x13 = rotl32(x13 ^ x2, 16) |
| 183 | vpaddd 0x40(%rsp),%ymm7,%ymm0 |
| 184 | vmovdqa %ymm0,0x40(%rsp) |
| 185 | vpxor %ymm0,%ymm13,%ymm13 |
| 186 | vpshufb %ymm3,%ymm13,%ymm13 |
| 187 | # x3 += x4, x14 = rotl32(x14 ^ x3, 16) |
| 188 | vpaddd 0x60(%rsp),%ymm4,%ymm0 |
| 189 | vmovdqa %ymm0,0x60(%rsp) |
| 190 | vpxor %ymm0,%ymm14,%ymm14 |
| 191 | vpshufb %ymm3,%ymm14,%ymm14 |
| 192 | |
| 193 | # x10 += x15, x5 = rotl32(x5 ^ x10, 12) |
| 194 | vpaddd %ymm15,%ymm10,%ymm10 |
| 195 | vpxor %ymm10,%ymm5,%ymm5 |
| 196 | vpslld $12,%ymm5,%ymm0 |
| 197 | vpsrld $20,%ymm5,%ymm5 |
| 198 | vpor %ymm0,%ymm5,%ymm5 |
| 199 | # x11 += x12, x6 = rotl32(x6 ^ x11, 12) |
| 200 | vpaddd %ymm12,%ymm11,%ymm11 |
| 201 | vpxor %ymm11,%ymm6,%ymm6 |
| 202 | vpslld $12,%ymm6,%ymm0 |
| 203 | vpsrld $20,%ymm6,%ymm6 |
| 204 | vpor %ymm0,%ymm6,%ymm6 |
| 205 | # x8 += x13, x7 = rotl32(x7 ^ x8, 12) |
| 206 | vpaddd %ymm13,%ymm8,%ymm8 |
| 207 | vpxor %ymm8,%ymm7,%ymm7 |
| 208 | vpslld $12,%ymm7,%ymm0 |
| 209 | vpsrld $20,%ymm7,%ymm7 |
| 210 | vpor %ymm0,%ymm7,%ymm7 |
| 211 | # x9 += x14, x4 = rotl32(x4 ^ x9, 12) |
| 212 | vpaddd %ymm14,%ymm9,%ymm9 |
| 213 | vpxor %ymm9,%ymm4,%ymm4 |
| 214 | vpslld $12,%ymm4,%ymm0 |
| 215 | vpsrld $20,%ymm4,%ymm4 |
| 216 | vpor %ymm0,%ymm4,%ymm4 |
| 217 | |
| 218 | # x0 += x5, x15 = rotl32(x15 ^ x0, 8) |
| 219 | vpaddd 0x00(%rsp),%ymm5,%ymm0 |
| 220 | vmovdqa %ymm0,0x00(%rsp) |
| 221 | vpxor %ymm0,%ymm15,%ymm15 |
| 222 | vpshufb %ymm2,%ymm15,%ymm15 |
| 223 | # x1 += x6, x12 = rotl32(x12 ^ x1, 8) |
| 224 | vpaddd 0x20(%rsp),%ymm6,%ymm0 |
| 225 | vmovdqa %ymm0,0x20(%rsp) |
| 226 | vpxor %ymm0,%ymm12,%ymm12 |
| 227 | vpshufb %ymm2,%ymm12,%ymm12 |
| 228 | # x2 += x7, x13 = rotl32(x13 ^ x2, 8) |
| 229 | vpaddd 0x40(%rsp),%ymm7,%ymm0 |
| 230 | vmovdqa %ymm0,0x40(%rsp) |
| 231 | vpxor %ymm0,%ymm13,%ymm13 |
| 232 | vpshufb %ymm2,%ymm13,%ymm13 |
| 233 | # x3 += x4, x14 = rotl32(x14 ^ x3, 8) |
| 234 | vpaddd 0x60(%rsp),%ymm4,%ymm0 |
| 235 | vmovdqa %ymm0,0x60(%rsp) |
| 236 | vpxor %ymm0,%ymm14,%ymm14 |
| 237 | vpshufb %ymm2,%ymm14,%ymm14 |
| 238 | |
| 239 | # x10 += x15, x5 = rotl32(x5 ^ x10, 7) |
| 240 | vpaddd %ymm15,%ymm10,%ymm10 |
| 241 | vpxor %ymm10,%ymm5,%ymm5 |
| 242 | vpslld $7,%ymm5,%ymm0 |
| 243 | vpsrld $25,%ymm5,%ymm5 |
| 244 | vpor %ymm0,%ymm5,%ymm5 |
| 245 | # x11 += x12, x6 = rotl32(x6 ^ x11, 7) |
| 246 | vpaddd %ymm12,%ymm11,%ymm11 |
| 247 | vpxor %ymm11,%ymm6,%ymm6 |
| 248 | vpslld $7,%ymm6,%ymm0 |
| 249 | vpsrld $25,%ymm6,%ymm6 |
| 250 | vpor %ymm0,%ymm6,%ymm6 |
| 251 | # x8 += x13, x7 = rotl32(x7 ^ x8, 7) |
| 252 | vpaddd %ymm13,%ymm8,%ymm8 |
| 253 | vpxor %ymm8,%ymm7,%ymm7 |
| 254 | vpslld $7,%ymm7,%ymm0 |
| 255 | vpsrld $25,%ymm7,%ymm7 |
| 256 | vpor %ymm0,%ymm7,%ymm7 |
| 257 | # x9 += x14, x4 = rotl32(x4 ^ x9, 7) |
| 258 | vpaddd %ymm14,%ymm9,%ymm9 |
| 259 | vpxor %ymm9,%ymm4,%ymm4 |
| 260 | vpslld $7,%ymm4,%ymm0 |
| 261 | vpsrld $25,%ymm4,%ymm4 |
| 262 | vpor %ymm0,%ymm4,%ymm4 |
| 263 | |
| 264 | dec %ecx |
| 265 | jnz .Ldoubleround8 |
| 266 | |
| 267 | # x0..15[0-3] += s[0..15] |
| 268 | vpbroadcastd 0x00(%rdi),%ymm0 |
| 269 | vpaddd 0x00(%rsp),%ymm0,%ymm0 |
| 270 | vmovdqa %ymm0,0x00(%rsp) |
| 271 | vpbroadcastd 0x04(%rdi),%ymm0 |
| 272 | vpaddd 0x20(%rsp),%ymm0,%ymm0 |
| 273 | vmovdqa %ymm0,0x20(%rsp) |
| 274 | vpbroadcastd 0x08(%rdi),%ymm0 |
| 275 | vpaddd 0x40(%rsp),%ymm0,%ymm0 |
| 276 | vmovdqa %ymm0,0x40(%rsp) |
| 277 | vpbroadcastd 0x0c(%rdi),%ymm0 |
| 278 | vpaddd 0x60(%rsp),%ymm0,%ymm0 |
| 279 | vmovdqa %ymm0,0x60(%rsp) |
| 280 | vpbroadcastd 0x10(%rdi),%ymm0 |
| 281 | vpaddd %ymm0,%ymm4,%ymm4 |
| 282 | vpbroadcastd 0x14(%rdi),%ymm0 |
| 283 | vpaddd %ymm0,%ymm5,%ymm5 |
| 284 | vpbroadcastd 0x18(%rdi),%ymm0 |
| 285 | vpaddd %ymm0,%ymm6,%ymm6 |
| 286 | vpbroadcastd 0x1c(%rdi),%ymm0 |
| 287 | vpaddd %ymm0,%ymm7,%ymm7 |
| 288 | vpbroadcastd 0x20(%rdi),%ymm0 |
| 289 | vpaddd %ymm0,%ymm8,%ymm8 |
| 290 | vpbroadcastd 0x24(%rdi),%ymm0 |
| 291 | vpaddd %ymm0,%ymm9,%ymm9 |
| 292 | vpbroadcastd 0x28(%rdi),%ymm0 |
| 293 | vpaddd %ymm0,%ymm10,%ymm10 |
| 294 | vpbroadcastd 0x2c(%rdi),%ymm0 |
| 295 | vpaddd %ymm0,%ymm11,%ymm11 |
| 296 | vpbroadcastd 0x30(%rdi),%ymm0 |
| 297 | vpaddd %ymm0,%ymm12,%ymm12 |
| 298 | vpbroadcastd 0x34(%rdi),%ymm0 |
| 299 | vpaddd %ymm0,%ymm13,%ymm13 |
| 300 | vpbroadcastd 0x38(%rdi),%ymm0 |
| 301 | vpaddd %ymm0,%ymm14,%ymm14 |
| 302 | vpbroadcastd 0x3c(%rdi),%ymm0 |
| 303 | vpaddd %ymm0,%ymm15,%ymm15 |
| 304 | |
| 305 | # x12 += counter values 0-3 |
| 306 | vpaddd %ymm1,%ymm12,%ymm12 |
| 307 | |
| 308 | # interleave 32-bit words in state n, n+1 |
| 309 | vmovdqa 0x00(%rsp),%ymm0 |
| 310 | vmovdqa 0x20(%rsp),%ymm1 |
| 311 | vpunpckldq %ymm1,%ymm0,%ymm2 |
| 312 | vpunpckhdq %ymm1,%ymm0,%ymm1 |
| 313 | vmovdqa %ymm2,0x00(%rsp) |
| 314 | vmovdqa %ymm1,0x20(%rsp) |
| 315 | vmovdqa 0x40(%rsp),%ymm0 |
| 316 | vmovdqa 0x60(%rsp),%ymm1 |
| 317 | vpunpckldq %ymm1,%ymm0,%ymm2 |
| 318 | vpunpckhdq %ymm1,%ymm0,%ymm1 |
| 319 | vmovdqa %ymm2,0x40(%rsp) |
| 320 | vmovdqa %ymm1,0x60(%rsp) |
| 321 | vmovdqa %ymm4,%ymm0 |
| 322 | vpunpckldq %ymm5,%ymm0,%ymm4 |
| 323 | vpunpckhdq %ymm5,%ymm0,%ymm5 |
| 324 | vmovdqa %ymm6,%ymm0 |
| 325 | vpunpckldq %ymm7,%ymm0,%ymm6 |
| 326 | vpunpckhdq %ymm7,%ymm0,%ymm7 |
| 327 | vmovdqa %ymm8,%ymm0 |
| 328 | vpunpckldq %ymm9,%ymm0,%ymm8 |
| 329 | vpunpckhdq %ymm9,%ymm0,%ymm9 |
| 330 | vmovdqa %ymm10,%ymm0 |
| 331 | vpunpckldq %ymm11,%ymm0,%ymm10 |
| 332 | vpunpckhdq %ymm11,%ymm0,%ymm11 |
| 333 | vmovdqa %ymm12,%ymm0 |
| 334 | vpunpckldq %ymm13,%ymm0,%ymm12 |
| 335 | vpunpckhdq %ymm13,%ymm0,%ymm13 |
| 336 | vmovdqa %ymm14,%ymm0 |
| 337 | vpunpckldq %ymm15,%ymm0,%ymm14 |
| 338 | vpunpckhdq %ymm15,%ymm0,%ymm15 |
| 339 | |
| 340 | # interleave 64-bit words in state n, n+2 |
| 341 | vmovdqa 0x00(%rsp),%ymm0 |
| 342 | vmovdqa 0x40(%rsp),%ymm2 |
| 343 | vpunpcklqdq %ymm2,%ymm0,%ymm1 |
| 344 | vpunpckhqdq %ymm2,%ymm0,%ymm2 |
| 345 | vmovdqa %ymm1,0x00(%rsp) |
| 346 | vmovdqa %ymm2,0x40(%rsp) |
| 347 | vmovdqa 0x20(%rsp),%ymm0 |
| 348 | vmovdqa 0x60(%rsp),%ymm2 |
| 349 | vpunpcklqdq %ymm2,%ymm0,%ymm1 |
| 350 | vpunpckhqdq %ymm2,%ymm0,%ymm2 |
| 351 | vmovdqa %ymm1,0x20(%rsp) |
| 352 | vmovdqa %ymm2,0x60(%rsp) |
| 353 | vmovdqa %ymm4,%ymm0 |
| 354 | vpunpcklqdq %ymm6,%ymm0,%ymm4 |
| 355 | vpunpckhqdq %ymm6,%ymm0,%ymm6 |
| 356 | vmovdqa %ymm5,%ymm0 |
| 357 | vpunpcklqdq %ymm7,%ymm0,%ymm5 |
| 358 | vpunpckhqdq %ymm7,%ymm0,%ymm7 |
| 359 | vmovdqa %ymm8,%ymm0 |
| 360 | vpunpcklqdq %ymm10,%ymm0,%ymm8 |
| 361 | vpunpckhqdq %ymm10,%ymm0,%ymm10 |
| 362 | vmovdqa %ymm9,%ymm0 |
| 363 | vpunpcklqdq %ymm11,%ymm0,%ymm9 |
| 364 | vpunpckhqdq %ymm11,%ymm0,%ymm11 |
| 365 | vmovdqa %ymm12,%ymm0 |
| 366 | vpunpcklqdq %ymm14,%ymm0,%ymm12 |
| 367 | vpunpckhqdq %ymm14,%ymm0,%ymm14 |
| 368 | vmovdqa %ymm13,%ymm0 |
| 369 | vpunpcklqdq %ymm15,%ymm0,%ymm13 |
| 370 | vpunpckhqdq %ymm15,%ymm0,%ymm15 |
| 371 | |
| 372 | # interleave 128-bit words in state n, n+4 |
| 373 | vmovdqa 0x00(%rsp),%ymm0 |
| 374 | vperm2i128 $0x20,%ymm4,%ymm0,%ymm1 |
| 375 | vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 |
| 376 | vmovdqa %ymm1,0x00(%rsp) |
| 377 | vmovdqa 0x20(%rsp),%ymm0 |
| 378 | vperm2i128 $0x20,%ymm5,%ymm0,%ymm1 |
| 379 | vperm2i128 $0x31,%ymm5,%ymm0,%ymm5 |
| 380 | vmovdqa %ymm1,0x20(%rsp) |
| 381 | vmovdqa 0x40(%rsp),%ymm0 |
| 382 | vperm2i128 $0x20,%ymm6,%ymm0,%ymm1 |
| 383 | vperm2i128 $0x31,%ymm6,%ymm0,%ymm6 |
| 384 | vmovdqa %ymm1,0x40(%rsp) |
| 385 | vmovdqa 0x60(%rsp),%ymm0 |
| 386 | vperm2i128 $0x20,%ymm7,%ymm0,%ymm1 |
| 387 | vperm2i128 $0x31,%ymm7,%ymm0,%ymm7 |
| 388 | vmovdqa %ymm1,0x60(%rsp) |
| 389 | vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 |
| 390 | vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 |
| 391 | vmovdqa %ymm0,%ymm8 |
| 392 | vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 |
| 393 | vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 |
| 394 | vmovdqa %ymm0,%ymm9 |
| 395 | vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 |
| 396 | vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 |
| 397 | vmovdqa %ymm0,%ymm10 |
| 398 | vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 |
| 399 | vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 |
| 400 | vmovdqa %ymm0,%ymm11 |
| 401 | |
| 402 | # xor with corresponding input, write to output |
| 403 | vmovdqa 0x00(%rsp),%ymm0 |
| 404 | vpxor 0x0000(%rdx),%ymm0,%ymm0 |
| 405 | vmovdqu %ymm0,0x0000(%rsi) |
| 406 | vmovdqa 0x20(%rsp),%ymm0 |
| 407 | vpxor 0x0080(%rdx),%ymm0,%ymm0 |
| 408 | vmovdqu %ymm0,0x0080(%rsi) |
| 409 | vmovdqa 0x40(%rsp),%ymm0 |
| 410 | vpxor 0x0040(%rdx),%ymm0,%ymm0 |
| 411 | vmovdqu %ymm0,0x0040(%rsi) |
| 412 | vmovdqa 0x60(%rsp),%ymm0 |
| 413 | vpxor 0x00c0(%rdx),%ymm0,%ymm0 |
| 414 | vmovdqu %ymm0,0x00c0(%rsi) |
| 415 | vpxor 0x0100(%rdx),%ymm4,%ymm4 |
| 416 | vmovdqu %ymm4,0x0100(%rsi) |
| 417 | vpxor 0x0180(%rdx),%ymm5,%ymm5 |
| 418 | vmovdqu %ymm5,0x00180(%rsi) |
| 419 | vpxor 0x0140(%rdx),%ymm6,%ymm6 |
| 420 | vmovdqu %ymm6,0x0140(%rsi) |
| 421 | vpxor 0x01c0(%rdx),%ymm7,%ymm7 |
| 422 | vmovdqu %ymm7,0x01c0(%rsi) |
| 423 | vpxor 0x0020(%rdx),%ymm8,%ymm8 |
| 424 | vmovdqu %ymm8,0x0020(%rsi) |
| 425 | vpxor 0x00a0(%rdx),%ymm9,%ymm9 |
| 426 | vmovdqu %ymm9,0x00a0(%rsi) |
| 427 | vpxor 0x0060(%rdx),%ymm10,%ymm10 |
| 428 | vmovdqu %ymm10,0x0060(%rsi) |
| 429 | vpxor 0x00e0(%rdx),%ymm11,%ymm11 |
| 430 | vmovdqu %ymm11,0x00e0(%rsi) |
| 431 | vpxor 0x0120(%rdx),%ymm12,%ymm12 |
| 432 | vmovdqu %ymm12,0x0120(%rsi) |
| 433 | vpxor 0x01a0(%rdx),%ymm13,%ymm13 |
| 434 | vmovdqu %ymm13,0x01a0(%rsi) |
| 435 | vpxor 0x0160(%rdx),%ymm14,%ymm14 |
| 436 | vmovdqu %ymm14,0x0160(%rsi) |
| 437 | vpxor 0x01e0(%rdx),%ymm15,%ymm15 |
| 438 | vmovdqu %ymm15,0x01e0(%rsi) |
| 439 | |
| 440 | vzeroupper |
| 441 | mov %r8,%rsp |
| 442 | ret |
| 443 | ENDPROC(chacha20_8block_xor_avx2) |