blob: 4e5c0f3584dd61befb930d3030acfb7b012b3a25 [file] [log] [blame]
David Benjaminf31229b2017-01-25 14:08:15 -05001#if defined(__x86_64__)
2.text
3
4.p2align 6
5.chacha20_consts:
6.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
7.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
8.rol8:
9.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
10.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
11.rol16:
12.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
13.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
14.avx2_init:
15.long 0,0,0,0
16.sse_inc:
17.long 1,0,0,0
18.avx2_inc:
19.long 2,0,0,0,2,0,0,0
20.clamp:
21.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
22.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
23.p2align 4
24.and_masks:
25.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
26.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
27.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
28.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
29.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
30.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
31.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
32.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
33.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
34.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
35.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
36.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
37.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
38.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
39.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
40
41
42.p2align 6
43poly_hash_ad_internal:
44.cfi_startproc
45 xorq %r10,%r10
46 xorq %r11,%r11
47 xorq %r12,%r12
48 cmpq $13,%r8
49 jne hash_ad_loop
50poly_fast_tls_ad:
51
52 movq (%rcx),%r10
53 movq 5(%rcx),%r11
54 shrq $24,%r11
55 movq $1,%r12
56 movq 0+0(%rbp),%rax
57 movq %rax,%r15
58 mulq %r10
59 movq %rax,%r13
60 movq %rdx,%r14
61 movq 0+0(%rbp),%rax
62 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -080063 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -050064 addq %rax,%r14
65 adcq %rdx,%r15
66 movq 8+0(%rbp),%rax
67 movq %rax,%r9
68 mulq %r10
69 addq %rax,%r14
70 adcq $0,%rdx
71 movq %rdx,%r10
72 movq 8+0(%rbp),%rax
73 mulq %r11
74 addq %rax,%r15
75 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -080076 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -050077 addq %r10,%r15
78 adcq %rdx,%r9
79 movq %r13,%r10
80 movq %r14,%r11
81 movq %r15,%r12
82 andq $3,%r12
83 movq %r15,%r13
84 andq $-4,%r13
85 movq %r9,%r14
86 shrdq $2,%r9,%r15
87 shrq $2,%r9
88 addq %r13,%r10
89 adcq %r14,%r11
90 adcq $0,%r12
91 addq %r15,%r10
92 adcq %r9,%r11
93 adcq $0,%r12
94
95 .byte 0xf3,0xc3
96hash_ad_loop:
97
98 cmpq $16,%r8
99 jb hash_ad_tail
100 addq 0(%rcx),%r10
101 adcq 8+0(%rcx),%r11
102 adcq $1,%r12
103 movq 0+0(%rbp),%rax
104 movq %rax,%r15
105 mulq %r10
106 movq %rax,%r13
107 movq %rdx,%r14
108 movq 0+0(%rbp),%rax
109 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800110 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500111 addq %rax,%r14
112 adcq %rdx,%r15
113 movq 8+0(%rbp),%rax
114 movq %rax,%r9
115 mulq %r10
116 addq %rax,%r14
117 adcq $0,%rdx
118 movq %rdx,%r10
119 movq 8+0(%rbp),%rax
120 mulq %r11
121 addq %rax,%r15
122 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -0800123 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500124 addq %r10,%r15
125 adcq %rdx,%r9
126 movq %r13,%r10
127 movq %r14,%r11
128 movq %r15,%r12
129 andq $3,%r12
130 movq %r15,%r13
131 andq $-4,%r13
132 movq %r9,%r14
133 shrdq $2,%r9,%r15
134 shrq $2,%r9
135 addq %r13,%r10
136 adcq %r14,%r11
137 adcq $0,%r12
138 addq %r15,%r10
139 adcq %r9,%r11
140 adcq $0,%r12
141
Robert Sloan4d1ac502017-02-06 08:36:14 -0800142 leaq 16(%rcx),%rcx
David Benjaminf31229b2017-01-25 14:08:15 -0500143 subq $16,%r8
144 jmp hash_ad_loop
145hash_ad_tail:
146 cmpq $0,%r8
147 je 1f
148
149 xorq %r13,%r13
150 xorq %r14,%r14
151 xorq %r15,%r15
152 addq %r8,%rcx
153hash_ad_tail_loop:
154 shldq $8,%r13,%r14
155 shlq $8,%r13
156 movzbq -1(%rcx),%r15
157 xorq %r15,%r13
158 decq %rcx
159 decq %r8
160 jne hash_ad_tail_loop
161
162 addq %r13,%r10
163 adcq %r14,%r11
164 adcq $1,%r12
165 movq 0+0(%rbp),%rax
166 movq %rax,%r15
167 mulq %r10
168 movq %rax,%r13
169 movq %rdx,%r14
170 movq 0+0(%rbp),%rax
171 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800172 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500173 addq %rax,%r14
174 adcq %rdx,%r15
175 movq 8+0(%rbp),%rax
176 movq %rax,%r9
177 mulq %r10
178 addq %rax,%r14
179 adcq $0,%rdx
180 movq %rdx,%r10
181 movq 8+0(%rbp),%rax
182 mulq %r11
183 addq %rax,%r15
184 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -0800185 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500186 addq %r10,%r15
187 adcq %rdx,%r9
188 movq %r13,%r10
189 movq %r14,%r11
190 movq %r15,%r12
191 andq $3,%r12
192 movq %r15,%r13
193 andq $-4,%r13
194 movq %r9,%r14
195 shrdq $2,%r9,%r15
196 shrq $2,%r9
197 addq %r13,%r10
198 adcq %r14,%r11
199 adcq $0,%r12
200 addq %r15,%r10
201 adcq %r9,%r11
202 adcq $0,%r12
203
204
2051:
206 .byte 0xf3,0xc3
207.cfi_endproc
208
209
210.globl _chacha20_poly1305_open
211.private_extern _chacha20_poly1305_open
212
213.p2align 6
214_chacha20_poly1305_open:
215.cfi_startproc
216 pushq %rbp
217.cfi_adjust_cfa_offset 8
218 pushq %rbx
219.cfi_adjust_cfa_offset 8
220 pushq %r12
221.cfi_adjust_cfa_offset 8
222 pushq %r13
223.cfi_adjust_cfa_offset 8
224 pushq %r14
225.cfi_adjust_cfa_offset 8
226 pushq %r15
227.cfi_adjust_cfa_offset 8
228
229
230 pushq %r9
231.cfi_adjust_cfa_offset 8
232 subq $288 + 32,%rsp
233.cfi_adjust_cfa_offset 288 + 32
234.cfi_offset rbp, -16
235.cfi_offset rbx, -24
236.cfi_offset r12, -32
237.cfi_offset r13, -40
238.cfi_offset r14, -48
239.cfi_offset r15, -56
David Benjaminf31229b2017-01-25 14:08:15 -0500240 leaq 32(%rsp),%rbp
241 andq $-32,%rbp
242 movq %rdx,8+32(%rbp)
243 movq %r8,0+32(%rbp)
244 movq %rdx,%rbx
245
246 movl _OPENSSL_ia32cap_P+8(%rip),%eax
247 andl $288,%eax
248 xorl $288,%eax
249 jz chacha20_poly1305_open_avx2
250
2511:
252 cmpq $128,%rbx
253 jbe open_sse_128
254
255 movdqa .chacha20_consts(%rip),%xmm0
256 movdqu 0(%r9),%xmm4
257 movdqu 16(%r9),%xmm8
258 movdqu 32(%r9),%xmm12
259 movdqa %xmm12,%xmm7
260
261 movdqa %xmm4,48(%rbp)
262 movdqa %xmm8,64(%rbp)
263 movdqa %xmm12,96(%rbp)
264 movq $10,%r10
2651:
266 paddd %xmm4,%xmm0
267 pxor %xmm0,%xmm12
268 pshufb .rol16(%rip),%xmm12
269 paddd %xmm12,%xmm8
270 pxor %xmm8,%xmm4
271 movdqa %xmm4,%xmm3
272 pslld $12,%xmm3
273 psrld $20,%xmm4
274 pxor %xmm3,%xmm4
275 paddd %xmm4,%xmm0
276 pxor %xmm0,%xmm12
277 pshufb .rol8(%rip),%xmm12
278 paddd %xmm12,%xmm8
279 pxor %xmm8,%xmm4
280 movdqa %xmm4,%xmm3
281 pslld $7,%xmm3
282 psrld $25,%xmm4
283 pxor %xmm3,%xmm4
284.byte 102,15,58,15,228,4
285.byte 102,69,15,58,15,192,8
286.byte 102,69,15,58,15,228,12
287 paddd %xmm4,%xmm0
288 pxor %xmm0,%xmm12
289 pshufb .rol16(%rip),%xmm12
290 paddd %xmm12,%xmm8
291 pxor %xmm8,%xmm4
292 movdqa %xmm4,%xmm3
293 pslld $12,%xmm3
294 psrld $20,%xmm4
295 pxor %xmm3,%xmm4
296 paddd %xmm4,%xmm0
297 pxor %xmm0,%xmm12
298 pshufb .rol8(%rip),%xmm12
299 paddd %xmm12,%xmm8
300 pxor %xmm8,%xmm4
301 movdqa %xmm4,%xmm3
302 pslld $7,%xmm3
303 psrld $25,%xmm4
304 pxor %xmm3,%xmm4
305.byte 102,15,58,15,228,12
306.byte 102,69,15,58,15,192,8
307.byte 102,69,15,58,15,228,4
308
309 decq %r10
310 jne 1b
311
312 paddd .chacha20_consts(%rip),%xmm0
313 paddd 48(%rbp),%xmm4
314
315 pand .clamp(%rip),%xmm0
316 movdqa %xmm0,0(%rbp)
317 movdqa %xmm4,16(%rbp)
318
319 movq %r8,%r8
320 call poly_hash_ad_internal
321open_sse_main_loop:
322 cmpq $256,%rbx
323 jb 2f
324
325 movdqa .chacha20_consts(%rip),%xmm0
326 movdqa 48(%rbp),%xmm4
327 movdqa 64(%rbp),%xmm8
328 movdqa %xmm0,%xmm1
329 movdqa %xmm4,%xmm5
330 movdqa %xmm8,%xmm9
331 movdqa %xmm0,%xmm2
332 movdqa %xmm4,%xmm6
333 movdqa %xmm8,%xmm10
334 movdqa %xmm0,%xmm3
335 movdqa %xmm4,%xmm7
336 movdqa %xmm8,%xmm11
337 movdqa 96(%rbp),%xmm15
338 paddd .sse_inc(%rip),%xmm15
339 movdqa %xmm15,%xmm14
340 paddd .sse_inc(%rip),%xmm14
341 movdqa %xmm14,%xmm13
342 paddd .sse_inc(%rip),%xmm13
343 movdqa %xmm13,%xmm12
344 paddd .sse_inc(%rip),%xmm12
345 movdqa %xmm12,96(%rbp)
346 movdqa %xmm13,112(%rbp)
347 movdqa %xmm14,128(%rbp)
348 movdqa %xmm15,144(%rbp)
349
350
351
352 movq $4,%rcx
353 movq %rsi,%r8
3541:
355 movdqa %xmm8,80(%rbp)
356 movdqa .rol16(%rip),%xmm8
357 paddd %xmm7,%xmm3
358 paddd %xmm6,%xmm2
359 paddd %xmm5,%xmm1
360 paddd %xmm4,%xmm0
361 pxor %xmm3,%xmm15
362 pxor %xmm2,%xmm14
363 pxor %xmm1,%xmm13
364 pxor %xmm0,%xmm12
365.byte 102,69,15,56,0,248
366.byte 102,69,15,56,0,240
367.byte 102,69,15,56,0,232
368.byte 102,69,15,56,0,224
369 movdqa 80(%rbp),%xmm8
370 paddd %xmm15,%xmm11
371 paddd %xmm14,%xmm10
372 paddd %xmm13,%xmm9
373 paddd %xmm12,%xmm8
374 pxor %xmm11,%xmm7
375 addq 0(%r8),%r10
376 adcq 8+0(%r8),%r11
377 adcq $1,%r12
378
379 leaq 16(%r8),%r8
380 pxor %xmm10,%xmm6
381 pxor %xmm9,%xmm5
382 pxor %xmm8,%xmm4
383 movdqa %xmm8,80(%rbp)
384 movdqa %xmm7,%xmm8
385 psrld $20,%xmm8
386 pslld $32-20,%xmm7
387 pxor %xmm8,%xmm7
388 movdqa %xmm6,%xmm8
389 psrld $20,%xmm8
390 pslld $32-20,%xmm6
391 pxor %xmm8,%xmm6
392 movdqa %xmm5,%xmm8
393 psrld $20,%xmm8
394 pslld $32-20,%xmm5
395 pxor %xmm8,%xmm5
396 movdqa %xmm4,%xmm8
397 psrld $20,%xmm8
398 pslld $32-20,%xmm4
399 pxor %xmm8,%xmm4
400 movq 0+0(%rbp),%rax
401 movq %rax,%r15
402 mulq %r10
403 movq %rax,%r13
404 movq %rdx,%r14
405 movq 0+0(%rbp),%rax
406 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800407 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500408 addq %rax,%r14
409 adcq %rdx,%r15
410 movdqa .rol8(%rip),%xmm8
411 paddd %xmm7,%xmm3
412 paddd %xmm6,%xmm2
413 paddd %xmm5,%xmm1
414 paddd %xmm4,%xmm0
415 pxor %xmm3,%xmm15
416 pxor %xmm2,%xmm14
417 pxor %xmm1,%xmm13
418 pxor %xmm0,%xmm12
419.byte 102,69,15,56,0,248
420.byte 102,69,15,56,0,240
421.byte 102,69,15,56,0,232
422.byte 102,69,15,56,0,224
423 movdqa 80(%rbp),%xmm8
424 paddd %xmm15,%xmm11
425 paddd %xmm14,%xmm10
426 paddd %xmm13,%xmm9
427 paddd %xmm12,%xmm8
428 pxor %xmm11,%xmm7
429 pxor %xmm10,%xmm6
430 movq 8+0(%rbp),%rax
431 movq %rax,%r9
432 mulq %r10
433 addq %rax,%r14
434 adcq $0,%rdx
435 movq %rdx,%r10
436 movq 8+0(%rbp),%rax
437 mulq %r11
438 addq %rax,%r15
439 adcq $0,%rdx
440 pxor %xmm9,%xmm5
441 pxor %xmm8,%xmm4
442 movdqa %xmm8,80(%rbp)
443 movdqa %xmm7,%xmm8
444 psrld $25,%xmm8
445 pslld $32-25,%xmm7
446 pxor %xmm8,%xmm7
447 movdqa %xmm6,%xmm8
448 psrld $25,%xmm8
449 pslld $32-25,%xmm6
450 pxor %xmm8,%xmm6
451 movdqa %xmm5,%xmm8
452 psrld $25,%xmm8
453 pslld $32-25,%xmm5
454 pxor %xmm8,%xmm5
455 movdqa %xmm4,%xmm8
456 psrld $25,%xmm8
457 pslld $32-25,%xmm4
458 pxor %xmm8,%xmm4
459 movdqa 80(%rbp),%xmm8
Robert Sloan4d1ac502017-02-06 08:36:14 -0800460 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500461 addq %r10,%r15
462 adcq %rdx,%r9
463.byte 102,15,58,15,255,4
464.byte 102,69,15,58,15,219,8
465.byte 102,69,15,58,15,255,12
466.byte 102,15,58,15,246,4
467.byte 102,69,15,58,15,210,8
468.byte 102,69,15,58,15,246,12
469.byte 102,15,58,15,237,4
470.byte 102,69,15,58,15,201,8
471.byte 102,69,15,58,15,237,12
472.byte 102,15,58,15,228,4
473.byte 102,69,15,58,15,192,8
474.byte 102,69,15,58,15,228,12
475 movdqa %xmm8,80(%rbp)
476 movdqa .rol16(%rip),%xmm8
477 paddd %xmm7,%xmm3
478 paddd %xmm6,%xmm2
479 paddd %xmm5,%xmm1
480 paddd %xmm4,%xmm0
481 pxor %xmm3,%xmm15
482 pxor %xmm2,%xmm14
483 movq %r13,%r10
484 movq %r14,%r11
485 movq %r15,%r12
486 andq $3,%r12
487 movq %r15,%r13
488 andq $-4,%r13
489 movq %r9,%r14
490 shrdq $2,%r9,%r15
491 shrq $2,%r9
492 addq %r13,%r10
493 adcq %r14,%r11
494 adcq $0,%r12
495 addq %r15,%r10
496 adcq %r9,%r11
497 adcq $0,%r12
498 pxor %xmm1,%xmm13
499 pxor %xmm0,%xmm12
500.byte 102,69,15,56,0,248
501.byte 102,69,15,56,0,240
502.byte 102,69,15,56,0,232
503.byte 102,69,15,56,0,224
504 movdqa 80(%rbp),%xmm8
505 paddd %xmm15,%xmm11
506 paddd %xmm14,%xmm10
507 paddd %xmm13,%xmm9
508 paddd %xmm12,%xmm8
509 pxor %xmm11,%xmm7
510 pxor %xmm10,%xmm6
511 pxor %xmm9,%xmm5
512 pxor %xmm8,%xmm4
513 movdqa %xmm8,80(%rbp)
514 movdqa %xmm7,%xmm8
515 psrld $20,%xmm8
516 pslld $32-20,%xmm7
517 pxor %xmm8,%xmm7
518 movdqa %xmm6,%xmm8
519 psrld $20,%xmm8
520 pslld $32-20,%xmm6
521 pxor %xmm8,%xmm6
522 movdqa %xmm5,%xmm8
523 psrld $20,%xmm8
524 pslld $32-20,%xmm5
525 pxor %xmm8,%xmm5
526 movdqa %xmm4,%xmm8
527 psrld $20,%xmm8
528 pslld $32-20,%xmm4
529 pxor %xmm8,%xmm4
530 movdqa .rol8(%rip),%xmm8
531 paddd %xmm7,%xmm3
532 paddd %xmm6,%xmm2
533 paddd %xmm5,%xmm1
534 paddd %xmm4,%xmm0
535 pxor %xmm3,%xmm15
536 pxor %xmm2,%xmm14
537 pxor %xmm1,%xmm13
538 pxor %xmm0,%xmm12
539.byte 102,69,15,56,0,248
540.byte 102,69,15,56,0,240
541.byte 102,69,15,56,0,232
542.byte 102,69,15,56,0,224
543 movdqa 80(%rbp),%xmm8
544 paddd %xmm15,%xmm11
545 paddd %xmm14,%xmm10
546 paddd %xmm13,%xmm9
547 paddd %xmm12,%xmm8
548 pxor %xmm11,%xmm7
549 pxor %xmm10,%xmm6
550 pxor %xmm9,%xmm5
551 pxor %xmm8,%xmm4
552 movdqa %xmm8,80(%rbp)
553 movdqa %xmm7,%xmm8
554 psrld $25,%xmm8
555 pslld $32-25,%xmm7
556 pxor %xmm8,%xmm7
557 movdqa %xmm6,%xmm8
558 psrld $25,%xmm8
559 pslld $32-25,%xmm6
560 pxor %xmm8,%xmm6
561 movdqa %xmm5,%xmm8
562 psrld $25,%xmm8
563 pslld $32-25,%xmm5
564 pxor %xmm8,%xmm5
565 movdqa %xmm4,%xmm8
566 psrld $25,%xmm8
567 pslld $32-25,%xmm4
568 pxor %xmm8,%xmm4
569 movdqa 80(%rbp),%xmm8
570.byte 102,15,58,15,255,12
571.byte 102,69,15,58,15,219,8
572.byte 102,69,15,58,15,255,4
573.byte 102,15,58,15,246,12
574.byte 102,69,15,58,15,210,8
575.byte 102,69,15,58,15,246,4
576.byte 102,15,58,15,237,12
577.byte 102,69,15,58,15,201,8
578.byte 102,69,15,58,15,237,4
579.byte 102,15,58,15,228,12
580.byte 102,69,15,58,15,192,8
581.byte 102,69,15,58,15,228,4
582
583 decq %rcx
584 jge 1b
585 addq 0(%r8),%r10
586 adcq 8+0(%r8),%r11
587 adcq $1,%r12
588 movq 0+0(%rbp),%rax
589 movq %rax,%r15
590 mulq %r10
591 movq %rax,%r13
592 movq %rdx,%r14
593 movq 0+0(%rbp),%rax
594 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800595 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500596 addq %rax,%r14
597 adcq %rdx,%r15
598 movq 8+0(%rbp),%rax
599 movq %rax,%r9
600 mulq %r10
601 addq %rax,%r14
602 adcq $0,%rdx
603 movq %rdx,%r10
604 movq 8+0(%rbp),%rax
605 mulq %r11
606 addq %rax,%r15
607 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -0800608 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500609 addq %r10,%r15
610 adcq %rdx,%r9
611 movq %r13,%r10
612 movq %r14,%r11
613 movq %r15,%r12
614 andq $3,%r12
615 movq %r15,%r13
616 andq $-4,%r13
617 movq %r9,%r14
618 shrdq $2,%r9,%r15
619 shrq $2,%r9
620 addq %r13,%r10
621 adcq %r14,%r11
622 adcq $0,%r12
623 addq %r15,%r10
624 adcq %r9,%r11
625 adcq $0,%r12
626
627 leaq 16(%r8),%r8
628 cmpq $-6,%rcx
629 jg 1b
630 paddd .chacha20_consts(%rip),%xmm3
631 paddd 48(%rbp),%xmm7
632 paddd 64(%rbp),%xmm11
633 paddd 144(%rbp),%xmm15
634 paddd .chacha20_consts(%rip),%xmm2
635 paddd 48(%rbp),%xmm6
636 paddd 64(%rbp),%xmm10
637 paddd 128(%rbp),%xmm14
638 paddd .chacha20_consts(%rip),%xmm1
639 paddd 48(%rbp),%xmm5
640 paddd 64(%rbp),%xmm9
641 paddd 112(%rbp),%xmm13
642 paddd .chacha20_consts(%rip),%xmm0
643 paddd 48(%rbp),%xmm4
644 paddd 64(%rbp),%xmm8
645 paddd 96(%rbp),%xmm12
646 movdqa %xmm12,80(%rbp)
647 movdqu 0 + 0(%rsi),%xmm12
648 pxor %xmm3,%xmm12
649 movdqu %xmm12,0 + 0(%rdi)
650 movdqu 16 + 0(%rsi),%xmm12
651 pxor %xmm7,%xmm12
652 movdqu %xmm12,16 + 0(%rdi)
653 movdqu 32 + 0(%rsi),%xmm12
654 pxor %xmm11,%xmm12
655 movdqu %xmm12,32 + 0(%rdi)
656 movdqu 48 + 0(%rsi),%xmm12
657 pxor %xmm15,%xmm12
658 movdqu %xmm12,48 + 0(%rdi)
659 movdqu 0 + 64(%rsi),%xmm3
660 movdqu 16 + 64(%rsi),%xmm7
661 movdqu 32 + 64(%rsi),%xmm11
662 movdqu 48 + 64(%rsi),%xmm15
663 pxor %xmm3,%xmm2
664 pxor %xmm7,%xmm6
665 pxor %xmm11,%xmm10
666 pxor %xmm14,%xmm15
667 movdqu %xmm2,0 + 64(%rdi)
668 movdqu %xmm6,16 + 64(%rdi)
669 movdqu %xmm10,32 + 64(%rdi)
670 movdqu %xmm15,48 + 64(%rdi)
671 movdqu 0 + 128(%rsi),%xmm3
672 movdqu 16 + 128(%rsi),%xmm7
673 movdqu 32 + 128(%rsi),%xmm11
674 movdqu 48 + 128(%rsi),%xmm15
675 pxor %xmm3,%xmm1
676 pxor %xmm7,%xmm5
677 pxor %xmm11,%xmm9
678 pxor %xmm13,%xmm15
679 movdqu %xmm1,0 + 128(%rdi)
680 movdqu %xmm5,16 + 128(%rdi)
681 movdqu %xmm9,32 + 128(%rdi)
682 movdqu %xmm15,48 + 128(%rdi)
683 movdqu 0 + 192(%rsi),%xmm3
684 movdqu 16 + 192(%rsi),%xmm7
685 movdqu 32 + 192(%rsi),%xmm11
686 movdqu 48 + 192(%rsi),%xmm15
687 pxor %xmm3,%xmm0
688 pxor %xmm7,%xmm4
689 pxor %xmm11,%xmm8
690 pxor 80(%rbp),%xmm15
691 movdqu %xmm0,0 + 192(%rdi)
692 movdqu %xmm4,16 + 192(%rdi)
693 movdqu %xmm8,32 + 192(%rdi)
694 movdqu %xmm15,48 + 192(%rdi)
695
696 leaq 256(%rsi),%rsi
697 leaq 256(%rdi),%rdi
698 subq $256,%rbx
699 jmp open_sse_main_loop
7002:
701
702 testq %rbx,%rbx
703 jz open_sse_finalize
704 cmpq $64,%rbx
705 ja 3f
706 movdqa .chacha20_consts(%rip),%xmm0
707 movdqa 48(%rbp),%xmm4
708 movdqa 64(%rbp),%xmm8
709 movdqa 96(%rbp),%xmm12
710 paddd .sse_inc(%rip),%xmm12
711 movdqa %xmm12,96(%rbp)
712
713 xorq %r8,%r8
714 movq %rbx,%rcx
715 cmpq $16,%rcx
716 jb 2f
7171:
718 addq 0(%rsi,%r8), %r10
719 adcq 8+0(%rsi,%r8), %r11
720 adcq $1,%r12
721 movq 0+0(%rbp),%rax
722 movq %rax,%r15
723 mulq %r10
724 movq %rax,%r13
725 movq %rdx,%r14
726 movq 0+0(%rbp),%rax
727 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800728 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500729 addq %rax,%r14
730 adcq %rdx,%r15
731 movq 8+0(%rbp),%rax
732 movq %rax,%r9
733 mulq %r10
734 addq %rax,%r14
735 adcq $0,%rdx
736 movq %rdx,%r10
737 movq 8+0(%rbp),%rax
738 mulq %r11
739 addq %rax,%r15
740 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -0800741 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500742 addq %r10,%r15
743 adcq %rdx,%r9
744 movq %r13,%r10
745 movq %r14,%r11
746 movq %r15,%r12
747 andq $3,%r12
748 movq %r15,%r13
749 andq $-4,%r13
750 movq %r9,%r14
751 shrdq $2,%r9,%r15
752 shrq $2,%r9
753 addq %r13,%r10
754 adcq %r14,%r11
755 adcq $0,%r12
756 addq %r15,%r10
757 adcq %r9,%r11
758 adcq $0,%r12
759
760 subq $16,%rcx
7612:
762 addq $16,%r8
763 paddd %xmm4,%xmm0
764 pxor %xmm0,%xmm12
765 pshufb .rol16(%rip),%xmm12
766 paddd %xmm12,%xmm8
767 pxor %xmm8,%xmm4
768 movdqa %xmm4,%xmm3
769 pslld $12,%xmm3
770 psrld $20,%xmm4
771 pxor %xmm3,%xmm4
772 paddd %xmm4,%xmm0
773 pxor %xmm0,%xmm12
774 pshufb .rol8(%rip),%xmm12
775 paddd %xmm12,%xmm8
776 pxor %xmm8,%xmm4
777 movdqa %xmm4,%xmm3
778 pslld $7,%xmm3
779 psrld $25,%xmm4
780 pxor %xmm3,%xmm4
781.byte 102,15,58,15,228,4
782.byte 102,69,15,58,15,192,8
783.byte 102,69,15,58,15,228,12
784 paddd %xmm4,%xmm0
785 pxor %xmm0,%xmm12
786 pshufb .rol16(%rip),%xmm12
787 paddd %xmm12,%xmm8
788 pxor %xmm8,%xmm4
789 movdqa %xmm4,%xmm3
790 pslld $12,%xmm3
791 psrld $20,%xmm4
792 pxor %xmm3,%xmm4
793 paddd %xmm4,%xmm0
794 pxor %xmm0,%xmm12
795 pshufb .rol8(%rip),%xmm12
796 paddd %xmm12,%xmm8
797 pxor %xmm8,%xmm4
798 movdqa %xmm4,%xmm3
799 pslld $7,%xmm3
800 psrld $25,%xmm4
801 pxor %xmm3,%xmm4
802.byte 102,15,58,15,228,12
803.byte 102,69,15,58,15,192,8
804.byte 102,69,15,58,15,228,4
805
806 cmpq $16,%rcx
807 jae 1b
808 cmpq $160,%r8
809 jne 2b
810 paddd .chacha20_consts(%rip),%xmm0
811 paddd 48(%rbp),%xmm4
812 paddd 64(%rbp),%xmm8
813 paddd 96(%rbp),%xmm12
814
815 jmp open_sse_tail_64_dec_loop
8163:
817 cmpq $128,%rbx
818 ja 3f
819 movdqa .chacha20_consts(%rip),%xmm0
820 movdqa 48(%rbp),%xmm4
821 movdqa 64(%rbp),%xmm8
822 movdqa %xmm0,%xmm1
823 movdqa %xmm4,%xmm5
824 movdqa %xmm8,%xmm9
825 movdqa 96(%rbp),%xmm13
826 paddd .sse_inc(%rip),%xmm13
827 movdqa %xmm13,%xmm12
828 paddd .sse_inc(%rip),%xmm12
829 movdqa %xmm12,96(%rbp)
830 movdqa %xmm13,112(%rbp)
831
832 movq %rbx,%rcx
833 andq $-16,%rcx
834 xorq %r8,%r8
8351:
836 addq 0(%rsi,%r8), %r10
837 adcq 8+0(%rsi,%r8), %r11
838 adcq $1,%r12
839 movq 0+0(%rbp),%rax
840 movq %rax,%r15
841 mulq %r10
842 movq %rax,%r13
843 movq %rdx,%r14
844 movq 0+0(%rbp),%rax
845 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800846 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500847 addq %rax,%r14
848 adcq %rdx,%r15
849 movq 8+0(%rbp),%rax
850 movq %rax,%r9
851 mulq %r10
852 addq %rax,%r14
853 adcq $0,%rdx
854 movq %rdx,%r10
855 movq 8+0(%rbp),%rax
856 mulq %r11
857 addq %rax,%r15
858 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -0800859 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500860 addq %r10,%r15
861 adcq %rdx,%r9
862 movq %r13,%r10
863 movq %r14,%r11
864 movq %r15,%r12
865 andq $3,%r12
866 movq %r15,%r13
867 andq $-4,%r13
868 movq %r9,%r14
869 shrdq $2,%r9,%r15
870 shrq $2,%r9
871 addq %r13,%r10
872 adcq %r14,%r11
873 adcq $0,%r12
874 addq %r15,%r10
875 adcq %r9,%r11
876 adcq $0,%r12
877
8782:
879 addq $16,%r8
880 paddd %xmm4,%xmm0
881 pxor %xmm0,%xmm12
882 pshufb .rol16(%rip),%xmm12
883 paddd %xmm12,%xmm8
884 pxor %xmm8,%xmm4
885 movdqa %xmm4,%xmm3
886 pslld $12,%xmm3
887 psrld $20,%xmm4
888 pxor %xmm3,%xmm4
889 paddd %xmm4,%xmm0
890 pxor %xmm0,%xmm12
891 pshufb .rol8(%rip),%xmm12
892 paddd %xmm12,%xmm8
893 pxor %xmm8,%xmm4
894 movdqa %xmm4,%xmm3
895 pslld $7,%xmm3
896 psrld $25,%xmm4
897 pxor %xmm3,%xmm4
898.byte 102,15,58,15,228,4
899.byte 102,69,15,58,15,192,8
900.byte 102,69,15,58,15,228,12
901 paddd %xmm5,%xmm1
902 pxor %xmm1,%xmm13
903 pshufb .rol16(%rip),%xmm13
904 paddd %xmm13,%xmm9
905 pxor %xmm9,%xmm5
906 movdqa %xmm5,%xmm3
907 pslld $12,%xmm3
908 psrld $20,%xmm5
909 pxor %xmm3,%xmm5
910 paddd %xmm5,%xmm1
911 pxor %xmm1,%xmm13
912 pshufb .rol8(%rip),%xmm13
913 paddd %xmm13,%xmm9
914 pxor %xmm9,%xmm5
915 movdqa %xmm5,%xmm3
916 pslld $7,%xmm3
917 psrld $25,%xmm5
918 pxor %xmm3,%xmm5
919.byte 102,15,58,15,237,4
920.byte 102,69,15,58,15,201,8
921.byte 102,69,15,58,15,237,12
922 paddd %xmm4,%xmm0
923 pxor %xmm0,%xmm12
924 pshufb .rol16(%rip),%xmm12
925 paddd %xmm12,%xmm8
926 pxor %xmm8,%xmm4
927 movdqa %xmm4,%xmm3
928 pslld $12,%xmm3
929 psrld $20,%xmm4
930 pxor %xmm3,%xmm4
931 paddd %xmm4,%xmm0
932 pxor %xmm0,%xmm12
933 pshufb .rol8(%rip),%xmm12
934 paddd %xmm12,%xmm8
935 pxor %xmm8,%xmm4
936 movdqa %xmm4,%xmm3
937 pslld $7,%xmm3
938 psrld $25,%xmm4
939 pxor %xmm3,%xmm4
940.byte 102,15,58,15,228,12
941.byte 102,69,15,58,15,192,8
942.byte 102,69,15,58,15,228,4
943 paddd %xmm5,%xmm1
944 pxor %xmm1,%xmm13
945 pshufb .rol16(%rip),%xmm13
946 paddd %xmm13,%xmm9
947 pxor %xmm9,%xmm5
948 movdqa %xmm5,%xmm3
949 pslld $12,%xmm3
950 psrld $20,%xmm5
951 pxor %xmm3,%xmm5
952 paddd %xmm5,%xmm1
953 pxor %xmm1,%xmm13
954 pshufb .rol8(%rip),%xmm13
955 paddd %xmm13,%xmm9
956 pxor %xmm9,%xmm5
957 movdqa %xmm5,%xmm3
958 pslld $7,%xmm3
959 psrld $25,%xmm5
960 pxor %xmm3,%xmm5
961.byte 102,15,58,15,237,12
962.byte 102,69,15,58,15,201,8
963.byte 102,69,15,58,15,237,4
964
965 cmpq %rcx,%r8
966 jb 1b
967 cmpq $160,%r8
968 jne 2b
969 paddd .chacha20_consts(%rip),%xmm1
970 paddd 48(%rbp),%xmm5
971 paddd 64(%rbp),%xmm9
972 paddd 112(%rbp),%xmm13
973 paddd .chacha20_consts(%rip),%xmm0
974 paddd 48(%rbp),%xmm4
975 paddd 64(%rbp),%xmm8
976 paddd 96(%rbp),%xmm12
977 movdqu 0 + 0(%rsi),%xmm3
978 movdqu 16 + 0(%rsi),%xmm7
979 movdqu 32 + 0(%rsi),%xmm11
980 movdqu 48 + 0(%rsi),%xmm15
981 pxor %xmm3,%xmm1
982 pxor %xmm7,%xmm5
983 pxor %xmm11,%xmm9
984 pxor %xmm13,%xmm15
985 movdqu %xmm1,0 + 0(%rdi)
986 movdqu %xmm5,16 + 0(%rdi)
987 movdqu %xmm9,32 + 0(%rdi)
988 movdqu %xmm15,48 + 0(%rdi)
989
990 subq $64,%rbx
991 leaq 64(%rsi),%rsi
992 leaq 64(%rdi),%rdi
993 jmp open_sse_tail_64_dec_loop
9943:
995 cmpq $192,%rbx
996 ja 3f
997 movdqa .chacha20_consts(%rip),%xmm0
998 movdqa 48(%rbp),%xmm4
999 movdqa 64(%rbp),%xmm8
1000 movdqa %xmm0,%xmm1
1001 movdqa %xmm4,%xmm5
1002 movdqa %xmm8,%xmm9
1003 movdqa %xmm0,%xmm2
1004 movdqa %xmm4,%xmm6
1005 movdqa %xmm8,%xmm10
1006 movdqa 96(%rbp),%xmm14
1007 paddd .sse_inc(%rip),%xmm14
1008 movdqa %xmm14,%xmm13
1009 paddd .sse_inc(%rip),%xmm13
1010 movdqa %xmm13,%xmm12
1011 paddd .sse_inc(%rip),%xmm12
1012 movdqa %xmm12,96(%rbp)
1013 movdqa %xmm13,112(%rbp)
1014 movdqa %xmm14,128(%rbp)
1015
1016 movq %rbx,%rcx
1017 movq $160,%r8
1018 cmpq $160,%rcx
1019 cmovgq %r8,%rcx
1020 andq $-16,%rcx
1021 xorq %r8,%r8
10221:
1023 addq 0(%rsi,%r8), %r10
1024 adcq 8+0(%rsi,%r8), %r11
1025 adcq $1,%r12
1026 movq 0+0(%rbp),%rax
1027 movq %rax,%r15
1028 mulq %r10
1029 movq %rax,%r13
1030 movq %rdx,%r14
1031 movq 0+0(%rbp),%rax
1032 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001033 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001034 addq %rax,%r14
1035 adcq %rdx,%r15
1036 movq 8+0(%rbp),%rax
1037 movq %rax,%r9
1038 mulq %r10
1039 addq %rax,%r14
1040 adcq $0,%rdx
1041 movq %rdx,%r10
1042 movq 8+0(%rbp),%rax
1043 mulq %r11
1044 addq %rax,%r15
1045 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001046 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001047 addq %r10,%r15
1048 adcq %rdx,%r9
1049 movq %r13,%r10
1050 movq %r14,%r11
1051 movq %r15,%r12
1052 andq $3,%r12
1053 movq %r15,%r13
1054 andq $-4,%r13
1055 movq %r9,%r14
1056 shrdq $2,%r9,%r15
1057 shrq $2,%r9
1058 addq %r13,%r10
1059 adcq %r14,%r11
1060 adcq $0,%r12
1061 addq %r15,%r10
1062 adcq %r9,%r11
1063 adcq $0,%r12
1064
10652:
1066 addq $16,%r8
1067 paddd %xmm4,%xmm0
1068 pxor %xmm0,%xmm12
1069 pshufb .rol16(%rip),%xmm12
1070 paddd %xmm12,%xmm8
1071 pxor %xmm8,%xmm4
1072 movdqa %xmm4,%xmm3
1073 pslld $12,%xmm3
1074 psrld $20,%xmm4
1075 pxor %xmm3,%xmm4
1076 paddd %xmm4,%xmm0
1077 pxor %xmm0,%xmm12
1078 pshufb .rol8(%rip),%xmm12
1079 paddd %xmm12,%xmm8
1080 pxor %xmm8,%xmm4
1081 movdqa %xmm4,%xmm3
1082 pslld $7,%xmm3
1083 psrld $25,%xmm4
1084 pxor %xmm3,%xmm4
1085.byte 102,15,58,15,228,4
1086.byte 102,69,15,58,15,192,8
1087.byte 102,69,15,58,15,228,12
1088 paddd %xmm5,%xmm1
1089 pxor %xmm1,%xmm13
1090 pshufb .rol16(%rip),%xmm13
1091 paddd %xmm13,%xmm9
1092 pxor %xmm9,%xmm5
1093 movdqa %xmm5,%xmm3
1094 pslld $12,%xmm3
1095 psrld $20,%xmm5
1096 pxor %xmm3,%xmm5
1097 paddd %xmm5,%xmm1
1098 pxor %xmm1,%xmm13
1099 pshufb .rol8(%rip),%xmm13
1100 paddd %xmm13,%xmm9
1101 pxor %xmm9,%xmm5
1102 movdqa %xmm5,%xmm3
1103 pslld $7,%xmm3
1104 psrld $25,%xmm5
1105 pxor %xmm3,%xmm5
1106.byte 102,15,58,15,237,4
1107.byte 102,69,15,58,15,201,8
1108.byte 102,69,15,58,15,237,12
1109 paddd %xmm6,%xmm2
1110 pxor %xmm2,%xmm14
1111 pshufb .rol16(%rip),%xmm14
1112 paddd %xmm14,%xmm10
1113 pxor %xmm10,%xmm6
1114 movdqa %xmm6,%xmm3
1115 pslld $12,%xmm3
1116 psrld $20,%xmm6
1117 pxor %xmm3,%xmm6
1118 paddd %xmm6,%xmm2
1119 pxor %xmm2,%xmm14
1120 pshufb .rol8(%rip),%xmm14
1121 paddd %xmm14,%xmm10
1122 pxor %xmm10,%xmm6
1123 movdqa %xmm6,%xmm3
1124 pslld $7,%xmm3
1125 psrld $25,%xmm6
1126 pxor %xmm3,%xmm6
1127.byte 102,15,58,15,246,4
1128.byte 102,69,15,58,15,210,8
1129.byte 102,69,15,58,15,246,12
1130 paddd %xmm4,%xmm0
1131 pxor %xmm0,%xmm12
1132 pshufb .rol16(%rip),%xmm12
1133 paddd %xmm12,%xmm8
1134 pxor %xmm8,%xmm4
1135 movdqa %xmm4,%xmm3
1136 pslld $12,%xmm3
1137 psrld $20,%xmm4
1138 pxor %xmm3,%xmm4
1139 paddd %xmm4,%xmm0
1140 pxor %xmm0,%xmm12
1141 pshufb .rol8(%rip),%xmm12
1142 paddd %xmm12,%xmm8
1143 pxor %xmm8,%xmm4
1144 movdqa %xmm4,%xmm3
1145 pslld $7,%xmm3
1146 psrld $25,%xmm4
1147 pxor %xmm3,%xmm4
1148.byte 102,15,58,15,228,12
1149.byte 102,69,15,58,15,192,8
1150.byte 102,69,15,58,15,228,4
1151 paddd %xmm5,%xmm1
1152 pxor %xmm1,%xmm13
1153 pshufb .rol16(%rip),%xmm13
1154 paddd %xmm13,%xmm9
1155 pxor %xmm9,%xmm5
1156 movdqa %xmm5,%xmm3
1157 pslld $12,%xmm3
1158 psrld $20,%xmm5
1159 pxor %xmm3,%xmm5
1160 paddd %xmm5,%xmm1
1161 pxor %xmm1,%xmm13
1162 pshufb .rol8(%rip),%xmm13
1163 paddd %xmm13,%xmm9
1164 pxor %xmm9,%xmm5
1165 movdqa %xmm5,%xmm3
1166 pslld $7,%xmm3
1167 psrld $25,%xmm5
1168 pxor %xmm3,%xmm5
1169.byte 102,15,58,15,237,12
1170.byte 102,69,15,58,15,201,8
1171.byte 102,69,15,58,15,237,4
1172 paddd %xmm6,%xmm2
1173 pxor %xmm2,%xmm14
1174 pshufb .rol16(%rip),%xmm14
1175 paddd %xmm14,%xmm10
1176 pxor %xmm10,%xmm6
1177 movdqa %xmm6,%xmm3
1178 pslld $12,%xmm3
1179 psrld $20,%xmm6
1180 pxor %xmm3,%xmm6
1181 paddd %xmm6,%xmm2
1182 pxor %xmm2,%xmm14
1183 pshufb .rol8(%rip),%xmm14
1184 paddd %xmm14,%xmm10
1185 pxor %xmm10,%xmm6
1186 movdqa %xmm6,%xmm3
1187 pslld $7,%xmm3
1188 psrld $25,%xmm6
1189 pxor %xmm3,%xmm6
1190.byte 102,15,58,15,246,12
1191.byte 102,69,15,58,15,210,8
1192.byte 102,69,15,58,15,246,4
1193
1194 cmpq %rcx,%r8
1195 jb 1b
1196 cmpq $160,%r8
1197 jne 2b
1198 cmpq $176,%rbx
1199 jb 1f
1200 addq 160(%rsi),%r10
1201 adcq 8+160(%rsi),%r11
1202 adcq $1,%r12
1203 movq 0+0(%rbp),%rax
1204 movq %rax,%r15
1205 mulq %r10
1206 movq %rax,%r13
1207 movq %rdx,%r14
1208 movq 0+0(%rbp),%rax
1209 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001210 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001211 addq %rax,%r14
1212 adcq %rdx,%r15
1213 movq 8+0(%rbp),%rax
1214 movq %rax,%r9
1215 mulq %r10
1216 addq %rax,%r14
1217 adcq $0,%rdx
1218 movq %rdx,%r10
1219 movq 8+0(%rbp),%rax
1220 mulq %r11
1221 addq %rax,%r15
1222 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001223 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001224 addq %r10,%r15
1225 adcq %rdx,%r9
1226 movq %r13,%r10
1227 movq %r14,%r11
1228 movq %r15,%r12
1229 andq $3,%r12
1230 movq %r15,%r13
1231 andq $-4,%r13
1232 movq %r9,%r14
1233 shrdq $2,%r9,%r15
1234 shrq $2,%r9
1235 addq %r13,%r10
1236 adcq %r14,%r11
1237 adcq $0,%r12
1238 addq %r15,%r10
1239 adcq %r9,%r11
1240 adcq $0,%r12
1241
1242 cmpq $192,%rbx
1243 jb 1f
1244 addq 176(%rsi),%r10
1245 adcq 8+176(%rsi),%r11
1246 adcq $1,%r12
1247 movq 0+0(%rbp),%rax
1248 movq %rax,%r15
1249 mulq %r10
1250 movq %rax,%r13
1251 movq %rdx,%r14
1252 movq 0+0(%rbp),%rax
1253 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001254 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001255 addq %rax,%r14
1256 adcq %rdx,%r15
1257 movq 8+0(%rbp),%rax
1258 movq %rax,%r9
1259 mulq %r10
1260 addq %rax,%r14
1261 adcq $0,%rdx
1262 movq %rdx,%r10
1263 movq 8+0(%rbp),%rax
1264 mulq %r11
1265 addq %rax,%r15
1266 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001267 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001268 addq %r10,%r15
1269 adcq %rdx,%r9
1270 movq %r13,%r10
1271 movq %r14,%r11
1272 movq %r15,%r12
1273 andq $3,%r12
1274 movq %r15,%r13
1275 andq $-4,%r13
1276 movq %r9,%r14
1277 shrdq $2,%r9,%r15
1278 shrq $2,%r9
1279 addq %r13,%r10
1280 adcq %r14,%r11
1281 adcq $0,%r12
1282 addq %r15,%r10
1283 adcq %r9,%r11
1284 adcq $0,%r12
1285
12861:
1287 paddd .chacha20_consts(%rip),%xmm2
1288 paddd 48(%rbp),%xmm6
1289 paddd 64(%rbp),%xmm10
1290 paddd 128(%rbp),%xmm14
1291 paddd .chacha20_consts(%rip),%xmm1
1292 paddd 48(%rbp),%xmm5
1293 paddd 64(%rbp),%xmm9
1294 paddd 112(%rbp),%xmm13
1295 paddd .chacha20_consts(%rip),%xmm0
1296 paddd 48(%rbp),%xmm4
1297 paddd 64(%rbp),%xmm8
1298 paddd 96(%rbp),%xmm12
1299 movdqu 0 + 0(%rsi),%xmm3
1300 movdqu 16 + 0(%rsi),%xmm7
1301 movdqu 32 + 0(%rsi),%xmm11
1302 movdqu 48 + 0(%rsi),%xmm15
1303 pxor %xmm3,%xmm2
1304 pxor %xmm7,%xmm6
1305 pxor %xmm11,%xmm10
1306 pxor %xmm14,%xmm15
1307 movdqu %xmm2,0 + 0(%rdi)
1308 movdqu %xmm6,16 + 0(%rdi)
1309 movdqu %xmm10,32 + 0(%rdi)
1310 movdqu %xmm15,48 + 0(%rdi)
1311 movdqu 0 + 64(%rsi),%xmm3
1312 movdqu 16 + 64(%rsi),%xmm7
1313 movdqu 32 + 64(%rsi),%xmm11
1314 movdqu 48 + 64(%rsi),%xmm15
1315 pxor %xmm3,%xmm1
1316 pxor %xmm7,%xmm5
1317 pxor %xmm11,%xmm9
1318 pxor %xmm13,%xmm15
1319 movdqu %xmm1,0 + 64(%rdi)
1320 movdqu %xmm5,16 + 64(%rdi)
1321 movdqu %xmm9,32 + 64(%rdi)
1322 movdqu %xmm15,48 + 64(%rdi)
1323
1324 subq $128,%rbx
1325 leaq 128(%rsi),%rsi
1326 leaq 128(%rdi),%rdi
1327 jmp open_sse_tail_64_dec_loop
13283:
1329
1330 movdqa .chacha20_consts(%rip),%xmm0
1331 movdqa 48(%rbp),%xmm4
1332 movdqa 64(%rbp),%xmm8
1333 movdqa %xmm0,%xmm1
1334 movdqa %xmm4,%xmm5
1335 movdqa %xmm8,%xmm9
1336 movdqa %xmm0,%xmm2
1337 movdqa %xmm4,%xmm6
1338 movdqa %xmm8,%xmm10
1339 movdqa %xmm0,%xmm3
1340 movdqa %xmm4,%xmm7
1341 movdqa %xmm8,%xmm11
1342 movdqa 96(%rbp),%xmm15
1343 paddd .sse_inc(%rip),%xmm15
1344 movdqa %xmm15,%xmm14
1345 paddd .sse_inc(%rip),%xmm14
1346 movdqa %xmm14,%xmm13
1347 paddd .sse_inc(%rip),%xmm13
1348 movdqa %xmm13,%xmm12
1349 paddd .sse_inc(%rip),%xmm12
1350 movdqa %xmm12,96(%rbp)
1351 movdqa %xmm13,112(%rbp)
1352 movdqa %xmm14,128(%rbp)
1353 movdqa %xmm15,144(%rbp)
1354
1355 xorq %r8,%r8
13561:
1357 addq 0(%rsi,%r8), %r10
1358 adcq 8+0(%rsi,%r8), %r11
1359 adcq $1,%r12
1360 movdqa %xmm11,80(%rbp)
1361 paddd %xmm4,%xmm0
1362 pxor %xmm0,%xmm12
1363 pshufb .rol16(%rip),%xmm12
1364 paddd %xmm12,%xmm8
1365 pxor %xmm8,%xmm4
1366 movdqa %xmm4,%xmm11
1367 pslld $12,%xmm11
1368 psrld $20,%xmm4
1369 pxor %xmm11,%xmm4
1370 paddd %xmm4,%xmm0
1371 pxor %xmm0,%xmm12
1372 pshufb .rol8(%rip),%xmm12
1373 paddd %xmm12,%xmm8
1374 pxor %xmm8,%xmm4
1375 movdqa %xmm4,%xmm11
1376 pslld $7,%xmm11
1377 psrld $25,%xmm4
1378 pxor %xmm11,%xmm4
1379.byte 102,15,58,15,228,4
1380.byte 102,69,15,58,15,192,8
1381.byte 102,69,15,58,15,228,12
1382 paddd %xmm5,%xmm1
1383 pxor %xmm1,%xmm13
1384 pshufb .rol16(%rip),%xmm13
1385 paddd %xmm13,%xmm9
1386 pxor %xmm9,%xmm5
1387 movdqa %xmm5,%xmm11
1388 pslld $12,%xmm11
1389 psrld $20,%xmm5
1390 pxor %xmm11,%xmm5
1391 paddd %xmm5,%xmm1
1392 pxor %xmm1,%xmm13
1393 pshufb .rol8(%rip),%xmm13
1394 paddd %xmm13,%xmm9
1395 pxor %xmm9,%xmm5
1396 movdqa %xmm5,%xmm11
1397 pslld $7,%xmm11
1398 psrld $25,%xmm5
1399 pxor %xmm11,%xmm5
1400.byte 102,15,58,15,237,4
1401.byte 102,69,15,58,15,201,8
1402.byte 102,69,15,58,15,237,12
1403 paddd %xmm6,%xmm2
1404 pxor %xmm2,%xmm14
1405 pshufb .rol16(%rip),%xmm14
1406 paddd %xmm14,%xmm10
1407 pxor %xmm10,%xmm6
1408 movdqa %xmm6,%xmm11
1409 pslld $12,%xmm11
1410 psrld $20,%xmm6
1411 pxor %xmm11,%xmm6
1412 paddd %xmm6,%xmm2
1413 pxor %xmm2,%xmm14
1414 pshufb .rol8(%rip),%xmm14
1415 paddd %xmm14,%xmm10
1416 pxor %xmm10,%xmm6
1417 movdqa %xmm6,%xmm11
1418 pslld $7,%xmm11
1419 psrld $25,%xmm6
1420 pxor %xmm11,%xmm6
1421.byte 102,15,58,15,246,4
1422.byte 102,69,15,58,15,210,8
1423.byte 102,69,15,58,15,246,12
1424 movdqa 80(%rbp),%xmm11
1425 movq 0+0(%rbp),%rax
1426 movq %rax,%r15
1427 mulq %r10
1428 movq %rax,%r13
1429 movq %rdx,%r14
1430 movq 0+0(%rbp),%rax
1431 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001432 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001433 addq %rax,%r14
1434 adcq %rdx,%r15
1435 movdqa %xmm9,80(%rbp)
1436 paddd %xmm7,%xmm3
1437 pxor %xmm3,%xmm15
1438 pshufb .rol16(%rip),%xmm15
1439 paddd %xmm15,%xmm11
1440 pxor %xmm11,%xmm7
1441 movdqa %xmm7,%xmm9
1442 pslld $12,%xmm9
1443 psrld $20,%xmm7
1444 pxor %xmm9,%xmm7
1445 paddd %xmm7,%xmm3
1446 pxor %xmm3,%xmm15
1447 pshufb .rol8(%rip),%xmm15
1448 paddd %xmm15,%xmm11
1449 pxor %xmm11,%xmm7
1450 movdqa %xmm7,%xmm9
1451 pslld $7,%xmm9
1452 psrld $25,%xmm7
1453 pxor %xmm9,%xmm7
1454.byte 102,15,58,15,255,4
1455.byte 102,69,15,58,15,219,8
1456.byte 102,69,15,58,15,255,12
1457 movdqa 80(%rbp),%xmm9
1458 movq 8+0(%rbp),%rax
1459 movq %rax,%r9
1460 mulq %r10
1461 addq %rax,%r14
1462 adcq $0,%rdx
1463 movq %rdx,%r10
1464 movq 8+0(%rbp),%rax
1465 mulq %r11
1466 addq %rax,%r15
1467 adcq $0,%rdx
1468 movdqa %xmm11,80(%rbp)
1469 paddd %xmm4,%xmm0
1470 pxor %xmm0,%xmm12
1471 pshufb .rol16(%rip),%xmm12
1472 paddd %xmm12,%xmm8
1473 pxor %xmm8,%xmm4
1474 movdqa %xmm4,%xmm11
1475 pslld $12,%xmm11
1476 psrld $20,%xmm4
1477 pxor %xmm11,%xmm4
1478 paddd %xmm4,%xmm0
1479 pxor %xmm0,%xmm12
1480 pshufb .rol8(%rip),%xmm12
1481 paddd %xmm12,%xmm8
1482 pxor %xmm8,%xmm4
1483 movdqa %xmm4,%xmm11
1484 pslld $7,%xmm11
1485 psrld $25,%xmm4
1486 pxor %xmm11,%xmm4
1487.byte 102,15,58,15,228,12
1488.byte 102,69,15,58,15,192,8
1489.byte 102,69,15,58,15,228,4
1490 paddd %xmm5,%xmm1
1491 pxor %xmm1,%xmm13
1492 pshufb .rol16(%rip),%xmm13
1493 paddd %xmm13,%xmm9
1494 pxor %xmm9,%xmm5
1495 movdqa %xmm5,%xmm11
1496 pslld $12,%xmm11
1497 psrld $20,%xmm5
1498 pxor %xmm11,%xmm5
1499 paddd %xmm5,%xmm1
1500 pxor %xmm1,%xmm13
1501 pshufb .rol8(%rip),%xmm13
1502 paddd %xmm13,%xmm9
1503 pxor %xmm9,%xmm5
1504 movdqa %xmm5,%xmm11
1505 pslld $7,%xmm11
1506 psrld $25,%xmm5
1507 pxor %xmm11,%xmm5
1508.byte 102,15,58,15,237,12
1509.byte 102,69,15,58,15,201,8
1510.byte 102,69,15,58,15,237,4
Robert Sloan4d1ac502017-02-06 08:36:14 -08001511 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001512 addq %r10,%r15
1513 adcq %rdx,%r9
1514 paddd %xmm6,%xmm2
1515 pxor %xmm2,%xmm14
1516 pshufb .rol16(%rip),%xmm14
1517 paddd %xmm14,%xmm10
1518 pxor %xmm10,%xmm6
1519 movdqa %xmm6,%xmm11
1520 pslld $12,%xmm11
1521 psrld $20,%xmm6
1522 pxor %xmm11,%xmm6
1523 paddd %xmm6,%xmm2
1524 pxor %xmm2,%xmm14
1525 pshufb .rol8(%rip),%xmm14
1526 paddd %xmm14,%xmm10
1527 pxor %xmm10,%xmm6
1528 movdqa %xmm6,%xmm11
1529 pslld $7,%xmm11
1530 psrld $25,%xmm6
1531 pxor %xmm11,%xmm6
1532.byte 102,15,58,15,246,12
1533.byte 102,69,15,58,15,210,8
1534.byte 102,69,15,58,15,246,4
1535 movdqa 80(%rbp),%xmm11
1536 movq %r13,%r10
1537 movq %r14,%r11
1538 movq %r15,%r12
1539 andq $3,%r12
1540 movq %r15,%r13
1541 andq $-4,%r13
1542 movq %r9,%r14
1543 shrdq $2,%r9,%r15
1544 shrq $2,%r9
1545 addq %r13,%r10
1546 adcq %r14,%r11
1547 adcq $0,%r12
1548 addq %r15,%r10
1549 adcq %r9,%r11
1550 adcq $0,%r12
1551 movdqa %xmm9,80(%rbp)
1552 paddd %xmm7,%xmm3
1553 pxor %xmm3,%xmm15
1554 pshufb .rol16(%rip),%xmm15
1555 paddd %xmm15,%xmm11
1556 pxor %xmm11,%xmm7
1557 movdqa %xmm7,%xmm9
1558 pslld $12,%xmm9
1559 psrld $20,%xmm7
1560 pxor %xmm9,%xmm7
1561 paddd %xmm7,%xmm3
1562 pxor %xmm3,%xmm15
1563 pshufb .rol8(%rip),%xmm15
1564 paddd %xmm15,%xmm11
1565 pxor %xmm11,%xmm7
1566 movdqa %xmm7,%xmm9
1567 pslld $7,%xmm9
1568 psrld $25,%xmm7
1569 pxor %xmm9,%xmm7
1570.byte 102,15,58,15,255,12
1571.byte 102,69,15,58,15,219,8
1572.byte 102,69,15,58,15,255,4
1573 movdqa 80(%rbp),%xmm9
1574
1575 addq $16,%r8
1576 cmpq $160,%r8
1577 jb 1b
1578 movq %rbx,%rcx
1579 andq $-16,%rcx
15801:
1581 addq 0(%rsi,%r8), %r10
1582 adcq 8+0(%rsi,%r8), %r11
1583 adcq $1,%r12
1584 movq 0+0(%rbp),%rax
1585 movq %rax,%r15
1586 mulq %r10
1587 movq %rax,%r13
1588 movq %rdx,%r14
1589 movq 0+0(%rbp),%rax
1590 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001591 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001592 addq %rax,%r14
1593 adcq %rdx,%r15
1594 movq 8+0(%rbp),%rax
1595 movq %rax,%r9
1596 mulq %r10
1597 addq %rax,%r14
1598 adcq $0,%rdx
1599 movq %rdx,%r10
1600 movq 8+0(%rbp),%rax
1601 mulq %r11
1602 addq %rax,%r15
1603 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001604 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001605 addq %r10,%r15
1606 adcq %rdx,%r9
1607 movq %r13,%r10
1608 movq %r14,%r11
1609 movq %r15,%r12
1610 andq $3,%r12
1611 movq %r15,%r13
1612 andq $-4,%r13
1613 movq %r9,%r14
1614 shrdq $2,%r9,%r15
1615 shrq $2,%r9
1616 addq %r13,%r10
1617 adcq %r14,%r11
1618 adcq $0,%r12
1619 addq %r15,%r10
1620 adcq %r9,%r11
1621 adcq $0,%r12
1622
1623 addq $16,%r8
1624 cmpq %rcx,%r8
1625 jb 1b
1626 paddd .chacha20_consts(%rip),%xmm3
1627 paddd 48(%rbp),%xmm7
1628 paddd 64(%rbp),%xmm11
1629 paddd 144(%rbp),%xmm15
1630 paddd .chacha20_consts(%rip),%xmm2
1631 paddd 48(%rbp),%xmm6
1632 paddd 64(%rbp),%xmm10
1633 paddd 128(%rbp),%xmm14
1634 paddd .chacha20_consts(%rip),%xmm1
1635 paddd 48(%rbp),%xmm5
1636 paddd 64(%rbp),%xmm9
1637 paddd 112(%rbp),%xmm13
1638 paddd .chacha20_consts(%rip),%xmm0
1639 paddd 48(%rbp),%xmm4
1640 paddd 64(%rbp),%xmm8
1641 paddd 96(%rbp),%xmm12
1642 movdqa %xmm12,80(%rbp)
1643 movdqu 0 + 0(%rsi),%xmm12
1644 pxor %xmm3,%xmm12
1645 movdqu %xmm12,0 + 0(%rdi)
1646 movdqu 16 + 0(%rsi),%xmm12
1647 pxor %xmm7,%xmm12
1648 movdqu %xmm12,16 + 0(%rdi)
1649 movdqu 32 + 0(%rsi),%xmm12
1650 pxor %xmm11,%xmm12
1651 movdqu %xmm12,32 + 0(%rdi)
1652 movdqu 48 + 0(%rsi),%xmm12
1653 pxor %xmm15,%xmm12
1654 movdqu %xmm12,48 + 0(%rdi)
1655 movdqu 0 + 64(%rsi),%xmm3
1656 movdqu 16 + 64(%rsi),%xmm7
1657 movdqu 32 + 64(%rsi),%xmm11
1658 movdqu 48 + 64(%rsi),%xmm15
1659 pxor %xmm3,%xmm2
1660 pxor %xmm7,%xmm6
1661 pxor %xmm11,%xmm10
1662 pxor %xmm14,%xmm15
1663 movdqu %xmm2,0 + 64(%rdi)
1664 movdqu %xmm6,16 + 64(%rdi)
1665 movdqu %xmm10,32 + 64(%rdi)
1666 movdqu %xmm15,48 + 64(%rdi)
1667 movdqu 0 + 128(%rsi),%xmm3
1668 movdqu 16 + 128(%rsi),%xmm7
1669 movdqu 32 + 128(%rsi),%xmm11
1670 movdqu 48 + 128(%rsi),%xmm15
1671 pxor %xmm3,%xmm1
1672 pxor %xmm7,%xmm5
1673 pxor %xmm11,%xmm9
1674 pxor %xmm13,%xmm15
1675 movdqu %xmm1,0 + 128(%rdi)
1676 movdqu %xmm5,16 + 128(%rdi)
1677 movdqu %xmm9,32 + 128(%rdi)
1678 movdqu %xmm15,48 + 128(%rdi)
1679
1680 movdqa 80(%rbp),%xmm12
1681 subq $192,%rbx
1682 leaq 192(%rsi),%rsi
1683 leaq 192(%rdi),%rdi
1684
1685
1686open_sse_tail_64_dec_loop:
1687 cmpq $16,%rbx
1688 jb 1f
1689 subq $16,%rbx
1690 movdqu (%rsi),%xmm3
1691 pxor %xmm3,%xmm0
1692 movdqu %xmm0,(%rdi)
1693 leaq 16(%rsi),%rsi
1694 leaq 16(%rdi),%rdi
1695 movdqa %xmm4,%xmm0
1696 movdqa %xmm8,%xmm4
1697 movdqa %xmm12,%xmm8
1698 jmp open_sse_tail_64_dec_loop
16991:
1700 movdqa %xmm0,%xmm1
1701
1702
1703open_sse_tail_16:
1704 testq %rbx,%rbx
1705 jz open_sse_finalize
1706
1707
1708
1709 pxor %xmm3,%xmm3
1710 leaq -1(%rsi,%rbx), %rsi
1711 movq %rbx,%r8
17122:
1713 pslldq $1,%xmm3
1714 pinsrb $0,(%rsi),%xmm3
1715 subq $1,%rsi
1716 subq $1,%r8
1717 jnz 2b
1718
17193:
1720.byte 102,73,15,126,221
1721 pextrq $1,%xmm3,%r14
1722
1723 pxor %xmm1,%xmm3
1724
1725
17262:
1727 pextrb $0,%xmm3,(%rdi)
1728 psrldq $1,%xmm3
1729 addq $1,%rdi
1730 subq $1,%rbx
1731 jne 2b
1732
1733 addq %r13,%r10
1734 adcq %r14,%r11
1735 adcq $1,%r12
1736 movq 0+0(%rbp),%rax
1737 movq %rax,%r15
1738 mulq %r10
1739 movq %rax,%r13
1740 movq %rdx,%r14
1741 movq 0+0(%rbp),%rax
1742 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001743 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001744 addq %rax,%r14
1745 adcq %rdx,%r15
1746 movq 8+0(%rbp),%rax
1747 movq %rax,%r9
1748 mulq %r10
1749 addq %rax,%r14
1750 adcq $0,%rdx
1751 movq %rdx,%r10
1752 movq 8+0(%rbp),%rax
1753 mulq %r11
1754 addq %rax,%r15
1755 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001756 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001757 addq %r10,%r15
1758 adcq %rdx,%r9
1759 movq %r13,%r10
1760 movq %r14,%r11
1761 movq %r15,%r12
1762 andq $3,%r12
1763 movq %r15,%r13
1764 andq $-4,%r13
1765 movq %r9,%r14
1766 shrdq $2,%r9,%r15
1767 shrq $2,%r9
1768 addq %r13,%r10
1769 adcq %r14,%r11
1770 adcq $0,%r12
1771 addq %r15,%r10
1772 adcq %r9,%r11
1773 adcq $0,%r12
1774
1775
1776open_sse_finalize:
1777 addq 32(%rbp),%r10
1778 adcq 8+32(%rbp),%r11
1779 adcq $1,%r12
1780 movq 0+0(%rbp),%rax
1781 movq %rax,%r15
1782 mulq %r10
1783 movq %rax,%r13
1784 movq %rdx,%r14
1785 movq 0+0(%rbp),%rax
1786 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001787 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001788 addq %rax,%r14
1789 adcq %rdx,%r15
1790 movq 8+0(%rbp),%rax
1791 movq %rax,%r9
1792 mulq %r10
1793 addq %rax,%r14
1794 adcq $0,%rdx
1795 movq %rdx,%r10
1796 movq 8+0(%rbp),%rax
1797 mulq %r11
1798 addq %rax,%r15
1799 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001800 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001801 addq %r10,%r15
1802 adcq %rdx,%r9
1803 movq %r13,%r10
1804 movq %r14,%r11
1805 movq %r15,%r12
1806 andq $3,%r12
1807 movq %r15,%r13
1808 andq $-4,%r13
1809 movq %r9,%r14
1810 shrdq $2,%r9,%r15
1811 shrq $2,%r9
1812 addq %r13,%r10
1813 adcq %r14,%r11
1814 adcq $0,%r12
1815 addq %r15,%r10
1816 adcq %r9,%r11
1817 adcq $0,%r12
1818
1819
1820 movq %r10,%r13
1821 movq %r11,%r14
1822 movq %r12,%r15
1823 subq $-5,%r10
1824 sbbq $-1,%r11
1825 sbbq $3,%r12
1826 cmovcq %r13,%r10
1827 cmovcq %r14,%r11
1828 cmovcq %r15,%r12
1829
1830 addq 0+16(%rbp),%r10
1831 adcq 8+16(%rbp),%r11
1832
1833 addq $288 + 32,%rsp
1834.cfi_adjust_cfa_offset -(288 + 32)
1835 popq %r9
1836.cfi_adjust_cfa_offset -8
1837 movq %r10,(%r9)
1838 movq %r11,8(%r9)
1839
1840 popq %r15
1841.cfi_adjust_cfa_offset -8
1842 popq %r14
1843.cfi_adjust_cfa_offset -8
1844 popq %r13
1845.cfi_adjust_cfa_offset -8
1846 popq %r12
1847.cfi_adjust_cfa_offset -8
1848 popq %rbx
1849.cfi_adjust_cfa_offset -8
1850 popq %rbp
1851.cfi_adjust_cfa_offset -8
1852 .byte 0xf3,0xc3
1853.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
1854
1855open_sse_128:
1856 movdqu .chacha20_consts(%rip),%xmm0
1857 movdqa %xmm0,%xmm1
1858 movdqa %xmm0,%xmm2
1859 movdqu 0(%r9),%xmm4
1860 movdqa %xmm4,%xmm5
1861 movdqa %xmm4,%xmm6
1862 movdqu 16(%r9),%xmm8
1863 movdqa %xmm8,%xmm9
1864 movdqa %xmm8,%xmm10
1865 movdqu 32(%r9),%xmm12
1866 movdqa %xmm12,%xmm13
1867 paddd .sse_inc(%rip),%xmm13
1868 movdqa %xmm13,%xmm14
1869 paddd .sse_inc(%rip),%xmm14
1870 movdqa %xmm4,%xmm7
1871 movdqa %xmm8,%xmm11
1872 movdqa %xmm13,%xmm15
1873 movq $10,%r10
18741:
1875 paddd %xmm4,%xmm0
1876 pxor %xmm0,%xmm12
1877 pshufb .rol16(%rip),%xmm12
1878 paddd %xmm12,%xmm8
1879 pxor %xmm8,%xmm4
1880 movdqa %xmm4,%xmm3
1881 pslld $12,%xmm3
1882 psrld $20,%xmm4
1883 pxor %xmm3,%xmm4
1884 paddd %xmm4,%xmm0
1885 pxor %xmm0,%xmm12
1886 pshufb .rol8(%rip),%xmm12
1887 paddd %xmm12,%xmm8
1888 pxor %xmm8,%xmm4
1889 movdqa %xmm4,%xmm3
1890 pslld $7,%xmm3
1891 psrld $25,%xmm4
1892 pxor %xmm3,%xmm4
1893.byte 102,15,58,15,228,4
1894.byte 102,69,15,58,15,192,8
1895.byte 102,69,15,58,15,228,12
1896 paddd %xmm5,%xmm1
1897 pxor %xmm1,%xmm13
1898 pshufb .rol16(%rip),%xmm13
1899 paddd %xmm13,%xmm9
1900 pxor %xmm9,%xmm5
1901 movdqa %xmm5,%xmm3
1902 pslld $12,%xmm3
1903 psrld $20,%xmm5
1904 pxor %xmm3,%xmm5
1905 paddd %xmm5,%xmm1
1906 pxor %xmm1,%xmm13
1907 pshufb .rol8(%rip),%xmm13
1908 paddd %xmm13,%xmm9
1909 pxor %xmm9,%xmm5
1910 movdqa %xmm5,%xmm3
1911 pslld $7,%xmm3
1912 psrld $25,%xmm5
1913 pxor %xmm3,%xmm5
1914.byte 102,15,58,15,237,4
1915.byte 102,69,15,58,15,201,8
1916.byte 102,69,15,58,15,237,12
1917 paddd %xmm6,%xmm2
1918 pxor %xmm2,%xmm14
1919 pshufb .rol16(%rip),%xmm14
1920 paddd %xmm14,%xmm10
1921 pxor %xmm10,%xmm6
1922 movdqa %xmm6,%xmm3
1923 pslld $12,%xmm3
1924 psrld $20,%xmm6
1925 pxor %xmm3,%xmm6
1926 paddd %xmm6,%xmm2
1927 pxor %xmm2,%xmm14
1928 pshufb .rol8(%rip),%xmm14
1929 paddd %xmm14,%xmm10
1930 pxor %xmm10,%xmm6
1931 movdqa %xmm6,%xmm3
1932 pslld $7,%xmm3
1933 psrld $25,%xmm6
1934 pxor %xmm3,%xmm6
1935.byte 102,15,58,15,246,4
1936.byte 102,69,15,58,15,210,8
1937.byte 102,69,15,58,15,246,12
1938 paddd %xmm4,%xmm0
1939 pxor %xmm0,%xmm12
1940 pshufb .rol16(%rip),%xmm12
1941 paddd %xmm12,%xmm8
1942 pxor %xmm8,%xmm4
1943 movdqa %xmm4,%xmm3
1944 pslld $12,%xmm3
1945 psrld $20,%xmm4
1946 pxor %xmm3,%xmm4
1947 paddd %xmm4,%xmm0
1948 pxor %xmm0,%xmm12
1949 pshufb .rol8(%rip),%xmm12
1950 paddd %xmm12,%xmm8
1951 pxor %xmm8,%xmm4
1952 movdqa %xmm4,%xmm3
1953 pslld $7,%xmm3
1954 psrld $25,%xmm4
1955 pxor %xmm3,%xmm4
1956.byte 102,15,58,15,228,12
1957.byte 102,69,15,58,15,192,8
1958.byte 102,69,15,58,15,228,4
1959 paddd %xmm5,%xmm1
1960 pxor %xmm1,%xmm13
1961 pshufb .rol16(%rip),%xmm13
1962 paddd %xmm13,%xmm9
1963 pxor %xmm9,%xmm5
1964 movdqa %xmm5,%xmm3
1965 pslld $12,%xmm3
1966 psrld $20,%xmm5
1967 pxor %xmm3,%xmm5
1968 paddd %xmm5,%xmm1
1969 pxor %xmm1,%xmm13
1970 pshufb .rol8(%rip),%xmm13
1971 paddd %xmm13,%xmm9
1972 pxor %xmm9,%xmm5
1973 movdqa %xmm5,%xmm3
1974 pslld $7,%xmm3
1975 psrld $25,%xmm5
1976 pxor %xmm3,%xmm5
1977.byte 102,15,58,15,237,12
1978.byte 102,69,15,58,15,201,8
1979.byte 102,69,15,58,15,237,4
1980 paddd %xmm6,%xmm2
1981 pxor %xmm2,%xmm14
1982 pshufb .rol16(%rip),%xmm14
1983 paddd %xmm14,%xmm10
1984 pxor %xmm10,%xmm6
1985 movdqa %xmm6,%xmm3
1986 pslld $12,%xmm3
1987 psrld $20,%xmm6
1988 pxor %xmm3,%xmm6
1989 paddd %xmm6,%xmm2
1990 pxor %xmm2,%xmm14
1991 pshufb .rol8(%rip),%xmm14
1992 paddd %xmm14,%xmm10
1993 pxor %xmm10,%xmm6
1994 movdqa %xmm6,%xmm3
1995 pslld $7,%xmm3
1996 psrld $25,%xmm6
1997 pxor %xmm3,%xmm6
1998.byte 102,15,58,15,246,12
1999.byte 102,69,15,58,15,210,8
2000.byte 102,69,15,58,15,246,4
2001
2002 decq %r10
2003 jnz 1b
2004 paddd .chacha20_consts(%rip),%xmm0
2005 paddd .chacha20_consts(%rip),%xmm1
2006 paddd .chacha20_consts(%rip),%xmm2
2007 paddd %xmm7,%xmm4
2008 paddd %xmm7,%xmm5
2009 paddd %xmm7,%xmm6
2010 paddd %xmm11,%xmm9
2011 paddd %xmm11,%xmm10
2012 paddd %xmm15,%xmm13
2013 paddd .sse_inc(%rip),%xmm15
2014 paddd %xmm15,%xmm14
2015
2016 pand .clamp(%rip),%xmm0
2017 movdqa %xmm0,0(%rbp)
2018 movdqa %xmm4,16(%rbp)
2019
2020 movq %r8,%r8
2021 call poly_hash_ad_internal
20221:
2023 cmpq $16,%rbx
2024 jb open_sse_tail_16
2025 subq $16,%rbx
2026 addq 0(%rsi),%r10
2027 adcq 8+0(%rsi),%r11
2028 adcq $1,%r12
2029
2030
2031 movdqu 0(%rsi),%xmm3
2032 pxor %xmm3,%xmm1
2033 movdqu %xmm1,0(%rdi)
2034 leaq 16(%rsi),%rsi
2035 leaq 16(%rdi),%rdi
2036 movq 0+0(%rbp),%rax
2037 movq %rax,%r15
2038 mulq %r10
2039 movq %rax,%r13
2040 movq %rdx,%r14
2041 movq 0+0(%rbp),%rax
2042 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08002043 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05002044 addq %rax,%r14
2045 adcq %rdx,%r15
2046 movq 8+0(%rbp),%rax
2047 movq %rax,%r9
2048 mulq %r10
2049 addq %rax,%r14
2050 adcq $0,%rdx
2051 movq %rdx,%r10
2052 movq 8+0(%rbp),%rax
2053 mulq %r11
2054 addq %rax,%r15
2055 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08002056 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05002057 addq %r10,%r15
2058 adcq %rdx,%r9
2059 movq %r13,%r10
2060 movq %r14,%r11
2061 movq %r15,%r12
2062 andq $3,%r12
2063 movq %r15,%r13
2064 andq $-4,%r13
2065 movq %r9,%r14
2066 shrdq $2,%r9,%r15
2067 shrq $2,%r9
2068 addq %r13,%r10
2069 adcq %r14,%r11
2070 adcq $0,%r12
2071 addq %r15,%r10
2072 adcq %r9,%r11
2073 adcq $0,%r12
2074
2075
2076 movdqa %xmm5,%xmm1
2077 movdqa %xmm9,%xmm5
2078 movdqa %xmm13,%xmm9
2079 movdqa %xmm2,%xmm13
2080 movdqa %xmm6,%xmm2
2081 movdqa %xmm10,%xmm6
2082 movdqa %xmm14,%xmm10
2083 jmp 1b
2084 jmp open_sse_tail_16
2085
2086.cfi_endproc
2087
2088
2089
2090
2091.globl _chacha20_poly1305_seal
2092.private_extern _chacha20_poly1305_seal
2093
2094.p2align 6
2095_chacha20_poly1305_seal:
2096.cfi_startproc
2097 pushq %rbp
2098.cfi_adjust_cfa_offset 8
2099 pushq %rbx
2100.cfi_adjust_cfa_offset 8
2101 pushq %r12
2102.cfi_adjust_cfa_offset 8
2103 pushq %r13
2104.cfi_adjust_cfa_offset 8
2105 pushq %r14
2106.cfi_adjust_cfa_offset 8
2107 pushq %r15
2108.cfi_adjust_cfa_offset 8
2109
2110
2111 pushq %r9
2112.cfi_adjust_cfa_offset 8
2113 subq $288 + 32,%rsp
2114.cfi_adjust_cfa_offset 288 + 32
2115.cfi_offset rbp, -16
2116.cfi_offset rbx, -24
2117.cfi_offset r12, -32
2118.cfi_offset r13, -40
2119.cfi_offset r14, -48
2120.cfi_offset r15, -56
David Benjaminf31229b2017-01-25 14:08:15 -05002121 leaq 32(%rsp),%rbp
2122 andq $-32,%rbp
2123 movq %rdx,8+32(%rbp)
2124 movq %r8,0+32(%rbp)
2125 movq %rdx,%rbx
2126
2127 movl _OPENSSL_ia32cap_P+8(%rip),%eax
2128 andl $288,%eax
2129 xorl $288,%eax
2130 jz chacha20_poly1305_seal_avx2
2131
2132 cmpq $128,%rbx
2133 jbe seal_sse_128
2134
2135 movdqa .chacha20_consts(%rip),%xmm0
2136 movdqu 0(%r9),%xmm4
2137 movdqu 16(%r9),%xmm8
2138 movdqu 32(%r9),%xmm12
2139 movdqa %xmm0,%xmm1
2140 movdqa %xmm0,%xmm2
2141 movdqa %xmm0,%xmm3
2142 movdqa %xmm4,%xmm5
2143 movdqa %xmm4,%xmm6
2144 movdqa %xmm4,%xmm7
2145 movdqa %xmm8,%xmm9
2146 movdqa %xmm8,%xmm10
2147 movdqa %xmm8,%xmm11
2148 movdqa %xmm12,%xmm15
2149 paddd .sse_inc(%rip),%xmm12
2150 movdqa %xmm12,%xmm14
2151 paddd .sse_inc(%rip),%xmm12
2152 movdqa %xmm12,%xmm13
2153 paddd .sse_inc(%rip),%xmm12
2154
2155 movdqa %xmm4,48(%rbp)
2156 movdqa %xmm8,64(%rbp)
2157 movdqa %xmm12,96(%rbp)
2158 movdqa %xmm13,112(%rbp)
2159 movdqa %xmm14,128(%rbp)
2160 movdqa %xmm15,144(%rbp)
2161 movq $10,%r10
21621:
2163 movdqa %xmm8,80(%rbp)
2164 movdqa .rol16(%rip),%xmm8
2165 paddd %xmm7,%xmm3
2166 paddd %xmm6,%xmm2
2167 paddd %xmm5,%xmm1
2168 paddd %xmm4,%xmm0
2169 pxor %xmm3,%xmm15
2170 pxor %xmm2,%xmm14
2171 pxor %xmm1,%xmm13
2172 pxor %xmm0,%xmm12
2173.byte 102,69,15,56,0,248
2174.byte 102,69,15,56,0,240
2175.byte 102,69,15,56,0,232
2176.byte 102,69,15,56,0,224
2177 movdqa 80(%rbp),%xmm8
2178 paddd %xmm15,%xmm11
2179 paddd %xmm14,%xmm10
2180 paddd %xmm13,%xmm9
2181 paddd %xmm12,%xmm8
2182 pxor %xmm11,%xmm7
2183 pxor %xmm10,%xmm6
2184 pxor %xmm9,%xmm5
2185 pxor %xmm8,%xmm4
2186 movdqa %xmm8,80(%rbp)
2187 movdqa %xmm7,%xmm8
2188 psrld $20,%xmm8
2189 pslld $32-20,%xmm7
2190 pxor %xmm8,%xmm7
2191 movdqa %xmm6,%xmm8
2192 psrld $20,%xmm8
2193 pslld $32-20,%xmm6
2194 pxor %xmm8,%xmm6
2195 movdqa %xmm5,%xmm8
2196 psrld $20,%xmm8
2197 pslld $32-20,%xmm5
2198 pxor %xmm8,%xmm5
2199 movdqa %xmm4,%xmm8
2200 psrld $20,%xmm8
2201 pslld $32-20,%xmm4
2202 pxor %xmm8,%xmm4
2203 movdqa .rol8(%rip),%xmm8
2204 paddd %xmm7,%xmm3
2205 paddd %xmm6,%xmm2
2206 paddd %xmm5,%xmm1
2207 paddd %xmm4,%xmm0
2208 pxor %xmm3,%xmm15
2209 pxor %xmm2,%xmm14
2210 pxor %xmm1,%xmm13
2211 pxor %xmm0,%xmm12
2212.byte 102,69,15,56,0,248
2213.byte 102,69,15,56,0,240
2214.byte 102,69,15,56,0,232
2215.byte 102,69,15,56,0,224
2216 movdqa 80(%rbp),%xmm8
2217 paddd %xmm15,%xmm11
2218 paddd %xmm14,%xmm10
2219 paddd %xmm13,%xmm9
2220 paddd %xmm12,%xmm8
2221 pxor %xmm11,%xmm7
2222 pxor %xmm10,%xmm6
2223 pxor %xmm9,%xmm5
2224 pxor %xmm8,%xmm4
2225 movdqa %xmm8,80(%rbp)
2226 movdqa %xmm7,%xmm8
2227 psrld $25,%xmm8
2228 pslld $32-25,%xmm7
2229 pxor %xmm8,%xmm7
2230 movdqa %xmm6,%xmm8
2231 psrld $25,%xmm8
2232 pslld $32-25,%xmm6
2233 pxor %xmm8,%xmm6
2234 movdqa %xmm5,%xmm8
2235 psrld $25,%xmm8
2236 pslld $32-25,%xmm5
2237 pxor %xmm8,%xmm5
2238 movdqa %xmm4,%xmm8
2239 psrld $25,%xmm8
2240 pslld $32-25,%xmm4
2241 pxor %xmm8,%xmm4
2242 movdqa 80(%rbp),%xmm8
2243.byte 102,15,58,15,255,4
2244.byte 102,69,15,58,15,219,8
2245.byte 102,69,15,58,15,255,12
2246.byte 102,15,58,15,246,4
2247.byte 102,69,15,58,15,210,8
2248.byte 102,69,15,58,15,246,12
2249.byte 102,15,58,15,237,4
2250.byte 102,69,15,58,15,201,8
2251.byte 102,69,15,58,15,237,12
2252.byte 102,15,58,15,228,4
2253.byte 102,69,15,58,15,192,8
2254.byte 102,69,15,58,15,228,12
2255 movdqa %xmm8,80(%rbp)
2256 movdqa .rol16(%rip),%xmm8
2257 paddd %xmm7,%xmm3
2258 paddd %xmm6,%xmm2
2259 paddd %xmm5,%xmm1
2260 paddd %xmm4,%xmm0
2261 pxor %xmm3,%xmm15
2262 pxor %xmm2,%xmm14
2263 pxor %xmm1,%xmm13
2264 pxor %xmm0,%xmm12
2265.byte 102,69,15,56,0,248
2266.byte 102,69,15,56,0,240
2267.byte 102,69,15,56,0,232
2268.byte 102,69,15,56,0,224
2269 movdqa 80(%rbp),%xmm8
2270 paddd %xmm15,%xmm11
2271 paddd %xmm14,%xmm10
2272 paddd %xmm13,%xmm9
2273 paddd %xmm12,%xmm8
2274 pxor %xmm11,%xmm7
2275 pxor %xmm10,%xmm6
2276 pxor %xmm9,%xmm5
2277 pxor %xmm8,%xmm4
2278 movdqa %xmm8,80(%rbp)
2279 movdqa %xmm7,%xmm8
2280 psrld $20,%xmm8
2281 pslld $32-20,%xmm7
2282 pxor %xmm8,%xmm7
2283 movdqa %xmm6,%xmm8
2284 psrld $20,%xmm8
2285 pslld $32-20,%xmm6
2286 pxor %xmm8,%xmm6
2287 movdqa %xmm5,%xmm8
2288 psrld $20,%xmm8
2289 pslld $32-20,%xmm5
2290 pxor %xmm8,%xmm5
2291 movdqa %xmm4,%xmm8
2292 psrld $20,%xmm8
2293 pslld $32-20,%xmm4
2294 pxor %xmm8,%xmm4
2295 movdqa .rol8(%rip),%xmm8
2296 paddd %xmm7,%xmm3
2297 paddd %xmm6,%xmm2
2298 paddd %xmm5,%xmm1
2299 paddd %xmm4,%xmm0
2300 pxor %xmm3,%xmm15
2301 pxor %xmm2,%xmm14
2302 pxor %xmm1,%xmm13
2303 pxor %xmm0,%xmm12
2304.byte 102,69,15,56,0,248
2305.byte 102,69,15,56,0,240
2306.byte 102,69,15,56,0,232
2307.byte 102,69,15,56,0,224
2308 movdqa 80(%rbp),%xmm8
2309 paddd %xmm15,%xmm11
2310 paddd %xmm14,%xmm10
2311 paddd %xmm13,%xmm9
2312 paddd %xmm12,%xmm8
2313 pxor %xmm11,%xmm7
2314 pxor %xmm10,%xmm6
2315 pxor %xmm9,%xmm5
2316 pxor %xmm8,%xmm4
2317 movdqa %xmm8,80(%rbp)
2318 movdqa %xmm7,%xmm8
2319 psrld $25,%xmm8
2320 pslld $32-25,%xmm7
2321 pxor %xmm8,%xmm7
2322 movdqa %xmm6,%xmm8
2323 psrld $25,%xmm8
2324 pslld $32-25,%xmm6
2325 pxor %xmm8,%xmm6
2326 movdqa %xmm5,%xmm8
2327 psrld $25,%xmm8
2328 pslld $32-25,%xmm5
2329 pxor %xmm8,%xmm5
2330 movdqa %xmm4,%xmm8
2331 psrld $25,%xmm8
2332 pslld $32-25,%xmm4
2333 pxor %xmm8,%xmm4
2334 movdqa 80(%rbp),%xmm8
2335.byte 102,15,58,15,255,12
2336.byte 102,69,15,58,15,219,8
2337.byte 102,69,15,58,15,255,4
2338.byte 102,15,58,15,246,12
2339.byte 102,69,15,58,15,210,8
2340.byte 102,69,15,58,15,246,4
2341.byte 102,15,58,15,237,12
2342.byte 102,69,15,58,15,201,8
2343.byte 102,69,15,58,15,237,4
2344.byte 102,15,58,15,228,12
2345.byte 102,69,15,58,15,192,8
2346.byte 102,69,15,58,15,228,4
2347
2348 decq %r10
2349 jnz 1b
2350 paddd .chacha20_consts(%rip),%xmm3
2351 paddd 48(%rbp),%xmm7
2352 paddd 64(%rbp),%xmm11
2353 paddd 144(%rbp),%xmm15
2354 paddd .chacha20_consts(%rip),%xmm2
2355 paddd 48(%rbp),%xmm6
2356 paddd 64(%rbp),%xmm10
2357 paddd 128(%rbp),%xmm14
2358 paddd .chacha20_consts(%rip),%xmm1
2359 paddd 48(%rbp),%xmm5
2360 paddd 64(%rbp),%xmm9
2361 paddd 112(%rbp),%xmm13
2362 paddd .chacha20_consts(%rip),%xmm0
2363 paddd 48(%rbp),%xmm4
2364 paddd 64(%rbp),%xmm8
2365 paddd 96(%rbp),%xmm12
2366
2367
2368 pand .clamp(%rip),%xmm3
2369 movdqa %xmm3,0(%rbp)
2370 movdqa %xmm7,16(%rbp)
2371
2372 movq %r8,%r8
2373 call poly_hash_ad_internal
2374 movdqu 0 + 0(%rsi),%xmm3
2375 movdqu 16 + 0(%rsi),%xmm7
2376 movdqu 32 + 0(%rsi),%xmm11
2377 movdqu 48 + 0(%rsi),%xmm15
2378 pxor %xmm3,%xmm2
2379 pxor %xmm7,%xmm6
2380 pxor %xmm11,%xmm10
2381 pxor %xmm14,%xmm15
2382 movdqu %xmm2,0 + 0(%rdi)
2383 movdqu %xmm6,16 + 0(%rdi)
2384 movdqu %xmm10,32 + 0(%rdi)
2385 movdqu %xmm15,48 + 0(%rdi)
2386 movdqu 0 + 64(%rsi),%xmm3
2387 movdqu 16 + 64(%rsi),%xmm7
2388 movdqu 32 + 64(%rsi),%xmm11
2389 movdqu 48 + 64(%rsi),%xmm15
2390 pxor %xmm3,%xmm1
2391 pxor %xmm7,%xmm5
2392 pxor %xmm11,%xmm9
2393 pxor %xmm13,%xmm15
2394 movdqu %xmm1,0 + 64(%rdi)
2395 movdqu %xmm5,16 + 64(%rdi)
2396 movdqu %xmm9,32 + 64(%rdi)
2397 movdqu %xmm15,48 + 64(%rdi)
2398
2399 cmpq $192,%rbx
2400 ja 1f
2401 movq $128,%rcx
2402 subq $128,%rbx
2403 leaq 128(%rsi),%rsi
2404 jmp seal_sse_128_seal_hash
24051:
2406 movdqu 0 + 128(%rsi),%xmm3
2407 movdqu 16 + 128(%rsi),%xmm7
2408 movdqu 32 + 128(%rsi),%xmm11
2409 movdqu 48 + 128(%rsi),%xmm15
2410 pxor %xmm3,%xmm0
2411 pxor %xmm7,%xmm4
2412 pxor %xmm11,%xmm8
2413 pxor %xmm12,%xmm15
2414 movdqu %xmm0,0 + 128(%rdi)
2415 movdqu %xmm4,16 + 128(%rdi)
2416 movdqu %xmm8,32 + 128(%rdi)
2417 movdqu %xmm15,48 + 128(%rdi)
2418
2419 movq $192,%rcx
2420 subq $192,%rbx
2421 leaq 192(%rsi),%rsi
2422 movq $2,%rcx
2423 movq $8,%r8
2424 cmpq $64,%rbx
2425 jbe seal_sse_tail_64
2426 cmpq $128,%rbx
2427 jbe seal_sse_tail_128
2428 cmpq $192,%rbx
2429 jbe seal_sse_tail_192
2430
24311:
2432 movdqa .chacha20_consts(%rip),%xmm0
2433 movdqa 48(%rbp),%xmm4
2434 movdqa 64(%rbp),%xmm8
2435 movdqa %xmm0,%xmm1
2436 movdqa %xmm4,%xmm5
2437 movdqa %xmm8,%xmm9
2438 movdqa %xmm0,%xmm2
2439 movdqa %xmm4,%xmm6
2440 movdqa %xmm8,%xmm10
2441 movdqa %xmm0,%xmm3
2442 movdqa %xmm4,%xmm7
2443 movdqa %xmm8,%xmm11
2444 movdqa 96(%rbp),%xmm15
2445 paddd .sse_inc(%rip),%xmm15
2446 movdqa %xmm15,%xmm14
2447 paddd .sse_inc(%rip),%xmm14
2448 movdqa %xmm14,%xmm13
2449 paddd .sse_inc(%rip),%xmm13
2450 movdqa %xmm13,%xmm12
2451 paddd .sse_inc(%rip),%xmm12
2452 movdqa %xmm12,96(%rbp)
2453 movdqa %xmm13,112(%rbp)
2454 movdqa %xmm14,128(%rbp)
2455 movdqa %xmm15,144(%rbp)
2456
24572:
2458 movdqa %xmm8,80(%rbp)
2459 movdqa .rol16(%rip),%xmm8
2460 paddd %xmm7,%xmm3
2461 paddd %xmm6,%xmm2
2462 paddd %xmm5,%xmm1
2463 paddd %xmm4,%xmm0
2464 pxor %xmm3,%xmm15
2465 pxor %xmm2,%xmm14
2466 pxor %xmm1,%xmm13
2467 pxor %xmm0,%xmm12
2468.byte 102,69,15,56,0,248
2469.byte 102,69,15,56,0,240
2470.byte 102,69,15,56,0,232
2471.byte 102,69,15,56,0,224
2472 movdqa 80(%rbp),%xmm8
2473 paddd %xmm15,%xmm11
2474 paddd %xmm14,%xmm10
2475 paddd %xmm13,%xmm9
2476 paddd %xmm12,%xmm8
2477 pxor %xmm11,%xmm7
2478 addq 0(%rdi),%r10
2479 adcq 8+0(%rdi),%r11
2480 adcq $1,%r12
2481 pxor %xmm10,%xmm6
2482 pxor %xmm9,%xmm5
2483 pxor %xmm8,%xmm4
2484 movdqa %xmm8,80(%rbp)
2485 movdqa %xmm7,%xmm8
2486 psrld $20,%xmm8
2487 pslld $32-20,%xmm7
2488 pxor %xmm8,%xmm7
2489 movdqa %xmm6,%xmm8
2490 psrld $20,%xmm8
2491 pslld $32-20,%xmm6
2492 pxor %xmm8,%xmm6
2493 movdqa %xmm5,%xmm8
2494 psrld $20,%xmm8
2495 pslld $32-20,%xmm5
2496 pxor %xmm8,%xmm5
2497 movdqa %xmm4,%xmm8
2498 psrld $20,%xmm8
2499 pslld $32-20,%xmm4
2500 pxor %xmm8,%xmm4
2501 movq 0+0(%rbp),%rax
2502 movq %rax,%r15
2503 mulq %r10
2504 movq %rax,%r13
2505 movq %rdx,%r14
2506 movq 0+0(%rbp),%rax
2507 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08002508 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05002509 addq %rax,%r14
2510 adcq %rdx,%r15
2511 movdqa .rol8(%rip),%xmm8
2512 paddd %xmm7,%xmm3
2513 paddd %xmm6,%xmm2
2514 paddd %xmm5,%xmm1
2515 paddd %xmm4,%xmm0
2516 pxor %xmm3,%xmm15
2517 pxor %xmm2,%xmm14
2518 pxor %xmm1,%xmm13
2519 pxor %xmm0,%xmm12
2520.byte 102,69,15,56,0,248
2521.byte 102,69,15,56,0,240
2522.byte 102,69,15,56,0,232
2523.byte 102,69,15,56,0,224
2524 movdqa 80(%rbp),%xmm8
2525 paddd %xmm15,%xmm11
2526 paddd %xmm14,%xmm10
2527 paddd %xmm13,%xmm9
2528 paddd %xmm12,%xmm8
2529 pxor %xmm11,%xmm7
2530 pxor %xmm10,%xmm6
2531 movq 8+0(%rbp),%rax
2532 movq %rax,%r9
2533 mulq %r10
2534 addq %rax,%r14
2535 adcq $0,%rdx
2536 movq %rdx,%r10
2537 movq 8+0(%rbp),%rax
2538 mulq %r11
2539 addq %rax,%r15
2540 adcq $0,%rdx
2541 pxor %xmm9,%xmm5
2542 pxor %xmm8,%xmm4
2543 movdqa %xmm8,80(%rbp)
2544 movdqa %xmm7,%xmm8
2545 psrld $25,%xmm8
2546 pslld $32-25,%xmm7
2547 pxor %xmm8,%xmm7
2548 movdqa %xmm6,%xmm8
2549 psrld $25,%xmm8
2550 pslld $32-25,%xmm6
2551 pxor %xmm8,%xmm6
2552 movdqa %xmm5,%xmm8
2553 psrld $25,%xmm8
2554 pslld $32-25,%xmm5
2555 pxor %xmm8,%xmm5
2556 movdqa %xmm4,%xmm8
2557 psrld $25,%xmm8
2558 pslld $32-25,%xmm4
2559 pxor %xmm8,%xmm4
2560 movdqa 80(%rbp),%xmm8
Robert Sloan4d1ac502017-02-06 08:36:14 -08002561 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05002562 addq %r10,%r15
2563 adcq %rdx,%r9
2564.byte 102,15,58,15,255,4
2565.byte 102,69,15,58,15,219,8
2566.byte 102,69,15,58,15,255,12
2567.byte 102,15,58,15,246,4
2568.byte 102,69,15,58,15,210,8
2569.byte 102,69,15,58,15,246,12
2570.byte 102,15,58,15,237,4
2571.byte 102,69,15,58,15,201,8
2572.byte 102,69,15,58,15,237,12
2573.byte 102,15,58,15,228,4
2574.byte 102,69,15,58,15,192,8
2575.byte 102,69,15,58,15,228,12
2576 movdqa %xmm8,80(%rbp)
2577 movdqa .rol16(%rip),%xmm8
2578 paddd %xmm7,%xmm3
2579 paddd %xmm6,%xmm2
2580 paddd %xmm5,%xmm1
2581 paddd %xmm4,%xmm0
2582 pxor %xmm3,%xmm15
2583 pxor %xmm2,%xmm14
2584 movq %r13,%r10
2585 movq %r14,%r11
2586 movq %r15,%r12
2587 andq $3,%r12
2588 movq %r15,%r13
2589 andq $-4,%r13
2590 movq %r9,%r14
2591 shrdq $2,%r9,%r15
2592 shrq $2,%r9
2593 addq %r13,%r10
2594 adcq %r14,%r11
2595 adcq $0,%r12
2596 addq %r15,%r10
2597 adcq %r9,%r11
2598 adcq $0,%r12
2599 pxor %xmm1,%xmm13
2600 pxor %xmm0,%xmm12
2601.byte 102,69,15,56,0,248
2602.byte 102,69,15,56,0,240
2603.byte 102,69,15,56,0,232
2604.byte 102,69,15,56,0,224
2605 movdqa 80(%rbp),%xmm8
2606 paddd %xmm15,%xmm11
2607 paddd %xmm14,%xmm10
2608 paddd %xmm13,%xmm9
2609 paddd %xmm12,%xmm8
2610 pxor %xmm11,%xmm7
2611 pxor %xmm10,%xmm6
2612 pxor %xmm9,%xmm5
2613 pxor %xmm8,%xmm4
2614 movdqa %xmm8,80(%rbp)
2615 movdqa %xmm7,%xmm8
2616 psrld $20,%xmm8
2617 pslld $32-20,%xmm7
2618 pxor %xmm8,%xmm7
2619 movdqa %xmm6,%xmm8
2620 psrld $20,%xmm8
2621 pslld $32-20,%xmm6
2622 pxor %xmm8,%xmm6
2623 movdqa %xmm5,%xmm8
2624 psrld $20,%xmm8
2625 pslld $32-20,%xmm5
2626 pxor %xmm8,%xmm5
2627 movdqa %xmm4,%xmm8
2628 psrld $20,%xmm8
2629 pslld $32-20,%xmm4
2630 pxor %xmm8,%xmm4
2631 movdqa .rol8(%rip),%xmm8
2632 paddd %xmm7,%xmm3
2633 paddd %xmm6,%xmm2
2634 paddd %xmm5,%xmm1
2635 paddd %xmm4,%xmm0
2636 pxor %xmm3,%xmm15
2637 pxor %xmm2,%xmm14
2638 pxor %xmm1,%xmm13
2639 pxor %xmm0,%xmm12
2640.byte 102,69,15,56,0,248
2641.byte 102,69,15,56,0,240
2642.byte 102,69,15,56,0,232
2643.byte 102,69,15,56,0,224
2644 movdqa 80(%rbp),%xmm8
2645 paddd %xmm15,%xmm11
2646 paddd %xmm14,%xmm10
2647 paddd %xmm13,%xmm9
2648 paddd %xmm12,%xmm8
2649 pxor %xmm11,%xmm7
2650 pxor %xmm10,%xmm6
2651 pxor %xmm9,%xmm5
2652 pxor %xmm8,%xmm4
2653 movdqa %xmm8,80(%rbp)
2654 movdqa %xmm7,%xmm8
2655 psrld $25,%xmm8
2656 pslld $32-25,%xmm7
2657 pxor %xmm8,%xmm7
2658 movdqa %xmm6,%xmm8
2659 psrld $25,%xmm8
2660 pslld $32-25,%xmm6
2661 pxor %xmm8,%xmm6
2662 movdqa %xmm5,%xmm8
2663 psrld $25,%xmm8
2664 pslld $32-25,%xmm5
2665 pxor %xmm8,%xmm5
2666 movdqa %xmm4,%xmm8
2667 psrld $25,%xmm8
2668 pslld $32-25,%xmm4
2669 pxor %xmm8,%xmm4
2670 movdqa 80(%rbp),%xmm8
2671.byte 102,15,58,15,255,12
2672.byte 102,69,15,58,15,219,8
2673.byte 102,69,15,58,15,255,4
2674.byte 102,15,58,15,246,12
2675.byte 102,69,15,58,15,210,8
2676.byte 102,69,15,58,15,246,4
2677.byte 102,15,58,15,237,12
2678.byte 102,69,15,58,15,201,8
2679.byte 102,69,15,58,15,237,4
2680.byte 102,15,58,15,228,12
2681.byte 102,69,15,58,15,192,8
2682.byte 102,69,15,58,15,228,4
2683
2684 leaq 16(%rdi),%rdi
2685 decq %r8
2686 jge 2b
2687 addq 0(%rdi),%r10
2688 adcq 8+0(%rdi),%r11
2689 adcq $1,%r12
2690 movq 0+0(%rbp),%rax
2691 movq %rax,%r15
2692 mulq %r10
2693 movq %rax,%r13
2694 movq %rdx,%r14
2695 movq 0+0(%rbp),%rax
2696 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08002697 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05002698 addq %rax,%r14
2699 adcq %rdx,%r15
2700 movq 8+0(%rbp),%rax
2701 movq %rax,%r9
2702 mulq %r10
2703 addq %rax,%r14
2704 adcq $0,%rdx
2705 movq %rdx,%r10
2706 movq 8+0(%rbp),%rax
2707 mulq %r11
2708 addq %rax,%r15
2709 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08002710 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05002711 addq %r10,%r15
2712 adcq %rdx,%r9
2713 movq %r13,%r10
2714 movq %r14,%r11
2715 movq %r15,%r12
2716 andq $3,%r12
2717 movq %r15,%r13
2718 andq $-4,%r13
2719 movq %r9,%r14
2720 shrdq $2,%r9,%r15
2721 shrq $2,%r9
2722 addq %r13,%r10
2723 adcq %r14,%r11
2724 adcq $0,%r12
2725 addq %r15,%r10
2726 adcq %r9,%r11
2727 adcq $0,%r12
2728
2729 leaq 16(%rdi),%rdi
2730 decq %rcx
2731 jg 2b
2732 paddd .chacha20_consts(%rip),%xmm3
2733 paddd 48(%rbp),%xmm7
2734 paddd 64(%rbp),%xmm11
2735 paddd 144(%rbp),%xmm15
2736 paddd .chacha20_consts(%rip),%xmm2
2737 paddd 48(%rbp),%xmm6
2738 paddd 64(%rbp),%xmm10
2739 paddd 128(%rbp),%xmm14
2740 paddd .chacha20_consts(%rip),%xmm1
2741 paddd 48(%rbp),%xmm5
2742 paddd 64(%rbp),%xmm9
2743 paddd 112(%rbp),%xmm13
2744 paddd .chacha20_consts(%rip),%xmm0
2745 paddd 48(%rbp),%xmm4
2746 paddd 64(%rbp),%xmm8
2747 paddd 96(%rbp),%xmm12
2748
2749 movdqa %xmm14,80(%rbp)
2750 movdqa %xmm14,80(%rbp)
2751 movdqu 0 + 0(%rsi),%xmm14
2752 pxor %xmm3,%xmm14
2753 movdqu %xmm14,0 + 0(%rdi)
2754 movdqu 16 + 0(%rsi),%xmm14
2755 pxor %xmm7,%xmm14
2756 movdqu %xmm14,16 + 0(%rdi)
2757 movdqu 32 + 0(%rsi),%xmm14
2758 pxor %xmm11,%xmm14
2759 movdqu %xmm14,32 + 0(%rdi)
2760 movdqu 48 + 0(%rsi),%xmm14
2761 pxor %xmm15,%xmm14
2762 movdqu %xmm14,48 + 0(%rdi)
2763
2764 movdqa 80(%rbp),%xmm14
2765 movdqu 0 + 64(%rsi),%xmm3
2766 movdqu 16 + 64(%rsi),%xmm7
2767 movdqu 32 + 64(%rsi),%xmm11
2768 movdqu 48 + 64(%rsi),%xmm15
2769 pxor %xmm3,%xmm2
2770 pxor %xmm7,%xmm6
2771 pxor %xmm11,%xmm10
2772 pxor %xmm14,%xmm15
2773 movdqu %xmm2,0 + 64(%rdi)
2774 movdqu %xmm6,16 + 64(%rdi)
2775 movdqu %xmm10,32 + 64(%rdi)
2776 movdqu %xmm15,48 + 64(%rdi)
2777 movdqu 0 + 128(%rsi),%xmm3
2778 movdqu 16 + 128(%rsi),%xmm7
2779 movdqu 32 + 128(%rsi),%xmm11
2780 movdqu 48 + 128(%rsi),%xmm15
2781 pxor %xmm3,%xmm1
2782 pxor %xmm7,%xmm5
2783 pxor %xmm11,%xmm9
2784 pxor %xmm13,%xmm15
2785 movdqu %xmm1,0 + 128(%rdi)
2786 movdqu %xmm5,16 + 128(%rdi)
2787 movdqu %xmm9,32 + 128(%rdi)
2788 movdqu %xmm15,48 + 128(%rdi)
2789
2790 cmpq $256,%rbx
2791 ja 3f
2792
2793 movq $192,%rcx
2794 subq $192,%rbx
2795 leaq 192(%rsi),%rsi
2796 jmp seal_sse_128_seal_hash
27973:
2798 movdqu 0 + 192(%rsi),%xmm3
2799 movdqu 16 + 192(%rsi),%xmm7
2800 movdqu 32 + 192(%rsi),%xmm11
2801 movdqu 48 + 192(%rsi),%xmm15
2802 pxor %xmm3,%xmm0
2803 pxor %xmm7,%xmm4
2804 pxor %xmm11,%xmm8
2805 pxor %xmm12,%xmm15
2806 movdqu %xmm0,0 + 192(%rdi)
2807 movdqu %xmm4,16 + 192(%rdi)
2808 movdqu %xmm8,32 + 192(%rdi)
2809 movdqu %xmm15,48 + 192(%rdi)
2810
2811 leaq 256(%rsi),%rsi
2812 subq $256,%rbx
2813 movq $6,%rcx
2814 movq $4,%r8
2815 cmpq $192,%rbx
2816 jg 1b
2817 movq %rbx,%rcx
2818 testq %rbx,%rbx
2819 je seal_sse_128_seal_hash
2820 movq $6,%rcx
2821 cmpq $64,%rbx
2822 jg 3f
2823
2824seal_sse_tail_64:
2825 movdqa .chacha20_consts(%rip),%xmm0
2826 movdqa 48(%rbp),%xmm4
2827 movdqa 64(%rbp),%xmm8
2828 movdqa 96(%rbp),%xmm12
2829 paddd .sse_inc(%rip),%xmm12
2830 movdqa %xmm12,96(%rbp)
2831
28321:
2833 addq 0(%rdi),%r10
2834 adcq 8+0(%rdi),%r11
2835 adcq $1,%r12
2836 movq 0+0(%rbp),%rax
2837 movq %rax,%r15
2838 mulq %r10
2839 movq %rax,%r13
2840 movq %rdx,%r14
2841 movq 0+0(%rbp),%rax
2842 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08002843 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05002844 addq %rax,%r14
2845 adcq %rdx,%r15
2846 movq 8+0(%rbp),%rax
2847 movq %rax,%r9
2848 mulq %r10
2849 addq %rax,%r14
2850 adcq $0,%rdx
2851 movq %rdx,%r10
2852 movq 8+0(%rbp),%rax
2853 mulq %r11
2854 addq %rax,%r15
2855 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08002856 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05002857 addq %r10,%r15
2858 adcq %rdx,%r9
2859 movq %r13,%r10
2860 movq %r14,%r11
2861 movq %r15,%r12
2862 andq $3,%r12
2863 movq %r15,%r13
2864 andq $-4,%r13
2865 movq %r9,%r14
2866 shrdq $2,%r9,%r15
2867 shrq $2,%r9
2868 addq %r13,%r10
2869 adcq %r14,%r11
2870 adcq $0,%r12
2871 addq %r15,%r10
2872 adcq %r9,%r11
2873 adcq $0,%r12
2874
2875 leaq 16(%rdi),%rdi
28762:
2877 paddd %xmm4,%xmm0
2878 pxor %xmm0,%xmm12
2879 pshufb .rol16(%rip),%xmm12
2880 paddd %xmm12,%xmm8
2881 pxor %xmm8,%xmm4
2882 movdqa %xmm4,%xmm3
2883 pslld $12,%xmm3
2884 psrld $20,%xmm4
2885 pxor %xmm3,%xmm4
2886 paddd %xmm4,%xmm0
2887 pxor %xmm0,%xmm12
2888 pshufb .rol8(%rip),%xmm12
2889 paddd %xmm12,%xmm8
2890 pxor %xmm8,%xmm4
2891 movdqa %xmm4,%xmm3
2892 pslld $7,%xmm3
2893 psrld $25,%xmm4
2894 pxor %xmm3,%xmm4
2895.byte 102,15,58,15,228,4
2896.byte 102,69,15,58,15,192,8
2897.byte 102,69,15,58,15,228,12
2898 paddd %xmm4,%xmm0
2899 pxor %xmm0,%xmm12
2900 pshufb .rol16(%rip),%xmm12
2901 paddd %xmm12,%xmm8
2902 pxor %xmm8,%xmm4
2903 movdqa %xmm4,%xmm3
2904 pslld $12,%xmm3
2905 psrld $20,%xmm4
2906 pxor %xmm3,%xmm4
2907 paddd %xmm4,%xmm0
2908 pxor %xmm0,%xmm12
2909 pshufb .rol8(%rip),%xmm12
2910 paddd %xmm12,%xmm8
2911 pxor %xmm8,%xmm4
2912 movdqa %xmm4,%xmm3
2913 pslld $7,%xmm3
2914 psrld $25,%xmm4
2915 pxor %xmm3,%xmm4
2916.byte 102,15,58,15,228,12
2917.byte 102,69,15,58,15,192,8
2918.byte 102,69,15,58,15,228,4
2919 addq 0(%rdi),%r10
2920 adcq 8+0(%rdi),%r11
2921 adcq $1,%r12
2922 movq 0+0(%rbp),%rax
2923 movq %rax,%r15
2924 mulq %r10
2925 movq %rax,%r13
2926 movq %rdx,%r14
2927 movq 0+0(%rbp),%rax
2928 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08002929 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05002930 addq %rax,%r14
2931 adcq %rdx,%r15
2932 movq 8+0(%rbp),%rax
2933 movq %rax,%r9
2934 mulq %r10
2935 addq %rax,%r14
2936 adcq $0,%rdx
2937 movq %rdx,%r10
2938 movq 8+0(%rbp),%rax
2939 mulq %r11
2940 addq %rax,%r15
2941 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08002942 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05002943 addq %r10,%r15
2944 adcq %rdx,%r9
2945 movq %r13,%r10
2946 movq %r14,%r11
2947 movq %r15,%r12
2948 andq $3,%r12
2949 movq %r15,%r13
2950 andq $-4,%r13
2951 movq %r9,%r14
2952 shrdq $2,%r9,%r15
2953 shrq $2,%r9
2954 addq %r13,%r10
2955 adcq %r14,%r11
2956 adcq $0,%r12
2957 addq %r15,%r10
2958 adcq %r9,%r11
2959 adcq $0,%r12
2960
2961 leaq 16(%rdi),%rdi
2962 decq %rcx
2963 jg 1b
2964 decq %r8
2965 jge 2b
2966 paddd .chacha20_consts(%rip),%xmm0
2967 paddd 48(%rbp),%xmm4
2968 paddd 64(%rbp),%xmm8
2969 paddd 96(%rbp),%xmm12
2970
2971 jmp seal_sse_128_seal
29723:
2973 cmpq $128,%rbx
2974 jg 3f
2975
2976seal_sse_tail_128:
2977 movdqa .chacha20_consts(%rip),%xmm0
2978 movdqa 48(%rbp),%xmm4
2979 movdqa 64(%rbp),%xmm8
2980 movdqa %xmm0,%xmm1
2981 movdqa %xmm4,%xmm5
2982 movdqa %xmm8,%xmm9
2983 movdqa 96(%rbp),%xmm13
2984 paddd .sse_inc(%rip),%xmm13
2985 movdqa %xmm13,%xmm12
2986 paddd .sse_inc(%rip),%xmm12
2987 movdqa %xmm12,96(%rbp)
2988 movdqa %xmm13,112(%rbp)
2989
29901:
2991 addq 0(%rdi),%r10
2992 adcq 8+0(%rdi),%r11
2993 adcq $1,%r12
2994 movq 0+0(%rbp),%rax
2995 movq %rax,%r15
2996 mulq %r10
2997 movq %rax,%r13
2998 movq %rdx,%r14
2999 movq 0+0(%rbp),%rax
3000 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003001 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003002 addq %rax,%r14
3003 adcq %rdx,%r15
3004 movq 8+0(%rbp),%rax
3005 movq %rax,%r9
3006 mulq %r10
3007 addq %rax,%r14
3008 adcq $0,%rdx
3009 movq %rdx,%r10
3010 movq 8+0(%rbp),%rax
3011 mulq %r11
3012 addq %rax,%r15
3013 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003014 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003015 addq %r10,%r15
3016 adcq %rdx,%r9
3017 movq %r13,%r10
3018 movq %r14,%r11
3019 movq %r15,%r12
3020 andq $3,%r12
3021 movq %r15,%r13
3022 andq $-4,%r13
3023 movq %r9,%r14
3024 shrdq $2,%r9,%r15
3025 shrq $2,%r9
3026 addq %r13,%r10
3027 adcq %r14,%r11
3028 adcq $0,%r12
3029 addq %r15,%r10
3030 adcq %r9,%r11
3031 adcq $0,%r12
3032
3033 leaq 16(%rdi),%rdi
30342:
3035 paddd %xmm4,%xmm0
3036 pxor %xmm0,%xmm12
3037 pshufb .rol16(%rip),%xmm12
3038 paddd %xmm12,%xmm8
3039 pxor %xmm8,%xmm4
3040 movdqa %xmm4,%xmm3
3041 pslld $12,%xmm3
3042 psrld $20,%xmm4
3043 pxor %xmm3,%xmm4
3044 paddd %xmm4,%xmm0
3045 pxor %xmm0,%xmm12
3046 pshufb .rol8(%rip),%xmm12
3047 paddd %xmm12,%xmm8
3048 pxor %xmm8,%xmm4
3049 movdqa %xmm4,%xmm3
3050 pslld $7,%xmm3
3051 psrld $25,%xmm4
3052 pxor %xmm3,%xmm4
3053.byte 102,15,58,15,228,4
3054.byte 102,69,15,58,15,192,8
3055.byte 102,69,15,58,15,228,12
3056 paddd %xmm5,%xmm1
3057 pxor %xmm1,%xmm13
3058 pshufb .rol16(%rip),%xmm13
3059 paddd %xmm13,%xmm9
3060 pxor %xmm9,%xmm5
3061 movdqa %xmm5,%xmm3
3062 pslld $12,%xmm3
3063 psrld $20,%xmm5
3064 pxor %xmm3,%xmm5
3065 paddd %xmm5,%xmm1
3066 pxor %xmm1,%xmm13
3067 pshufb .rol8(%rip),%xmm13
3068 paddd %xmm13,%xmm9
3069 pxor %xmm9,%xmm5
3070 movdqa %xmm5,%xmm3
3071 pslld $7,%xmm3
3072 psrld $25,%xmm5
3073 pxor %xmm3,%xmm5
3074.byte 102,15,58,15,237,4
3075.byte 102,69,15,58,15,201,8
3076.byte 102,69,15,58,15,237,12
3077 addq 0(%rdi),%r10
3078 adcq 8+0(%rdi),%r11
3079 adcq $1,%r12
3080 movq 0+0(%rbp),%rax
3081 movq %rax,%r15
3082 mulq %r10
3083 movq %rax,%r13
3084 movq %rdx,%r14
3085 movq 0+0(%rbp),%rax
3086 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003087 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003088 addq %rax,%r14
3089 adcq %rdx,%r15
3090 movq 8+0(%rbp),%rax
3091 movq %rax,%r9
3092 mulq %r10
3093 addq %rax,%r14
3094 adcq $0,%rdx
3095 movq %rdx,%r10
3096 movq 8+0(%rbp),%rax
3097 mulq %r11
3098 addq %rax,%r15
3099 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003100 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003101 addq %r10,%r15
3102 adcq %rdx,%r9
3103 movq %r13,%r10
3104 movq %r14,%r11
3105 movq %r15,%r12
3106 andq $3,%r12
3107 movq %r15,%r13
3108 andq $-4,%r13
3109 movq %r9,%r14
3110 shrdq $2,%r9,%r15
3111 shrq $2,%r9
3112 addq %r13,%r10
3113 adcq %r14,%r11
3114 adcq $0,%r12
3115 addq %r15,%r10
3116 adcq %r9,%r11
3117 adcq $0,%r12
3118 paddd %xmm4,%xmm0
3119 pxor %xmm0,%xmm12
3120 pshufb .rol16(%rip),%xmm12
3121 paddd %xmm12,%xmm8
3122 pxor %xmm8,%xmm4
3123 movdqa %xmm4,%xmm3
3124 pslld $12,%xmm3
3125 psrld $20,%xmm4
3126 pxor %xmm3,%xmm4
3127 paddd %xmm4,%xmm0
3128 pxor %xmm0,%xmm12
3129 pshufb .rol8(%rip),%xmm12
3130 paddd %xmm12,%xmm8
3131 pxor %xmm8,%xmm4
3132 movdqa %xmm4,%xmm3
3133 pslld $7,%xmm3
3134 psrld $25,%xmm4
3135 pxor %xmm3,%xmm4
3136.byte 102,15,58,15,228,12
3137.byte 102,69,15,58,15,192,8
3138.byte 102,69,15,58,15,228,4
3139 paddd %xmm5,%xmm1
3140 pxor %xmm1,%xmm13
3141 pshufb .rol16(%rip),%xmm13
3142 paddd %xmm13,%xmm9
3143 pxor %xmm9,%xmm5
3144 movdqa %xmm5,%xmm3
3145 pslld $12,%xmm3
3146 psrld $20,%xmm5
3147 pxor %xmm3,%xmm5
3148 paddd %xmm5,%xmm1
3149 pxor %xmm1,%xmm13
3150 pshufb .rol8(%rip),%xmm13
3151 paddd %xmm13,%xmm9
3152 pxor %xmm9,%xmm5
3153 movdqa %xmm5,%xmm3
3154 pslld $7,%xmm3
3155 psrld $25,%xmm5
3156 pxor %xmm3,%xmm5
3157.byte 102,15,58,15,237,12
3158.byte 102,69,15,58,15,201,8
3159.byte 102,69,15,58,15,237,4
3160
3161 leaq 16(%rdi),%rdi
3162 decq %rcx
3163 jg 1b
3164 decq %r8
3165 jge 2b
3166 paddd .chacha20_consts(%rip),%xmm1
3167 paddd 48(%rbp),%xmm5
3168 paddd 64(%rbp),%xmm9
3169 paddd 112(%rbp),%xmm13
3170 paddd .chacha20_consts(%rip),%xmm0
3171 paddd 48(%rbp),%xmm4
3172 paddd 64(%rbp),%xmm8
3173 paddd 96(%rbp),%xmm12
3174 movdqu 0 + 0(%rsi),%xmm3
3175 movdqu 16 + 0(%rsi),%xmm7
3176 movdqu 32 + 0(%rsi),%xmm11
3177 movdqu 48 + 0(%rsi),%xmm15
3178 pxor %xmm3,%xmm1
3179 pxor %xmm7,%xmm5
3180 pxor %xmm11,%xmm9
3181 pxor %xmm13,%xmm15
3182 movdqu %xmm1,0 + 0(%rdi)
3183 movdqu %xmm5,16 + 0(%rdi)
3184 movdqu %xmm9,32 + 0(%rdi)
3185 movdqu %xmm15,48 + 0(%rdi)
3186
3187 movq $64,%rcx
3188 subq $64,%rbx
3189 leaq 64(%rsi),%rsi
3190 jmp seal_sse_128_seal_hash
31913:
3192
3193seal_sse_tail_192:
3194 movdqa .chacha20_consts(%rip),%xmm0
3195 movdqa 48(%rbp),%xmm4
3196 movdqa 64(%rbp),%xmm8
3197 movdqa %xmm0,%xmm1
3198 movdqa %xmm4,%xmm5
3199 movdqa %xmm8,%xmm9
3200 movdqa %xmm0,%xmm2
3201 movdqa %xmm4,%xmm6
3202 movdqa %xmm8,%xmm10
3203 movdqa 96(%rbp),%xmm14
3204 paddd .sse_inc(%rip),%xmm14
3205 movdqa %xmm14,%xmm13
3206 paddd .sse_inc(%rip),%xmm13
3207 movdqa %xmm13,%xmm12
3208 paddd .sse_inc(%rip),%xmm12
3209 movdqa %xmm12,96(%rbp)
3210 movdqa %xmm13,112(%rbp)
3211 movdqa %xmm14,128(%rbp)
3212
32131:
3214 addq 0(%rdi),%r10
3215 adcq 8+0(%rdi),%r11
3216 adcq $1,%r12
3217 movq 0+0(%rbp),%rax
3218 movq %rax,%r15
3219 mulq %r10
3220 movq %rax,%r13
3221 movq %rdx,%r14
3222 movq 0+0(%rbp),%rax
3223 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003224 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003225 addq %rax,%r14
3226 adcq %rdx,%r15
3227 movq 8+0(%rbp),%rax
3228 movq %rax,%r9
3229 mulq %r10
3230 addq %rax,%r14
3231 adcq $0,%rdx
3232 movq %rdx,%r10
3233 movq 8+0(%rbp),%rax
3234 mulq %r11
3235 addq %rax,%r15
3236 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003237 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003238 addq %r10,%r15
3239 adcq %rdx,%r9
3240 movq %r13,%r10
3241 movq %r14,%r11
3242 movq %r15,%r12
3243 andq $3,%r12
3244 movq %r15,%r13
3245 andq $-4,%r13
3246 movq %r9,%r14
3247 shrdq $2,%r9,%r15
3248 shrq $2,%r9
3249 addq %r13,%r10
3250 adcq %r14,%r11
3251 adcq $0,%r12
3252 addq %r15,%r10
3253 adcq %r9,%r11
3254 adcq $0,%r12
3255
3256 leaq 16(%rdi),%rdi
32572:
3258 paddd %xmm4,%xmm0
3259 pxor %xmm0,%xmm12
3260 pshufb .rol16(%rip),%xmm12
3261 paddd %xmm12,%xmm8
3262 pxor %xmm8,%xmm4
3263 movdqa %xmm4,%xmm3
3264 pslld $12,%xmm3
3265 psrld $20,%xmm4
3266 pxor %xmm3,%xmm4
3267 paddd %xmm4,%xmm0
3268 pxor %xmm0,%xmm12
3269 pshufb .rol8(%rip),%xmm12
3270 paddd %xmm12,%xmm8
3271 pxor %xmm8,%xmm4
3272 movdqa %xmm4,%xmm3
3273 pslld $7,%xmm3
3274 psrld $25,%xmm4
3275 pxor %xmm3,%xmm4
3276.byte 102,15,58,15,228,4
3277.byte 102,69,15,58,15,192,8
3278.byte 102,69,15,58,15,228,12
3279 paddd %xmm5,%xmm1
3280 pxor %xmm1,%xmm13
3281 pshufb .rol16(%rip),%xmm13
3282 paddd %xmm13,%xmm9
3283 pxor %xmm9,%xmm5
3284 movdqa %xmm5,%xmm3
3285 pslld $12,%xmm3
3286 psrld $20,%xmm5
3287 pxor %xmm3,%xmm5
3288 paddd %xmm5,%xmm1
3289 pxor %xmm1,%xmm13
3290 pshufb .rol8(%rip),%xmm13
3291 paddd %xmm13,%xmm9
3292 pxor %xmm9,%xmm5
3293 movdqa %xmm5,%xmm3
3294 pslld $7,%xmm3
3295 psrld $25,%xmm5
3296 pxor %xmm3,%xmm5
3297.byte 102,15,58,15,237,4
3298.byte 102,69,15,58,15,201,8
3299.byte 102,69,15,58,15,237,12
3300 paddd %xmm6,%xmm2
3301 pxor %xmm2,%xmm14
3302 pshufb .rol16(%rip),%xmm14
3303 paddd %xmm14,%xmm10
3304 pxor %xmm10,%xmm6
3305 movdqa %xmm6,%xmm3
3306 pslld $12,%xmm3
3307 psrld $20,%xmm6
3308 pxor %xmm3,%xmm6
3309 paddd %xmm6,%xmm2
3310 pxor %xmm2,%xmm14
3311 pshufb .rol8(%rip),%xmm14
3312 paddd %xmm14,%xmm10
3313 pxor %xmm10,%xmm6
3314 movdqa %xmm6,%xmm3
3315 pslld $7,%xmm3
3316 psrld $25,%xmm6
3317 pxor %xmm3,%xmm6
3318.byte 102,15,58,15,246,4
3319.byte 102,69,15,58,15,210,8
3320.byte 102,69,15,58,15,246,12
3321 addq 0(%rdi),%r10
3322 adcq 8+0(%rdi),%r11
3323 adcq $1,%r12
3324 movq 0+0(%rbp),%rax
3325 movq %rax,%r15
3326 mulq %r10
3327 movq %rax,%r13
3328 movq %rdx,%r14
3329 movq 0+0(%rbp),%rax
3330 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003331 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003332 addq %rax,%r14
3333 adcq %rdx,%r15
3334 movq 8+0(%rbp),%rax
3335 movq %rax,%r9
3336 mulq %r10
3337 addq %rax,%r14
3338 adcq $0,%rdx
3339 movq %rdx,%r10
3340 movq 8+0(%rbp),%rax
3341 mulq %r11
3342 addq %rax,%r15
3343 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003344 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003345 addq %r10,%r15
3346 adcq %rdx,%r9
3347 movq %r13,%r10
3348 movq %r14,%r11
3349 movq %r15,%r12
3350 andq $3,%r12
3351 movq %r15,%r13
3352 andq $-4,%r13
3353 movq %r9,%r14
3354 shrdq $2,%r9,%r15
3355 shrq $2,%r9
3356 addq %r13,%r10
3357 adcq %r14,%r11
3358 adcq $0,%r12
3359 addq %r15,%r10
3360 adcq %r9,%r11
3361 adcq $0,%r12
3362 paddd %xmm4,%xmm0
3363 pxor %xmm0,%xmm12
3364 pshufb .rol16(%rip),%xmm12
3365 paddd %xmm12,%xmm8
3366 pxor %xmm8,%xmm4
3367 movdqa %xmm4,%xmm3
3368 pslld $12,%xmm3
3369 psrld $20,%xmm4
3370 pxor %xmm3,%xmm4
3371 paddd %xmm4,%xmm0
3372 pxor %xmm0,%xmm12
3373 pshufb .rol8(%rip),%xmm12
3374 paddd %xmm12,%xmm8
3375 pxor %xmm8,%xmm4
3376 movdqa %xmm4,%xmm3
3377 pslld $7,%xmm3
3378 psrld $25,%xmm4
3379 pxor %xmm3,%xmm4
3380.byte 102,15,58,15,228,12
3381.byte 102,69,15,58,15,192,8
3382.byte 102,69,15,58,15,228,4
3383 paddd %xmm5,%xmm1
3384 pxor %xmm1,%xmm13
3385 pshufb .rol16(%rip),%xmm13
3386 paddd %xmm13,%xmm9
3387 pxor %xmm9,%xmm5
3388 movdqa %xmm5,%xmm3
3389 pslld $12,%xmm3
3390 psrld $20,%xmm5
3391 pxor %xmm3,%xmm5
3392 paddd %xmm5,%xmm1
3393 pxor %xmm1,%xmm13
3394 pshufb .rol8(%rip),%xmm13
3395 paddd %xmm13,%xmm9
3396 pxor %xmm9,%xmm5
3397 movdqa %xmm5,%xmm3
3398 pslld $7,%xmm3
3399 psrld $25,%xmm5
3400 pxor %xmm3,%xmm5
3401.byte 102,15,58,15,237,12
3402.byte 102,69,15,58,15,201,8
3403.byte 102,69,15,58,15,237,4
3404 paddd %xmm6,%xmm2
3405 pxor %xmm2,%xmm14
3406 pshufb .rol16(%rip),%xmm14
3407 paddd %xmm14,%xmm10
3408 pxor %xmm10,%xmm6
3409 movdqa %xmm6,%xmm3
3410 pslld $12,%xmm3
3411 psrld $20,%xmm6
3412 pxor %xmm3,%xmm6
3413 paddd %xmm6,%xmm2
3414 pxor %xmm2,%xmm14
3415 pshufb .rol8(%rip),%xmm14
3416 paddd %xmm14,%xmm10
3417 pxor %xmm10,%xmm6
3418 movdqa %xmm6,%xmm3
3419 pslld $7,%xmm3
3420 psrld $25,%xmm6
3421 pxor %xmm3,%xmm6
3422.byte 102,15,58,15,246,12
3423.byte 102,69,15,58,15,210,8
3424.byte 102,69,15,58,15,246,4
3425
3426 leaq 16(%rdi),%rdi
3427 decq %rcx
3428 jg 1b
3429 decq %r8
3430 jge 2b
3431 paddd .chacha20_consts(%rip),%xmm2
3432 paddd 48(%rbp),%xmm6
3433 paddd 64(%rbp),%xmm10
3434 paddd 128(%rbp),%xmm14
3435 paddd .chacha20_consts(%rip),%xmm1
3436 paddd 48(%rbp),%xmm5
3437 paddd 64(%rbp),%xmm9
3438 paddd 112(%rbp),%xmm13
3439 paddd .chacha20_consts(%rip),%xmm0
3440 paddd 48(%rbp),%xmm4
3441 paddd 64(%rbp),%xmm8
3442 paddd 96(%rbp),%xmm12
3443 movdqu 0 + 0(%rsi),%xmm3
3444 movdqu 16 + 0(%rsi),%xmm7
3445 movdqu 32 + 0(%rsi),%xmm11
3446 movdqu 48 + 0(%rsi),%xmm15
3447 pxor %xmm3,%xmm2
3448 pxor %xmm7,%xmm6
3449 pxor %xmm11,%xmm10
3450 pxor %xmm14,%xmm15
3451 movdqu %xmm2,0 + 0(%rdi)
3452 movdqu %xmm6,16 + 0(%rdi)
3453 movdqu %xmm10,32 + 0(%rdi)
3454 movdqu %xmm15,48 + 0(%rdi)
3455 movdqu 0 + 64(%rsi),%xmm3
3456 movdqu 16 + 64(%rsi),%xmm7
3457 movdqu 32 + 64(%rsi),%xmm11
3458 movdqu 48 + 64(%rsi),%xmm15
3459 pxor %xmm3,%xmm1
3460 pxor %xmm7,%xmm5
3461 pxor %xmm11,%xmm9
3462 pxor %xmm13,%xmm15
3463 movdqu %xmm1,0 + 64(%rdi)
3464 movdqu %xmm5,16 + 64(%rdi)
3465 movdqu %xmm9,32 + 64(%rdi)
3466 movdqu %xmm15,48 + 64(%rdi)
3467
3468 movq $128,%rcx
3469 subq $128,%rbx
3470 leaq 128(%rsi),%rsi
3471
3472seal_sse_128_seal_hash:
3473 cmpq $16,%rcx
3474 jb seal_sse_128_seal
3475 addq 0(%rdi),%r10
3476 adcq 8+0(%rdi),%r11
3477 adcq $1,%r12
3478 movq 0+0(%rbp),%rax
3479 movq %rax,%r15
3480 mulq %r10
3481 movq %rax,%r13
3482 movq %rdx,%r14
3483 movq 0+0(%rbp),%rax
3484 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003485 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003486 addq %rax,%r14
3487 adcq %rdx,%r15
3488 movq 8+0(%rbp),%rax
3489 movq %rax,%r9
3490 mulq %r10
3491 addq %rax,%r14
3492 adcq $0,%rdx
3493 movq %rdx,%r10
3494 movq 8+0(%rbp),%rax
3495 mulq %r11
3496 addq %rax,%r15
3497 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003498 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003499 addq %r10,%r15
3500 adcq %rdx,%r9
3501 movq %r13,%r10
3502 movq %r14,%r11
3503 movq %r15,%r12
3504 andq $3,%r12
3505 movq %r15,%r13
3506 andq $-4,%r13
3507 movq %r9,%r14
3508 shrdq $2,%r9,%r15
3509 shrq $2,%r9
3510 addq %r13,%r10
3511 adcq %r14,%r11
3512 adcq $0,%r12
3513 addq %r15,%r10
3514 adcq %r9,%r11
3515 adcq $0,%r12
3516
3517 subq $16,%rcx
3518 leaq 16(%rdi),%rdi
3519 jmp seal_sse_128_seal_hash
3520
3521seal_sse_128_seal:
3522 cmpq $16,%rbx
3523 jb seal_sse_tail_16
3524 subq $16,%rbx
3525
3526 movdqu 0(%rsi),%xmm3
3527 pxor %xmm3,%xmm0
3528 movdqu %xmm0,0(%rdi)
3529
3530 addq 0(%rdi),%r10
3531 adcq 8(%rdi),%r11
3532 adcq $1,%r12
3533 leaq 16(%rsi),%rsi
3534 leaq 16(%rdi),%rdi
3535 movq 0+0(%rbp),%rax
3536 movq %rax,%r15
3537 mulq %r10
3538 movq %rax,%r13
3539 movq %rdx,%r14
3540 movq 0+0(%rbp),%rax
3541 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003542 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003543 addq %rax,%r14
3544 adcq %rdx,%r15
3545 movq 8+0(%rbp),%rax
3546 movq %rax,%r9
3547 mulq %r10
3548 addq %rax,%r14
3549 adcq $0,%rdx
3550 movq %rdx,%r10
3551 movq 8+0(%rbp),%rax
3552 mulq %r11
3553 addq %rax,%r15
3554 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003555 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003556 addq %r10,%r15
3557 adcq %rdx,%r9
3558 movq %r13,%r10
3559 movq %r14,%r11
3560 movq %r15,%r12
3561 andq $3,%r12
3562 movq %r15,%r13
3563 andq $-4,%r13
3564 movq %r9,%r14
3565 shrdq $2,%r9,%r15
3566 shrq $2,%r9
3567 addq %r13,%r10
3568 adcq %r14,%r11
3569 adcq $0,%r12
3570 addq %r15,%r10
3571 adcq %r9,%r11
3572 adcq $0,%r12
3573
3574
3575 movdqa %xmm4,%xmm0
3576 movdqa %xmm8,%xmm4
3577 movdqa %xmm12,%xmm8
3578 movdqa %xmm1,%xmm12
3579 movdqa %xmm5,%xmm1
3580 movdqa %xmm9,%xmm5
3581 movdqa %xmm13,%xmm9
3582 jmp seal_sse_128_seal
3583
3584seal_sse_tail_16:
3585 testq %rbx,%rbx
3586 jz seal_sse_finalize
3587
3588 movq %rbx,%r8
3589 shlq $4,%r8
3590 leaq .and_masks(%rip),%r13
3591 movq %rbx,%rcx
3592 leaq -1(%rsi,%rbx), %rsi
3593 pxor %xmm15,%xmm15
35941:
3595 pslldq $1,%xmm15
3596 pinsrb $0,(%rsi),%xmm15
3597 leaq -1(%rsi),%rsi
3598 decq %rcx
3599 jne 1b
3600
3601
3602 pxor %xmm0,%xmm15
3603
3604
3605 movq %rbx,%rcx
3606 movdqu %xmm15,%xmm0
36072:
3608 pextrb $0,%xmm0,(%rdi)
3609 psrldq $1,%xmm0
3610 addq $1,%rdi
3611 subq $1,%rcx
3612 jnz 2b
3613
3614 pand -16(%r13,%r8), %xmm15
3615.byte 102,77,15,126,253
3616 pextrq $1,%xmm15,%r14
3617 addq %r13,%r10
3618 adcq %r14,%r11
3619 adcq $1,%r12
3620 movq 0+0(%rbp),%rax
3621 movq %rax,%r15
3622 mulq %r10
3623 movq %rax,%r13
3624 movq %rdx,%r14
3625 movq 0+0(%rbp),%rax
3626 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003627 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003628 addq %rax,%r14
3629 adcq %rdx,%r15
3630 movq 8+0(%rbp),%rax
3631 movq %rax,%r9
3632 mulq %r10
3633 addq %rax,%r14
3634 adcq $0,%rdx
3635 movq %rdx,%r10
3636 movq 8+0(%rbp),%rax
3637 mulq %r11
3638 addq %rax,%r15
3639 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003640 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003641 addq %r10,%r15
3642 adcq %rdx,%r9
3643 movq %r13,%r10
3644 movq %r14,%r11
3645 movq %r15,%r12
3646 andq $3,%r12
3647 movq %r15,%r13
3648 andq $-4,%r13
3649 movq %r9,%r14
3650 shrdq $2,%r9,%r15
3651 shrq $2,%r9
3652 addq %r13,%r10
3653 adcq %r14,%r11
3654 adcq $0,%r12
3655 addq %r15,%r10
3656 adcq %r9,%r11
3657 adcq $0,%r12
3658
3659seal_sse_finalize:
3660 addq 32(%rbp),%r10
3661 adcq 8+32(%rbp),%r11
3662 adcq $1,%r12
3663 movq 0+0(%rbp),%rax
3664 movq %rax,%r15
3665 mulq %r10
3666 movq %rax,%r13
3667 movq %rdx,%r14
3668 movq 0+0(%rbp),%rax
3669 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003670 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003671 addq %rax,%r14
3672 adcq %rdx,%r15
3673 movq 8+0(%rbp),%rax
3674 movq %rax,%r9
3675 mulq %r10
3676 addq %rax,%r14
3677 adcq $0,%rdx
3678 movq %rdx,%r10
3679 movq 8+0(%rbp),%rax
3680 mulq %r11
3681 addq %rax,%r15
3682 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003683 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003684 addq %r10,%r15
3685 adcq %rdx,%r9
3686 movq %r13,%r10
3687 movq %r14,%r11
3688 movq %r15,%r12
3689 andq $3,%r12
3690 movq %r15,%r13
3691 andq $-4,%r13
3692 movq %r9,%r14
3693 shrdq $2,%r9,%r15
3694 shrq $2,%r9
3695 addq %r13,%r10
3696 adcq %r14,%r11
3697 adcq $0,%r12
3698 addq %r15,%r10
3699 adcq %r9,%r11
3700 adcq $0,%r12
3701
3702
3703 movq %r10,%r13
3704 movq %r11,%r14
3705 movq %r12,%r15
3706 subq $-5,%r10
3707 sbbq $-1,%r11
3708 sbbq $3,%r12
3709 cmovcq %r13,%r10
3710 cmovcq %r14,%r11
3711 cmovcq %r15,%r12
3712
3713 addq 0+16(%rbp),%r10
3714 adcq 8+16(%rbp),%r11
3715
3716 addq $288 + 32,%rsp
3717.cfi_adjust_cfa_offset -(288 + 32)
3718 popq %r9
3719.cfi_adjust_cfa_offset -8
3720 movq %r10,0(%r9)
3721 movq %r11,8(%r9)
3722
3723 popq %r15
3724.cfi_adjust_cfa_offset -8
3725 popq %r14
3726.cfi_adjust_cfa_offset -8
3727 popq %r13
3728.cfi_adjust_cfa_offset -8
3729 popq %r12
3730.cfi_adjust_cfa_offset -8
3731 popq %rbx
3732.cfi_adjust_cfa_offset -8
3733 popq %rbp
3734.cfi_adjust_cfa_offset -8
3735 .byte 0xf3,0xc3
3736.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
3737
3738seal_sse_128:
3739 movdqu .chacha20_consts(%rip),%xmm0
3740 movdqa %xmm0,%xmm1
3741 movdqa %xmm0,%xmm2
3742 movdqu 0(%r9),%xmm4
3743 movdqa %xmm4,%xmm5
3744 movdqa %xmm4,%xmm6
3745 movdqu 16(%r9),%xmm8
3746 movdqa %xmm8,%xmm9
3747 movdqa %xmm8,%xmm10
3748 movdqu 32(%r9),%xmm14
3749 movdqa %xmm14,%xmm12
3750 paddd .sse_inc(%rip),%xmm12
3751 movdqa %xmm12,%xmm13
3752 paddd .sse_inc(%rip),%xmm13
3753 movdqa %xmm4,%xmm7
3754 movdqa %xmm8,%xmm11
3755 movdqa %xmm12,%xmm15
3756 movq $10,%r10
37571:
3758 paddd %xmm4,%xmm0
3759 pxor %xmm0,%xmm12
3760 pshufb .rol16(%rip),%xmm12
3761 paddd %xmm12,%xmm8
3762 pxor %xmm8,%xmm4
3763 movdqa %xmm4,%xmm3
3764 pslld $12,%xmm3
3765 psrld $20,%xmm4
3766 pxor %xmm3,%xmm4
3767 paddd %xmm4,%xmm0
3768 pxor %xmm0,%xmm12
3769 pshufb .rol8(%rip),%xmm12
3770 paddd %xmm12,%xmm8
3771 pxor %xmm8,%xmm4
3772 movdqa %xmm4,%xmm3
3773 pslld $7,%xmm3
3774 psrld $25,%xmm4
3775 pxor %xmm3,%xmm4
3776.byte 102,15,58,15,228,4
3777.byte 102,69,15,58,15,192,8
3778.byte 102,69,15,58,15,228,12
3779 paddd %xmm5,%xmm1
3780 pxor %xmm1,%xmm13
3781 pshufb .rol16(%rip),%xmm13
3782 paddd %xmm13,%xmm9
3783 pxor %xmm9,%xmm5
3784 movdqa %xmm5,%xmm3
3785 pslld $12,%xmm3
3786 psrld $20,%xmm5
3787 pxor %xmm3,%xmm5
3788 paddd %xmm5,%xmm1
3789 pxor %xmm1,%xmm13
3790 pshufb .rol8(%rip),%xmm13
3791 paddd %xmm13,%xmm9
3792 pxor %xmm9,%xmm5
3793 movdqa %xmm5,%xmm3
3794 pslld $7,%xmm3
3795 psrld $25,%xmm5
3796 pxor %xmm3,%xmm5
3797.byte 102,15,58,15,237,4
3798.byte 102,69,15,58,15,201,8
3799.byte 102,69,15,58,15,237,12
3800 paddd %xmm6,%xmm2
3801 pxor %xmm2,%xmm14
3802 pshufb .rol16(%rip),%xmm14
3803 paddd %xmm14,%xmm10
3804 pxor %xmm10,%xmm6
3805 movdqa %xmm6,%xmm3
3806 pslld $12,%xmm3
3807 psrld $20,%xmm6
3808 pxor %xmm3,%xmm6
3809 paddd %xmm6,%xmm2
3810 pxor %xmm2,%xmm14
3811 pshufb .rol8(%rip),%xmm14
3812 paddd %xmm14,%xmm10
3813 pxor %xmm10,%xmm6
3814 movdqa %xmm6,%xmm3
3815 pslld $7,%xmm3
3816 psrld $25,%xmm6
3817 pxor %xmm3,%xmm6
3818.byte 102,15,58,15,246,4
3819.byte 102,69,15,58,15,210,8
3820.byte 102,69,15,58,15,246,12
3821 paddd %xmm4,%xmm0
3822 pxor %xmm0,%xmm12
3823 pshufb .rol16(%rip),%xmm12
3824 paddd %xmm12,%xmm8
3825 pxor %xmm8,%xmm4
3826 movdqa %xmm4,%xmm3
3827 pslld $12,%xmm3
3828 psrld $20,%xmm4
3829 pxor %xmm3,%xmm4
3830 paddd %xmm4,%xmm0
3831 pxor %xmm0,%xmm12
3832 pshufb .rol8(%rip),%xmm12
3833 paddd %xmm12,%xmm8
3834 pxor %xmm8,%xmm4
3835 movdqa %xmm4,%xmm3
3836 pslld $7,%xmm3
3837 psrld $25,%xmm4
3838 pxor %xmm3,%xmm4
3839.byte 102,15,58,15,228,12
3840.byte 102,69,15,58,15,192,8
3841.byte 102,69,15,58,15,228,4
3842 paddd %xmm5,%xmm1
3843 pxor %xmm1,%xmm13
3844 pshufb .rol16(%rip),%xmm13
3845 paddd %xmm13,%xmm9
3846 pxor %xmm9,%xmm5
3847 movdqa %xmm5,%xmm3
3848 pslld $12,%xmm3
3849 psrld $20,%xmm5
3850 pxor %xmm3,%xmm5
3851 paddd %xmm5,%xmm1
3852 pxor %xmm1,%xmm13
3853 pshufb .rol8(%rip),%xmm13
3854 paddd %xmm13,%xmm9
3855 pxor %xmm9,%xmm5
3856 movdqa %xmm5,%xmm3
3857 pslld $7,%xmm3
3858 psrld $25,%xmm5
3859 pxor %xmm3,%xmm5
3860.byte 102,15,58,15,237,12
3861.byte 102,69,15,58,15,201,8
3862.byte 102,69,15,58,15,237,4
3863 paddd %xmm6,%xmm2
3864 pxor %xmm2,%xmm14
3865 pshufb .rol16(%rip),%xmm14
3866 paddd %xmm14,%xmm10
3867 pxor %xmm10,%xmm6
3868 movdqa %xmm6,%xmm3
3869 pslld $12,%xmm3
3870 psrld $20,%xmm6
3871 pxor %xmm3,%xmm6
3872 paddd %xmm6,%xmm2
3873 pxor %xmm2,%xmm14
3874 pshufb .rol8(%rip),%xmm14
3875 paddd %xmm14,%xmm10
3876 pxor %xmm10,%xmm6
3877 movdqa %xmm6,%xmm3
3878 pslld $7,%xmm3
3879 psrld $25,%xmm6
3880 pxor %xmm3,%xmm6
3881.byte 102,15,58,15,246,12
3882.byte 102,69,15,58,15,210,8
3883.byte 102,69,15,58,15,246,4
3884
3885 decq %r10
3886 jnz 1b
3887 paddd .chacha20_consts(%rip),%xmm0
3888 paddd .chacha20_consts(%rip),%xmm1
3889 paddd .chacha20_consts(%rip),%xmm2
3890 paddd %xmm7,%xmm4
3891 paddd %xmm7,%xmm5
3892 paddd %xmm7,%xmm6
3893 paddd %xmm11,%xmm8
3894 paddd %xmm11,%xmm9
3895 paddd %xmm15,%xmm12
3896 paddd .sse_inc(%rip),%xmm15
3897 paddd %xmm15,%xmm13
3898
3899 pand .clamp(%rip),%xmm2
3900 movdqa %xmm2,0(%rbp)
3901 movdqa %xmm6,16(%rbp)
3902
3903 movq %r8,%r8
3904 call poly_hash_ad_internal
3905 jmp seal_sse_128_seal
3906
3907
3908
3909
3910.p2align 6
3911chacha20_poly1305_open_avx2:
3912 vzeroupper
3913 vmovdqa .chacha20_consts(%rip),%ymm0
3914 vbroadcasti128 0(%r9),%ymm4
3915 vbroadcasti128 16(%r9),%ymm8
3916 vbroadcasti128 32(%r9),%ymm12
3917 vpaddd .avx2_init(%rip),%ymm12,%ymm12
3918 cmpq $192,%rbx
3919 jbe open_avx2_192
3920 cmpq $320,%rbx
3921 jbe open_avx2_320
3922
3923 vmovdqa %ymm4,64(%rbp)
3924 vmovdqa %ymm8,96(%rbp)
3925 vmovdqa %ymm12,160(%rbp)
3926 movq $10,%r10
39271:
3928 vpaddd %ymm4,%ymm0,%ymm0
3929 vpxor %ymm0,%ymm12,%ymm12
3930 vpshufb .rol16(%rip),%ymm12,%ymm12
3931 vpaddd %ymm12,%ymm8,%ymm8
3932 vpxor %ymm8,%ymm4,%ymm4
3933 vpsrld $20,%ymm4,%ymm3
3934 vpslld $12,%ymm4,%ymm4
3935 vpxor %ymm3,%ymm4,%ymm4
3936 vpaddd %ymm4,%ymm0,%ymm0
3937 vpxor %ymm0,%ymm12,%ymm12
3938 vpshufb .rol8(%rip),%ymm12,%ymm12
3939 vpaddd %ymm12,%ymm8,%ymm8
3940 vpxor %ymm8,%ymm4,%ymm4
3941 vpslld $7,%ymm4,%ymm3
3942 vpsrld $25,%ymm4,%ymm4
3943 vpxor %ymm3,%ymm4,%ymm4
3944 vpalignr $12,%ymm12,%ymm12,%ymm12
3945 vpalignr $8,%ymm8,%ymm8,%ymm8
3946 vpalignr $4,%ymm4,%ymm4,%ymm4
3947 vpaddd %ymm4,%ymm0,%ymm0
3948 vpxor %ymm0,%ymm12,%ymm12
3949 vpshufb .rol16(%rip),%ymm12,%ymm12
3950 vpaddd %ymm12,%ymm8,%ymm8
3951 vpxor %ymm8,%ymm4,%ymm4
3952 vpsrld $20,%ymm4,%ymm3
3953 vpslld $12,%ymm4,%ymm4
3954 vpxor %ymm3,%ymm4,%ymm4
3955 vpaddd %ymm4,%ymm0,%ymm0
3956 vpxor %ymm0,%ymm12,%ymm12
3957 vpshufb .rol8(%rip),%ymm12,%ymm12
3958 vpaddd %ymm12,%ymm8,%ymm8
3959 vpxor %ymm8,%ymm4,%ymm4
3960 vpslld $7,%ymm4,%ymm3
3961 vpsrld $25,%ymm4,%ymm4
3962 vpxor %ymm3,%ymm4,%ymm4
3963 vpalignr $4,%ymm12,%ymm12,%ymm12
3964 vpalignr $8,%ymm8,%ymm8,%ymm8
3965 vpalignr $12,%ymm4,%ymm4,%ymm4
3966
3967 decq %r10
3968 jne 1b
3969 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
3970 vpaddd 64(%rbp),%ymm4,%ymm4
3971 vpaddd 96(%rbp),%ymm8,%ymm8
3972 vpaddd 160(%rbp),%ymm12,%ymm12
3973
3974 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
3975
3976 vpand .clamp(%rip),%ymm3,%ymm3
3977 vmovdqa %ymm3,0(%rbp)
3978
3979 vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
3980 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
3981
3982 movq %r8,%r8
3983 call poly_hash_ad_internal
3984 xorq %rcx,%rcx
3985
39861:
3987 addq 0(%rsi,%rcx), %r10
3988 adcq 8+0(%rsi,%rcx), %r11
3989 adcq $1,%r12
3990 movq 0+0(%rbp),%rax
3991 movq %rax,%r15
3992 mulq %r10
3993 movq %rax,%r13
3994 movq %rdx,%r14
3995 movq 0+0(%rbp),%rax
3996 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003997 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003998 addq %rax,%r14
3999 adcq %rdx,%r15
4000 movq 8+0(%rbp),%rax
4001 movq %rax,%r9
4002 mulq %r10
4003 addq %rax,%r14
4004 adcq $0,%rdx
4005 movq %rdx,%r10
4006 movq 8+0(%rbp),%rax
4007 mulq %r11
4008 addq %rax,%r15
4009 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004010 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05004011 addq %r10,%r15
4012 adcq %rdx,%r9
4013 movq %r13,%r10
4014 movq %r14,%r11
4015 movq %r15,%r12
4016 andq $3,%r12
4017 movq %r15,%r13
4018 andq $-4,%r13
4019 movq %r9,%r14
4020 shrdq $2,%r9,%r15
4021 shrq $2,%r9
4022 addq %r13,%r10
4023 adcq %r14,%r11
4024 adcq $0,%r12
4025 addq %r15,%r10
4026 adcq %r9,%r11
4027 adcq $0,%r12
4028
4029 addq $16,%rcx
4030 cmpq $64,%rcx
4031 jne 1b
4032
4033 vpxor 0(%rsi),%ymm0,%ymm0
4034 vpxor 32(%rsi),%ymm4,%ymm4
4035 vmovdqu %ymm0,0(%rdi)
4036 vmovdqu %ymm4,32(%rdi)
4037 leaq 64(%rsi),%rsi
4038 leaq 64(%rdi),%rdi
4039 subq $64,%rbx
40401:
4041
4042 cmpq $512,%rbx
4043 jb 3f
4044 vmovdqa .chacha20_consts(%rip),%ymm0
4045 vmovdqa 64(%rbp),%ymm4
4046 vmovdqa 96(%rbp),%ymm8
4047 vmovdqa %ymm0,%ymm1
4048 vmovdqa %ymm4,%ymm5
4049 vmovdqa %ymm8,%ymm9
4050 vmovdqa %ymm0,%ymm2
4051 vmovdqa %ymm4,%ymm6
4052 vmovdqa %ymm8,%ymm10
4053 vmovdqa %ymm0,%ymm3
4054 vmovdqa %ymm4,%ymm7
4055 vmovdqa %ymm8,%ymm11
4056 vmovdqa .avx2_inc(%rip),%ymm12
4057 vpaddd 160(%rbp),%ymm12,%ymm15
4058 vpaddd %ymm15,%ymm12,%ymm14
4059 vpaddd %ymm14,%ymm12,%ymm13
4060 vpaddd %ymm13,%ymm12,%ymm12
4061 vmovdqa %ymm15,256(%rbp)
4062 vmovdqa %ymm14,224(%rbp)
4063 vmovdqa %ymm13,192(%rbp)
4064 vmovdqa %ymm12,160(%rbp)
4065
4066 xorq %rcx,%rcx
40672:
4068 addq 0*8(%rsi,%rcx), %r10
4069 adcq 8+0*8(%rsi,%rcx), %r11
4070 adcq $1,%r12
4071 vmovdqa %ymm8,128(%rbp)
4072 vmovdqa .rol16(%rip),%ymm8
4073 vpaddd %ymm7,%ymm3,%ymm3
4074 vpaddd %ymm6,%ymm2,%ymm2
4075 vpaddd %ymm5,%ymm1,%ymm1
4076 vpaddd %ymm4,%ymm0,%ymm0
4077 vpxor %ymm3,%ymm15,%ymm15
4078 vpxor %ymm2,%ymm14,%ymm14
4079 vpxor %ymm1,%ymm13,%ymm13
4080 vpxor %ymm0,%ymm12,%ymm12
4081 movq 0+0(%rbp),%rdx
4082 movq %rdx,%r15
4083 mulxq %r10,%r13,%r14
4084 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004085 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004086 addq %rax,%r14
4087 adcq %rdx,%r15
4088 vpshufb %ymm8,%ymm15,%ymm15
4089 vpshufb %ymm8,%ymm14,%ymm14
4090 vpshufb %ymm8,%ymm13,%ymm13
4091 vpshufb %ymm8,%ymm12,%ymm12
4092 vmovdqa 128(%rbp),%ymm8
4093 vpaddd %ymm15,%ymm11,%ymm11
4094 vpaddd %ymm14,%ymm10,%ymm10
4095 vpaddd %ymm13,%ymm9,%ymm9
4096 vpaddd %ymm12,%ymm8,%ymm8
4097 movq 8+0(%rbp),%rdx
4098 mulxq %r10,%r10,%rax
4099 addq %r10,%r14
4100 mulxq %r11,%r11,%r9
4101 adcq %r11,%r15
4102 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004103 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004104 vpxor %ymm11,%ymm7,%ymm7
4105 vpxor %ymm10,%ymm6,%ymm6
4106 vpxor %ymm9,%ymm5,%ymm5
4107 vpxor %ymm8,%ymm4,%ymm4
4108 vmovdqa %ymm8,128(%rbp)
4109 vpsrld $20,%ymm7,%ymm8
4110 vpslld $32-20,%ymm7,%ymm7
4111 vpxor %ymm8,%ymm7,%ymm7
4112 vpsrld $20,%ymm6,%ymm8
4113 vpslld $32-20,%ymm6,%ymm6
4114 vpxor %ymm8,%ymm6,%ymm6
4115 vpsrld $20,%ymm5,%ymm8
4116 addq %rax,%r15
4117 adcq %rdx,%r9
4118 vpslld $32-20,%ymm5,%ymm5
4119 vpxor %ymm8,%ymm5,%ymm5
4120 vpsrld $20,%ymm4,%ymm8
4121 vpslld $32-20,%ymm4,%ymm4
4122 vpxor %ymm8,%ymm4,%ymm4
4123 vmovdqa .rol8(%rip),%ymm8
4124 vpaddd %ymm7,%ymm3,%ymm3
4125 vpaddd %ymm6,%ymm2,%ymm2
4126 vpaddd %ymm5,%ymm1,%ymm1
4127 vpaddd %ymm4,%ymm0,%ymm0
4128 movq %r13,%r10
4129 movq %r14,%r11
4130 movq %r15,%r12
4131 andq $3,%r12
4132 movq %r15,%r13
4133 andq $-4,%r13
4134 movq %r9,%r14
4135 shrdq $2,%r9,%r15
4136 shrq $2,%r9
4137 addq %r13,%r10
4138 adcq %r14,%r11
4139 adcq $0,%r12
4140 addq %r15,%r10
4141 adcq %r9,%r11
4142 adcq $0,%r12
4143 vpxor %ymm3,%ymm15,%ymm15
4144 vpxor %ymm2,%ymm14,%ymm14
4145 vpxor %ymm1,%ymm13,%ymm13
4146 vpxor %ymm0,%ymm12,%ymm12
4147 vpshufb %ymm8,%ymm15,%ymm15
4148 vpshufb %ymm8,%ymm14,%ymm14
4149 vpshufb %ymm8,%ymm13,%ymm13
4150 vpshufb %ymm8,%ymm12,%ymm12
4151 vmovdqa 128(%rbp),%ymm8
4152 addq 2*8(%rsi,%rcx), %r10
4153 adcq 8+2*8(%rsi,%rcx), %r11
4154 adcq $1,%r12
4155 vpaddd %ymm15,%ymm11,%ymm11
4156 vpaddd %ymm14,%ymm10,%ymm10
4157 vpaddd %ymm13,%ymm9,%ymm9
4158 vpaddd %ymm12,%ymm8,%ymm8
4159 vpxor %ymm11,%ymm7,%ymm7
4160 vpxor %ymm10,%ymm6,%ymm6
4161 vpxor %ymm9,%ymm5,%ymm5
4162 vpxor %ymm8,%ymm4,%ymm4
4163 movq 0+0(%rbp),%rdx
4164 movq %rdx,%r15
4165 mulxq %r10,%r13,%r14
4166 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004167 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004168 addq %rax,%r14
4169 adcq %rdx,%r15
4170 vmovdqa %ymm8,128(%rbp)
4171 vpsrld $25,%ymm7,%ymm8
4172 vpslld $32-25,%ymm7,%ymm7
4173 vpxor %ymm8,%ymm7,%ymm7
4174 vpsrld $25,%ymm6,%ymm8
4175 vpslld $32-25,%ymm6,%ymm6
4176 vpxor %ymm8,%ymm6,%ymm6
4177 vpsrld $25,%ymm5,%ymm8
4178 vpslld $32-25,%ymm5,%ymm5
4179 vpxor %ymm8,%ymm5,%ymm5
4180 vpsrld $25,%ymm4,%ymm8
4181 vpslld $32-25,%ymm4,%ymm4
4182 vpxor %ymm8,%ymm4,%ymm4
4183 vmovdqa 128(%rbp),%ymm8
4184 vpalignr $4,%ymm7,%ymm7,%ymm7
4185 vpalignr $8,%ymm11,%ymm11,%ymm11
4186 vpalignr $12,%ymm15,%ymm15,%ymm15
4187 vpalignr $4,%ymm6,%ymm6,%ymm6
4188 movq 8+0(%rbp),%rdx
4189 mulxq %r10,%r10,%rax
4190 addq %r10,%r14
4191 mulxq %r11,%r11,%r9
4192 adcq %r11,%r15
4193 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004194 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004195 vpalignr $8,%ymm10,%ymm10,%ymm10
4196 vpalignr $12,%ymm14,%ymm14,%ymm14
4197 vpalignr $4,%ymm5,%ymm5,%ymm5
4198 vpalignr $8,%ymm9,%ymm9,%ymm9
4199 vpalignr $12,%ymm13,%ymm13,%ymm13
4200 vpalignr $4,%ymm4,%ymm4,%ymm4
4201 vpalignr $8,%ymm8,%ymm8,%ymm8
4202 vpalignr $12,%ymm12,%ymm12,%ymm12
4203 vmovdqa %ymm8,128(%rbp)
4204 vmovdqa .rol16(%rip),%ymm8
4205 vpaddd %ymm7,%ymm3,%ymm3
4206 vpaddd %ymm6,%ymm2,%ymm2
4207 vpaddd %ymm5,%ymm1,%ymm1
4208 vpaddd %ymm4,%ymm0,%ymm0
4209 vpxor %ymm3,%ymm15,%ymm15
4210 vpxor %ymm2,%ymm14,%ymm14
4211 vpxor %ymm1,%ymm13,%ymm13
4212 vpxor %ymm0,%ymm12,%ymm12
4213 addq %rax,%r15
4214 adcq %rdx,%r9
4215 vpshufb %ymm8,%ymm15,%ymm15
4216 vpshufb %ymm8,%ymm14,%ymm14
4217 vpshufb %ymm8,%ymm13,%ymm13
4218 vpshufb %ymm8,%ymm12,%ymm12
4219 vmovdqa 128(%rbp),%ymm8
4220 vpaddd %ymm15,%ymm11,%ymm11
4221 vpaddd %ymm14,%ymm10,%ymm10
4222 vpaddd %ymm13,%ymm9,%ymm9
4223 vpaddd %ymm12,%ymm8,%ymm8
4224 movq %r13,%r10
4225 movq %r14,%r11
4226 movq %r15,%r12
4227 andq $3,%r12
4228 movq %r15,%r13
4229 andq $-4,%r13
4230 movq %r9,%r14
4231 shrdq $2,%r9,%r15
4232 shrq $2,%r9
4233 addq %r13,%r10
4234 adcq %r14,%r11
4235 adcq $0,%r12
4236 addq %r15,%r10
4237 adcq %r9,%r11
4238 adcq $0,%r12
4239 vpxor %ymm11,%ymm7,%ymm7
4240 vpxor %ymm10,%ymm6,%ymm6
4241 vpxor %ymm9,%ymm5,%ymm5
4242 vpxor %ymm8,%ymm4,%ymm4
4243 vmovdqa %ymm8,128(%rbp)
4244 vpsrld $20,%ymm7,%ymm8
4245 vpslld $32-20,%ymm7,%ymm7
4246 vpxor %ymm8,%ymm7,%ymm7
4247 addq 4*8(%rsi,%rcx), %r10
4248 adcq 8+4*8(%rsi,%rcx), %r11
4249 adcq $1,%r12
4250
4251 leaq 48(%rcx),%rcx
4252 vpsrld $20,%ymm6,%ymm8
4253 vpslld $32-20,%ymm6,%ymm6
4254 vpxor %ymm8,%ymm6,%ymm6
4255 vpsrld $20,%ymm5,%ymm8
4256 vpslld $32-20,%ymm5,%ymm5
4257 vpxor %ymm8,%ymm5,%ymm5
4258 vpsrld $20,%ymm4,%ymm8
4259 vpslld $32-20,%ymm4,%ymm4
4260 vpxor %ymm8,%ymm4,%ymm4
4261 vmovdqa .rol8(%rip),%ymm8
4262 vpaddd %ymm7,%ymm3,%ymm3
4263 vpaddd %ymm6,%ymm2,%ymm2
4264 vpaddd %ymm5,%ymm1,%ymm1
4265 vpaddd %ymm4,%ymm0,%ymm0
4266 vpxor %ymm3,%ymm15,%ymm15
4267 vpxor %ymm2,%ymm14,%ymm14
4268 vpxor %ymm1,%ymm13,%ymm13
4269 vpxor %ymm0,%ymm12,%ymm12
4270 movq 0+0(%rbp),%rdx
4271 movq %rdx,%r15
4272 mulxq %r10,%r13,%r14
4273 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004274 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004275 addq %rax,%r14
4276 adcq %rdx,%r15
4277 vpshufb %ymm8,%ymm15,%ymm15
4278 vpshufb %ymm8,%ymm14,%ymm14
4279 vpshufb %ymm8,%ymm13,%ymm13
4280 vpshufb %ymm8,%ymm12,%ymm12
4281 vmovdqa 128(%rbp),%ymm8
4282 vpaddd %ymm15,%ymm11,%ymm11
4283 vpaddd %ymm14,%ymm10,%ymm10
4284 vpaddd %ymm13,%ymm9,%ymm9
4285 movq 8+0(%rbp),%rdx
4286 mulxq %r10,%r10,%rax
4287 addq %r10,%r14
4288 mulxq %r11,%r11,%r9
4289 adcq %r11,%r15
4290 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004291 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004292 vpaddd %ymm12,%ymm8,%ymm8
4293 vpxor %ymm11,%ymm7,%ymm7
4294 vpxor %ymm10,%ymm6,%ymm6
4295 vpxor %ymm9,%ymm5,%ymm5
4296 vpxor %ymm8,%ymm4,%ymm4
4297 vmovdqa %ymm8,128(%rbp)
4298 vpsrld $25,%ymm7,%ymm8
4299 vpslld $32-25,%ymm7,%ymm7
4300 addq %rax,%r15
4301 adcq %rdx,%r9
4302 vpxor %ymm8,%ymm7,%ymm7
4303 vpsrld $25,%ymm6,%ymm8
4304 vpslld $32-25,%ymm6,%ymm6
4305 vpxor %ymm8,%ymm6,%ymm6
4306 vpsrld $25,%ymm5,%ymm8
4307 vpslld $32-25,%ymm5,%ymm5
4308 vpxor %ymm8,%ymm5,%ymm5
4309 vpsrld $25,%ymm4,%ymm8
4310 vpslld $32-25,%ymm4,%ymm4
4311 vpxor %ymm8,%ymm4,%ymm4
4312 vmovdqa 128(%rbp),%ymm8
4313 vpalignr $12,%ymm7,%ymm7,%ymm7
4314 vpalignr $8,%ymm11,%ymm11,%ymm11
4315 vpalignr $4,%ymm15,%ymm15,%ymm15
4316 vpalignr $12,%ymm6,%ymm6,%ymm6
4317 vpalignr $8,%ymm10,%ymm10,%ymm10
4318 vpalignr $4,%ymm14,%ymm14,%ymm14
4319 vpalignr $12,%ymm5,%ymm5,%ymm5
4320 movq %r13,%r10
4321 movq %r14,%r11
4322 movq %r15,%r12
4323 andq $3,%r12
4324 movq %r15,%r13
4325 andq $-4,%r13
4326 movq %r9,%r14
4327 shrdq $2,%r9,%r15
4328 shrq $2,%r9
4329 addq %r13,%r10
4330 adcq %r14,%r11
4331 adcq $0,%r12
4332 addq %r15,%r10
4333 adcq %r9,%r11
4334 adcq $0,%r12
4335 vpalignr $8,%ymm9,%ymm9,%ymm9
4336 vpalignr $4,%ymm13,%ymm13,%ymm13
4337 vpalignr $12,%ymm4,%ymm4,%ymm4
4338 vpalignr $8,%ymm8,%ymm8,%ymm8
4339 vpalignr $4,%ymm12,%ymm12,%ymm12
4340
4341 cmpq $60*8,%rcx
4342 jne 2b
4343 vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
4344 vpaddd 64(%rbp),%ymm7,%ymm7
4345 vpaddd 96(%rbp),%ymm11,%ymm11
4346 vpaddd 256(%rbp),%ymm15,%ymm15
4347 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
4348 vpaddd 64(%rbp),%ymm6,%ymm6
4349 vpaddd 96(%rbp),%ymm10,%ymm10
4350 vpaddd 224(%rbp),%ymm14,%ymm14
4351 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
4352 vpaddd 64(%rbp),%ymm5,%ymm5
4353 vpaddd 96(%rbp),%ymm9,%ymm9
4354 vpaddd 192(%rbp),%ymm13,%ymm13
4355 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
4356 vpaddd 64(%rbp),%ymm4,%ymm4
4357 vpaddd 96(%rbp),%ymm8,%ymm8
4358 vpaddd 160(%rbp),%ymm12,%ymm12
4359
4360 vmovdqa %ymm0,128(%rbp)
4361 addq 60*8(%rsi),%r10
4362 adcq 8+60*8(%rsi),%r11
4363 adcq $1,%r12
4364 vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
4365 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
4366 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
4367 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
4368 vpxor 0+0(%rsi),%ymm0,%ymm0
4369 vpxor 32+0(%rsi),%ymm3,%ymm3
4370 vpxor 64+0(%rsi),%ymm7,%ymm7
4371 vpxor 96+0(%rsi),%ymm11,%ymm11
4372 vmovdqu %ymm0,0+0(%rdi)
4373 vmovdqu %ymm3,32+0(%rdi)
4374 vmovdqu %ymm7,64+0(%rdi)
4375 vmovdqu %ymm11,96+0(%rdi)
4376
4377 vmovdqa 128(%rbp),%ymm0
4378 movq 0+0(%rbp),%rax
4379 movq %rax,%r15
4380 mulq %r10
4381 movq %rax,%r13
4382 movq %rdx,%r14
4383 movq 0+0(%rbp),%rax
4384 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08004385 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004386 addq %rax,%r14
4387 adcq %rdx,%r15
4388 movq 8+0(%rbp),%rax
4389 movq %rax,%r9
4390 mulq %r10
4391 addq %rax,%r14
4392 adcq $0,%rdx
4393 movq %rdx,%r10
4394 movq 8+0(%rbp),%rax
4395 mulq %r11
4396 addq %rax,%r15
4397 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004398 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05004399 addq %r10,%r15
4400 adcq %rdx,%r9
4401 movq %r13,%r10
4402 movq %r14,%r11
4403 movq %r15,%r12
4404 andq $3,%r12
4405 movq %r15,%r13
4406 andq $-4,%r13
4407 movq %r9,%r14
4408 shrdq $2,%r9,%r15
4409 shrq $2,%r9
4410 addq %r13,%r10
4411 adcq %r14,%r11
4412 adcq $0,%r12
4413 addq %r15,%r10
4414 adcq %r9,%r11
4415 adcq $0,%r12
4416 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
4417 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
4418 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
4419 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
4420 vpxor 0+128(%rsi),%ymm3,%ymm3
4421 vpxor 32+128(%rsi),%ymm2,%ymm2
4422 vpxor 64+128(%rsi),%ymm6,%ymm6
4423 vpxor 96+128(%rsi),%ymm10,%ymm10
4424 vmovdqu %ymm3,0+128(%rdi)
4425 vmovdqu %ymm2,32+128(%rdi)
4426 vmovdqu %ymm6,64+128(%rdi)
4427 vmovdqu %ymm10,96+128(%rdi)
4428 addq 60*8+16(%rsi),%r10
4429 adcq 8+60*8+16(%rsi),%r11
4430 adcq $1,%r12
4431 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
4432 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
4433 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
4434 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
4435 vpxor 0+256(%rsi),%ymm3,%ymm3
4436 vpxor 32+256(%rsi),%ymm1,%ymm1
4437 vpxor 64+256(%rsi),%ymm5,%ymm5
4438 vpxor 96+256(%rsi),%ymm9,%ymm9
4439 vmovdqu %ymm3,0+256(%rdi)
4440 vmovdqu %ymm1,32+256(%rdi)
4441 vmovdqu %ymm5,64+256(%rdi)
4442 vmovdqu %ymm9,96+256(%rdi)
4443 movq 0+0(%rbp),%rax
4444 movq %rax,%r15
4445 mulq %r10
4446 movq %rax,%r13
4447 movq %rdx,%r14
4448 movq 0+0(%rbp),%rax
4449 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08004450 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004451 addq %rax,%r14
4452 adcq %rdx,%r15
4453 movq 8+0(%rbp),%rax
4454 movq %rax,%r9
4455 mulq %r10
4456 addq %rax,%r14
4457 adcq $0,%rdx
4458 movq %rdx,%r10
4459 movq 8+0(%rbp),%rax
4460 mulq %r11
4461 addq %rax,%r15
4462 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004463 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05004464 addq %r10,%r15
4465 adcq %rdx,%r9
4466 movq %r13,%r10
4467 movq %r14,%r11
4468 movq %r15,%r12
4469 andq $3,%r12
4470 movq %r15,%r13
4471 andq $-4,%r13
4472 movq %r9,%r14
4473 shrdq $2,%r9,%r15
4474 shrq $2,%r9
4475 addq %r13,%r10
4476 adcq %r14,%r11
4477 adcq $0,%r12
4478 addq %r15,%r10
4479 adcq %r9,%r11
4480 adcq $0,%r12
4481 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
4482 vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
4483 vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
4484 vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
4485 vpxor 0+384(%rsi),%ymm3,%ymm3
4486 vpxor 32+384(%rsi),%ymm0,%ymm0
4487 vpxor 64+384(%rsi),%ymm4,%ymm4
4488 vpxor 96+384(%rsi),%ymm8,%ymm8
4489 vmovdqu %ymm3,0+384(%rdi)
4490 vmovdqu %ymm0,32+384(%rdi)
4491 vmovdqu %ymm4,64+384(%rdi)
4492 vmovdqu %ymm8,96+384(%rdi)
4493
4494 leaq 512(%rsi),%rsi
4495 leaq 512(%rdi),%rdi
4496 subq $512,%rbx
4497 jmp 1b
44983:
4499 testq %rbx,%rbx
4500 vzeroupper
4501 je open_sse_finalize
45023:
4503 cmpq $128,%rbx
4504 ja 3f
4505 vmovdqa .chacha20_consts(%rip),%ymm0
4506 vmovdqa 64(%rbp),%ymm4
4507 vmovdqa 96(%rbp),%ymm8
4508 vmovdqa .avx2_inc(%rip),%ymm12
4509 vpaddd 160(%rbp),%ymm12,%ymm12
4510 vmovdqa %ymm12,160(%rbp)
4511
4512 xorq %r8,%r8
4513 movq %rbx,%rcx
4514 andq $-16,%rcx
4515 testq %rcx,%rcx
4516 je 2f
45171:
4518 addq 0*8(%rsi,%r8), %r10
4519 adcq 8+0*8(%rsi,%r8), %r11
4520 adcq $1,%r12
4521 movq 0+0(%rbp),%rax
4522 movq %rax,%r15
4523 mulq %r10
4524 movq %rax,%r13
4525 movq %rdx,%r14
4526 movq 0+0(%rbp),%rax
4527 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08004528 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004529 addq %rax,%r14
4530 adcq %rdx,%r15
4531 movq 8+0(%rbp),%rax
4532 movq %rax,%r9
4533 mulq %r10
4534 addq %rax,%r14
4535 adcq $0,%rdx
4536 movq %rdx,%r10
4537 movq 8+0(%rbp),%rax
4538 mulq %r11
4539 addq %rax,%r15
4540 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004541 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05004542 addq %r10,%r15
4543 adcq %rdx,%r9
4544 movq %r13,%r10
4545 movq %r14,%r11
4546 movq %r15,%r12
4547 andq $3,%r12
4548 movq %r15,%r13
4549 andq $-4,%r13
4550 movq %r9,%r14
4551 shrdq $2,%r9,%r15
4552 shrq $2,%r9
4553 addq %r13,%r10
4554 adcq %r14,%r11
4555 adcq $0,%r12
4556 addq %r15,%r10
4557 adcq %r9,%r11
4558 adcq $0,%r12
4559
45602:
4561 addq $16,%r8
4562 vpaddd %ymm4,%ymm0,%ymm0
4563 vpxor %ymm0,%ymm12,%ymm12
4564 vpshufb .rol16(%rip),%ymm12,%ymm12
4565 vpaddd %ymm12,%ymm8,%ymm8
4566 vpxor %ymm8,%ymm4,%ymm4
4567 vpsrld $20,%ymm4,%ymm3
4568 vpslld $12,%ymm4,%ymm4
4569 vpxor %ymm3,%ymm4,%ymm4
4570 vpaddd %ymm4,%ymm0,%ymm0
4571 vpxor %ymm0,%ymm12,%ymm12
4572 vpshufb .rol8(%rip),%ymm12,%ymm12
4573 vpaddd %ymm12,%ymm8,%ymm8
4574 vpxor %ymm8,%ymm4,%ymm4
4575 vpslld $7,%ymm4,%ymm3
4576 vpsrld $25,%ymm4,%ymm4
4577 vpxor %ymm3,%ymm4,%ymm4
4578 vpalignr $12,%ymm12,%ymm12,%ymm12
4579 vpalignr $8,%ymm8,%ymm8,%ymm8
4580 vpalignr $4,%ymm4,%ymm4,%ymm4
4581 vpaddd %ymm4,%ymm0,%ymm0
4582 vpxor %ymm0,%ymm12,%ymm12
4583 vpshufb .rol16(%rip),%ymm12,%ymm12
4584 vpaddd %ymm12,%ymm8,%ymm8
4585 vpxor %ymm8,%ymm4,%ymm4
4586 vpsrld $20,%ymm4,%ymm3
4587 vpslld $12,%ymm4,%ymm4
4588 vpxor %ymm3,%ymm4,%ymm4
4589 vpaddd %ymm4,%ymm0,%ymm0
4590 vpxor %ymm0,%ymm12,%ymm12
4591 vpshufb .rol8(%rip),%ymm12,%ymm12
4592 vpaddd %ymm12,%ymm8,%ymm8
4593 vpxor %ymm8,%ymm4,%ymm4
4594 vpslld $7,%ymm4,%ymm3
4595 vpsrld $25,%ymm4,%ymm4
4596 vpxor %ymm3,%ymm4,%ymm4
4597 vpalignr $4,%ymm12,%ymm12,%ymm12
4598 vpalignr $8,%ymm8,%ymm8,%ymm8
4599 vpalignr $12,%ymm4,%ymm4,%ymm4
4600
4601 cmpq %rcx,%r8
4602 jb 1b
4603 cmpq $160,%r8
4604 jne 2b
4605 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
4606 vpaddd 64(%rbp),%ymm4,%ymm4
4607 vpaddd 96(%rbp),%ymm8,%ymm8
4608 vpaddd 160(%rbp),%ymm12,%ymm12
4609 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
4610 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
4611 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
4612 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
4613 vmovdqa %ymm3,%ymm8
4614
4615 jmp open_avx2_tail_loop
46163:
4617 cmpq $256,%rbx
4618 ja 3f
4619 vmovdqa .chacha20_consts(%rip),%ymm0
4620 vmovdqa 64(%rbp),%ymm4
4621 vmovdqa 96(%rbp),%ymm8
4622 vmovdqa %ymm0,%ymm1
4623 vmovdqa %ymm4,%ymm5
4624 vmovdqa %ymm8,%ymm9
4625 vmovdqa .avx2_inc(%rip),%ymm12
4626 vpaddd 160(%rbp),%ymm12,%ymm13
4627 vpaddd %ymm13,%ymm12,%ymm12
4628 vmovdqa %ymm12,160(%rbp)
4629 vmovdqa %ymm13,192(%rbp)
4630
4631 movq %rbx,128(%rbp)
4632 movq %rbx,%rcx
4633 subq $128,%rcx
4634 shrq $4,%rcx
4635 movq $10,%r8
4636 cmpq $10,%rcx
4637 cmovgq %r8,%rcx
4638 movq %rsi,%rbx
4639 xorq %r8,%r8
46401:
4641 addq 0(%rbx),%r10
4642 adcq 8+0(%rbx),%r11
4643 adcq $1,%r12
4644 movq 0+0(%rbp),%rdx
4645 movq %rdx,%r15
4646 mulxq %r10,%r13,%r14
4647 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004648 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004649 addq %rax,%r14
4650 adcq %rdx,%r15
4651 movq 8+0(%rbp),%rdx
4652 mulxq %r10,%r10,%rax
4653 addq %r10,%r14
4654 mulxq %r11,%r11,%r9
4655 adcq %r11,%r15
4656 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004657 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004658 addq %rax,%r15
4659 adcq %rdx,%r9
4660 movq %r13,%r10
4661 movq %r14,%r11
4662 movq %r15,%r12
4663 andq $3,%r12
4664 movq %r15,%r13
4665 andq $-4,%r13
4666 movq %r9,%r14
4667 shrdq $2,%r9,%r15
4668 shrq $2,%r9
4669 addq %r13,%r10
4670 adcq %r14,%r11
4671 adcq $0,%r12
4672 addq %r15,%r10
4673 adcq %r9,%r11
4674 adcq $0,%r12
4675
4676 leaq 16(%rbx),%rbx
46772:
4678 vpaddd %ymm4,%ymm0,%ymm0
4679 vpxor %ymm0,%ymm12,%ymm12
4680 vpshufb .rol16(%rip),%ymm12,%ymm12
4681 vpaddd %ymm12,%ymm8,%ymm8
4682 vpxor %ymm8,%ymm4,%ymm4
4683 vpsrld $20,%ymm4,%ymm3
4684 vpslld $12,%ymm4,%ymm4
4685 vpxor %ymm3,%ymm4,%ymm4
4686 vpaddd %ymm4,%ymm0,%ymm0
4687 vpxor %ymm0,%ymm12,%ymm12
4688 vpshufb .rol8(%rip),%ymm12,%ymm12
4689 vpaddd %ymm12,%ymm8,%ymm8
4690 vpxor %ymm8,%ymm4,%ymm4
4691 vpslld $7,%ymm4,%ymm3
4692 vpsrld $25,%ymm4,%ymm4
4693 vpxor %ymm3,%ymm4,%ymm4
4694 vpalignr $12,%ymm12,%ymm12,%ymm12
4695 vpalignr $8,%ymm8,%ymm8,%ymm8
4696 vpalignr $4,%ymm4,%ymm4,%ymm4
4697 vpaddd %ymm5,%ymm1,%ymm1
4698 vpxor %ymm1,%ymm13,%ymm13
4699 vpshufb .rol16(%rip),%ymm13,%ymm13
4700 vpaddd %ymm13,%ymm9,%ymm9
4701 vpxor %ymm9,%ymm5,%ymm5
4702 vpsrld $20,%ymm5,%ymm3
4703 vpslld $12,%ymm5,%ymm5
4704 vpxor %ymm3,%ymm5,%ymm5
4705 vpaddd %ymm5,%ymm1,%ymm1
4706 vpxor %ymm1,%ymm13,%ymm13
4707 vpshufb .rol8(%rip),%ymm13,%ymm13
4708 vpaddd %ymm13,%ymm9,%ymm9
4709 vpxor %ymm9,%ymm5,%ymm5
4710 vpslld $7,%ymm5,%ymm3
4711 vpsrld $25,%ymm5,%ymm5
4712 vpxor %ymm3,%ymm5,%ymm5
4713 vpalignr $12,%ymm13,%ymm13,%ymm13
4714 vpalignr $8,%ymm9,%ymm9,%ymm9
4715 vpalignr $4,%ymm5,%ymm5,%ymm5
4716
4717 incq %r8
4718 vpaddd %ymm4,%ymm0,%ymm0
4719 vpxor %ymm0,%ymm12,%ymm12
4720 vpshufb .rol16(%rip),%ymm12,%ymm12
4721 vpaddd %ymm12,%ymm8,%ymm8
4722 vpxor %ymm8,%ymm4,%ymm4
4723 vpsrld $20,%ymm4,%ymm3
4724 vpslld $12,%ymm4,%ymm4
4725 vpxor %ymm3,%ymm4,%ymm4
4726 vpaddd %ymm4,%ymm0,%ymm0
4727 vpxor %ymm0,%ymm12,%ymm12
4728 vpshufb .rol8(%rip),%ymm12,%ymm12
4729 vpaddd %ymm12,%ymm8,%ymm8
4730 vpxor %ymm8,%ymm4,%ymm4
4731 vpslld $7,%ymm4,%ymm3
4732 vpsrld $25,%ymm4,%ymm4
4733 vpxor %ymm3,%ymm4,%ymm4
4734 vpalignr $4,%ymm12,%ymm12,%ymm12
4735 vpalignr $8,%ymm8,%ymm8,%ymm8
4736 vpalignr $12,%ymm4,%ymm4,%ymm4
4737 vpaddd %ymm5,%ymm1,%ymm1
4738 vpxor %ymm1,%ymm13,%ymm13
4739 vpshufb .rol16(%rip),%ymm13,%ymm13
4740 vpaddd %ymm13,%ymm9,%ymm9
4741 vpxor %ymm9,%ymm5,%ymm5
4742 vpsrld $20,%ymm5,%ymm3
4743 vpslld $12,%ymm5,%ymm5
4744 vpxor %ymm3,%ymm5,%ymm5
4745 vpaddd %ymm5,%ymm1,%ymm1
4746 vpxor %ymm1,%ymm13,%ymm13
4747 vpshufb .rol8(%rip),%ymm13,%ymm13
4748 vpaddd %ymm13,%ymm9,%ymm9
4749 vpxor %ymm9,%ymm5,%ymm5
4750 vpslld $7,%ymm5,%ymm3
4751 vpsrld $25,%ymm5,%ymm5
4752 vpxor %ymm3,%ymm5,%ymm5
4753 vpalignr $4,%ymm13,%ymm13,%ymm13
4754 vpalignr $8,%ymm9,%ymm9,%ymm9
4755 vpalignr $12,%ymm5,%ymm5,%ymm5
4756 vpaddd %ymm6,%ymm2,%ymm2
4757 vpxor %ymm2,%ymm14,%ymm14
4758 vpshufb .rol16(%rip),%ymm14,%ymm14
4759 vpaddd %ymm14,%ymm10,%ymm10
4760 vpxor %ymm10,%ymm6,%ymm6
4761 vpsrld $20,%ymm6,%ymm3
4762 vpslld $12,%ymm6,%ymm6
4763 vpxor %ymm3,%ymm6,%ymm6
4764 vpaddd %ymm6,%ymm2,%ymm2
4765 vpxor %ymm2,%ymm14,%ymm14
4766 vpshufb .rol8(%rip),%ymm14,%ymm14
4767 vpaddd %ymm14,%ymm10,%ymm10
4768 vpxor %ymm10,%ymm6,%ymm6
4769 vpslld $7,%ymm6,%ymm3
4770 vpsrld $25,%ymm6,%ymm6
4771 vpxor %ymm3,%ymm6,%ymm6
4772 vpalignr $4,%ymm14,%ymm14,%ymm14
4773 vpalignr $8,%ymm10,%ymm10,%ymm10
4774 vpalignr $12,%ymm6,%ymm6,%ymm6
4775
4776 cmpq %rcx,%r8
4777 jb 1b
4778 cmpq $10,%r8
4779 jne 2b
4780 movq %rbx,%r8
4781 subq %rsi,%rbx
4782 movq %rbx,%rcx
4783 movq 128(%rbp),%rbx
47841:
4785 addq $16,%rcx
4786 cmpq %rbx,%rcx
4787 jg 1f
4788 addq 0(%r8),%r10
4789 adcq 8+0(%r8),%r11
4790 adcq $1,%r12
4791 movq 0+0(%rbp),%rdx
4792 movq %rdx,%r15
4793 mulxq %r10,%r13,%r14
4794 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004795 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004796 addq %rax,%r14
4797 adcq %rdx,%r15
4798 movq 8+0(%rbp),%rdx
4799 mulxq %r10,%r10,%rax
4800 addq %r10,%r14
4801 mulxq %r11,%r11,%r9
4802 adcq %r11,%r15
4803 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004804 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004805 addq %rax,%r15
4806 adcq %rdx,%r9
4807 movq %r13,%r10
4808 movq %r14,%r11
4809 movq %r15,%r12
4810 andq $3,%r12
4811 movq %r15,%r13
4812 andq $-4,%r13
4813 movq %r9,%r14
4814 shrdq $2,%r9,%r15
4815 shrq $2,%r9
4816 addq %r13,%r10
4817 adcq %r14,%r11
4818 adcq $0,%r12
4819 addq %r15,%r10
4820 adcq %r9,%r11
4821 adcq $0,%r12
4822
4823 leaq 16(%r8),%r8
4824 jmp 1b
48251:
4826 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
4827 vpaddd 64(%rbp),%ymm5,%ymm5
4828 vpaddd 96(%rbp),%ymm9,%ymm9
4829 vpaddd 192(%rbp),%ymm13,%ymm13
4830 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
4831 vpaddd 64(%rbp),%ymm4,%ymm4
4832 vpaddd 96(%rbp),%ymm8,%ymm8
4833 vpaddd 160(%rbp),%ymm12,%ymm12
4834 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
4835 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
4836 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
4837 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
4838 vpxor 0+0(%rsi),%ymm3,%ymm3
4839 vpxor 32+0(%rsi),%ymm1,%ymm1
4840 vpxor 64+0(%rsi),%ymm5,%ymm5
4841 vpxor 96+0(%rsi),%ymm9,%ymm9
4842 vmovdqu %ymm3,0+0(%rdi)
4843 vmovdqu %ymm1,32+0(%rdi)
4844 vmovdqu %ymm5,64+0(%rdi)
4845 vmovdqu %ymm9,96+0(%rdi)
4846 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
4847 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
4848 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
4849 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
4850 vmovdqa %ymm3,%ymm8
4851
4852 leaq 128(%rsi),%rsi
4853 leaq 128(%rdi),%rdi
4854 subq $128,%rbx
4855 jmp open_avx2_tail_loop
48563:
4857 cmpq $384,%rbx
4858 ja 3f
4859 vmovdqa .chacha20_consts(%rip),%ymm0
4860 vmovdqa 64(%rbp),%ymm4
4861 vmovdqa 96(%rbp),%ymm8
4862 vmovdqa %ymm0,%ymm1
4863 vmovdqa %ymm4,%ymm5
4864 vmovdqa %ymm8,%ymm9
4865 vmovdqa %ymm0,%ymm2
4866 vmovdqa %ymm4,%ymm6
4867 vmovdqa %ymm8,%ymm10
4868 vmovdqa .avx2_inc(%rip),%ymm12
4869 vpaddd 160(%rbp),%ymm12,%ymm14
4870 vpaddd %ymm14,%ymm12,%ymm13
4871 vpaddd %ymm13,%ymm12,%ymm12
4872 vmovdqa %ymm12,160(%rbp)
4873 vmovdqa %ymm13,192(%rbp)
4874 vmovdqa %ymm14,224(%rbp)
4875
4876 movq %rbx,128(%rbp)
4877 movq %rbx,%rcx
4878 subq $256,%rcx
4879 shrq $4,%rcx
4880 addq $6,%rcx
4881 movq $10,%r8
4882 cmpq $10,%rcx
4883 cmovgq %r8,%rcx
4884 movq %rsi,%rbx
4885 xorq %r8,%r8
48861:
4887 addq 0(%rbx),%r10
4888 adcq 8+0(%rbx),%r11
4889 adcq $1,%r12
4890 movq 0+0(%rbp),%rdx
4891 movq %rdx,%r15
4892 mulxq %r10,%r13,%r14
4893 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004894 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004895 addq %rax,%r14
4896 adcq %rdx,%r15
4897 movq 8+0(%rbp),%rdx
4898 mulxq %r10,%r10,%rax
4899 addq %r10,%r14
4900 mulxq %r11,%r11,%r9
4901 adcq %r11,%r15
4902 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004903 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004904 addq %rax,%r15
4905 adcq %rdx,%r9
4906 movq %r13,%r10
4907 movq %r14,%r11
4908 movq %r15,%r12
4909 andq $3,%r12
4910 movq %r15,%r13
4911 andq $-4,%r13
4912 movq %r9,%r14
4913 shrdq $2,%r9,%r15
4914 shrq $2,%r9
4915 addq %r13,%r10
4916 adcq %r14,%r11
4917 adcq $0,%r12
4918 addq %r15,%r10
4919 adcq %r9,%r11
4920 adcq $0,%r12
4921
4922 leaq 16(%rbx),%rbx
49232:
4924 vpaddd %ymm6,%ymm2,%ymm2
4925 vpxor %ymm2,%ymm14,%ymm14
4926 vpshufb .rol16(%rip),%ymm14,%ymm14
4927 vpaddd %ymm14,%ymm10,%ymm10
4928 vpxor %ymm10,%ymm6,%ymm6
4929 vpsrld $20,%ymm6,%ymm3
4930 vpslld $12,%ymm6,%ymm6
4931 vpxor %ymm3,%ymm6,%ymm6
4932 vpaddd %ymm6,%ymm2,%ymm2
4933 vpxor %ymm2,%ymm14,%ymm14
4934 vpshufb .rol8(%rip),%ymm14,%ymm14
4935 vpaddd %ymm14,%ymm10,%ymm10
4936 vpxor %ymm10,%ymm6,%ymm6
4937 vpslld $7,%ymm6,%ymm3
4938 vpsrld $25,%ymm6,%ymm6
4939 vpxor %ymm3,%ymm6,%ymm6
4940 vpalignr $12,%ymm14,%ymm14,%ymm14
4941 vpalignr $8,%ymm10,%ymm10,%ymm10
4942 vpalignr $4,%ymm6,%ymm6,%ymm6
4943 vpaddd %ymm5,%ymm1,%ymm1
4944 vpxor %ymm1,%ymm13,%ymm13
4945 vpshufb .rol16(%rip),%ymm13,%ymm13
4946 vpaddd %ymm13,%ymm9,%ymm9
4947 vpxor %ymm9,%ymm5,%ymm5
4948 vpsrld $20,%ymm5,%ymm3
4949 vpslld $12,%ymm5,%ymm5
4950 vpxor %ymm3,%ymm5,%ymm5
4951 vpaddd %ymm5,%ymm1,%ymm1
4952 vpxor %ymm1,%ymm13,%ymm13
4953 vpshufb .rol8(%rip),%ymm13,%ymm13
4954 vpaddd %ymm13,%ymm9,%ymm9
4955 vpxor %ymm9,%ymm5,%ymm5
4956 vpslld $7,%ymm5,%ymm3
4957 vpsrld $25,%ymm5,%ymm5
4958 vpxor %ymm3,%ymm5,%ymm5
4959 vpalignr $12,%ymm13,%ymm13,%ymm13
4960 vpalignr $8,%ymm9,%ymm9,%ymm9
4961 vpalignr $4,%ymm5,%ymm5,%ymm5
4962 vpaddd %ymm4,%ymm0,%ymm0
4963 vpxor %ymm0,%ymm12,%ymm12
4964 vpshufb .rol16(%rip),%ymm12,%ymm12
4965 vpaddd %ymm12,%ymm8,%ymm8
4966 vpxor %ymm8,%ymm4,%ymm4
4967 vpsrld $20,%ymm4,%ymm3
4968 vpslld $12,%ymm4,%ymm4
4969 vpxor %ymm3,%ymm4,%ymm4
4970 vpaddd %ymm4,%ymm0,%ymm0
4971 vpxor %ymm0,%ymm12,%ymm12
4972 vpshufb .rol8(%rip),%ymm12,%ymm12
4973 vpaddd %ymm12,%ymm8,%ymm8
4974 vpxor %ymm8,%ymm4,%ymm4
4975 vpslld $7,%ymm4,%ymm3
4976 vpsrld $25,%ymm4,%ymm4
4977 vpxor %ymm3,%ymm4,%ymm4
4978 vpalignr $12,%ymm12,%ymm12,%ymm12
4979 vpalignr $8,%ymm8,%ymm8,%ymm8
4980 vpalignr $4,%ymm4,%ymm4,%ymm4
4981 addq 0(%rbx),%r10
4982 adcq 8+0(%rbx),%r11
4983 adcq $1,%r12
4984 movq 0+0(%rbp),%rax
4985 movq %rax,%r15
4986 mulq %r10
4987 movq %rax,%r13
4988 movq %rdx,%r14
4989 movq 0+0(%rbp),%rax
4990 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08004991 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004992 addq %rax,%r14
4993 adcq %rdx,%r15
4994 movq 8+0(%rbp),%rax
4995 movq %rax,%r9
4996 mulq %r10
4997 addq %rax,%r14
4998 adcq $0,%rdx
4999 movq %rdx,%r10
5000 movq 8+0(%rbp),%rax
5001 mulq %r11
5002 addq %rax,%r15
5003 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005004 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05005005 addq %r10,%r15
5006 adcq %rdx,%r9
5007 movq %r13,%r10
5008 movq %r14,%r11
5009 movq %r15,%r12
5010 andq $3,%r12
5011 movq %r15,%r13
5012 andq $-4,%r13
5013 movq %r9,%r14
5014 shrdq $2,%r9,%r15
5015 shrq $2,%r9
5016 addq %r13,%r10
5017 adcq %r14,%r11
5018 adcq $0,%r12
5019 addq %r15,%r10
5020 adcq %r9,%r11
5021 adcq $0,%r12
5022
5023 leaq 16(%rbx),%rbx
5024 incq %r8
5025 vpaddd %ymm6,%ymm2,%ymm2
5026 vpxor %ymm2,%ymm14,%ymm14
5027 vpshufb .rol16(%rip),%ymm14,%ymm14
5028 vpaddd %ymm14,%ymm10,%ymm10
5029 vpxor %ymm10,%ymm6,%ymm6
5030 vpsrld $20,%ymm6,%ymm3
5031 vpslld $12,%ymm6,%ymm6
5032 vpxor %ymm3,%ymm6,%ymm6
5033 vpaddd %ymm6,%ymm2,%ymm2
5034 vpxor %ymm2,%ymm14,%ymm14
5035 vpshufb .rol8(%rip),%ymm14,%ymm14
5036 vpaddd %ymm14,%ymm10,%ymm10
5037 vpxor %ymm10,%ymm6,%ymm6
5038 vpslld $7,%ymm6,%ymm3
5039 vpsrld $25,%ymm6,%ymm6
5040 vpxor %ymm3,%ymm6,%ymm6
5041 vpalignr $4,%ymm14,%ymm14,%ymm14
5042 vpalignr $8,%ymm10,%ymm10,%ymm10
5043 vpalignr $12,%ymm6,%ymm6,%ymm6
5044 vpaddd %ymm5,%ymm1,%ymm1
5045 vpxor %ymm1,%ymm13,%ymm13
5046 vpshufb .rol16(%rip),%ymm13,%ymm13
5047 vpaddd %ymm13,%ymm9,%ymm9
5048 vpxor %ymm9,%ymm5,%ymm5
5049 vpsrld $20,%ymm5,%ymm3
5050 vpslld $12,%ymm5,%ymm5
5051 vpxor %ymm3,%ymm5,%ymm5
5052 vpaddd %ymm5,%ymm1,%ymm1
5053 vpxor %ymm1,%ymm13,%ymm13
5054 vpshufb .rol8(%rip),%ymm13,%ymm13
5055 vpaddd %ymm13,%ymm9,%ymm9
5056 vpxor %ymm9,%ymm5,%ymm5
5057 vpslld $7,%ymm5,%ymm3
5058 vpsrld $25,%ymm5,%ymm5
5059 vpxor %ymm3,%ymm5,%ymm5
5060 vpalignr $4,%ymm13,%ymm13,%ymm13
5061 vpalignr $8,%ymm9,%ymm9,%ymm9
5062 vpalignr $12,%ymm5,%ymm5,%ymm5
5063 vpaddd %ymm4,%ymm0,%ymm0
5064 vpxor %ymm0,%ymm12,%ymm12
5065 vpshufb .rol16(%rip),%ymm12,%ymm12
5066 vpaddd %ymm12,%ymm8,%ymm8
5067 vpxor %ymm8,%ymm4,%ymm4
5068 vpsrld $20,%ymm4,%ymm3
5069 vpslld $12,%ymm4,%ymm4
5070 vpxor %ymm3,%ymm4,%ymm4
5071 vpaddd %ymm4,%ymm0,%ymm0
5072 vpxor %ymm0,%ymm12,%ymm12
5073 vpshufb .rol8(%rip),%ymm12,%ymm12
5074 vpaddd %ymm12,%ymm8,%ymm8
5075 vpxor %ymm8,%ymm4,%ymm4
5076 vpslld $7,%ymm4,%ymm3
5077 vpsrld $25,%ymm4,%ymm4
5078 vpxor %ymm3,%ymm4,%ymm4
5079 vpalignr $4,%ymm12,%ymm12,%ymm12
5080 vpalignr $8,%ymm8,%ymm8,%ymm8
5081 vpalignr $12,%ymm4,%ymm4,%ymm4
5082
5083 cmpq %rcx,%r8
5084 jb 1b
5085 cmpq $10,%r8
5086 jne 2b
5087 movq %rbx,%r8
5088 subq %rsi,%rbx
5089 movq %rbx,%rcx
5090 movq 128(%rbp),%rbx
50911:
5092 addq $16,%rcx
5093 cmpq %rbx,%rcx
5094 jg 1f
5095 addq 0(%r8),%r10
5096 adcq 8+0(%r8),%r11
5097 adcq $1,%r12
5098 movq 0+0(%rbp),%rdx
5099 movq %rdx,%r15
5100 mulxq %r10,%r13,%r14
5101 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005102 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005103 addq %rax,%r14
5104 adcq %rdx,%r15
5105 movq 8+0(%rbp),%rdx
5106 mulxq %r10,%r10,%rax
5107 addq %r10,%r14
5108 mulxq %r11,%r11,%r9
5109 adcq %r11,%r15
5110 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08005111 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05005112 addq %rax,%r15
5113 adcq %rdx,%r9
5114 movq %r13,%r10
5115 movq %r14,%r11
5116 movq %r15,%r12
5117 andq $3,%r12
5118 movq %r15,%r13
5119 andq $-4,%r13
5120 movq %r9,%r14
5121 shrdq $2,%r9,%r15
5122 shrq $2,%r9
5123 addq %r13,%r10
5124 adcq %r14,%r11
5125 adcq $0,%r12
5126 addq %r15,%r10
5127 adcq %r9,%r11
5128 adcq $0,%r12
5129
5130 leaq 16(%r8),%r8
5131 jmp 1b
51321:
5133 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
5134 vpaddd 64(%rbp),%ymm6,%ymm6
5135 vpaddd 96(%rbp),%ymm10,%ymm10
5136 vpaddd 224(%rbp),%ymm14,%ymm14
5137 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
5138 vpaddd 64(%rbp),%ymm5,%ymm5
5139 vpaddd 96(%rbp),%ymm9,%ymm9
5140 vpaddd 192(%rbp),%ymm13,%ymm13
5141 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
5142 vpaddd 64(%rbp),%ymm4,%ymm4
5143 vpaddd 96(%rbp),%ymm8,%ymm8
5144 vpaddd 160(%rbp),%ymm12,%ymm12
5145 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
5146 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
5147 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
5148 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
5149 vpxor 0+0(%rsi),%ymm3,%ymm3
5150 vpxor 32+0(%rsi),%ymm2,%ymm2
5151 vpxor 64+0(%rsi),%ymm6,%ymm6
5152 vpxor 96+0(%rsi),%ymm10,%ymm10
5153 vmovdqu %ymm3,0+0(%rdi)
5154 vmovdqu %ymm2,32+0(%rdi)
5155 vmovdqu %ymm6,64+0(%rdi)
5156 vmovdqu %ymm10,96+0(%rdi)
5157 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
5158 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
5159 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
5160 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
5161 vpxor 0+128(%rsi),%ymm3,%ymm3
5162 vpxor 32+128(%rsi),%ymm1,%ymm1
5163 vpxor 64+128(%rsi),%ymm5,%ymm5
5164 vpxor 96+128(%rsi),%ymm9,%ymm9
5165 vmovdqu %ymm3,0+128(%rdi)
5166 vmovdqu %ymm1,32+128(%rdi)
5167 vmovdqu %ymm5,64+128(%rdi)
5168 vmovdqu %ymm9,96+128(%rdi)
5169 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
5170 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
5171 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
5172 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
5173 vmovdqa %ymm3,%ymm8
5174
5175 leaq 256(%rsi),%rsi
5176 leaq 256(%rdi),%rdi
5177 subq $256,%rbx
5178 jmp open_avx2_tail_loop
51793:
5180 vmovdqa .chacha20_consts(%rip),%ymm0
5181 vmovdqa 64(%rbp),%ymm4
5182 vmovdqa 96(%rbp),%ymm8
5183 vmovdqa %ymm0,%ymm1
5184 vmovdqa %ymm4,%ymm5
5185 vmovdqa %ymm8,%ymm9
5186 vmovdqa %ymm0,%ymm2
5187 vmovdqa %ymm4,%ymm6
5188 vmovdqa %ymm8,%ymm10
5189 vmovdqa %ymm0,%ymm3
5190 vmovdqa %ymm4,%ymm7
5191 vmovdqa %ymm8,%ymm11
5192 vmovdqa .avx2_inc(%rip),%ymm12
5193 vpaddd 160(%rbp),%ymm12,%ymm15
5194 vpaddd %ymm15,%ymm12,%ymm14
5195 vpaddd %ymm14,%ymm12,%ymm13
5196 vpaddd %ymm13,%ymm12,%ymm12
5197 vmovdqa %ymm15,256(%rbp)
5198 vmovdqa %ymm14,224(%rbp)
5199 vmovdqa %ymm13,192(%rbp)
5200 vmovdqa %ymm12,160(%rbp)
5201
5202 xorq %rcx,%rcx
5203 movq %rsi,%r8
52041:
5205 addq 0(%r8),%r10
5206 adcq 8+0(%r8),%r11
5207 adcq $1,%r12
5208 movq 0+0(%rbp),%rax
5209 movq %rax,%r15
5210 mulq %r10
5211 movq %rax,%r13
5212 movq %rdx,%r14
5213 movq 0+0(%rbp),%rax
5214 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08005215 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005216 addq %rax,%r14
5217 adcq %rdx,%r15
5218 movq 8+0(%rbp),%rax
5219 movq %rax,%r9
5220 mulq %r10
5221 addq %rax,%r14
5222 adcq $0,%rdx
5223 movq %rdx,%r10
5224 movq 8+0(%rbp),%rax
5225 mulq %r11
5226 addq %rax,%r15
5227 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005228 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05005229 addq %r10,%r15
5230 adcq %rdx,%r9
5231 movq %r13,%r10
5232 movq %r14,%r11
5233 movq %r15,%r12
5234 andq $3,%r12
5235 movq %r15,%r13
5236 andq $-4,%r13
5237 movq %r9,%r14
5238 shrdq $2,%r9,%r15
5239 shrq $2,%r9
5240 addq %r13,%r10
5241 adcq %r14,%r11
5242 adcq $0,%r12
5243 addq %r15,%r10
5244 adcq %r9,%r11
5245 adcq $0,%r12
5246
5247 leaq 16(%r8),%r8
52482:
5249 vmovdqa %ymm8,128(%rbp)
5250 vmovdqa .rol16(%rip),%ymm8
5251 vpaddd %ymm7,%ymm3,%ymm3
5252 vpaddd %ymm6,%ymm2,%ymm2
5253 vpaddd %ymm5,%ymm1,%ymm1
5254 vpaddd %ymm4,%ymm0,%ymm0
5255 vpxor %ymm3,%ymm15,%ymm15
5256 vpxor %ymm2,%ymm14,%ymm14
5257 vpxor %ymm1,%ymm13,%ymm13
5258 vpxor %ymm0,%ymm12,%ymm12
5259 vpshufb %ymm8,%ymm15,%ymm15
5260 vpshufb %ymm8,%ymm14,%ymm14
5261 vpshufb %ymm8,%ymm13,%ymm13
5262 vpshufb %ymm8,%ymm12,%ymm12
5263 vmovdqa 128(%rbp),%ymm8
5264 vpaddd %ymm15,%ymm11,%ymm11
5265 vpaddd %ymm14,%ymm10,%ymm10
5266 vpaddd %ymm13,%ymm9,%ymm9
5267 vpaddd %ymm12,%ymm8,%ymm8
5268 vpxor %ymm11,%ymm7,%ymm7
5269 vpxor %ymm10,%ymm6,%ymm6
5270 vpxor %ymm9,%ymm5,%ymm5
5271 vpxor %ymm8,%ymm4,%ymm4
5272 vmovdqa %ymm8,128(%rbp)
5273 vpsrld $20,%ymm7,%ymm8
5274 vpslld $32-20,%ymm7,%ymm7
5275 vpxor %ymm8,%ymm7,%ymm7
5276 vpsrld $20,%ymm6,%ymm8
5277 vpslld $32-20,%ymm6,%ymm6
5278 vpxor %ymm8,%ymm6,%ymm6
5279 vpsrld $20,%ymm5,%ymm8
5280 vpslld $32-20,%ymm5,%ymm5
5281 vpxor %ymm8,%ymm5,%ymm5
5282 vpsrld $20,%ymm4,%ymm8
5283 vpslld $32-20,%ymm4,%ymm4
5284 vpxor %ymm8,%ymm4,%ymm4
5285 vmovdqa .rol8(%rip),%ymm8
5286 addq 0(%r8),%r10
5287 adcq 8+0(%r8),%r11
5288 adcq $1,%r12
5289 movq 0+0(%rbp),%rdx
5290 movq %rdx,%r15
5291 mulxq %r10,%r13,%r14
5292 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005293 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005294 addq %rax,%r14
5295 adcq %rdx,%r15
5296 movq 8+0(%rbp),%rdx
5297 mulxq %r10,%r10,%rax
5298 addq %r10,%r14
5299 mulxq %r11,%r11,%r9
5300 adcq %r11,%r15
5301 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08005302 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05005303 addq %rax,%r15
5304 adcq %rdx,%r9
5305 movq %r13,%r10
5306 movq %r14,%r11
5307 movq %r15,%r12
5308 andq $3,%r12
5309 movq %r15,%r13
5310 andq $-4,%r13
5311 movq %r9,%r14
5312 shrdq $2,%r9,%r15
5313 shrq $2,%r9
5314 addq %r13,%r10
5315 adcq %r14,%r11
5316 adcq $0,%r12
5317 addq %r15,%r10
5318 adcq %r9,%r11
5319 adcq $0,%r12
5320 vpaddd %ymm7,%ymm3,%ymm3
5321 vpaddd %ymm6,%ymm2,%ymm2
5322 vpaddd %ymm5,%ymm1,%ymm1
5323 vpaddd %ymm4,%ymm0,%ymm0
5324 vpxor %ymm3,%ymm15,%ymm15
5325 vpxor %ymm2,%ymm14,%ymm14
5326 vpxor %ymm1,%ymm13,%ymm13
5327 vpxor %ymm0,%ymm12,%ymm12
5328 vpshufb %ymm8,%ymm15,%ymm15
5329 vpshufb %ymm8,%ymm14,%ymm14
5330 vpshufb %ymm8,%ymm13,%ymm13
5331 vpshufb %ymm8,%ymm12,%ymm12
5332 vmovdqa 128(%rbp),%ymm8
5333 vpaddd %ymm15,%ymm11,%ymm11
5334 vpaddd %ymm14,%ymm10,%ymm10
5335 vpaddd %ymm13,%ymm9,%ymm9
5336 vpaddd %ymm12,%ymm8,%ymm8
5337 vpxor %ymm11,%ymm7,%ymm7
5338 vpxor %ymm10,%ymm6,%ymm6
5339 vpxor %ymm9,%ymm5,%ymm5
5340 vpxor %ymm8,%ymm4,%ymm4
5341 vmovdqa %ymm8,128(%rbp)
5342 vpsrld $25,%ymm7,%ymm8
5343 vpslld $32-25,%ymm7,%ymm7
5344 vpxor %ymm8,%ymm7,%ymm7
5345 vpsrld $25,%ymm6,%ymm8
5346 vpslld $32-25,%ymm6,%ymm6
5347 vpxor %ymm8,%ymm6,%ymm6
5348 vpsrld $25,%ymm5,%ymm8
5349 vpslld $32-25,%ymm5,%ymm5
5350 vpxor %ymm8,%ymm5,%ymm5
5351 vpsrld $25,%ymm4,%ymm8
5352 vpslld $32-25,%ymm4,%ymm4
5353 vpxor %ymm8,%ymm4,%ymm4
5354 vmovdqa 128(%rbp),%ymm8
5355 vpalignr $4,%ymm7,%ymm7,%ymm7
5356 vpalignr $8,%ymm11,%ymm11,%ymm11
5357 vpalignr $12,%ymm15,%ymm15,%ymm15
5358 vpalignr $4,%ymm6,%ymm6,%ymm6
5359 vpalignr $8,%ymm10,%ymm10,%ymm10
5360 vpalignr $12,%ymm14,%ymm14,%ymm14
5361 vpalignr $4,%ymm5,%ymm5,%ymm5
5362 vpalignr $8,%ymm9,%ymm9,%ymm9
5363 vpalignr $12,%ymm13,%ymm13,%ymm13
5364 vpalignr $4,%ymm4,%ymm4,%ymm4
5365 vpalignr $8,%ymm8,%ymm8,%ymm8
5366 vpalignr $12,%ymm12,%ymm12,%ymm12
5367 vmovdqa %ymm8,128(%rbp)
5368 addq 16(%r8),%r10
5369 adcq 8+16(%r8),%r11
5370 adcq $1,%r12
5371 movq 0+0(%rbp),%rdx
5372 movq %rdx,%r15
5373 mulxq %r10,%r13,%r14
5374 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005375 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005376 addq %rax,%r14
5377 adcq %rdx,%r15
5378 movq 8+0(%rbp),%rdx
5379 mulxq %r10,%r10,%rax
5380 addq %r10,%r14
5381 mulxq %r11,%r11,%r9
5382 adcq %r11,%r15
5383 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08005384 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05005385 addq %rax,%r15
5386 adcq %rdx,%r9
5387 movq %r13,%r10
5388 movq %r14,%r11
5389 movq %r15,%r12
5390 andq $3,%r12
5391 movq %r15,%r13
5392 andq $-4,%r13
5393 movq %r9,%r14
5394 shrdq $2,%r9,%r15
5395 shrq $2,%r9
5396 addq %r13,%r10
5397 adcq %r14,%r11
5398 adcq $0,%r12
5399 addq %r15,%r10
5400 adcq %r9,%r11
5401 adcq $0,%r12
5402
5403 leaq 32(%r8),%r8
5404 vmovdqa .rol16(%rip),%ymm8
5405 vpaddd %ymm7,%ymm3,%ymm3
5406 vpaddd %ymm6,%ymm2,%ymm2
5407 vpaddd %ymm5,%ymm1,%ymm1
5408 vpaddd %ymm4,%ymm0,%ymm0
5409 vpxor %ymm3,%ymm15,%ymm15
5410 vpxor %ymm2,%ymm14,%ymm14
5411 vpxor %ymm1,%ymm13,%ymm13
5412 vpxor %ymm0,%ymm12,%ymm12
5413 vpshufb %ymm8,%ymm15,%ymm15
5414 vpshufb %ymm8,%ymm14,%ymm14
5415 vpshufb %ymm8,%ymm13,%ymm13
5416 vpshufb %ymm8,%ymm12,%ymm12
5417 vmovdqa 128(%rbp),%ymm8
5418 vpaddd %ymm15,%ymm11,%ymm11
5419 vpaddd %ymm14,%ymm10,%ymm10
5420 vpaddd %ymm13,%ymm9,%ymm9
5421 vpaddd %ymm12,%ymm8,%ymm8
5422 vpxor %ymm11,%ymm7,%ymm7
5423 vpxor %ymm10,%ymm6,%ymm6
5424 vpxor %ymm9,%ymm5,%ymm5
5425 vpxor %ymm8,%ymm4,%ymm4
5426 vmovdqa %ymm8,128(%rbp)
5427 vpsrld $20,%ymm7,%ymm8
5428 vpslld $32-20,%ymm7,%ymm7
5429 vpxor %ymm8,%ymm7,%ymm7
5430 vpsrld $20,%ymm6,%ymm8
5431 vpslld $32-20,%ymm6,%ymm6
5432 vpxor %ymm8,%ymm6,%ymm6
5433 vpsrld $20,%ymm5,%ymm8
5434 vpslld $32-20,%ymm5,%ymm5
5435 vpxor %ymm8,%ymm5,%ymm5
5436 vpsrld $20,%ymm4,%ymm8
5437 vpslld $32-20,%ymm4,%ymm4
5438 vpxor %ymm8,%ymm4,%ymm4
5439 vmovdqa .rol8(%rip),%ymm8
5440 vpaddd %ymm7,%ymm3,%ymm3
5441 vpaddd %ymm6,%ymm2,%ymm2
5442 vpaddd %ymm5,%ymm1,%ymm1
5443 vpaddd %ymm4,%ymm0,%ymm0
5444 vpxor %ymm3,%ymm15,%ymm15
5445 vpxor %ymm2,%ymm14,%ymm14
5446 vpxor %ymm1,%ymm13,%ymm13
5447 vpxor %ymm0,%ymm12,%ymm12
5448 vpshufb %ymm8,%ymm15,%ymm15
5449 vpshufb %ymm8,%ymm14,%ymm14
5450 vpshufb %ymm8,%ymm13,%ymm13
5451 vpshufb %ymm8,%ymm12,%ymm12
5452 vmovdqa 128(%rbp),%ymm8
5453 vpaddd %ymm15,%ymm11,%ymm11
5454 vpaddd %ymm14,%ymm10,%ymm10
5455 vpaddd %ymm13,%ymm9,%ymm9
5456 vpaddd %ymm12,%ymm8,%ymm8
5457 vpxor %ymm11,%ymm7,%ymm7
5458 vpxor %ymm10,%ymm6,%ymm6
5459 vpxor %ymm9,%ymm5,%ymm5
5460 vpxor %ymm8,%ymm4,%ymm4
5461 vmovdqa %ymm8,128(%rbp)
5462 vpsrld $25,%ymm7,%ymm8
5463 vpslld $32-25,%ymm7,%ymm7
5464 vpxor %ymm8,%ymm7,%ymm7
5465 vpsrld $25,%ymm6,%ymm8
5466 vpslld $32-25,%ymm6,%ymm6
5467 vpxor %ymm8,%ymm6,%ymm6
5468 vpsrld $25,%ymm5,%ymm8
5469 vpslld $32-25,%ymm5,%ymm5
5470 vpxor %ymm8,%ymm5,%ymm5
5471 vpsrld $25,%ymm4,%ymm8
5472 vpslld $32-25,%ymm4,%ymm4
5473 vpxor %ymm8,%ymm4,%ymm4
5474 vmovdqa 128(%rbp),%ymm8
5475 vpalignr $12,%ymm7,%ymm7,%ymm7
5476 vpalignr $8,%ymm11,%ymm11,%ymm11
5477 vpalignr $4,%ymm15,%ymm15,%ymm15
5478 vpalignr $12,%ymm6,%ymm6,%ymm6
5479 vpalignr $8,%ymm10,%ymm10,%ymm10
5480 vpalignr $4,%ymm14,%ymm14,%ymm14
5481 vpalignr $12,%ymm5,%ymm5,%ymm5
5482 vpalignr $8,%ymm9,%ymm9,%ymm9
5483 vpalignr $4,%ymm13,%ymm13,%ymm13
5484 vpalignr $12,%ymm4,%ymm4,%ymm4
5485 vpalignr $8,%ymm8,%ymm8,%ymm8
5486 vpalignr $4,%ymm12,%ymm12,%ymm12
5487
5488 incq %rcx
5489 cmpq $4,%rcx
5490 jl 1b
5491 cmpq $10,%rcx
5492 jne 2b
5493 movq %rbx,%rcx
5494 subq $384,%rcx
5495 andq $-16,%rcx
54961:
5497 testq %rcx,%rcx
5498 je 1f
5499 addq 0(%r8),%r10
5500 adcq 8+0(%r8),%r11
5501 adcq $1,%r12
5502 movq 0+0(%rbp),%rdx
5503 movq %rdx,%r15
5504 mulxq %r10,%r13,%r14
5505 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005506 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005507 addq %rax,%r14
5508 adcq %rdx,%r15
5509 movq 8+0(%rbp),%rdx
5510 mulxq %r10,%r10,%rax
5511 addq %r10,%r14
5512 mulxq %r11,%r11,%r9
5513 adcq %r11,%r15
5514 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08005515 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05005516 addq %rax,%r15
5517 adcq %rdx,%r9
5518 movq %r13,%r10
5519 movq %r14,%r11
5520 movq %r15,%r12
5521 andq $3,%r12
5522 movq %r15,%r13
5523 andq $-4,%r13
5524 movq %r9,%r14
5525 shrdq $2,%r9,%r15
5526 shrq $2,%r9
5527 addq %r13,%r10
5528 adcq %r14,%r11
5529 adcq $0,%r12
5530 addq %r15,%r10
5531 adcq %r9,%r11
5532 adcq $0,%r12
5533
5534 leaq 16(%r8),%r8
5535 subq $16,%rcx
5536 jmp 1b
55371:
5538 vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
5539 vpaddd 64(%rbp),%ymm7,%ymm7
5540 vpaddd 96(%rbp),%ymm11,%ymm11
5541 vpaddd 256(%rbp),%ymm15,%ymm15
5542 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
5543 vpaddd 64(%rbp),%ymm6,%ymm6
5544 vpaddd 96(%rbp),%ymm10,%ymm10
5545 vpaddd 224(%rbp),%ymm14,%ymm14
5546 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
5547 vpaddd 64(%rbp),%ymm5,%ymm5
5548 vpaddd 96(%rbp),%ymm9,%ymm9
5549 vpaddd 192(%rbp),%ymm13,%ymm13
5550 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
5551 vpaddd 64(%rbp),%ymm4,%ymm4
5552 vpaddd 96(%rbp),%ymm8,%ymm8
5553 vpaddd 160(%rbp),%ymm12,%ymm12
5554
5555 vmovdqa %ymm0,128(%rbp)
5556 vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
5557 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
5558 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
5559 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
5560 vpxor 0+0(%rsi),%ymm0,%ymm0
5561 vpxor 32+0(%rsi),%ymm3,%ymm3
5562 vpxor 64+0(%rsi),%ymm7,%ymm7
5563 vpxor 96+0(%rsi),%ymm11,%ymm11
5564 vmovdqu %ymm0,0+0(%rdi)
5565 vmovdqu %ymm3,32+0(%rdi)
5566 vmovdqu %ymm7,64+0(%rdi)
5567 vmovdqu %ymm11,96+0(%rdi)
5568
5569 vmovdqa 128(%rbp),%ymm0
5570 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
5571 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
5572 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
5573 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
5574 vpxor 0+128(%rsi),%ymm3,%ymm3
5575 vpxor 32+128(%rsi),%ymm2,%ymm2
5576 vpxor 64+128(%rsi),%ymm6,%ymm6
5577 vpxor 96+128(%rsi),%ymm10,%ymm10
5578 vmovdqu %ymm3,0+128(%rdi)
5579 vmovdqu %ymm2,32+128(%rdi)
5580 vmovdqu %ymm6,64+128(%rdi)
5581 vmovdqu %ymm10,96+128(%rdi)
5582 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
5583 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
5584 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
5585 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
5586 vpxor 0+256(%rsi),%ymm3,%ymm3
5587 vpxor 32+256(%rsi),%ymm1,%ymm1
5588 vpxor 64+256(%rsi),%ymm5,%ymm5
5589 vpxor 96+256(%rsi),%ymm9,%ymm9
5590 vmovdqu %ymm3,0+256(%rdi)
5591 vmovdqu %ymm1,32+256(%rdi)
5592 vmovdqu %ymm5,64+256(%rdi)
5593 vmovdqu %ymm9,96+256(%rdi)
5594 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
5595 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
5596 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
5597 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
5598 vmovdqa %ymm3,%ymm8
5599
5600 leaq 384(%rsi),%rsi
5601 leaq 384(%rdi),%rdi
5602 subq $384,%rbx
5603open_avx2_tail_loop:
5604 cmpq $32,%rbx
5605 jb open_avx2_tail
5606 subq $32,%rbx
5607 vpxor (%rsi),%ymm0,%ymm0
5608 vmovdqu %ymm0,(%rdi)
5609 leaq 32(%rsi),%rsi
5610 leaq 32(%rdi),%rdi
5611 vmovdqa %ymm4,%ymm0
5612 vmovdqa %ymm8,%ymm4
5613 vmovdqa %ymm12,%ymm8
5614 jmp open_avx2_tail_loop
5615open_avx2_tail:
5616 cmpq $16,%rbx
5617 vmovdqa %xmm0,%xmm1
5618 jb 1f
5619 subq $16,%rbx
5620
5621 vpxor (%rsi),%xmm0,%xmm1
5622 vmovdqu %xmm1,(%rdi)
5623 leaq 16(%rsi),%rsi
5624 leaq 16(%rdi),%rdi
5625 vperm2i128 $0x11,%ymm0,%ymm0,%ymm0
5626 vmovdqa %xmm0,%xmm1
56271:
5628 vzeroupper
5629 jmp open_sse_tail_16
5630
5631open_avx2_192:
5632 vmovdqa %ymm0,%ymm1
5633 vmovdqa %ymm0,%ymm2
5634 vmovdqa %ymm4,%ymm5
5635 vmovdqa %ymm4,%ymm6
5636 vmovdqa %ymm8,%ymm9
5637 vmovdqa %ymm8,%ymm10
5638 vpaddd .avx2_inc(%rip),%ymm12,%ymm13
5639 vmovdqa %ymm12,%ymm11
5640 vmovdqa %ymm13,%ymm15
5641 movq $10,%r10
56421:
5643 vpaddd %ymm4,%ymm0,%ymm0
5644 vpxor %ymm0,%ymm12,%ymm12
5645 vpshufb .rol16(%rip),%ymm12,%ymm12
5646 vpaddd %ymm12,%ymm8,%ymm8
5647 vpxor %ymm8,%ymm4,%ymm4
5648 vpsrld $20,%ymm4,%ymm3
5649 vpslld $12,%ymm4,%ymm4
5650 vpxor %ymm3,%ymm4,%ymm4
5651 vpaddd %ymm4,%ymm0,%ymm0
5652 vpxor %ymm0,%ymm12,%ymm12
5653 vpshufb .rol8(%rip),%ymm12,%ymm12
5654 vpaddd %ymm12,%ymm8,%ymm8
5655 vpxor %ymm8,%ymm4,%ymm4
5656 vpslld $7,%ymm4,%ymm3
5657 vpsrld $25,%ymm4,%ymm4
5658 vpxor %ymm3,%ymm4,%ymm4
5659 vpalignr $12,%ymm12,%ymm12,%ymm12
5660 vpalignr $8,%ymm8,%ymm8,%ymm8
5661 vpalignr $4,%ymm4,%ymm4,%ymm4
5662 vpaddd %ymm5,%ymm1,%ymm1
5663 vpxor %ymm1,%ymm13,%ymm13
5664 vpshufb .rol16(%rip),%ymm13,%ymm13
5665 vpaddd %ymm13,%ymm9,%ymm9
5666 vpxor %ymm9,%ymm5,%ymm5
5667 vpsrld $20,%ymm5,%ymm3
5668 vpslld $12,%ymm5,%ymm5
5669 vpxor %ymm3,%ymm5,%ymm5
5670 vpaddd %ymm5,%ymm1,%ymm1
5671 vpxor %ymm1,%ymm13,%ymm13
5672 vpshufb .rol8(%rip),%ymm13,%ymm13
5673 vpaddd %ymm13,%ymm9,%ymm9
5674 vpxor %ymm9,%ymm5,%ymm5
5675 vpslld $7,%ymm5,%ymm3
5676 vpsrld $25,%ymm5,%ymm5
5677 vpxor %ymm3,%ymm5,%ymm5
5678 vpalignr $12,%ymm13,%ymm13,%ymm13
5679 vpalignr $8,%ymm9,%ymm9,%ymm9
5680 vpalignr $4,%ymm5,%ymm5,%ymm5
5681 vpaddd %ymm4,%ymm0,%ymm0
5682 vpxor %ymm0,%ymm12,%ymm12
5683 vpshufb .rol16(%rip),%ymm12,%ymm12
5684 vpaddd %ymm12,%ymm8,%ymm8
5685 vpxor %ymm8,%ymm4,%ymm4
5686 vpsrld $20,%ymm4,%ymm3
5687 vpslld $12,%ymm4,%ymm4
5688 vpxor %ymm3,%ymm4,%ymm4
5689 vpaddd %ymm4,%ymm0,%ymm0
5690 vpxor %ymm0,%ymm12,%ymm12
5691 vpshufb .rol8(%rip),%ymm12,%ymm12
5692 vpaddd %ymm12,%ymm8,%ymm8
5693 vpxor %ymm8,%ymm4,%ymm4
5694 vpslld $7,%ymm4,%ymm3
5695 vpsrld $25,%ymm4,%ymm4
5696 vpxor %ymm3,%ymm4,%ymm4
5697 vpalignr $4,%ymm12,%ymm12,%ymm12
5698 vpalignr $8,%ymm8,%ymm8,%ymm8
5699 vpalignr $12,%ymm4,%ymm4,%ymm4
5700 vpaddd %ymm5,%ymm1,%ymm1
5701 vpxor %ymm1,%ymm13,%ymm13
5702 vpshufb .rol16(%rip),%ymm13,%ymm13
5703 vpaddd %ymm13,%ymm9,%ymm9
5704 vpxor %ymm9,%ymm5,%ymm5
5705 vpsrld $20,%ymm5,%ymm3
5706 vpslld $12,%ymm5,%ymm5
5707 vpxor %ymm3,%ymm5,%ymm5
5708 vpaddd %ymm5,%ymm1,%ymm1
5709 vpxor %ymm1,%ymm13,%ymm13
5710 vpshufb .rol8(%rip),%ymm13,%ymm13
5711 vpaddd %ymm13,%ymm9,%ymm9
5712 vpxor %ymm9,%ymm5,%ymm5
5713 vpslld $7,%ymm5,%ymm3
5714 vpsrld $25,%ymm5,%ymm5
5715 vpxor %ymm3,%ymm5,%ymm5
5716 vpalignr $4,%ymm13,%ymm13,%ymm13
5717 vpalignr $8,%ymm9,%ymm9,%ymm9
5718 vpalignr $12,%ymm5,%ymm5,%ymm5
5719
5720 decq %r10
5721 jne 1b
5722 vpaddd %ymm2,%ymm0,%ymm0
5723 vpaddd %ymm2,%ymm1,%ymm1
5724 vpaddd %ymm6,%ymm4,%ymm4
5725 vpaddd %ymm6,%ymm5,%ymm5
5726 vpaddd %ymm10,%ymm8,%ymm8
5727 vpaddd %ymm10,%ymm9,%ymm9
5728 vpaddd %ymm11,%ymm12,%ymm12
5729 vpaddd %ymm15,%ymm13,%ymm13
5730 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
5731
5732 vpand .clamp(%rip),%ymm3,%ymm3
5733 vmovdqa %ymm3,0(%rbp)
5734
5735 vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
5736 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
5737 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
5738 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
5739 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
5740 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
5741open_avx2_short:
5742 movq %r8,%r8
5743 call poly_hash_ad_internal
5744open_avx2_hash_and_xor_loop:
5745 cmpq $32,%rbx
5746 jb open_avx2_short_tail_32
5747 subq $32,%rbx
5748 addq 0(%rsi),%r10
5749 adcq 8+0(%rsi),%r11
5750 adcq $1,%r12
5751 movq 0+0(%rbp),%rax
5752 movq %rax,%r15
5753 mulq %r10
5754 movq %rax,%r13
5755 movq %rdx,%r14
5756 movq 0+0(%rbp),%rax
5757 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08005758 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005759 addq %rax,%r14
5760 adcq %rdx,%r15
5761 movq 8+0(%rbp),%rax
5762 movq %rax,%r9
5763 mulq %r10
5764 addq %rax,%r14
5765 adcq $0,%rdx
5766 movq %rdx,%r10
5767 movq 8+0(%rbp),%rax
5768 mulq %r11
5769 addq %rax,%r15
5770 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005771 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05005772 addq %r10,%r15
5773 adcq %rdx,%r9
5774 movq %r13,%r10
5775 movq %r14,%r11
5776 movq %r15,%r12
5777 andq $3,%r12
5778 movq %r15,%r13
5779 andq $-4,%r13
5780 movq %r9,%r14
5781 shrdq $2,%r9,%r15
5782 shrq $2,%r9
5783 addq %r13,%r10
5784 adcq %r14,%r11
5785 adcq $0,%r12
5786 addq %r15,%r10
5787 adcq %r9,%r11
5788 adcq $0,%r12
5789 addq 16(%rsi),%r10
5790 adcq 8+16(%rsi),%r11
5791 adcq $1,%r12
5792 movq 0+0(%rbp),%rax
5793 movq %rax,%r15
5794 mulq %r10
5795 movq %rax,%r13
5796 movq %rdx,%r14
5797 movq 0+0(%rbp),%rax
5798 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08005799 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005800 addq %rax,%r14
5801 adcq %rdx,%r15
5802 movq 8+0(%rbp),%rax
5803 movq %rax,%r9
5804 mulq %r10
5805 addq %rax,%r14
5806 adcq $0,%rdx
5807 movq %rdx,%r10
5808 movq 8+0(%rbp),%rax
5809 mulq %r11
5810 addq %rax,%r15
5811 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005812 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05005813 addq %r10,%r15
5814 adcq %rdx,%r9
5815 movq %r13,%r10
5816 movq %r14,%r11
5817 movq %r15,%r12
5818 andq $3,%r12
5819 movq %r15,%r13
5820 andq $-4,%r13
5821 movq %r9,%r14
5822 shrdq $2,%r9,%r15
5823 shrq $2,%r9
5824 addq %r13,%r10
5825 adcq %r14,%r11
5826 adcq $0,%r12
5827 addq %r15,%r10
5828 adcq %r9,%r11
5829 adcq $0,%r12
5830
5831
5832 vpxor (%rsi),%ymm0,%ymm0
5833 vmovdqu %ymm0,(%rdi)
5834 leaq 32(%rsi),%rsi
5835 leaq 32(%rdi),%rdi
5836
5837 vmovdqa %ymm4,%ymm0
5838 vmovdqa %ymm8,%ymm4
5839 vmovdqa %ymm12,%ymm8
5840 vmovdqa %ymm1,%ymm12
5841 vmovdqa %ymm5,%ymm1
5842 vmovdqa %ymm9,%ymm5
5843 vmovdqa %ymm13,%ymm9
5844 vmovdqa %ymm2,%ymm13
5845 vmovdqa %ymm6,%ymm2
5846 jmp open_avx2_hash_and_xor_loop
5847open_avx2_short_tail_32:
5848 cmpq $16,%rbx
5849 vmovdqa %xmm0,%xmm1
5850 jb 1f
5851 subq $16,%rbx
5852 addq 0(%rsi),%r10
5853 adcq 8+0(%rsi),%r11
5854 adcq $1,%r12
5855 movq 0+0(%rbp),%rax
5856 movq %rax,%r15
5857 mulq %r10
5858 movq %rax,%r13
5859 movq %rdx,%r14
5860 movq 0+0(%rbp),%rax
5861 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08005862 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005863 addq %rax,%r14
5864 adcq %rdx,%r15
5865 movq 8+0(%rbp),%rax
5866 movq %rax,%r9
5867 mulq %r10
5868 addq %rax,%r14
5869 adcq $0,%rdx
5870 movq %rdx,%r10
5871 movq 8+0(%rbp),%rax
5872 mulq %r11
5873 addq %rax,%r15
5874 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005875 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05005876 addq %r10,%r15
5877 adcq %rdx,%r9
5878 movq %r13,%r10
5879 movq %r14,%r11
5880 movq %r15,%r12
5881 andq $3,%r12
5882 movq %r15,%r13
5883 andq $-4,%r13
5884 movq %r9,%r14
5885 shrdq $2,%r9,%r15
5886 shrq $2,%r9
5887 addq %r13,%r10
5888 adcq %r14,%r11
5889 adcq $0,%r12
5890 addq %r15,%r10
5891 adcq %r9,%r11
5892 adcq $0,%r12
5893
5894 vpxor (%rsi),%xmm0,%xmm3
5895 vmovdqu %xmm3,(%rdi)
5896 leaq 16(%rsi),%rsi
5897 leaq 16(%rdi),%rdi
5898 vextracti128 $1,%ymm0,%xmm1
58991:
5900 vzeroupper
5901 jmp open_sse_tail_16
5902
5903open_avx2_320:
5904 vmovdqa %ymm0,%ymm1
5905 vmovdqa %ymm0,%ymm2
5906 vmovdqa %ymm4,%ymm5
5907 vmovdqa %ymm4,%ymm6
5908 vmovdqa %ymm8,%ymm9
5909 vmovdqa %ymm8,%ymm10
5910 vpaddd .avx2_inc(%rip),%ymm12,%ymm13
5911 vpaddd .avx2_inc(%rip),%ymm13,%ymm14
5912 vmovdqa %ymm4,%ymm7
5913 vmovdqa %ymm8,%ymm11
5914 vmovdqa %ymm12,160(%rbp)
5915 vmovdqa %ymm13,192(%rbp)
5916 vmovdqa %ymm14,224(%rbp)
5917 movq $10,%r10
59181:
5919 vpaddd %ymm4,%ymm0,%ymm0
5920 vpxor %ymm0,%ymm12,%ymm12
5921 vpshufb .rol16(%rip),%ymm12,%ymm12
5922 vpaddd %ymm12,%ymm8,%ymm8
5923 vpxor %ymm8,%ymm4,%ymm4
5924 vpsrld $20,%ymm4,%ymm3
5925 vpslld $12,%ymm4,%ymm4
5926 vpxor %ymm3,%ymm4,%ymm4
5927 vpaddd %ymm4,%ymm0,%ymm0
5928 vpxor %ymm0,%ymm12,%ymm12
5929 vpshufb .rol8(%rip),%ymm12,%ymm12
5930 vpaddd %ymm12,%ymm8,%ymm8
5931 vpxor %ymm8,%ymm4,%ymm4
5932 vpslld $7,%ymm4,%ymm3
5933 vpsrld $25,%ymm4,%ymm4
5934 vpxor %ymm3,%ymm4,%ymm4
5935 vpalignr $12,%ymm12,%ymm12,%ymm12
5936 vpalignr $8,%ymm8,%ymm8,%ymm8
5937 vpalignr $4,%ymm4,%ymm4,%ymm4
5938 vpaddd %ymm5,%ymm1,%ymm1
5939 vpxor %ymm1,%ymm13,%ymm13
5940 vpshufb .rol16(%rip),%ymm13,%ymm13
5941 vpaddd %ymm13,%ymm9,%ymm9
5942 vpxor %ymm9,%ymm5,%ymm5
5943 vpsrld $20,%ymm5,%ymm3
5944 vpslld $12,%ymm5,%ymm5
5945 vpxor %ymm3,%ymm5,%ymm5
5946 vpaddd %ymm5,%ymm1,%ymm1
5947 vpxor %ymm1,%ymm13,%ymm13
5948 vpshufb .rol8(%rip),%ymm13,%ymm13
5949 vpaddd %ymm13,%ymm9,%ymm9
5950 vpxor %ymm9,%ymm5,%ymm5
5951 vpslld $7,%ymm5,%ymm3
5952 vpsrld $25,%ymm5,%ymm5
5953 vpxor %ymm3,%ymm5,%ymm5
5954 vpalignr $12,%ymm13,%ymm13,%ymm13
5955 vpalignr $8,%ymm9,%ymm9,%ymm9
5956 vpalignr $4,%ymm5,%ymm5,%ymm5
5957 vpaddd %ymm6,%ymm2,%ymm2
5958 vpxor %ymm2,%ymm14,%ymm14
5959 vpshufb .rol16(%rip),%ymm14,%ymm14
5960 vpaddd %ymm14,%ymm10,%ymm10
5961 vpxor %ymm10,%ymm6,%ymm6
5962 vpsrld $20,%ymm6,%ymm3
5963 vpslld $12,%ymm6,%ymm6
5964 vpxor %ymm3,%ymm6,%ymm6
5965 vpaddd %ymm6,%ymm2,%ymm2
5966 vpxor %ymm2,%ymm14,%ymm14
5967 vpshufb .rol8(%rip),%ymm14,%ymm14
5968 vpaddd %ymm14,%ymm10,%ymm10
5969 vpxor %ymm10,%ymm6,%ymm6
5970 vpslld $7,%ymm6,%ymm3
5971 vpsrld $25,%ymm6,%ymm6
5972 vpxor %ymm3,%ymm6,%ymm6
5973 vpalignr $12,%ymm14,%ymm14,%ymm14
5974 vpalignr $8,%ymm10,%ymm10,%ymm10
5975 vpalignr $4,%ymm6,%ymm6,%ymm6
5976 vpaddd %ymm4,%ymm0,%ymm0
5977 vpxor %ymm0,%ymm12,%ymm12
5978 vpshufb .rol16(%rip),%ymm12,%ymm12
5979 vpaddd %ymm12,%ymm8,%ymm8
5980 vpxor %ymm8,%ymm4,%ymm4
5981 vpsrld $20,%ymm4,%ymm3
5982 vpslld $12,%ymm4,%ymm4
5983 vpxor %ymm3,%ymm4,%ymm4
5984 vpaddd %ymm4,%ymm0,%ymm0
5985 vpxor %ymm0,%ymm12,%ymm12
5986 vpshufb .rol8(%rip),%ymm12,%ymm12
5987 vpaddd %ymm12,%ymm8,%ymm8
5988 vpxor %ymm8,%ymm4,%ymm4
5989 vpslld $7,%ymm4,%ymm3
5990 vpsrld $25,%ymm4,%ymm4
5991 vpxor %ymm3,%ymm4,%ymm4
5992 vpalignr $4,%ymm12,%ymm12,%ymm12
5993 vpalignr $8,%ymm8,%ymm8,%ymm8
5994 vpalignr $12,%ymm4,%ymm4,%ymm4
5995 vpaddd %ymm5,%ymm1,%ymm1
5996 vpxor %ymm1,%ymm13,%ymm13
5997 vpshufb .rol16(%rip),%ymm13,%ymm13
5998 vpaddd %ymm13,%ymm9,%ymm9
5999 vpxor %ymm9,%ymm5,%ymm5
6000 vpsrld $20,%ymm5,%ymm3
6001 vpslld $12,%ymm5,%ymm5
6002 vpxor %ymm3,%ymm5,%ymm5
6003 vpaddd %ymm5,%ymm1,%ymm1
6004 vpxor %ymm1,%ymm13,%ymm13
6005 vpshufb .rol8(%rip),%ymm13,%ymm13
6006 vpaddd %ymm13,%ymm9,%ymm9
6007 vpxor %ymm9,%ymm5,%ymm5
6008 vpslld $7,%ymm5,%ymm3
6009 vpsrld $25,%ymm5,%ymm5
6010 vpxor %ymm3,%ymm5,%ymm5
6011 vpalignr $4,%ymm13,%ymm13,%ymm13
6012 vpalignr $8,%ymm9,%ymm9,%ymm9
6013 vpalignr $12,%ymm5,%ymm5,%ymm5
6014 vpaddd %ymm6,%ymm2,%ymm2
6015 vpxor %ymm2,%ymm14,%ymm14
6016 vpshufb .rol16(%rip),%ymm14,%ymm14
6017 vpaddd %ymm14,%ymm10,%ymm10
6018 vpxor %ymm10,%ymm6,%ymm6
6019 vpsrld $20,%ymm6,%ymm3
6020 vpslld $12,%ymm6,%ymm6
6021 vpxor %ymm3,%ymm6,%ymm6
6022 vpaddd %ymm6,%ymm2,%ymm2
6023 vpxor %ymm2,%ymm14,%ymm14
6024 vpshufb .rol8(%rip),%ymm14,%ymm14
6025 vpaddd %ymm14,%ymm10,%ymm10
6026 vpxor %ymm10,%ymm6,%ymm6
6027 vpslld $7,%ymm6,%ymm3
6028 vpsrld $25,%ymm6,%ymm6
6029 vpxor %ymm3,%ymm6,%ymm6
6030 vpalignr $4,%ymm14,%ymm14,%ymm14
6031 vpalignr $8,%ymm10,%ymm10,%ymm10
6032 vpalignr $12,%ymm6,%ymm6,%ymm6
6033
6034 decq %r10
6035 jne 1b
6036 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
6037 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
6038 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
6039 vpaddd %ymm7,%ymm4,%ymm4
6040 vpaddd %ymm7,%ymm5,%ymm5
6041 vpaddd %ymm7,%ymm6,%ymm6
6042 vpaddd %ymm11,%ymm8,%ymm8
6043 vpaddd %ymm11,%ymm9,%ymm9
6044 vpaddd %ymm11,%ymm10,%ymm10
6045 vpaddd 160(%rbp),%ymm12,%ymm12
6046 vpaddd 192(%rbp),%ymm13,%ymm13
6047 vpaddd 224(%rbp),%ymm14,%ymm14
6048 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
6049
6050 vpand .clamp(%rip),%ymm3,%ymm3
6051 vmovdqa %ymm3,0(%rbp)
6052
6053 vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
6054 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
6055 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
6056 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
6057 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
6058 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
6059 vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
6060 vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
6061 vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
6062 vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
6063 jmp open_avx2_short
6064
6065
6066
6067
6068.p2align 6
6069chacha20_poly1305_seal_avx2:
6070 vzeroupper
6071 vmovdqa .chacha20_consts(%rip),%ymm0
6072 vbroadcasti128 0(%r9),%ymm4
6073 vbroadcasti128 16(%r9),%ymm8
6074 vbroadcasti128 32(%r9),%ymm12
6075 vpaddd .avx2_init(%rip),%ymm12,%ymm12
6076 cmpq $192,%rbx
6077 jbe seal_avx2_192
6078 cmpq $320,%rbx
6079 jbe seal_avx2_320
6080 vmovdqa %ymm0,%ymm1
6081 vmovdqa %ymm0,%ymm2
6082 vmovdqa %ymm0,%ymm3
6083 vmovdqa %ymm4,%ymm5
6084 vmovdqa %ymm4,%ymm6
6085 vmovdqa %ymm4,%ymm7
6086 vmovdqa %ymm4,64(%rbp)
6087 vmovdqa %ymm8,%ymm9
6088 vmovdqa %ymm8,%ymm10
6089 vmovdqa %ymm8,%ymm11
6090 vmovdqa %ymm8,96(%rbp)
6091 vmovdqa %ymm12,%ymm15
6092 vpaddd .avx2_inc(%rip),%ymm15,%ymm14
6093 vpaddd .avx2_inc(%rip),%ymm14,%ymm13
6094 vpaddd .avx2_inc(%rip),%ymm13,%ymm12
6095 vmovdqa %ymm12,160(%rbp)
6096 vmovdqa %ymm13,192(%rbp)
6097 vmovdqa %ymm14,224(%rbp)
6098 vmovdqa %ymm15,256(%rbp)
6099 movq $10,%r10
61001:
6101 vmovdqa %ymm8,128(%rbp)
6102 vmovdqa .rol16(%rip),%ymm8
6103 vpaddd %ymm7,%ymm3,%ymm3
6104 vpaddd %ymm6,%ymm2,%ymm2
6105 vpaddd %ymm5,%ymm1,%ymm1
6106 vpaddd %ymm4,%ymm0,%ymm0
6107 vpxor %ymm3,%ymm15,%ymm15
6108 vpxor %ymm2,%ymm14,%ymm14
6109 vpxor %ymm1,%ymm13,%ymm13
6110 vpxor %ymm0,%ymm12,%ymm12
6111 vpshufb %ymm8,%ymm15,%ymm15
6112 vpshufb %ymm8,%ymm14,%ymm14
6113 vpshufb %ymm8,%ymm13,%ymm13
6114 vpshufb %ymm8,%ymm12,%ymm12
6115 vmovdqa 128(%rbp),%ymm8
6116 vpaddd %ymm15,%ymm11,%ymm11
6117 vpaddd %ymm14,%ymm10,%ymm10
6118 vpaddd %ymm13,%ymm9,%ymm9
6119 vpaddd %ymm12,%ymm8,%ymm8
6120 vpxor %ymm11,%ymm7,%ymm7
6121 vpxor %ymm10,%ymm6,%ymm6
6122 vpxor %ymm9,%ymm5,%ymm5
6123 vpxor %ymm8,%ymm4,%ymm4
6124 vmovdqa %ymm8,128(%rbp)
6125 vpsrld $20,%ymm7,%ymm8
6126 vpslld $32-20,%ymm7,%ymm7
6127 vpxor %ymm8,%ymm7,%ymm7
6128 vpsrld $20,%ymm6,%ymm8
6129 vpslld $32-20,%ymm6,%ymm6
6130 vpxor %ymm8,%ymm6,%ymm6
6131 vpsrld $20,%ymm5,%ymm8
6132 vpslld $32-20,%ymm5,%ymm5
6133 vpxor %ymm8,%ymm5,%ymm5
6134 vpsrld $20,%ymm4,%ymm8
6135 vpslld $32-20,%ymm4,%ymm4
6136 vpxor %ymm8,%ymm4,%ymm4
6137 vmovdqa .rol8(%rip),%ymm8
6138 vpaddd %ymm7,%ymm3,%ymm3
6139 vpaddd %ymm6,%ymm2,%ymm2
6140 vpaddd %ymm5,%ymm1,%ymm1
6141 vpaddd %ymm4,%ymm0,%ymm0
6142 vpxor %ymm3,%ymm15,%ymm15
6143 vpxor %ymm2,%ymm14,%ymm14
6144 vpxor %ymm1,%ymm13,%ymm13
6145 vpxor %ymm0,%ymm12,%ymm12
6146 vpshufb %ymm8,%ymm15,%ymm15
6147 vpshufb %ymm8,%ymm14,%ymm14
6148 vpshufb %ymm8,%ymm13,%ymm13
6149 vpshufb %ymm8,%ymm12,%ymm12
6150 vmovdqa 128(%rbp),%ymm8
6151 vpaddd %ymm15,%ymm11,%ymm11
6152 vpaddd %ymm14,%ymm10,%ymm10
6153 vpaddd %ymm13,%ymm9,%ymm9
6154 vpaddd %ymm12,%ymm8,%ymm8
6155 vpxor %ymm11,%ymm7,%ymm7
6156 vpxor %ymm10,%ymm6,%ymm6
6157 vpxor %ymm9,%ymm5,%ymm5
6158 vpxor %ymm8,%ymm4,%ymm4
6159 vmovdqa %ymm8,128(%rbp)
6160 vpsrld $25,%ymm7,%ymm8
6161 vpslld $32-25,%ymm7,%ymm7
6162 vpxor %ymm8,%ymm7,%ymm7
6163 vpsrld $25,%ymm6,%ymm8
6164 vpslld $32-25,%ymm6,%ymm6
6165 vpxor %ymm8,%ymm6,%ymm6
6166 vpsrld $25,%ymm5,%ymm8
6167 vpslld $32-25,%ymm5,%ymm5
6168 vpxor %ymm8,%ymm5,%ymm5
6169 vpsrld $25,%ymm4,%ymm8
6170 vpslld $32-25,%ymm4,%ymm4
6171 vpxor %ymm8,%ymm4,%ymm4
6172 vmovdqa 128(%rbp),%ymm8
6173 vpalignr $4,%ymm7,%ymm7,%ymm7
6174 vpalignr $8,%ymm11,%ymm11,%ymm11
6175 vpalignr $12,%ymm15,%ymm15,%ymm15
6176 vpalignr $4,%ymm6,%ymm6,%ymm6
6177 vpalignr $8,%ymm10,%ymm10,%ymm10
6178 vpalignr $12,%ymm14,%ymm14,%ymm14
6179 vpalignr $4,%ymm5,%ymm5,%ymm5
6180 vpalignr $8,%ymm9,%ymm9,%ymm9
6181 vpalignr $12,%ymm13,%ymm13,%ymm13
6182 vpalignr $4,%ymm4,%ymm4,%ymm4
6183 vpalignr $8,%ymm8,%ymm8,%ymm8
6184 vpalignr $12,%ymm12,%ymm12,%ymm12
6185 vmovdqa %ymm8,128(%rbp)
6186 vmovdqa .rol16(%rip),%ymm8
6187 vpaddd %ymm7,%ymm3,%ymm3
6188 vpaddd %ymm6,%ymm2,%ymm2
6189 vpaddd %ymm5,%ymm1,%ymm1
6190 vpaddd %ymm4,%ymm0,%ymm0
6191 vpxor %ymm3,%ymm15,%ymm15
6192 vpxor %ymm2,%ymm14,%ymm14
6193 vpxor %ymm1,%ymm13,%ymm13
6194 vpxor %ymm0,%ymm12,%ymm12
6195 vpshufb %ymm8,%ymm15,%ymm15
6196 vpshufb %ymm8,%ymm14,%ymm14
6197 vpshufb %ymm8,%ymm13,%ymm13
6198 vpshufb %ymm8,%ymm12,%ymm12
6199 vmovdqa 128(%rbp),%ymm8
6200 vpaddd %ymm15,%ymm11,%ymm11
6201 vpaddd %ymm14,%ymm10,%ymm10
6202 vpaddd %ymm13,%ymm9,%ymm9
6203 vpaddd %ymm12,%ymm8,%ymm8
6204 vpxor %ymm11,%ymm7,%ymm7
6205 vpxor %ymm10,%ymm6,%ymm6
6206 vpxor %ymm9,%ymm5,%ymm5
6207 vpxor %ymm8,%ymm4,%ymm4
6208 vmovdqa %ymm8,128(%rbp)
6209 vpsrld $20,%ymm7,%ymm8
6210 vpslld $32-20,%ymm7,%ymm7
6211 vpxor %ymm8,%ymm7,%ymm7
6212 vpsrld $20,%ymm6,%ymm8
6213 vpslld $32-20,%ymm6,%ymm6
6214 vpxor %ymm8,%ymm6,%ymm6
6215 vpsrld $20,%ymm5,%ymm8
6216 vpslld $32-20,%ymm5,%ymm5
6217 vpxor %ymm8,%ymm5,%ymm5
6218 vpsrld $20,%ymm4,%ymm8
6219 vpslld $32-20,%ymm4,%ymm4
6220 vpxor %ymm8,%ymm4,%ymm4
6221 vmovdqa .rol8(%rip),%ymm8
6222 vpaddd %ymm7,%ymm3,%ymm3
6223 vpaddd %ymm6,%ymm2,%ymm2
6224 vpaddd %ymm5,%ymm1,%ymm1
6225 vpaddd %ymm4,%ymm0,%ymm0
6226 vpxor %ymm3,%ymm15,%ymm15
6227 vpxor %ymm2,%ymm14,%ymm14
6228 vpxor %ymm1,%ymm13,%ymm13
6229 vpxor %ymm0,%ymm12,%ymm12
6230 vpshufb %ymm8,%ymm15,%ymm15
6231 vpshufb %ymm8,%ymm14,%ymm14
6232 vpshufb %ymm8,%ymm13,%ymm13
6233 vpshufb %ymm8,%ymm12,%ymm12
6234 vmovdqa 128(%rbp),%ymm8
6235 vpaddd %ymm15,%ymm11,%ymm11
6236 vpaddd %ymm14,%ymm10,%ymm10
6237 vpaddd %ymm13,%ymm9,%ymm9
6238 vpaddd %ymm12,%ymm8,%ymm8
6239 vpxor %ymm11,%ymm7,%ymm7
6240 vpxor %ymm10,%ymm6,%ymm6
6241 vpxor %ymm9,%ymm5,%ymm5
6242 vpxor %ymm8,%ymm4,%ymm4
6243 vmovdqa %ymm8,128(%rbp)
6244 vpsrld $25,%ymm7,%ymm8
6245 vpslld $32-25,%ymm7,%ymm7
6246 vpxor %ymm8,%ymm7,%ymm7
6247 vpsrld $25,%ymm6,%ymm8
6248 vpslld $32-25,%ymm6,%ymm6
6249 vpxor %ymm8,%ymm6,%ymm6
6250 vpsrld $25,%ymm5,%ymm8
6251 vpslld $32-25,%ymm5,%ymm5
6252 vpxor %ymm8,%ymm5,%ymm5
6253 vpsrld $25,%ymm4,%ymm8
6254 vpslld $32-25,%ymm4,%ymm4
6255 vpxor %ymm8,%ymm4,%ymm4
6256 vmovdqa 128(%rbp),%ymm8
6257 vpalignr $12,%ymm7,%ymm7,%ymm7
6258 vpalignr $8,%ymm11,%ymm11,%ymm11
6259 vpalignr $4,%ymm15,%ymm15,%ymm15
6260 vpalignr $12,%ymm6,%ymm6,%ymm6
6261 vpalignr $8,%ymm10,%ymm10,%ymm10
6262 vpalignr $4,%ymm14,%ymm14,%ymm14
6263 vpalignr $12,%ymm5,%ymm5,%ymm5
6264 vpalignr $8,%ymm9,%ymm9,%ymm9
6265 vpalignr $4,%ymm13,%ymm13,%ymm13
6266 vpalignr $12,%ymm4,%ymm4,%ymm4
6267 vpalignr $8,%ymm8,%ymm8,%ymm8
6268 vpalignr $4,%ymm12,%ymm12,%ymm12
6269
6270 decq %r10
6271 jnz 1b
6272 vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
6273 vpaddd 64(%rbp),%ymm7,%ymm7
6274 vpaddd 96(%rbp),%ymm11,%ymm11
6275 vpaddd 256(%rbp),%ymm15,%ymm15
6276 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
6277 vpaddd 64(%rbp),%ymm6,%ymm6
6278 vpaddd 96(%rbp),%ymm10,%ymm10
6279 vpaddd 224(%rbp),%ymm14,%ymm14
6280 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
6281 vpaddd 64(%rbp),%ymm5,%ymm5
6282 vpaddd 96(%rbp),%ymm9,%ymm9
6283 vpaddd 192(%rbp),%ymm13,%ymm13
6284 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
6285 vpaddd 64(%rbp),%ymm4,%ymm4
6286 vpaddd 96(%rbp),%ymm8,%ymm8
6287 vpaddd 160(%rbp),%ymm12,%ymm12
6288
6289 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
6290 vperm2i128 $0x02,%ymm3,%ymm7,%ymm15
6291 vperm2i128 $0x13,%ymm3,%ymm7,%ymm3
6292 vpand .clamp(%rip),%ymm15,%ymm15
6293 vmovdqa %ymm15,0(%rbp)
6294 movq %r8,%r8
6295 call poly_hash_ad_internal
6296
6297 vpxor 0(%rsi),%ymm3,%ymm3
6298 vpxor 32(%rsi),%ymm11,%ymm11
6299 vmovdqu %ymm3,0(%rdi)
6300 vmovdqu %ymm11,32(%rdi)
6301 vperm2i128 $0x02,%ymm2,%ymm6,%ymm15
6302 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
6303 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
6304 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
6305 vpxor 0+64(%rsi),%ymm15,%ymm15
6306 vpxor 32+64(%rsi),%ymm2,%ymm2
6307 vpxor 64+64(%rsi),%ymm6,%ymm6
6308 vpxor 96+64(%rsi),%ymm10,%ymm10
6309 vmovdqu %ymm15,0+64(%rdi)
6310 vmovdqu %ymm2,32+64(%rdi)
6311 vmovdqu %ymm6,64+64(%rdi)
6312 vmovdqu %ymm10,96+64(%rdi)
6313 vperm2i128 $0x02,%ymm1,%ymm5,%ymm15
6314 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
6315 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
6316 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
6317 vpxor 0+192(%rsi),%ymm15,%ymm15
6318 vpxor 32+192(%rsi),%ymm1,%ymm1
6319 vpxor 64+192(%rsi),%ymm5,%ymm5
6320 vpxor 96+192(%rsi),%ymm9,%ymm9
6321 vmovdqu %ymm15,0+192(%rdi)
6322 vmovdqu %ymm1,32+192(%rdi)
6323 vmovdqu %ymm5,64+192(%rdi)
6324 vmovdqu %ymm9,96+192(%rdi)
6325 vperm2i128 $0x13,%ymm0,%ymm4,%ymm15
6326 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
6327 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
6328 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
6329 vmovdqa %ymm15,%ymm8
6330
6331 leaq 320(%rsi),%rsi
6332 subq $320,%rbx
6333 movq $320,%rcx
6334 cmpq $128,%rbx
6335 jbe seal_avx2_hash
6336 vpxor 0(%rsi),%ymm0,%ymm0
6337 vpxor 32(%rsi),%ymm4,%ymm4
6338 vpxor 64(%rsi),%ymm8,%ymm8
6339 vpxor 96(%rsi),%ymm12,%ymm12
6340 vmovdqu %ymm0,320(%rdi)
6341 vmovdqu %ymm4,352(%rdi)
6342 vmovdqu %ymm8,384(%rdi)
6343 vmovdqu %ymm12,416(%rdi)
6344 leaq 128(%rsi),%rsi
6345 subq $128,%rbx
6346 movq $8,%rcx
6347 movq $2,%r8
6348 cmpq $128,%rbx
6349 jbe seal_avx2_tail_128
6350 cmpq $256,%rbx
6351 jbe seal_avx2_tail_256
6352 cmpq $384,%rbx
6353 jbe seal_avx2_tail_384
6354 cmpq $512,%rbx
6355 jbe seal_avx2_tail_512
6356 vmovdqa .chacha20_consts(%rip),%ymm0
6357 vmovdqa 64(%rbp),%ymm4
6358 vmovdqa 96(%rbp),%ymm8
6359 vmovdqa %ymm0,%ymm1
6360 vmovdqa %ymm4,%ymm5
6361 vmovdqa %ymm8,%ymm9
6362 vmovdqa %ymm0,%ymm2
6363 vmovdqa %ymm4,%ymm6
6364 vmovdqa %ymm8,%ymm10
6365 vmovdqa %ymm0,%ymm3
6366 vmovdqa %ymm4,%ymm7
6367 vmovdqa %ymm8,%ymm11
6368 vmovdqa .avx2_inc(%rip),%ymm12
6369 vpaddd 160(%rbp),%ymm12,%ymm15
6370 vpaddd %ymm15,%ymm12,%ymm14
6371 vpaddd %ymm14,%ymm12,%ymm13
6372 vpaddd %ymm13,%ymm12,%ymm12
6373 vmovdqa %ymm15,256(%rbp)
6374 vmovdqa %ymm14,224(%rbp)
6375 vmovdqa %ymm13,192(%rbp)
6376 vmovdqa %ymm12,160(%rbp)
6377 vmovdqa %ymm8,128(%rbp)
6378 vmovdqa .rol16(%rip),%ymm8
6379 vpaddd %ymm7,%ymm3,%ymm3
6380 vpaddd %ymm6,%ymm2,%ymm2
6381 vpaddd %ymm5,%ymm1,%ymm1
6382 vpaddd %ymm4,%ymm0,%ymm0
6383 vpxor %ymm3,%ymm15,%ymm15
6384 vpxor %ymm2,%ymm14,%ymm14
6385 vpxor %ymm1,%ymm13,%ymm13
6386 vpxor %ymm0,%ymm12,%ymm12
6387 vpshufb %ymm8,%ymm15,%ymm15
6388 vpshufb %ymm8,%ymm14,%ymm14
6389 vpshufb %ymm8,%ymm13,%ymm13
6390 vpshufb %ymm8,%ymm12,%ymm12
6391 vmovdqa 128(%rbp),%ymm8
6392 vpaddd %ymm15,%ymm11,%ymm11
6393 vpaddd %ymm14,%ymm10,%ymm10
6394 vpaddd %ymm13,%ymm9,%ymm9
6395 vpaddd %ymm12,%ymm8,%ymm8
6396 vpxor %ymm11,%ymm7,%ymm7
6397 vpxor %ymm10,%ymm6,%ymm6
6398 vpxor %ymm9,%ymm5,%ymm5
6399 vpxor %ymm8,%ymm4,%ymm4
6400 vmovdqa %ymm8,128(%rbp)
6401 vpsrld $20,%ymm7,%ymm8
6402 vpslld $32-20,%ymm7,%ymm7
6403 vpxor %ymm8,%ymm7,%ymm7
6404 vpsrld $20,%ymm6,%ymm8
6405 vpslld $32-20,%ymm6,%ymm6
6406 vpxor %ymm8,%ymm6,%ymm6
6407 vpsrld $20,%ymm5,%ymm8
6408 vpslld $32-20,%ymm5,%ymm5
6409 vpxor %ymm8,%ymm5,%ymm5
6410 vpsrld $20,%ymm4,%ymm8
6411 vpslld $32-20,%ymm4,%ymm4
6412 vpxor %ymm8,%ymm4,%ymm4
6413 vmovdqa .rol8(%rip),%ymm8
6414 vpaddd %ymm7,%ymm3,%ymm3
6415 vpaddd %ymm6,%ymm2,%ymm2
6416 vpaddd %ymm5,%ymm1,%ymm1
6417 vpaddd %ymm4,%ymm0,%ymm0
6418 vpxor %ymm3,%ymm15,%ymm15
6419 vpxor %ymm2,%ymm14,%ymm14
6420 vpxor %ymm1,%ymm13,%ymm13
6421 vpxor %ymm0,%ymm12,%ymm12
6422 vpshufb %ymm8,%ymm15,%ymm15
6423 vpshufb %ymm8,%ymm14,%ymm14
6424 vpshufb %ymm8,%ymm13,%ymm13
6425 vpshufb %ymm8,%ymm12,%ymm12
6426 vmovdqa 128(%rbp),%ymm8
6427 vpaddd %ymm15,%ymm11,%ymm11
6428 vpaddd %ymm14,%ymm10,%ymm10
6429 vpaddd %ymm13,%ymm9,%ymm9
6430 vpaddd %ymm12,%ymm8,%ymm8
6431 vpxor %ymm11,%ymm7,%ymm7
6432 vpxor %ymm10,%ymm6,%ymm6
6433 vpxor %ymm9,%ymm5,%ymm5
6434 vpxor %ymm8,%ymm4,%ymm4
6435 vmovdqa %ymm8,128(%rbp)
6436 vpsrld $25,%ymm7,%ymm8
6437 vpslld $32-25,%ymm7,%ymm7
6438 vpxor %ymm8,%ymm7,%ymm7
6439 vpsrld $25,%ymm6,%ymm8
6440 vpslld $32-25,%ymm6,%ymm6
6441 vpxor %ymm8,%ymm6,%ymm6
6442 vpsrld $25,%ymm5,%ymm8
6443 vpslld $32-25,%ymm5,%ymm5
6444 vpxor %ymm8,%ymm5,%ymm5
6445 vpsrld $25,%ymm4,%ymm8
6446 vpslld $32-25,%ymm4,%ymm4
6447 vpxor %ymm8,%ymm4,%ymm4
6448 vmovdqa 128(%rbp),%ymm8
6449 vpalignr $4,%ymm7,%ymm7,%ymm7
6450 vpalignr $8,%ymm11,%ymm11,%ymm11
6451 vpalignr $12,%ymm15,%ymm15,%ymm15
6452 vpalignr $4,%ymm6,%ymm6,%ymm6
6453 vpalignr $8,%ymm10,%ymm10,%ymm10
6454 vpalignr $12,%ymm14,%ymm14,%ymm14
6455 vpalignr $4,%ymm5,%ymm5,%ymm5
6456 vpalignr $8,%ymm9,%ymm9,%ymm9
6457 vpalignr $12,%ymm13,%ymm13,%ymm13
6458 vpalignr $4,%ymm4,%ymm4,%ymm4
6459 vpalignr $8,%ymm8,%ymm8,%ymm8
6460 vpalignr $12,%ymm12,%ymm12,%ymm12
6461 vmovdqa %ymm8,128(%rbp)
6462 vmovdqa .rol16(%rip),%ymm8
6463 vpaddd %ymm7,%ymm3,%ymm3
6464 vpaddd %ymm6,%ymm2,%ymm2
6465 vpaddd %ymm5,%ymm1,%ymm1
6466 vpaddd %ymm4,%ymm0,%ymm0
6467 vpxor %ymm3,%ymm15,%ymm15
6468 vpxor %ymm2,%ymm14,%ymm14
6469 vpxor %ymm1,%ymm13,%ymm13
6470 vpxor %ymm0,%ymm12,%ymm12
6471 vpshufb %ymm8,%ymm15,%ymm15
6472 vpshufb %ymm8,%ymm14,%ymm14
6473 vpshufb %ymm8,%ymm13,%ymm13
6474 vpshufb %ymm8,%ymm12,%ymm12
6475 vmovdqa 128(%rbp),%ymm8
6476 vpaddd %ymm15,%ymm11,%ymm11
6477 vpaddd %ymm14,%ymm10,%ymm10
6478 vpaddd %ymm13,%ymm9,%ymm9
6479 vpaddd %ymm12,%ymm8,%ymm8
6480 vpxor %ymm11,%ymm7,%ymm7
6481 vpxor %ymm10,%ymm6,%ymm6
6482 vpxor %ymm9,%ymm5,%ymm5
6483 vpxor %ymm8,%ymm4,%ymm4
6484 vmovdqa %ymm8,128(%rbp)
6485 vpsrld $20,%ymm7,%ymm8
6486 vpslld $32-20,%ymm7,%ymm7
6487 vpxor %ymm8,%ymm7,%ymm7
6488 vpsrld $20,%ymm6,%ymm8
6489 vpslld $32-20,%ymm6,%ymm6
6490 vpxor %ymm8,%ymm6,%ymm6
6491 vpsrld $20,%ymm5,%ymm8
6492 vpslld $32-20,%ymm5,%ymm5
6493 vpxor %ymm8,%ymm5,%ymm5
6494 vpsrld $20,%ymm4,%ymm8
6495 vpslld $32-20,%ymm4,%ymm4
6496 vpxor %ymm8,%ymm4,%ymm4
6497 vmovdqa .rol8(%rip),%ymm8
6498 vpaddd %ymm7,%ymm3,%ymm3
6499 vpaddd %ymm6,%ymm2,%ymm2
6500 vpaddd %ymm5,%ymm1,%ymm1
6501 vpaddd %ymm4,%ymm0,%ymm0
6502 vpxor %ymm3,%ymm15,%ymm15
6503 vpxor %ymm2,%ymm14,%ymm14
6504 vpxor %ymm1,%ymm13,%ymm13
6505 vpxor %ymm0,%ymm12,%ymm12
6506 vpshufb %ymm8,%ymm15,%ymm15
6507 vpshufb %ymm8,%ymm14,%ymm14
6508 vpshufb %ymm8,%ymm13,%ymm13
6509 vpshufb %ymm8,%ymm12,%ymm12
6510 vmovdqa 128(%rbp),%ymm8
6511 vpaddd %ymm15,%ymm11,%ymm11
6512 vpaddd %ymm14,%ymm10,%ymm10
6513 vpaddd %ymm13,%ymm9,%ymm9
6514 vpaddd %ymm12,%ymm8,%ymm8
6515 vpxor %ymm11,%ymm7,%ymm7
6516 vpxor %ymm10,%ymm6,%ymm6
6517 vpxor %ymm9,%ymm5,%ymm5
6518 vpxor %ymm8,%ymm4,%ymm4
6519 vmovdqa %ymm8,128(%rbp)
6520 vpsrld $25,%ymm7,%ymm8
6521 vpslld $32-25,%ymm7,%ymm7
6522 vpxor %ymm8,%ymm7,%ymm7
6523 vpsrld $25,%ymm6,%ymm8
6524 vpslld $32-25,%ymm6,%ymm6
6525 vpxor %ymm8,%ymm6,%ymm6
6526 vpsrld $25,%ymm5,%ymm8
6527 vpslld $32-25,%ymm5,%ymm5
6528 vpxor %ymm8,%ymm5,%ymm5
6529 vpsrld $25,%ymm4,%ymm8
6530 vpslld $32-25,%ymm4,%ymm4
6531 vpxor %ymm8,%ymm4,%ymm4
6532 vmovdqa 128(%rbp),%ymm8
6533 vpalignr $12,%ymm7,%ymm7,%ymm7
6534 vpalignr $8,%ymm11,%ymm11,%ymm11
6535 vpalignr $4,%ymm15,%ymm15,%ymm15
6536 vpalignr $12,%ymm6,%ymm6,%ymm6
6537 vpalignr $8,%ymm10,%ymm10,%ymm10
6538 vpalignr $4,%ymm14,%ymm14,%ymm14
6539 vpalignr $12,%ymm5,%ymm5,%ymm5
6540 vpalignr $8,%ymm9,%ymm9,%ymm9
6541 vpalignr $4,%ymm13,%ymm13,%ymm13
6542 vpalignr $12,%ymm4,%ymm4,%ymm4
6543 vpalignr $8,%ymm8,%ymm8,%ymm8
6544 vpalignr $4,%ymm12,%ymm12,%ymm12
6545 vmovdqa %ymm8,128(%rbp)
6546 vmovdqa .rol16(%rip),%ymm8
6547 vpaddd %ymm7,%ymm3,%ymm3
6548 vpaddd %ymm6,%ymm2,%ymm2
6549 vpaddd %ymm5,%ymm1,%ymm1
6550 vpaddd %ymm4,%ymm0,%ymm0
6551 vpxor %ymm3,%ymm15,%ymm15
6552 vpxor %ymm2,%ymm14,%ymm14
6553 vpxor %ymm1,%ymm13,%ymm13
6554 vpxor %ymm0,%ymm12,%ymm12
6555 vpshufb %ymm8,%ymm15,%ymm15
6556 vpshufb %ymm8,%ymm14,%ymm14
6557 vpshufb %ymm8,%ymm13,%ymm13
6558 vpshufb %ymm8,%ymm12,%ymm12
6559 vmovdqa 128(%rbp),%ymm8
6560 vpaddd %ymm15,%ymm11,%ymm11
6561 vpaddd %ymm14,%ymm10,%ymm10
6562 vpaddd %ymm13,%ymm9,%ymm9
6563 vpaddd %ymm12,%ymm8,%ymm8
6564 vpxor %ymm11,%ymm7,%ymm7
6565 vpxor %ymm10,%ymm6,%ymm6
6566 vpxor %ymm9,%ymm5,%ymm5
6567 vpxor %ymm8,%ymm4,%ymm4
6568 vmovdqa %ymm8,128(%rbp)
6569 vpsrld $20,%ymm7,%ymm8
6570 vpslld $32-20,%ymm7,%ymm7
6571 vpxor %ymm8,%ymm7,%ymm7
6572 vpsrld $20,%ymm6,%ymm8
6573 vpslld $32-20,%ymm6,%ymm6
6574 vpxor %ymm8,%ymm6,%ymm6
6575 vpsrld $20,%ymm5,%ymm8
6576 vpslld $32-20,%ymm5,%ymm5
6577 vpxor %ymm8,%ymm5,%ymm5
6578 vpsrld $20,%ymm4,%ymm8
6579 vpslld $32-20,%ymm4,%ymm4
6580 vpxor %ymm8,%ymm4,%ymm4
6581 vmovdqa .rol8(%rip),%ymm8
6582 vpaddd %ymm7,%ymm3,%ymm3
6583 vpaddd %ymm6,%ymm2,%ymm2
6584 vpaddd %ymm5,%ymm1,%ymm1
6585 vpaddd %ymm4,%ymm0,%ymm0
6586
6587 subq $16,%rdi
6588 movq $9,%rcx
6589 jmp 4f
65901:
6591 vmovdqa .chacha20_consts(%rip),%ymm0
6592 vmovdqa 64(%rbp),%ymm4
6593 vmovdqa 96(%rbp),%ymm8
6594 vmovdqa %ymm0,%ymm1
6595 vmovdqa %ymm4,%ymm5
6596 vmovdqa %ymm8,%ymm9
6597 vmovdqa %ymm0,%ymm2
6598 vmovdqa %ymm4,%ymm6
6599 vmovdqa %ymm8,%ymm10
6600 vmovdqa %ymm0,%ymm3
6601 vmovdqa %ymm4,%ymm7
6602 vmovdqa %ymm8,%ymm11
6603 vmovdqa .avx2_inc(%rip),%ymm12
6604 vpaddd 160(%rbp),%ymm12,%ymm15
6605 vpaddd %ymm15,%ymm12,%ymm14
6606 vpaddd %ymm14,%ymm12,%ymm13
6607 vpaddd %ymm13,%ymm12,%ymm12
6608 vmovdqa %ymm15,256(%rbp)
6609 vmovdqa %ymm14,224(%rbp)
6610 vmovdqa %ymm13,192(%rbp)
6611 vmovdqa %ymm12,160(%rbp)
6612
6613 movq $10,%rcx
66142:
6615 addq 0(%rdi),%r10
6616 adcq 8+0(%rdi),%r11
6617 adcq $1,%r12
6618 vmovdqa %ymm8,128(%rbp)
6619 vmovdqa .rol16(%rip),%ymm8
6620 vpaddd %ymm7,%ymm3,%ymm3
6621 vpaddd %ymm6,%ymm2,%ymm2
6622 vpaddd %ymm5,%ymm1,%ymm1
6623 vpaddd %ymm4,%ymm0,%ymm0
6624 vpxor %ymm3,%ymm15,%ymm15
6625 vpxor %ymm2,%ymm14,%ymm14
6626 vpxor %ymm1,%ymm13,%ymm13
6627 vpxor %ymm0,%ymm12,%ymm12
6628 movq 0+0(%rbp),%rdx
6629 movq %rdx,%r15
6630 mulxq %r10,%r13,%r14
6631 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08006632 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05006633 addq %rax,%r14
6634 adcq %rdx,%r15
6635 vpshufb %ymm8,%ymm15,%ymm15
6636 vpshufb %ymm8,%ymm14,%ymm14
6637 vpshufb %ymm8,%ymm13,%ymm13
6638 vpshufb %ymm8,%ymm12,%ymm12
6639 vmovdqa 128(%rbp),%ymm8
6640 vpaddd %ymm15,%ymm11,%ymm11
6641 vpaddd %ymm14,%ymm10,%ymm10
6642 vpaddd %ymm13,%ymm9,%ymm9
6643 vpaddd %ymm12,%ymm8,%ymm8
6644 movq 8+0(%rbp),%rdx
6645 mulxq %r10,%r10,%rax
6646 addq %r10,%r14
6647 mulxq %r11,%r11,%r9
6648 adcq %r11,%r15
6649 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08006650 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05006651 vpxor %ymm11,%ymm7,%ymm7
6652 vpxor %ymm10,%ymm6,%ymm6
6653 vpxor %ymm9,%ymm5,%ymm5
6654 vpxor %ymm8,%ymm4,%ymm4
6655 vmovdqa %ymm8,128(%rbp)
6656 vpsrld $20,%ymm7,%ymm8
6657 vpslld $32-20,%ymm7,%ymm7
6658 vpxor %ymm8,%ymm7,%ymm7
6659 vpsrld $20,%ymm6,%ymm8
6660 vpslld $32-20,%ymm6,%ymm6
6661 vpxor %ymm8,%ymm6,%ymm6
6662 vpsrld $20,%ymm5,%ymm8
6663 addq %rax,%r15
6664 adcq %rdx,%r9
6665 vpslld $32-20,%ymm5,%ymm5
6666 vpxor %ymm8,%ymm5,%ymm5
6667 vpsrld $20,%ymm4,%ymm8
6668 vpslld $32-20,%ymm4,%ymm4
6669 vpxor %ymm8,%ymm4,%ymm4
6670 vmovdqa .rol8(%rip),%ymm8
6671 vpaddd %ymm7,%ymm3,%ymm3
6672 vpaddd %ymm6,%ymm2,%ymm2
6673 vpaddd %ymm5,%ymm1,%ymm1
6674 vpaddd %ymm4,%ymm0,%ymm0
6675 movq %r13,%r10
6676 movq %r14,%r11
6677 movq %r15,%r12
6678 andq $3,%r12
6679 movq %r15,%r13
6680 andq $-4,%r13
6681 movq %r9,%r14
6682 shrdq $2,%r9,%r15
6683 shrq $2,%r9
6684 addq %r13,%r10
6685 adcq %r14,%r11
6686 adcq $0,%r12
6687 addq %r15,%r10
6688 adcq %r9,%r11
6689 adcq $0,%r12
6690
66914:
6692 vpxor %ymm3,%ymm15,%ymm15
6693 vpxor %ymm2,%ymm14,%ymm14
6694 vpxor %ymm1,%ymm13,%ymm13
6695 vpxor %ymm0,%ymm12,%ymm12
6696 vpshufb %ymm8,%ymm15,%ymm15
6697 vpshufb %ymm8,%ymm14,%ymm14
6698 vpshufb %ymm8,%ymm13,%ymm13
6699 vpshufb %ymm8,%ymm12,%ymm12
6700 vmovdqa 128(%rbp),%ymm8
6701 addq 16(%rdi),%r10
6702 adcq 8+16(%rdi),%r11
6703 adcq $1,%r12
6704 vpaddd %ymm15,%ymm11,%ymm11
6705 vpaddd %ymm14,%ymm10,%ymm10
6706 vpaddd %ymm13,%ymm9,%ymm9
6707 vpaddd %ymm12,%ymm8,%ymm8
6708 vpxor %ymm11,%ymm7,%ymm7
6709 vpxor %ymm10,%ymm6,%ymm6
6710 vpxor %ymm9,%ymm5,%ymm5
6711 vpxor %ymm8,%ymm4,%ymm4
6712 movq 0+0(%rbp),%rdx
6713 movq %rdx,%r15
6714 mulxq %r10,%r13,%r14
6715 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08006716 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05006717 addq %rax,%r14
6718 adcq %rdx,%r15
6719 vmovdqa %ymm8,128(%rbp)
6720 vpsrld $25,%ymm7,%ymm8
6721 vpslld $32-25,%ymm7,%ymm7
6722 vpxor %ymm8,%ymm7,%ymm7
6723 vpsrld $25,%ymm6,%ymm8
6724 vpslld $32-25,%ymm6,%ymm6
6725 vpxor %ymm8,%ymm6,%ymm6
6726 vpsrld $25,%ymm5,%ymm8
6727 vpslld $32-25,%ymm5,%ymm5
6728 vpxor %ymm8,%ymm5,%ymm5
6729 vpsrld $25,%ymm4,%ymm8
6730 vpslld $32-25,%ymm4,%ymm4
6731 vpxor %ymm8,%ymm4,%ymm4
6732 vmovdqa 128(%rbp),%ymm8
6733 vpalignr $4,%ymm7,%ymm7,%ymm7
6734 vpalignr $8,%ymm11,%ymm11,%ymm11
6735 vpalignr $12,%ymm15,%ymm15,%ymm15
6736 vpalignr $4,%ymm6,%ymm6,%ymm6
6737 movq 8+0(%rbp),%rdx
6738 mulxq %r10,%r10,%rax
6739 addq %r10,%r14
6740 mulxq %r11,%r11,%r9
6741 adcq %r11,%r15
6742 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08006743 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05006744 vpalignr $8,%ymm10,%ymm10,%ymm10
6745 vpalignr $12,%ymm14,%ymm14,%ymm14
6746 vpalignr $4,%ymm5,%ymm5,%ymm5
6747 vpalignr $8,%ymm9,%ymm9,%ymm9
6748 vpalignr $12,%ymm13,%ymm13,%ymm13
6749 vpalignr $4,%ymm4,%ymm4,%ymm4
6750 vpalignr $8,%ymm8,%ymm8,%ymm8
6751 vpalignr $12,%ymm12,%ymm12,%ymm12
6752 vmovdqa %ymm8,128(%rbp)
6753 vmovdqa .rol16(%rip),%ymm8
6754 vpaddd %ymm7,%ymm3,%ymm3
6755 vpaddd %ymm6,%ymm2,%ymm2
6756 vpaddd %ymm5,%ymm1,%ymm1
6757 vpaddd %ymm4,%ymm0,%ymm0
6758 vpxor %ymm3,%ymm15,%ymm15
6759 vpxor %ymm2,%ymm14,%ymm14
6760 vpxor %ymm1,%ymm13,%ymm13
6761 vpxor %ymm0,%ymm12,%ymm12
6762 addq %rax,%r15
6763 adcq %rdx,%r9
6764 vpshufb %ymm8,%ymm15,%ymm15
6765 vpshufb %ymm8,%ymm14,%ymm14
6766 vpshufb %ymm8,%ymm13,%ymm13
6767 vpshufb %ymm8,%ymm12,%ymm12
6768 vmovdqa 128(%rbp),%ymm8
6769 vpaddd %ymm15,%ymm11,%ymm11
6770 vpaddd %ymm14,%ymm10,%ymm10
6771 vpaddd %ymm13,%ymm9,%ymm9
6772 vpaddd %ymm12,%ymm8,%ymm8
6773 movq %r13,%r10
6774 movq %r14,%r11
6775 movq %r15,%r12
6776 andq $3,%r12
6777 movq %r15,%r13
6778 andq $-4,%r13
6779 movq %r9,%r14
6780 shrdq $2,%r9,%r15
6781 shrq $2,%r9
6782 addq %r13,%r10
6783 adcq %r14,%r11
6784 adcq $0,%r12
6785 addq %r15,%r10
6786 adcq %r9,%r11
6787 adcq $0,%r12
6788 vpxor %ymm11,%ymm7,%ymm7
6789 vpxor %ymm10,%ymm6,%ymm6
6790 vpxor %ymm9,%ymm5,%ymm5
6791 vpxor %ymm8,%ymm4,%ymm4
6792 vmovdqa %ymm8,128(%rbp)
6793 vpsrld $20,%ymm7,%ymm8
6794 vpslld $32-20,%ymm7,%ymm7
6795 vpxor %ymm8,%ymm7,%ymm7
6796 addq 32(%rdi),%r10
6797 adcq 8+32(%rdi),%r11
6798 adcq $1,%r12
6799
6800 leaq 48(%rdi),%rdi
6801 vpsrld $20,%ymm6,%ymm8
6802 vpslld $32-20,%ymm6,%ymm6
6803 vpxor %ymm8,%ymm6,%ymm6
6804 vpsrld $20,%ymm5,%ymm8
6805 vpslld $32-20,%ymm5,%ymm5
6806 vpxor %ymm8,%ymm5,%ymm5
6807 vpsrld $20,%ymm4,%ymm8
6808 vpslld $32-20,%ymm4,%ymm4
6809 vpxor %ymm8,%ymm4,%ymm4
6810 vmovdqa .rol8(%rip),%ymm8
6811 vpaddd %ymm7,%ymm3,%ymm3
6812 vpaddd %ymm6,%ymm2,%ymm2
6813 vpaddd %ymm5,%ymm1,%ymm1
6814 vpaddd %ymm4,%ymm0,%ymm0
6815 vpxor %ymm3,%ymm15,%ymm15
6816 vpxor %ymm2,%ymm14,%ymm14
6817 vpxor %ymm1,%ymm13,%ymm13
6818 vpxor %ymm0,%ymm12,%ymm12
6819 movq 0+0(%rbp),%rdx
6820 movq %rdx,%r15
6821 mulxq %r10,%r13,%r14
6822 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08006823 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05006824 addq %rax,%r14
6825 adcq %rdx,%r15
6826 vpshufb %ymm8,%ymm15,%ymm15
6827 vpshufb %ymm8,%ymm14,%ymm14
6828 vpshufb %ymm8,%ymm13,%ymm13
6829 vpshufb %ymm8,%ymm12,%ymm12
6830 vmovdqa 128(%rbp),%ymm8
6831 vpaddd %ymm15,%ymm11,%ymm11
6832 vpaddd %ymm14,%ymm10,%ymm10
6833 vpaddd %ymm13,%ymm9,%ymm9
6834 movq 8+0(%rbp),%rdx
6835 mulxq %r10,%r10,%rax
6836 addq %r10,%r14
6837 mulxq %r11,%r11,%r9
6838 adcq %r11,%r15
6839 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08006840 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05006841 vpaddd %ymm12,%ymm8,%ymm8
6842 vpxor %ymm11,%ymm7,%ymm7
6843 vpxor %ymm10,%ymm6,%ymm6
6844 vpxor %ymm9,%ymm5,%ymm5
6845 vpxor %ymm8,%ymm4,%ymm4
6846 vmovdqa %ymm8,128(%rbp)
6847 vpsrld $25,%ymm7,%ymm8
6848 vpslld $32-25,%ymm7,%ymm7
6849 addq %rax,%r15
6850 adcq %rdx,%r9
6851 vpxor %ymm8,%ymm7,%ymm7
6852 vpsrld $25,%ymm6,%ymm8
6853 vpslld $32-25,%ymm6,%ymm6
6854 vpxor %ymm8,%ymm6,%ymm6
6855 vpsrld $25,%ymm5,%ymm8
6856 vpslld $32-25,%ymm5,%ymm5
6857 vpxor %ymm8,%ymm5,%ymm5
6858 vpsrld $25,%ymm4,%ymm8
6859 vpslld $32-25,%ymm4,%ymm4
6860 vpxor %ymm8,%ymm4,%ymm4
6861 vmovdqa 128(%rbp),%ymm8
6862 vpalignr $12,%ymm7,%ymm7,%ymm7
6863 vpalignr $8,%ymm11,%ymm11,%ymm11
6864 vpalignr $4,%ymm15,%ymm15,%ymm15
6865 vpalignr $12,%ymm6,%ymm6,%ymm6
6866 vpalignr $8,%ymm10,%ymm10,%ymm10
6867 vpalignr $4,%ymm14,%ymm14,%ymm14
6868 vpalignr $12,%ymm5,%ymm5,%ymm5
6869 movq %r13,%r10
6870 movq %r14,%r11
6871 movq %r15,%r12
6872 andq $3,%r12
6873 movq %r15,%r13
6874 andq $-4,%r13
6875 movq %r9,%r14
6876 shrdq $2,%r9,%r15
6877 shrq $2,%r9
6878 addq %r13,%r10
6879 adcq %r14,%r11
6880 adcq $0,%r12
6881 addq %r15,%r10
6882 adcq %r9,%r11
6883 adcq $0,%r12
6884 vpalignr $8,%ymm9,%ymm9,%ymm9
6885 vpalignr $4,%ymm13,%ymm13,%ymm13
6886 vpalignr $12,%ymm4,%ymm4,%ymm4
6887 vpalignr $8,%ymm8,%ymm8,%ymm8
6888 vpalignr $4,%ymm12,%ymm12,%ymm12
6889
6890 decq %rcx
6891 jne 2b
6892 vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
6893 vpaddd 64(%rbp),%ymm7,%ymm7
6894 vpaddd 96(%rbp),%ymm11,%ymm11
6895 vpaddd 256(%rbp),%ymm15,%ymm15
6896 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
6897 vpaddd 64(%rbp),%ymm6,%ymm6
6898 vpaddd 96(%rbp),%ymm10,%ymm10
6899 vpaddd 224(%rbp),%ymm14,%ymm14
6900 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
6901 vpaddd 64(%rbp),%ymm5,%ymm5
6902 vpaddd 96(%rbp),%ymm9,%ymm9
6903 vpaddd 192(%rbp),%ymm13,%ymm13
6904 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
6905 vpaddd 64(%rbp),%ymm4,%ymm4
6906 vpaddd 96(%rbp),%ymm8,%ymm8
6907 vpaddd 160(%rbp),%ymm12,%ymm12
6908
6909 leaq 32(%rdi),%rdi
6910 vmovdqa %ymm0,128(%rbp)
6911 addq -32(%rdi),%r10
6912 adcq 8+-32(%rdi),%r11
6913 adcq $1,%r12
6914 vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
6915 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
6916 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
6917 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
6918 vpxor 0+0(%rsi),%ymm0,%ymm0
6919 vpxor 32+0(%rsi),%ymm3,%ymm3
6920 vpxor 64+0(%rsi),%ymm7,%ymm7
6921 vpxor 96+0(%rsi),%ymm11,%ymm11
6922 vmovdqu %ymm0,0+0(%rdi)
6923 vmovdqu %ymm3,32+0(%rdi)
6924 vmovdqu %ymm7,64+0(%rdi)
6925 vmovdqu %ymm11,96+0(%rdi)
6926
6927 vmovdqa 128(%rbp),%ymm0
6928 movq 0+0(%rbp),%rax
6929 movq %rax,%r15
6930 mulq %r10
6931 movq %rax,%r13
6932 movq %rdx,%r14
6933 movq 0+0(%rbp),%rax
6934 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08006935 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05006936 addq %rax,%r14
6937 adcq %rdx,%r15
6938 movq 8+0(%rbp),%rax
6939 movq %rax,%r9
6940 mulq %r10
6941 addq %rax,%r14
6942 adcq $0,%rdx
6943 movq %rdx,%r10
6944 movq 8+0(%rbp),%rax
6945 mulq %r11
6946 addq %rax,%r15
6947 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08006948 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05006949 addq %r10,%r15
6950 adcq %rdx,%r9
6951 movq %r13,%r10
6952 movq %r14,%r11
6953 movq %r15,%r12
6954 andq $3,%r12
6955 movq %r15,%r13
6956 andq $-4,%r13
6957 movq %r9,%r14
6958 shrdq $2,%r9,%r15
6959 shrq $2,%r9
6960 addq %r13,%r10
6961 adcq %r14,%r11
6962 adcq $0,%r12
6963 addq %r15,%r10
6964 adcq %r9,%r11
6965 adcq $0,%r12
6966 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
6967 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
6968 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
6969 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
6970 vpxor 0+128(%rsi),%ymm3,%ymm3
6971 vpxor 32+128(%rsi),%ymm2,%ymm2
6972 vpxor 64+128(%rsi),%ymm6,%ymm6
6973 vpxor 96+128(%rsi),%ymm10,%ymm10
6974 vmovdqu %ymm3,0+128(%rdi)
6975 vmovdqu %ymm2,32+128(%rdi)
6976 vmovdqu %ymm6,64+128(%rdi)
6977 vmovdqu %ymm10,96+128(%rdi)
6978 addq -16(%rdi),%r10
6979 adcq 8+-16(%rdi),%r11
6980 adcq $1,%r12
6981 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
6982 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
6983 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
6984 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
6985 vpxor 0+256(%rsi),%ymm3,%ymm3
6986 vpxor 32+256(%rsi),%ymm1,%ymm1
6987 vpxor 64+256(%rsi),%ymm5,%ymm5
6988 vpxor 96+256(%rsi),%ymm9,%ymm9
6989 vmovdqu %ymm3,0+256(%rdi)
6990 vmovdqu %ymm1,32+256(%rdi)
6991 vmovdqu %ymm5,64+256(%rdi)
6992 vmovdqu %ymm9,96+256(%rdi)
6993 movq 0+0(%rbp),%rax
6994 movq %rax,%r15
6995 mulq %r10
6996 movq %rax,%r13
6997 movq %rdx,%r14
6998 movq 0+0(%rbp),%rax
6999 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007000 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007001 addq %rax,%r14
7002 adcq %rdx,%r15
7003 movq 8+0(%rbp),%rax
7004 movq %rax,%r9
7005 mulq %r10
7006 addq %rax,%r14
7007 adcq $0,%rdx
7008 movq %rdx,%r10
7009 movq 8+0(%rbp),%rax
7010 mulq %r11
7011 addq %rax,%r15
7012 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007013 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007014 addq %r10,%r15
7015 adcq %rdx,%r9
7016 movq %r13,%r10
7017 movq %r14,%r11
7018 movq %r15,%r12
7019 andq $3,%r12
7020 movq %r15,%r13
7021 andq $-4,%r13
7022 movq %r9,%r14
7023 shrdq $2,%r9,%r15
7024 shrq $2,%r9
7025 addq %r13,%r10
7026 adcq %r14,%r11
7027 adcq $0,%r12
7028 addq %r15,%r10
7029 adcq %r9,%r11
7030 adcq $0,%r12
7031 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
7032 vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
7033 vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
7034 vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
7035 vpxor 0+384(%rsi),%ymm3,%ymm3
7036 vpxor 32+384(%rsi),%ymm0,%ymm0
7037 vpxor 64+384(%rsi),%ymm4,%ymm4
7038 vpxor 96+384(%rsi),%ymm8,%ymm8
7039 vmovdqu %ymm3,0+384(%rdi)
7040 vmovdqu %ymm0,32+384(%rdi)
7041 vmovdqu %ymm4,64+384(%rdi)
7042 vmovdqu %ymm8,96+384(%rdi)
7043
7044 leaq 512(%rsi),%rsi
7045 subq $512,%rbx
7046 cmpq $512,%rbx
7047 jg 1b
7048 addq 0(%rdi),%r10
7049 adcq 8+0(%rdi),%r11
7050 adcq $1,%r12
7051 movq 0+0(%rbp),%rax
7052 movq %rax,%r15
7053 mulq %r10
7054 movq %rax,%r13
7055 movq %rdx,%r14
7056 movq 0+0(%rbp),%rax
7057 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007058 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007059 addq %rax,%r14
7060 adcq %rdx,%r15
7061 movq 8+0(%rbp),%rax
7062 movq %rax,%r9
7063 mulq %r10
7064 addq %rax,%r14
7065 adcq $0,%rdx
7066 movq %rdx,%r10
7067 movq 8+0(%rbp),%rax
7068 mulq %r11
7069 addq %rax,%r15
7070 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007071 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007072 addq %r10,%r15
7073 adcq %rdx,%r9
7074 movq %r13,%r10
7075 movq %r14,%r11
7076 movq %r15,%r12
7077 andq $3,%r12
7078 movq %r15,%r13
7079 andq $-4,%r13
7080 movq %r9,%r14
7081 shrdq $2,%r9,%r15
7082 shrq $2,%r9
7083 addq %r13,%r10
7084 adcq %r14,%r11
7085 adcq $0,%r12
7086 addq %r15,%r10
7087 adcq %r9,%r11
7088 adcq $0,%r12
7089 addq 16(%rdi),%r10
7090 adcq 8+16(%rdi),%r11
7091 adcq $1,%r12
7092 movq 0+0(%rbp),%rax
7093 movq %rax,%r15
7094 mulq %r10
7095 movq %rax,%r13
7096 movq %rdx,%r14
7097 movq 0+0(%rbp),%rax
7098 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007099 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007100 addq %rax,%r14
7101 adcq %rdx,%r15
7102 movq 8+0(%rbp),%rax
7103 movq %rax,%r9
7104 mulq %r10
7105 addq %rax,%r14
7106 adcq $0,%rdx
7107 movq %rdx,%r10
7108 movq 8+0(%rbp),%rax
7109 mulq %r11
7110 addq %rax,%r15
7111 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007112 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007113 addq %r10,%r15
7114 adcq %rdx,%r9
7115 movq %r13,%r10
7116 movq %r14,%r11
7117 movq %r15,%r12
7118 andq $3,%r12
7119 movq %r15,%r13
7120 andq $-4,%r13
7121 movq %r9,%r14
7122 shrdq $2,%r9,%r15
7123 shrq $2,%r9
7124 addq %r13,%r10
7125 adcq %r14,%r11
7126 adcq $0,%r12
7127 addq %r15,%r10
7128 adcq %r9,%r11
7129 adcq $0,%r12
7130
7131 leaq 32(%rdi),%rdi
7132 movq $10,%rcx
7133 xorq %r8,%r8
7134 cmpq $128,%rbx
7135 ja 3f
7136
7137seal_avx2_tail_128:
7138 vmovdqa .chacha20_consts(%rip),%ymm0
7139 vmovdqa 64(%rbp),%ymm4
7140 vmovdqa 96(%rbp),%ymm8
7141 vmovdqa .avx2_inc(%rip),%ymm12
7142 vpaddd 160(%rbp),%ymm12,%ymm12
7143 vmovdqa %ymm12,160(%rbp)
7144
71451:
7146 addq 0(%rdi),%r10
7147 adcq 8+0(%rdi),%r11
7148 adcq $1,%r12
7149 movq 0+0(%rbp),%rax
7150 movq %rax,%r15
7151 mulq %r10
7152 movq %rax,%r13
7153 movq %rdx,%r14
7154 movq 0+0(%rbp),%rax
7155 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007156 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007157 addq %rax,%r14
7158 adcq %rdx,%r15
7159 movq 8+0(%rbp),%rax
7160 movq %rax,%r9
7161 mulq %r10
7162 addq %rax,%r14
7163 adcq $0,%rdx
7164 movq %rdx,%r10
7165 movq 8+0(%rbp),%rax
7166 mulq %r11
7167 addq %rax,%r15
7168 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007169 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007170 addq %r10,%r15
7171 adcq %rdx,%r9
7172 movq %r13,%r10
7173 movq %r14,%r11
7174 movq %r15,%r12
7175 andq $3,%r12
7176 movq %r15,%r13
7177 andq $-4,%r13
7178 movq %r9,%r14
7179 shrdq $2,%r9,%r15
7180 shrq $2,%r9
7181 addq %r13,%r10
7182 adcq %r14,%r11
7183 adcq $0,%r12
7184 addq %r15,%r10
7185 adcq %r9,%r11
7186 adcq $0,%r12
7187
7188 leaq 16(%rdi),%rdi
71892:
7190 vpaddd %ymm4,%ymm0,%ymm0
7191 vpxor %ymm0,%ymm12,%ymm12
7192 vpshufb .rol16(%rip),%ymm12,%ymm12
7193 vpaddd %ymm12,%ymm8,%ymm8
7194 vpxor %ymm8,%ymm4,%ymm4
7195 vpsrld $20,%ymm4,%ymm3
7196 vpslld $12,%ymm4,%ymm4
7197 vpxor %ymm3,%ymm4,%ymm4
7198 vpaddd %ymm4,%ymm0,%ymm0
7199 vpxor %ymm0,%ymm12,%ymm12
7200 vpshufb .rol8(%rip),%ymm12,%ymm12
7201 vpaddd %ymm12,%ymm8,%ymm8
7202 vpxor %ymm8,%ymm4,%ymm4
7203 vpslld $7,%ymm4,%ymm3
7204 vpsrld $25,%ymm4,%ymm4
7205 vpxor %ymm3,%ymm4,%ymm4
7206 vpalignr $12,%ymm12,%ymm12,%ymm12
7207 vpalignr $8,%ymm8,%ymm8,%ymm8
7208 vpalignr $4,%ymm4,%ymm4,%ymm4
7209 addq 0(%rdi),%r10
7210 adcq 8+0(%rdi),%r11
7211 adcq $1,%r12
7212 movq 0+0(%rbp),%rax
7213 movq %rax,%r15
7214 mulq %r10
7215 movq %rax,%r13
7216 movq %rdx,%r14
7217 movq 0+0(%rbp),%rax
7218 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007219 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007220 addq %rax,%r14
7221 adcq %rdx,%r15
7222 movq 8+0(%rbp),%rax
7223 movq %rax,%r9
7224 mulq %r10
7225 addq %rax,%r14
7226 adcq $0,%rdx
7227 movq %rdx,%r10
7228 movq 8+0(%rbp),%rax
7229 mulq %r11
7230 addq %rax,%r15
7231 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007232 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007233 addq %r10,%r15
7234 adcq %rdx,%r9
7235 movq %r13,%r10
7236 movq %r14,%r11
7237 movq %r15,%r12
7238 andq $3,%r12
7239 movq %r15,%r13
7240 andq $-4,%r13
7241 movq %r9,%r14
7242 shrdq $2,%r9,%r15
7243 shrq $2,%r9
7244 addq %r13,%r10
7245 adcq %r14,%r11
7246 adcq $0,%r12
7247 addq %r15,%r10
7248 adcq %r9,%r11
7249 adcq $0,%r12
7250 vpaddd %ymm4,%ymm0,%ymm0
7251 vpxor %ymm0,%ymm12,%ymm12
7252 vpshufb .rol16(%rip),%ymm12,%ymm12
7253 vpaddd %ymm12,%ymm8,%ymm8
7254 vpxor %ymm8,%ymm4,%ymm4
7255 vpsrld $20,%ymm4,%ymm3
7256 vpslld $12,%ymm4,%ymm4
7257 vpxor %ymm3,%ymm4,%ymm4
7258 vpaddd %ymm4,%ymm0,%ymm0
7259 vpxor %ymm0,%ymm12,%ymm12
7260 vpshufb .rol8(%rip),%ymm12,%ymm12
7261 vpaddd %ymm12,%ymm8,%ymm8
7262 vpxor %ymm8,%ymm4,%ymm4
7263 vpslld $7,%ymm4,%ymm3
7264 vpsrld $25,%ymm4,%ymm4
7265 vpxor %ymm3,%ymm4,%ymm4
7266 vpalignr $4,%ymm12,%ymm12,%ymm12
7267 vpalignr $8,%ymm8,%ymm8,%ymm8
7268 vpalignr $12,%ymm4,%ymm4,%ymm4
7269 addq 16(%rdi),%r10
7270 adcq 8+16(%rdi),%r11
7271 adcq $1,%r12
7272 movq 0+0(%rbp),%rax
7273 movq %rax,%r15
7274 mulq %r10
7275 movq %rax,%r13
7276 movq %rdx,%r14
7277 movq 0+0(%rbp),%rax
7278 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007279 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007280 addq %rax,%r14
7281 adcq %rdx,%r15
7282 movq 8+0(%rbp),%rax
7283 movq %rax,%r9
7284 mulq %r10
7285 addq %rax,%r14
7286 adcq $0,%rdx
7287 movq %rdx,%r10
7288 movq 8+0(%rbp),%rax
7289 mulq %r11
7290 addq %rax,%r15
7291 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007292 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007293 addq %r10,%r15
7294 adcq %rdx,%r9
7295 movq %r13,%r10
7296 movq %r14,%r11
7297 movq %r15,%r12
7298 andq $3,%r12
7299 movq %r15,%r13
7300 andq $-4,%r13
7301 movq %r9,%r14
7302 shrdq $2,%r9,%r15
7303 shrq $2,%r9
7304 addq %r13,%r10
7305 adcq %r14,%r11
7306 adcq $0,%r12
7307 addq %r15,%r10
7308 adcq %r9,%r11
7309 adcq $0,%r12
7310
7311 leaq 32(%rdi),%rdi
7312 decq %rcx
7313 jg 1b
7314 decq %r8
7315 jge 2b
7316 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
7317 vpaddd 64(%rbp),%ymm4,%ymm4
7318 vpaddd 96(%rbp),%ymm8,%ymm8
7319 vpaddd 160(%rbp),%ymm12,%ymm12
7320 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
7321 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
7322 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
7323 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
7324 vmovdqa %ymm3,%ymm8
7325
7326 jmp seal_avx2_short_loop
73273:
7328 cmpq $256,%rbx
7329 ja 3f
7330
7331seal_avx2_tail_256:
7332 vmovdqa .chacha20_consts(%rip),%ymm0
7333 vmovdqa 64(%rbp),%ymm4
7334 vmovdqa 96(%rbp),%ymm8
7335 vmovdqa %ymm0,%ymm1
7336 vmovdqa %ymm4,%ymm5
7337 vmovdqa %ymm8,%ymm9
7338 vmovdqa .avx2_inc(%rip),%ymm12
7339 vpaddd 160(%rbp),%ymm12,%ymm13
7340 vpaddd %ymm13,%ymm12,%ymm12
7341 vmovdqa %ymm12,160(%rbp)
7342 vmovdqa %ymm13,192(%rbp)
7343
73441:
7345 addq 0(%rdi),%r10
7346 adcq 8+0(%rdi),%r11
7347 adcq $1,%r12
7348 movq 0+0(%rbp),%rax
7349 movq %rax,%r15
7350 mulq %r10
7351 movq %rax,%r13
7352 movq %rdx,%r14
7353 movq 0+0(%rbp),%rax
7354 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007355 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007356 addq %rax,%r14
7357 adcq %rdx,%r15
7358 movq 8+0(%rbp),%rax
7359 movq %rax,%r9
7360 mulq %r10
7361 addq %rax,%r14
7362 adcq $0,%rdx
7363 movq %rdx,%r10
7364 movq 8+0(%rbp),%rax
7365 mulq %r11
7366 addq %rax,%r15
7367 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007368 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007369 addq %r10,%r15
7370 adcq %rdx,%r9
7371 movq %r13,%r10
7372 movq %r14,%r11
7373 movq %r15,%r12
7374 andq $3,%r12
7375 movq %r15,%r13
7376 andq $-4,%r13
7377 movq %r9,%r14
7378 shrdq $2,%r9,%r15
7379 shrq $2,%r9
7380 addq %r13,%r10
7381 adcq %r14,%r11
7382 adcq $0,%r12
7383 addq %r15,%r10
7384 adcq %r9,%r11
7385 adcq $0,%r12
7386
7387 leaq 16(%rdi),%rdi
73882:
7389 vpaddd %ymm4,%ymm0,%ymm0
7390 vpxor %ymm0,%ymm12,%ymm12
7391 vpshufb .rol16(%rip),%ymm12,%ymm12
7392 vpaddd %ymm12,%ymm8,%ymm8
7393 vpxor %ymm8,%ymm4,%ymm4
7394 vpsrld $20,%ymm4,%ymm3
7395 vpslld $12,%ymm4,%ymm4
7396 vpxor %ymm3,%ymm4,%ymm4
7397 vpaddd %ymm4,%ymm0,%ymm0
7398 vpxor %ymm0,%ymm12,%ymm12
7399 vpshufb .rol8(%rip),%ymm12,%ymm12
7400 vpaddd %ymm12,%ymm8,%ymm8
7401 vpxor %ymm8,%ymm4,%ymm4
7402 vpslld $7,%ymm4,%ymm3
7403 vpsrld $25,%ymm4,%ymm4
7404 vpxor %ymm3,%ymm4,%ymm4
7405 vpalignr $12,%ymm12,%ymm12,%ymm12
7406 vpalignr $8,%ymm8,%ymm8,%ymm8
7407 vpalignr $4,%ymm4,%ymm4,%ymm4
7408 vpaddd %ymm5,%ymm1,%ymm1
7409 vpxor %ymm1,%ymm13,%ymm13
7410 vpshufb .rol16(%rip),%ymm13,%ymm13
7411 vpaddd %ymm13,%ymm9,%ymm9
7412 vpxor %ymm9,%ymm5,%ymm5
7413 vpsrld $20,%ymm5,%ymm3
7414 vpslld $12,%ymm5,%ymm5
7415 vpxor %ymm3,%ymm5,%ymm5
7416 vpaddd %ymm5,%ymm1,%ymm1
7417 vpxor %ymm1,%ymm13,%ymm13
7418 vpshufb .rol8(%rip),%ymm13,%ymm13
7419 vpaddd %ymm13,%ymm9,%ymm9
7420 vpxor %ymm9,%ymm5,%ymm5
7421 vpslld $7,%ymm5,%ymm3
7422 vpsrld $25,%ymm5,%ymm5
7423 vpxor %ymm3,%ymm5,%ymm5
7424 vpalignr $12,%ymm13,%ymm13,%ymm13
7425 vpalignr $8,%ymm9,%ymm9,%ymm9
7426 vpalignr $4,%ymm5,%ymm5,%ymm5
7427 addq 0(%rdi),%r10
7428 adcq 8+0(%rdi),%r11
7429 adcq $1,%r12
7430 movq 0+0(%rbp),%rax
7431 movq %rax,%r15
7432 mulq %r10
7433 movq %rax,%r13
7434 movq %rdx,%r14
7435 movq 0+0(%rbp),%rax
7436 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007437 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007438 addq %rax,%r14
7439 adcq %rdx,%r15
7440 movq 8+0(%rbp),%rax
7441 movq %rax,%r9
7442 mulq %r10
7443 addq %rax,%r14
7444 adcq $0,%rdx
7445 movq %rdx,%r10
7446 movq 8+0(%rbp),%rax
7447 mulq %r11
7448 addq %rax,%r15
7449 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007450 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007451 addq %r10,%r15
7452 adcq %rdx,%r9
7453 movq %r13,%r10
7454 movq %r14,%r11
7455 movq %r15,%r12
7456 andq $3,%r12
7457 movq %r15,%r13
7458 andq $-4,%r13
7459 movq %r9,%r14
7460 shrdq $2,%r9,%r15
7461 shrq $2,%r9
7462 addq %r13,%r10
7463 adcq %r14,%r11
7464 adcq $0,%r12
7465 addq %r15,%r10
7466 adcq %r9,%r11
7467 adcq $0,%r12
7468 vpaddd %ymm4,%ymm0,%ymm0
7469 vpxor %ymm0,%ymm12,%ymm12
7470 vpshufb .rol16(%rip),%ymm12,%ymm12
7471 vpaddd %ymm12,%ymm8,%ymm8
7472 vpxor %ymm8,%ymm4,%ymm4
7473 vpsrld $20,%ymm4,%ymm3
7474 vpslld $12,%ymm4,%ymm4
7475 vpxor %ymm3,%ymm4,%ymm4
7476 vpaddd %ymm4,%ymm0,%ymm0
7477 vpxor %ymm0,%ymm12,%ymm12
7478 vpshufb .rol8(%rip),%ymm12,%ymm12
7479 vpaddd %ymm12,%ymm8,%ymm8
7480 vpxor %ymm8,%ymm4,%ymm4
7481 vpslld $7,%ymm4,%ymm3
7482 vpsrld $25,%ymm4,%ymm4
7483 vpxor %ymm3,%ymm4,%ymm4
7484 vpalignr $4,%ymm12,%ymm12,%ymm12
7485 vpalignr $8,%ymm8,%ymm8,%ymm8
7486 vpalignr $12,%ymm4,%ymm4,%ymm4
7487 vpaddd %ymm5,%ymm1,%ymm1
7488 vpxor %ymm1,%ymm13,%ymm13
7489 vpshufb .rol16(%rip),%ymm13,%ymm13
7490 vpaddd %ymm13,%ymm9,%ymm9
7491 vpxor %ymm9,%ymm5,%ymm5
7492 vpsrld $20,%ymm5,%ymm3
7493 vpslld $12,%ymm5,%ymm5
7494 vpxor %ymm3,%ymm5,%ymm5
7495 vpaddd %ymm5,%ymm1,%ymm1
7496 vpxor %ymm1,%ymm13,%ymm13
7497 vpshufb .rol8(%rip),%ymm13,%ymm13
7498 vpaddd %ymm13,%ymm9,%ymm9
7499 vpxor %ymm9,%ymm5,%ymm5
7500 vpslld $7,%ymm5,%ymm3
7501 vpsrld $25,%ymm5,%ymm5
7502 vpxor %ymm3,%ymm5,%ymm5
7503 vpalignr $4,%ymm13,%ymm13,%ymm13
7504 vpalignr $8,%ymm9,%ymm9,%ymm9
7505 vpalignr $12,%ymm5,%ymm5,%ymm5
7506 addq 16(%rdi),%r10
7507 adcq 8+16(%rdi),%r11
7508 adcq $1,%r12
7509 movq 0+0(%rbp),%rax
7510 movq %rax,%r15
7511 mulq %r10
7512 movq %rax,%r13
7513 movq %rdx,%r14
7514 movq 0+0(%rbp),%rax
7515 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007516 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007517 addq %rax,%r14
7518 adcq %rdx,%r15
7519 movq 8+0(%rbp),%rax
7520 movq %rax,%r9
7521 mulq %r10
7522 addq %rax,%r14
7523 adcq $0,%rdx
7524 movq %rdx,%r10
7525 movq 8+0(%rbp),%rax
7526 mulq %r11
7527 addq %rax,%r15
7528 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007529 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007530 addq %r10,%r15
7531 adcq %rdx,%r9
7532 movq %r13,%r10
7533 movq %r14,%r11
7534 movq %r15,%r12
7535 andq $3,%r12
7536 movq %r15,%r13
7537 andq $-4,%r13
7538 movq %r9,%r14
7539 shrdq $2,%r9,%r15
7540 shrq $2,%r9
7541 addq %r13,%r10
7542 adcq %r14,%r11
7543 adcq $0,%r12
7544 addq %r15,%r10
7545 adcq %r9,%r11
7546 adcq $0,%r12
7547
7548 leaq 32(%rdi),%rdi
7549 decq %rcx
7550 jg 1b
7551 decq %r8
7552 jge 2b
7553 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
7554 vpaddd 64(%rbp),%ymm5,%ymm5
7555 vpaddd 96(%rbp),%ymm9,%ymm9
7556 vpaddd 192(%rbp),%ymm13,%ymm13
7557 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
7558 vpaddd 64(%rbp),%ymm4,%ymm4
7559 vpaddd 96(%rbp),%ymm8,%ymm8
7560 vpaddd 160(%rbp),%ymm12,%ymm12
7561 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
7562 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
7563 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
7564 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
7565 vpxor 0+0(%rsi),%ymm3,%ymm3
7566 vpxor 32+0(%rsi),%ymm1,%ymm1
7567 vpxor 64+0(%rsi),%ymm5,%ymm5
7568 vpxor 96+0(%rsi),%ymm9,%ymm9
7569 vmovdqu %ymm3,0+0(%rdi)
7570 vmovdqu %ymm1,32+0(%rdi)
7571 vmovdqu %ymm5,64+0(%rdi)
7572 vmovdqu %ymm9,96+0(%rdi)
7573 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
7574 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
7575 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
7576 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
7577 vmovdqa %ymm3,%ymm8
7578
7579 movq $128,%rcx
7580 leaq 128(%rsi),%rsi
7581 subq $128,%rbx
7582 jmp seal_avx2_hash
75833:
7584 cmpq $384,%rbx
7585 ja seal_avx2_tail_512
7586
7587seal_avx2_tail_384:
7588 vmovdqa .chacha20_consts(%rip),%ymm0
7589 vmovdqa 64(%rbp),%ymm4
7590 vmovdqa 96(%rbp),%ymm8
7591 vmovdqa %ymm0,%ymm1
7592 vmovdqa %ymm4,%ymm5
7593 vmovdqa %ymm8,%ymm9
7594 vmovdqa %ymm0,%ymm2
7595 vmovdqa %ymm4,%ymm6
7596 vmovdqa %ymm8,%ymm10
7597 vmovdqa .avx2_inc(%rip),%ymm12
7598 vpaddd 160(%rbp),%ymm12,%ymm14
7599 vpaddd %ymm14,%ymm12,%ymm13
7600 vpaddd %ymm13,%ymm12,%ymm12
7601 vmovdqa %ymm12,160(%rbp)
7602 vmovdqa %ymm13,192(%rbp)
7603 vmovdqa %ymm14,224(%rbp)
7604
76051:
7606 addq 0(%rdi),%r10
7607 adcq 8+0(%rdi),%r11
7608 adcq $1,%r12
7609 movq 0+0(%rbp),%rax
7610 movq %rax,%r15
7611 mulq %r10
7612 movq %rax,%r13
7613 movq %rdx,%r14
7614 movq 0+0(%rbp),%rax
7615 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007616 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007617 addq %rax,%r14
7618 adcq %rdx,%r15
7619 movq 8+0(%rbp),%rax
7620 movq %rax,%r9
7621 mulq %r10
7622 addq %rax,%r14
7623 adcq $0,%rdx
7624 movq %rdx,%r10
7625 movq 8+0(%rbp),%rax
7626 mulq %r11
7627 addq %rax,%r15
7628 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007629 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007630 addq %r10,%r15
7631 adcq %rdx,%r9
7632 movq %r13,%r10
7633 movq %r14,%r11
7634 movq %r15,%r12
7635 andq $3,%r12
7636 movq %r15,%r13
7637 andq $-4,%r13
7638 movq %r9,%r14
7639 shrdq $2,%r9,%r15
7640 shrq $2,%r9
7641 addq %r13,%r10
7642 adcq %r14,%r11
7643 adcq $0,%r12
7644 addq %r15,%r10
7645 adcq %r9,%r11
7646 adcq $0,%r12
7647
7648 leaq 16(%rdi),%rdi
76492:
7650 vpaddd %ymm4,%ymm0,%ymm0
7651 vpxor %ymm0,%ymm12,%ymm12
7652 vpshufb .rol16(%rip),%ymm12,%ymm12
7653 vpaddd %ymm12,%ymm8,%ymm8
7654 vpxor %ymm8,%ymm4,%ymm4
7655 vpsrld $20,%ymm4,%ymm3
7656 vpslld $12,%ymm4,%ymm4
7657 vpxor %ymm3,%ymm4,%ymm4
7658 vpaddd %ymm4,%ymm0,%ymm0
7659 vpxor %ymm0,%ymm12,%ymm12
7660 vpshufb .rol8(%rip),%ymm12,%ymm12
7661 vpaddd %ymm12,%ymm8,%ymm8
7662 vpxor %ymm8,%ymm4,%ymm4
7663 vpslld $7,%ymm4,%ymm3
7664 vpsrld $25,%ymm4,%ymm4
7665 vpxor %ymm3,%ymm4,%ymm4
7666 vpalignr $12,%ymm12,%ymm12,%ymm12
7667 vpalignr $8,%ymm8,%ymm8,%ymm8
7668 vpalignr $4,%ymm4,%ymm4,%ymm4
7669 vpaddd %ymm5,%ymm1,%ymm1
7670 vpxor %ymm1,%ymm13,%ymm13
7671 vpshufb .rol16(%rip),%ymm13,%ymm13
7672 vpaddd %ymm13,%ymm9,%ymm9
7673 vpxor %ymm9,%ymm5,%ymm5
7674 vpsrld $20,%ymm5,%ymm3
7675 vpslld $12,%ymm5,%ymm5
7676 vpxor %ymm3,%ymm5,%ymm5
7677 vpaddd %ymm5,%ymm1,%ymm1
7678 vpxor %ymm1,%ymm13,%ymm13
7679 vpshufb .rol8(%rip),%ymm13,%ymm13
7680 vpaddd %ymm13,%ymm9,%ymm9
7681 vpxor %ymm9,%ymm5,%ymm5
7682 vpslld $7,%ymm5,%ymm3
7683 vpsrld $25,%ymm5,%ymm5
7684 vpxor %ymm3,%ymm5,%ymm5
7685 vpalignr $12,%ymm13,%ymm13,%ymm13
7686 vpalignr $8,%ymm9,%ymm9,%ymm9
7687 vpalignr $4,%ymm5,%ymm5,%ymm5
7688 addq 0(%rdi),%r10
7689 adcq 8+0(%rdi),%r11
7690 adcq $1,%r12
7691 movq 0+0(%rbp),%rax
7692 movq %rax,%r15
7693 mulq %r10
7694 movq %rax,%r13
7695 movq %rdx,%r14
7696 movq 0+0(%rbp),%rax
7697 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007698 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007699 addq %rax,%r14
7700 adcq %rdx,%r15
7701 movq 8+0(%rbp),%rax
7702 movq %rax,%r9
7703 mulq %r10
7704 addq %rax,%r14
7705 adcq $0,%rdx
7706 movq %rdx,%r10
7707 movq 8+0(%rbp),%rax
7708 mulq %r11
7709 addq %rax,%r15
7710 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007711 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007712 addq %r10,%r15
7713 adcq %rdx,%r9
7714 movq %r13,%r10
7715 movq %r14,%r11
7716 movq %r15,%r12
7717 andq $3,%r12
7718 movq %r15,%r13
7719 andq $-4,%r13
7720 movq %r9,%r14
7721 shrdq $2,%r9,%r15
7722 shrq $2,%r9
7723 addq %r13,%r10
7724 adcq %r14,%r11
7725 adcq $0,%r12
7726 addq %r15,%r10
7727 adcq %r9,%r11
7728 adcq $0,%r12
7729 vpaddd %ymm6,%ymm2,%ymm2
7730 vpxor %ymm2,%ymm14,%ymm14
7731 vpshufb .rol16(%rip),%ymm14,%ymm14
7732 vpaddd %ymm14,%ymm10,%ymm10
7733 vpxor %ymm10,%ymm6,%ymm6
7734 vpsrld $20,%ymm6,%ymm3
7735 vpslld $12,%ymm6,%ymm6
7736 vpxor %ymm3,%ymm6,%ymm6
7737 vpaddd %ymm6,%ymm2,%ymm2
7738 vpxor %ymm2,%ymm14,%ymm14
7739 vpshufb .rol8(%rip),%ymm14,%ymm14
7740 vpaddd %ymm14,%ymm10,%ymm10
7741 vpxor %ymm10,%ymm6,%ymm6
7742 vpslld $7,%ymm6,%ymm3
7743 vpsrld $25,%ymm6,%ymm6
7744 vpxor %ymm3,%ymm6,%ymm6
7745 vpalignr $12,%ymm14,%ymm14,%ymm14
7746 vpalignr $8,%ymm10,%ymm10,%ymm10
7747 vpalignr $4,%ymm6,%ymm6,%ymm6
7748 vpaddd %ymm4,%ymm0,%ymm0
7749 vpxor %ymm0,%ymm12,%ymm12
7750 vpshufb .rol16(%rip),%ymm12,%ymm12
7751 vpaddd %ymm12,%ymm8,%ymm8
7752 vpxor %ymm8,%ymm4,%ymm4
7753 vpsrld $20,%ymm4,%ymm3
7754 vpslld $12,%ymm4,%ymm4
7755 vpxor %ymm3,%ymm4,%ymm4
7756 vpaddd %ymm4,%ymm0,%ymm0
7757 vpxor %ymm0,%ymm12,%ymm12
7758 vpshufb .rol8(%rip),%ymm12,%ymm12
7759 vpaddd %ymm12,%ymm8,%ymm8
7760 vpxor %ymm8,%ymm4,%ymm4
7761 vpslld $7,%ymm4,%ymm3
7762 vpsrld $25,%ymm4,%ymm4
7763 vpxor %ymm3,%ymm4,%ymm4
7764 vpalignr $4,%ymm12,%ymm12,%ymm12
7765 vpalignr $8,%ymm8,%ymm8,%ymm8
7766 vpalignr $12,%ymm4,%ymm4,%ymm4
7767 addq 16(%rdi),%r10
7768 adcq 8+16(%rdi),%r11
7769 adcq $1,%r12
7770 movq 0+0(%rbp),%rax
7771 movq %rax,%r15
7772 mulq %r10
7773 movq %rax,%r13
7774 movq %rdx,%r14
7775 movq 0+0(%rbp),%rax
7776 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007777 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007778 addq %rax,%r14
7779 adcq %rdx,%r15
7780 movq 8+0(%rbp),%rax
7781 movq %rax,%r9
7782 mulq %r10
7783 addq %rax,%r14
7784 adcq $0,%rdx
7785 movq %rdx,%r10
7786 movq 8+0(%rbp),%rax
7787 mulq %r11
7788 addq %rax,%r15
7789 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007790 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007791 addq %r10,%r15
7792 adcq %rdx,%r9
7793 movq %r13,%r10
7794 movq %r14,%r11
7795 movq %r15,%r12
7796 andq $3,%r12
7797 movq %r15,%r13
7798 andq $-4,%r13
7799 movq %r9,%r14
7800 shrdq $2,%r9,%r15
7801 shrq $2,%r9
7802 addq %r13,%r10
7803 adcq %r14,%r11
7804 adcq $0,%r12
7805 addq %r15,%r10
7806 adcq %r9,%r11
7807 adcq $0,%r12
7808 vpaddd %ymm5,%ymm1,%ymm1
7809 vpxor %ymm1,%ymm13,%ymm13
7810 vpshufb .rol16(%rip),%ymm13,%ymm13
7811 vpaddd %ymm13,%ymm9,%ymm9
7812 vpxor %ymm9,%ymm5,%ymm5
7813 vpsrld $20,%ymm5,%ymm3
7814 vpslld $12,%ymm5,%ymm5
7815 vpxor %ymm3,%ymm5,%ymm5
7816 vpaddd %ymm5,%ymm1,%ymm1
7817 vpxor %ymm1,%ymm13,%ymm13
7818 vpshufb .rol8(%rip),%ymm13,%ymm13
7819 vpaddd %ymm13,%ymm9,%ymm9
7820 vpxor %ymm9,%ymm5,%ymm5
7821 vpslld $7,%ymm5,%ymm3
7822 vpsrld $25,%ymm5,%ymm5
7823 vpxor %ymm3,%ymm5,%ymm5
7824 vpalignr $4,%ymm13,%ymm13,%ymm13
7825 vpalignr $8,%ymm9,%ymm9,%ymm9
7826 vpalignr $12,%ymm5,%ymm5,%ymm5
7827 vpaddd %ymm6,%ymm2,%ymm2
7828 vpxor %ymm2,%ymm14,%ymm14
7829 vpshufb .rol16(%rip),%ymm14,%ymm14
7830 vpaddd %ymm14,%ymm10,%ymm10
7831 vpxor %ymm10,%ymm6,%ymm6
7832 vpsrld $20,%ymm6,%ymm3
7833 vpslld $12,%ymm6,%ymm6
7834 vpxor %ymm3,%ymm6,%ymm6
7835 vpaddd %ymm6,%ymm2,%ymm2
7836 vpxor %ymm2,%ymm14,%ymm14
7837 vpshufb .rol8(%rip),%ymm14,%ymm14
7838 vpaddd %ymm14,%ymm10,%ymm10
7839 vpxor %ymm10,%ymm6,%ymm6
7840 vpslld $7,%ymm6,%ymm3
7841 vpsrld $25,%ymm6,%ymm6
7842 vpxor %ymm3,%ymm6,%ymm6
7843 vpalignr $4,%ymm14,%ymm14,%ymm14
7844 vpalignr $8,%ymm10,%ymm10,%ymm10
7845 vpalignr $12,%ymm6,%ymm6,%ymm6
7846
7847 leaq 32(%rdi),%rdi
7848 decq %rcx
7849 jg 1b
7850 decq %r8
7851 jge 2b
7852 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
7853 vpaddd 64(%rbp),%ymm6,%ymm6
7854 vpaddd 96(%rbp),%ymm10,%ymm10
7855 vpaddd 224(%rbp),%ymm14,%ymm14
7856 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
7857 vpaddd 64(%rbp),%ymm5,%ymm5
7858 vpaddd 96(%rbp),%ymm9,%ymm9
7859 vpaddd 192(%rbp),%ymm13,%ymm13
7860 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
7861 vpaddd 64(%rbp),%ymm4,%ymm4
7862 vpaddd 96(%rbp),%ymm8,%ymm8
7863 vpaddd 160(%rbp),%ymm12,%ymm12
7864 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
7865 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
7866 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
7867 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
7868 vpxor 0+0(%rsi),%ymm3,%ymm3
7869 vpxor 32+0(%rsi),%ymm2,%ymm2
7870 vpxor 64+0(%rsi),%ymm6,%ymm6
7871 vpxor 96+0(%rsi),%ymm10,%ymm10
7872 vmovdqu %ymm3,0+0(%rdi)
7873 vmovdqu %ymm2,32+0(%rdi)
7874 vmovdqu %ymm6,64+0(%rdi)
7875 vmovdqu %ymm10,96+0(%rdi)
7876 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
7877 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
7878 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
7879 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
7880 vpxor 0+128(%rsi),%ymm3,%ymm3
7881 vpxor 32+128(%rsi),%ymm1,%ymm1
7882 vpxor 64+128(%rsi),%ymm5,%ymm5
7883 vpxor 96+128(%rsi),%ymm9,%ymm9
7884 vmovdqu %ymm3,0+128(%rdi)
7885 vmovdqu %ymm1,32+128(%rdi)
7886 vmovdqu %ymm5,64+128(%rdi)
7887 vmovdqu %ymm9,96+128(%rdi)
7888 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
7889 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
7890 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
7891 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
7892 vmovdqa %ymm3,%ymm8
7893
7894 movq $256,%rcx
7895 leaq 256(%rsi),%rsi
7896 subq $256,%rbx
7897 jmp seal_avx2_hash
7898
7899seal_avx2_tail_512:
7900 vmovdqa .chacha20_consts(%rip),%ymm0
7901 vmovdqa 64(%rbp),%ymm4
7902 vmovdqa 96(%rbp),%ymm8
7903 vmovdqa %ymm0,%ymm1
7904 vmovdqa %ymm4,%ymm5
7905 vmovdqa %ymm8,%ymm9
7906 vmovdqa %ymm0,%ymm2
7907 vmovdqa %ymm4,%ymm6
7908 vmovdqa %ymm8,%ymm10
7909 vmovdqa %ymm0,%ymm3
7910 vmovdqa %ymm4,%ymm7
7911 vmovdqa %ymm8,%ymm11
7912 vmovdqa .avx2_inc(%rip),%ymm12
7913 vpaddd 160(%rbp),%ymm12,%ymm15
7914 vpaddd %ymm15,%ymm12,%ymm14
7915 vpaddd %ymm14,%ymm12,%ymm13
7916 vpaddd %ymm13,%ymm12,%ymm12
7917 vmovdqa %ymm15,256(%rbp)
7918 vmovdqa %ymm14,224(%rbp)
7919 vmovdqa %ymm13,192(%rbp)
7920 vmovdqa %ymm12,160(%rbp)
7921
79221:
7923 addq 0(%rdi),%r10
7924 adcq 8+0(%rdi),%r11
7925 adcq $1,%r12
7926 movq 0+0(%rbp),%rdx
7927 movq %rdx,%r15
7928 mulxq %r10,%r13,%r14
7929 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007930 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007931 addq %rax,%r14
7932 adcq %rdx,%r15
7933 movq 8+0(%rbp),%rdx
7934 mulxq %r10,%r10,%rax
7935 addq %r10,%r14
7936 mulxq %r11,%r11,%r9
7937 adcq %r11,%r15
7938 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08007939 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05007940 addq %rax,%r15
7941 adcq %rdx,%r9
7942 movq %r13,%r10
7943 movq %r14,%r11
7944 movq %r15,%r12
7945 andq $3,%r12
7946 movq %r15,%r13
7947 andq $-4,%r13
7948 movq %r9,%r14
7949 shrdq $2,%r9,%r15
7950 shrq $2,%r9
7951 addq %r13,%r10
7952 adcq %r14,%r11
7953 adcq $0,%r12
7954 addq %r15,%r10
7955 adcq %r9,%r11
7956 adcq $0,%r12
7957
7958 leaq 16(%rdi),%rdi
79592:
7960 vmovdqa %ymm8,128(%rbp)
7961 vmovdqa .rol16(%rip),%ymm8
7962 vpaddd %ymm7,%ymm3,%ymm3
7963 vpaddd %ymm6,%ymm2,%ymm2
7964 vpaddd %ymm5,%ymm1,%ymm1
7965 vpaddd %ymm4,%ymm0,%ymm0
7966 vpxor %ymm3,%ymm15,%ymm15
7967 vpxor %ymm2,%ymm14,%ymm14
7968 vpxor %ymm1,%ymm13,%ymm13
7969 vpxor %ymm0,%ymm12,%ymm12
7970 vpshufb %ymm8,%ymm15,%ymm15
7971 vpshufb %ymm8,%ymm14,%ymm14
7972 vpshufb %ymm8,%ymm13,%ymm13
7973 vpshufb %ymm8,%ymm12,%ymm12
7974 vmovdqa 128(%rbp),%ymm8
7975 vpaddd %ymm15,%ymm11,%ymm11
7976 vpaddd %ymm14,%ymm10,%ymm10
7977 vpaddd %ymm13,%ymm9,%ymm9
7978 vpaddd %ymm12,%ymm8,%ymm8
7979 vpxor %ymm11,%ymm7,%ymm7
7980 addq 0(%rdi),%r10
7981 adcq 8+0(%rdi),%r11
7982 adcq $1,%r12
7983 vpxor %ymm10,%ymm6,%ymm6
7984 vpxor %ymm9,%ymm5,%ymm5
7985 vpxor %ymm8,%ymm4,%ymm4
7986 vmovdqa %ymm8,128(%rbp)
7987 vpsrld $20,%ymm7,%ymm8
7988 vpslld $32-20,%ymm7,%ymm7
7989 vpxor %ymm8,%ymm7,%ymm7
7990 vpsrld $20,%ymm6,%ymm8
7991 vpslld $32-20,%ymm6,%ymm6
7992 vpxor %ymm8,%ymm6,%ymm6
7993 vpsrld $20,%ymm5,%ymm8
7994 vpslld $32-20,%ymm5,%ymm5
7995 vpxor %ymm8,%ymm5,%ymm5
7996 vpsrld $20,%ymm4,%ymm8
7997 vpslld $32-20,%ymm4,%ymm4
7998 vpxor %ymm8,%ymm4,%ymm4
7999 vmovdqa .rol8(%rip),%ymm8
8000 vpaddd %ymm7,%ymm3,%ymm3
8001 vpaddd %ymm6,%ymm2,%ymm2
8002 vpaddd %ymm5,%ymm1,%ymm1
8003 movq 0+0(%rbp),%rdx
8004 movq %rdx,%r15
8005 mulxq %r10,%r13,%r14
8006 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008007 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008008 addq %rax,%r14
8009 adcq %rdx,%r15
8010 vpaddd %ymm4,%ymm0,%ymm0
8011 vpxor %ymm3,%ymm15,%ymm15
8012 vpxor %ymm2,%ymm14,%ymm14
8013 vpxor %ymm1,%ymm13,%ymm13
8014 vpxor %ymm0,%ymm12,%ymm12
8015 vpshufb %ymm8,%ymm15,%ymm15
8016 vpshufb %ymm8,%ymm14,%ymm14
8017 vpshufb %ymm8,%ymm13,%ymm13
8018 vpshufb %ymm8,%ymm12,%ymm12
8019 vmovdqa 128(%rbp),%ymm8
8020 vpaddd %ymm15,%ymm11,%ymm11
8021 vpaddd %ymm14,%ymm10,%ymm10
8022 vpaddd %ymm13,%ymm9,%ymm9
8023 vpaddd %ymm12,%ymm8,%ymm8
8024 vpxor %ymm11,%ymm7,%ymm7
8025 vpxor %ymm10,%ymm6,%ymm6
8026 vpxor %ymm9,%ymm5,%ymm5
8027 vpxor %ymm8,%ymm4,%ymm4
8028 vmovdqa %ymm8,128(%rbp)
8029 vpsrld $25,%ymm7,%ymm8
8030 movq 8+0(%rbp),%rdx
8031 mulxq %r10,%r10,%rax
8032 addq %r10,%r14
8033 mulxq %r11,%r11,%r9
8034 adcq %r11,%r15
8035 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08008036 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05008037 vpslld $32-25,%ymm7,%ymm7
8038 vpxor %ymm8,%ymm7,%ymm7
8039 vpsrld $25,%ymm6,%ymm8
8040 vpslld $32-25,%ymm6,%ymm6
8041 vpxor %ymm8,%ymm6,%ymm6
8042 vpsrld $25,%ymm5,%ymm8
8043 vpslld $32-25,%ymm5,%ymm5
8044 vpxor %ymm8,%ymm5,%ymm5
8045 vpsrld $25,%ymm4,%ymm8
8046 vpslld $32-25,%ymm4,%ymm4
8047 vpxor %ymm8,%ymm4,%ymm4
8048 vmovdqa 128(%rbp),%ymm8
8049 vpalignr $4,%ymm7,%ymm7,%ymm7
8050 vpalignr $8,%ymm11,%ymm11,%ymm11
8051 vpalignr $12,%ymm15,%ymm15,%ymm15
8052 vpalignr $4,%ymm6,%ymm6,%ymm6
8053 vpalignr $8,%ymm10,%ymm10,%ymm10
8054 vpalignr $12,%ymm14,%ymm14,%ymm14
8055 vpalignr $4,%ymm5,%ymm5,%ymm5
8056 vpalignr $8,%ymm9,%ymm9,%ymm9
8057 addq %rax,%r15
8058 adcq %rdx,%r9
8059 vpalignr $12,%ymm13,%ymm13,%ymm13
8060 vpalignr $4,%ymm4,%ymm4,%ymm4
8061 vpalignr $8,%ymm8,%ymm8,%ymm8
8062 vpalignr $12,%ymm12,%ymm12,%ymm12
8063 vmovdqa %ymm8,128(%rbp)
8064 vmovdqa .rol16(%rip),%ymm8
8065 vpaddd %ymm7,%ymm3,%ymm3
8066 vpaddd %ymm6,%ymm2,%ymm2
8067 vpaddd %ymm5,%ymm1,%ymm1
8068 vpaddd %ymm4,%ymm0,%ymm0
8069 vpxor %ymm3,%ymm15,%ymm15
8070 vpxor %ymm2,%ymm14,%ymm14
8071 vpxor %ymm1,%ymm13,%ymm13
8072 vpxor %ymm0,%ymm12,%ymm12
8073 vpshufb %ymm8,%ymm15,%ymm15
8074 vpshufb %ymm8,%ymm14,%ymm14
8075 vpshufb %ymm8,%ymm13,%ymm13
8076 vpshufb %ymm8,%ymm12,%ymm12
8077 vmovdqa 128(%rbp),%ymm8
8078 vpaddd %ymm15,%ymm11,%ymm11
8079 movq %r13,%r10
8080 movq %r14,%r11
8081 movq %r15,%r12
8082 andq $3,%r12
8083 movq %r15,%r13
8084 andq $-4,%r13
8085 movq %r9,%r14
8086 shrdq $2,%r9,%r15
8087 shrq $2,%r9
8088 addq %r13,%r10
8089 adcq %r14,%r11
8090 adcq $0,%r12
8091 addq %r15,%r10
8092 adcq %r9,%r11
8093 adcq $0,%r12
8094 vpaddd %ymm14,%ymm10,%ymm10
8095 vpaddd %ymm13,%ymm9,%ymm9
8096 vpaddd %ymm12,%ymm8,%ymm8
8097 vpxor %ymm11,%ymm7,%ymm7
8098 vpxor %ymm10,%ymm6,%ymm6
8099 vpxor %ymm9,%ymm5,%ymm5
8100 vpxor %ymm8,%ymm4,%ymm4
8101 vmovdqa %ymm8,128(%rbp)
8102 vpsrld $20,%ymm7,%ymm8
8103 vpslld $32-20,%ymm7,%ymm7
8104 vpxor %ymm8,%ymm7,%ymm7
8105 vpsrld $20,%ymm6,%ymm8
8106 vpslld $32-20,%ymm6,%ymm6
8107 vpxor %ymm8,%ymm6,%ymm6
8108 vpsrld $20,%ymm5,%ymm8
8109 vpslld $32-20,%ymm5,%ymm5
8110 vpxor %ymm8,%ymm5,%ymm5
8111 vpsrld $20,%ymm4,%ymm8
8112 vpslld $32-20,%ymm4,%ymm4
8113 vpxor %ymm8,%ymm4,%ymm4
8114 addq 16(%rdi),%r10
8115 adcq 8+16(%rdi),%r11
8116 adcq $1,%r12
8117 vmovdqa .rol8(%rip),%ymm8
8118 vpaddd %ymm7,%ymm3,%ymm3
8119 vpaddd %ymm6,%ymm2,%ymm2
8120 vpaddd %ymm5,%ymm1,%ymm1
8121 vpaddd %ymm4,%ymm0,%ymm0
8122 vpxor %ymm3,%ymm15,%ymm15
8123 vpxor %ymm2,%ymm14,%ymm14
8124 vpxor %ymm1,%ymm13,%ymm13
8125 vpxor %ymm0,%ymm12,%ymm12
8126 vpshufb %ymm8,%ymm15,%ymm15
8127 vpshufb %ymm8,%ymm14,%ymm14
8128 vpshufb %ymm8,%ymm13,%ymm13
8129 vpshufb %ymm8,%ymm12,%ymm12
8130 vmovdqa 128(%rbp),%ymm8
8131 vpaddd %ymm15,%ymm11,%ymm11
8132 vpaddd %ymm14,%ymm10,%ymm10
8133 vpaddd %ymm13,%ymm9,%ymm9
8134 vpaddd %ymm12,%ymm8,%ymm8
8135 vpxor %ymm11,%ymm7,%ymm7
8136 vpxor %ymm10,%ymm6,%ymm6
8137 movq 0+0(%rbp),%rdx
8138 movq %rdx,%r15
8139 mulxq %r10,%r13,%r14
8140 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008141 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008142 addq %rax,%r14
8143 adcq %rdx,%r15
8144 vpxor %ymm9,%ymm5,%ymm5
8145 vpxor %ymm8,%ymm4,%ymm4
8146 vmovdqa %ymm8,128(%rbp)
8147 vpsrld $25,%ymm7,%ymm8
8148 vpslld $32-25,%ymm7,%ymm7
8149 vpxor %ymm8,%ymm7,%ymm7
8150 vpsrld $25,%ymm6,%ymm8
8151 vpslld $32-25,%ymm6,%ymm6
8152 vpxor %ymm8,%ymm6,%ymm6
8153 vpsrld $25,%ymm5,%ymm8
8154 vpslld $32-25,%ymm5,%ymm5
8155 vpxor %ymm8,%ymm5,%ymm5
8156 vpsrld $25,%ymm4,%ymm8
8157 vpslld $32-25,%ymm4,%ymm4
8158 vpxor %ymm8,%ymm4,%ymm4
8159 vmovdqa 128(%rbp),%ymm8
8160 vpalignr $12,%ymm7,%ymm7,%ymm7
8161 vpalignr $8,%ymm11,%ymm11,%ymm11
8162 vpalignr $4,%ymm15,%ymm15,%ymm15
8163 vpalignr $12,%ymm6,%ymm6,%ymm6
8164 movq 8+0(%rbp),%rdx
8165 mulxq %r10,%r10,%rax
8166 addq %r10,%r14
8167 mulxq %r11,%r11,%r9
8168 adcq %r11,%r15
8169 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08008170 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05008171 vpalignr $8,%ymm10,%ymm10,%ymm10
8172 vpalignr $4,%ymm14,%ymm14,%ymm14
8173 vpalignr $12,%ymm5,%ymm5,%ymm5
8174 vpalignr $8,%ymm9,%ymm9,%ymm9
8175 vpalignr $4,%ymm13,%ymm13,%ymm13
8176 vpalignr $12,%ymm4,%ymm4,%ymm4
8177 vpalignr $8,%ymm8,%ymm8,%ymm8
8178 vpalignr $4,%ymm12,%ymm12,%ymm12
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191 addq %rax,%r15
8192 adcq %rdx,%r9
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213 movq %r13,%r10
8214 movq %r14,%r11
8215 movq %r15,%r12
8216 andq $3,%r12
8217 movq %r15,%r13
8218 andq $-4,%r13
8219 movq %r9,%r14
8220 shrdq $2,%r9,%r15
8221 shrq $2,%r9
8222 addq %r13,%r10
8223 adcq %r14,%r11
8224 adcq $0,%r12
8225 addq %r15,%r10
8226 adcq %r9,%r11
8227 adcq $0,%r12
8228
8229 leaq 32(%rdi),%rdi
8230 decq %rcx
8231 jg 1b
8232 decq %r8
8233 jge 2b
8234 vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
8235 vpaddd 64(%rbp),%ymm7,%ymm7
8236 vpaddd 96(%rbp),%ymm11,%ymm11
8237 vpaddd 256(%rbp),%ymm15,%ymm15
8238 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
8239 vpaddd 64(%rbp),%ymm6,%ymm6
8240 vpaddd 96(%rbp),%ymm10,%ymm10
8241 vpaddd 224(%rbp),%ymm14,%ymm14
8242 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
8243 vpaddd 64(%rbp),%ymm5,%ymm5
8244 vpaddd 96(%rbp),%ymm9,%ymm9
8245 vpaddd 192(%rbp),%ymm13,%ymm13
8246 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
8247 vpaddd 64(%rbp),%ymm4,%ymm4
8248 vpaddd 96(%rbp),%ymm8,%ymm8
8249 vpaddd 160(%rbp),%ymm12,%ymm12
8250
8251 vmovdqa %ymm0,128(%rbp)
8252 vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
8253 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
8254 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
8255 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
8256 vpxor 0+0(%rsi),%ymm0,%ymm0
8257 vpxor 32+0(%rsi),%ymm3,%ymm3
8258 vpxor 64+0(%rsi),%ymm7,%ymm7
8259 vpxor 96+0(%rsi),%ymm11,%ymm11
8260 vmovdqu %ymm0,0+0(%rdi)
8261 vmovdqu %ymm3,32+0(%rdi)
8262 vmovdqu %ymm7,64+0(%rdi)
8263 vmovdqu %ymm11,96+0(%rdi)
8264
8265 vmovdqa 128(%rbp),%ymm0
8266 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
8267 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
8268 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
8269 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
8270 vpxor 0+128(%rsi),%ymm3,%ymm3
8271 vpxor 32+128(%rsi),%ymm2,%ymm2
8272 vpxor 64+128(%rsi),%ymm6,%ymm6
8273 vpxor 96+128(%rsi),%ymm10,%ymm10
8274 vmovdqu %ymm3,0+128(%rdi)
8275 vmovdqu %ymm2,32+128(%rdi)
8276 vmovdqu %ymm6,64+128(%rdi)
8277 vmovdqu %ymm10,96+128(%rdi)
8278 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
8279 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
8280 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
8281 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
8282 vpxor 0+256(%rsi),%ymm3,%ymm3
8283 vpxor 32+256(%rsi),%ymm1,%ymm1
8284 vpxor 64+256(%rsi),%ymm5,%ymm5
8285 vpxor 96+256(%rsi),%ymm9,%ymm9
8286 vmovdqu %ymm3,0+256(%rdi)
8287 vmovdqu %ymm1,32+256(%rdi)
8288 vmovdqu %ymm5,64+256(%rdi)
8289 vmovdqu %ymm9,96+256(%rdi)
8290 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
8291 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
8292 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
8293 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
8294 vmovdqa %ymm3,%ymm8
8295
8296 movq $384,%rcx
8297 leaq 384(%rsi),%rsi
8298 subq $384,%rbx
8299 jmp seal_avx2_hash
8300
8301seal_avx2_320:
8302 vmovdqa %ymm0,%ymm1
8303 vmovdqa %ymm0,%ymm2
8304 vmovdqa %ymm4,%ymm5
8305 vmovdqa %ymm4,%ymm6
8306 vmovdqa %ymm8,%ymm9
8307 vmovdqa %ymm8,%ymm10
8308 vpaddd .avx2_inc(%rip),%ymm12,%ymm13
8309 vpaddd .avx2_inc(%rip),%ymm13,%ymm14
8310 vmovdqa %ymm4,%ymm7
8311 vmovdqa %ymm8,%ymm11
8312 vmovdqa %ymm12,160(%rbp)
8313 vmovdqa %ymm13,192(%rbp)
8314 vmovdqa %ymm14,224(%rbp)
8315 movq $10,%r10
83161:
8317 vpaddd %ymm4,%ymm0,%ymm0
8318 vpxor %ymm0,%ymm12,%ymm12
8319 vpshufb .rol16(%rip),%ymm12,%ymm12
8320 vpaddd %ymm12,%ymm8,%ymm8
8321 vpxor %ymm8,%ymm4,%ymm4
8322 vpsrld $20,%ymm4,%ymm3
8323 vpslld $12,%ymm4,%ymm4
8324 vpxor %ymm3,%ymm4,%ymm4
8325 vpaddd %ymm4,%ymm0,%ymm0
8326 vpxor %ymm0,%ymm12,%ymm12
8327 vpshufb .rol8(%rip),%ymm12,%ymm12
8328 vpaddd %ymm12,%ymm8,%ymm8
8329 vpxor %ymm8,%ymm4,%ymm4
8330 vpslld $7,%ymm4,%ymm3
8331 vpsrld $25,%ymm4,%ymm4
8332 vpxor %ymm3,%ymm4,%ymm4
8333 vpalignr $12,%ymm12,%ymm12,%ymm12
8334 vpalignr $8,%ymm8,%ymm8,%ymm8
8335 vpalignr $4,%ymm4,%ymm4,%ymm4
8336 vpaddd %ymm5,%ymm1,%ymm1
8337 vpxor %ymm1,%ymm13,%ymm13
8338 vpshufb .rol16(%rip),%ymm13,%ymm13
8339 vpaddd %ymm13,%ymm9,%ymm9
8340 vpxor %ymm9,%ymm5,%ymm5
8341 vpsrld $20,%ymm5,%ymm3
8342 vpslld $12,%ymm5,%ymm5
8343 vpxor %ymm3,%ymm5,%ymm5
8344 vpaddd %ymm5,%ymm1,%ymm1
8345 vpxor %ymm1,%ymm13,%ymm13
8346 vpshufb .rol8(%rip),%ymm13,%ymm13
8347 vpaddd %ymm13,%ymm9,%ymm9
8348 vpxor %ymm9,%ymm5,%ymm5
8349 vpslld $7,%ymm5,%ymm3
8350 vpsrld $25,%ymm5,%ymm5
8351 vpxor %ymm3,%ymm5,%ymm5
8352 vpalignr $12,%ymm13,%ymm13,%ymm13
8353 vpalignr $8,%ymm9,%ymm9,%ymm9
8354 vpalignr $4,%ymm5,%ymm5,%ymm5
8355 vpaddd %ymm6,%ymm2,%ymm2
8356 vpxor %ymm2,%ymm14,%ymm14
8357 vpshufb .rol16(%rip),%ymm14,%ymm14
8358 vpaddd %ymm14,%ymm10,%ymm10
8359 vpxor %ymm10,%ymm6,%ymm6
8360 vpsrld $20,%ymm6,%ymm3
8361 vpslld $12,%ymm6,%ymm6
8362 vpxor %ymm3,%ymm6,%ymm6
8363 vpaddd %ymm6,%ymm2,%ymm2
8364 vpxor %ymm2,%ymm14,%ymm14
8365 vpshufb .rol8(%rip),%ymm14,%ymm14
8366 vpaddd %ymm14,%ymm10,%ymm10
8367 vpxor %ymm10,%ymm6,%ymm6
8368 vpslld $7,%ymm6,%ymm3
8369 vpsrld $25,%ymm6,%ymm6
8370 vpxor %ymm3,%ymm6,%ymm6
8371 vpalignr $12,%ymm14,%ymm14,%ymm14
8372 vpalignr $8,%ymm10,%ymm10,%ymm10
8373 vpalignr $4,%ymm6,%ymm6,%ymm6
8374 vpaddd %ymm4,%ymm0,%ymm0
8375 vpxor %ymm0,%ymm12,%ymm12
8376 vpshufb .rol16(%rip),%ymm12,%ymm12
8377 vpaddd %ymm12,%ymm8,%ymm8
8378 vpxor %ymm8,%ymm4,%ymm4
8379 vpsrld $20,%ymm4,%ymm3
8380 vpslld $12,%ymm4,%ymm4
8381 vpxor %ymm3,%ymm4,%ymm4
8382 vpaddd %ymm4,%ymm0,%ymm0
8383 vpxor %ymm0,%ymm12,%ymm12
8384 vpshufb .rol8(%rip),%ymm12,%ymm12
8385 vpaddd %ymm12,%ymm8,%ymm8
8386 vpxor %ymm8,%ymm4,%ymm4
8387 vpslld $7,%ymm4,%ymm3
8388 vpsrld $25,%ymm4,%ymm4
8389 vpxor %ymm3,%ymm4,%ymm4
8390 vpalignr $4,%ymm12,%ymm12,%ymm12
8391 vpalignr $8,%ymm8,%ymm8,%ymm8
8392 vpalignr $12,%ymm4,%ymm4,%ymm4
8393 vpaddd %ymm5,%ymm1,%ymm1
8394 vpxor %ymm1,%ymm13,%ymm13
8395 vpshufb .rol16(%rip),%ymm13,%ymm13
8396 vpaddd %ymm13,%ymm9,%ymm9
8397 vpxor %ymm9,%ymm5,%ymm5
8398 vpsrld $20,%ymm5,%ymm3
8399 vpslld $12,%ymm5,%ymm5
8400 vpxor %ymm3,%ymm5,%ymm5
8401 vpaddd %ymm5,%ymm1,%ymm1
8402 vpxor %ymm1,%ymm13,%ymm13
8403 vpshufb .rol8(%rip),%ymm13,%ymm13
8404 vpaddd %ymm13,%ymm9,%ymm9
8405 vpxor %ymm9,%ymm5,%ymm5
8406 vpslld $7,%ymm5,%ymm3
8407 vpsrld $25,%ymm5,%ymm5
8408 vpxor %ymm3,%ymm5,%ymm5
8409 vpalignr $4,%ymm13,%ymm13,%ymm13
8410 vpalignr $8,%ymm9,%ymm9,%ymm9
8411 vpalignr $12,%ymm5,%ymm5,%ymm5
8412 vpaddd %ymm6,%ymm2,%ymm2
8413 vpxor %ymm2,%ymm14,%ymm14
8414 vpshufb .rol16(%rip),%ymm14,%ymm14
8415 vpaddd %ymm14,%ymm10,%ymm10
8416 vpxor %ymm10,%ymm6,%ymm6
8417 vpsrld $20,%ymm6,%ymm3
8418 vpslld $12,%ymm6,%ymm6
8419 vpxor %ymm3,%ymm6,%ymm6
8420 vpaddd %ymm6,%ymm2,%ymm2
8421 vpxor %ymm2,%ymm14,%ymm14
8422 vpshufb .rol8(%rip),%ymm14,%ymm14
8423 vpaddd %ymm14,%ymm10,%ymm10
8424 vpxor %ymm10,%ymm6,%ymm6
8425 vpslld $7,%ymm6,%ymm3
8426 vpsrld $25,%ymm6,%ymm6
8427 vpxor %ymm3,%ymm6,%ymm6
8428 vpalignr $4,%ymm14,%ymm14,%ymm14
8429 vpalignr $8,%ymm10,%ymm10,%ymm10
8430 vpalignr $12,%ymm6,%ymm6,%ymm6
8431
8432 decq %r10
8433 jne 1b
8434 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
8435 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
8436 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
8437 vpaddd %ymm7,%ymm4,%ymm4
8438 vpaddd %ymm7,%ymm5,%ymm5
8439 vpaddd %ymm7,%ymm6,%ymm6
8440 vpaddd %ymm11,%ymm8,%ymm8
8441 vpaddd %ymm11,%ymm9,%ymm9
8442 vpaddd %ymm11,%ymm10,%ymm10
8443 vpaddd 160(%rbp),%ymm12,%ymm12
8444 vpaddd 192(%rbp),%ymm13,%ymm13
8445 vpaddd 224(%rbp),%ymm14,%ymm14
8446 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
8447
8448 vpand .clamp(%rip),%ymm3,%ymm3
8449 vmovdqa %ymm3,0(%rbp)
8450
8451 vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
8452 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
8453 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
8454 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
8455 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
8456 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
8457 vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
8458 vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
8459 vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
8460 vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
8461 jmp seal_avx2_short
8462
8463seal_avx2_192:
8464 vmovdqa %ymm0,%ymm1
8465 vmovdqa %ymm0,%ymm2
8466 vmovdqa %ymm4,%ymm5
8467 vmovdqa %ymm4,%ymm6
8468 vmovdqa %ymm8,%ymm9
8469 vmovdqa %ymm8,%ymm10
8470 vpaddd .avx2_inc(%rip),%ymm12,%ymm13
8471 vmovdqa %ymm12,%ymm11
8472 vmovdqa %ymm13,%ymm15
8473 movq $10,%r10
84741:
8475 vpaddd %ymm4,%ymm0,%ymm0
8476 vpxor %ymm0,%ymm12,%ymm12
8477 vpshufb .rol16(%rip),%ymm12,%ymm12
8478 vpaddd %ymm12,%ymm8,%ymm8
8479 vpxor %ymm8,%ymm4,%ymm4
8480 vpsrld $20,%ymm4,%ymm3
8481 vpslld $12,%ymm4,%ymm4
8482 vpxor %ymm3,%ymm4,%ymm4
8483 vpaddd %ymm4,%ymm0,%ymm0
8484 vpxor %ymm0,%ymm12,%ymm12
8485 vpshufb .rol8(%rip),%ymm12,%ymm12
8486 vpaddd %ymm12,%ymm8,%ymm8
8487 vpxor %ymm8,%ymm4,%ymm4
8488 vpslld $7,%ymm4,%ymm3
8489 vpsrld $25,%ymm4,%ymm4
8490 vpxor %ymm3,%ymm4,%ymm4
8491 vpalignr $12,%ymm12,%ymm12,%ymm12
8492 vpalignr $8,%ymm8,%ymm8,%ymm8
8493 vpalignr $4,%ymm4,%ymm4,%ymm4
8494 vpaddd %ymm5,%ymm1,%ymm1
8495 vpxor %ymm1,%ymm13,%ymm13
8496 vpshufb .rol16(%rip),%ymm13,%ymm13
8497 vpaddd %ymm13,%ymm9,%ymm9
8498 vpxor %ymm9,%ymm5,%ymm5
8499 vpsrld $20,%ymm5,%ymm3
8500 vpslld $12,%ymm5,%ymm5
8501 vpxor %ymm3,%ymm5,%ymm5
8502 vpaddd %ymm5,%ymm1,%ymm1
8503 vpxor %ymm1,%ymm13,%ymm13
8504 vpshufb .rol8(%rip),%ymm13,%ymm13
8505 vpaddd %ymm13,%ymm9,%ymm9
8506 vpxor %ymm9,%ymm5,%ymm5
8507 vpslld $7,%ymm5,%ymm3
8508 vpsrld $25,%ymm5,%ymm5
8509 vpxor %ymm3,%ymm5,%ymm5
8510 vpalignr $12,%ymm13,%ymm13,%ymm13
8511 vpalignr $8,%ymm9,%ymm9,%ymm9
8512 vpalignr $4,%ymm5,%ymm5,%ymm5
8513 vpaddd %ymm4,%ymm0,%ymm0
8514 vpxor %ymm0,%ymm12,%ymm12
8515 vpshufb .rol16(%rip),%ymm12,%ymm12
8516 vpaddd %ymm12,%ymm8,%ymm8
8517 vpxor %ymm8,%ymm4,%ymm4
8518 vpsrld $20,%ymm4,%ymm3
8519 vpslld $12,%ymm4,%ymm4
8520 vpxor %ymm3,%ymm4,%ymm4
8521 vpaddd %ymm4,%ymm0,%ymm0
8522 vpxor %ymm0,%ymm12,%ymm12
8523 vpshufb .rol8(%rip),%ymm12,%ymm12
8524 vpaddd %ymm12,%ymm8,%ymm8
8525 vpxor %ymm8,%ymm4,%ymm4
8526 vpslld $7,%ymm4,%ymm3
8527 vpsrld $25,%ymm4,%ymm4
8528 vpxor %ymm3,%ymm4,%ymm4
8529 vpalignr $4,%ymm12,%ymm12,%ymm12
8530 vpalignr $8,%ymm8,%ymm8,%ymm8
8531 vpalignr $12,%ymm4,%ymm4,%ymm4
8532 vpaddd %ymm5,%ymm1,%ymm1
8533 vpxor %ymm1,%ymm13,%ymm13
8534 vpshufb .rol16(%rip),%ymm13,%ymm13
8535 vpaddd %ymm13,%ymm9,%ymm9
8536 vpxor %ymm9,%ymm5,%ymm5
8537 vpsrld $20,%ymm5,%ymm3
8538 vpslld $12,%ymm5,%ymm5
8539 vpxor %ymm3,%ymm5,%ymm5
8540 vpaddd %ymm5,%ymm1,%ymm1
8541 vpxor %ymm1,%ymm13,%ymm13
8542 vpshufb .rol8(%rip),%ymm13,%ymm13
8543 vpaddd %ymm13,%ymm9,%ymm9
8544 vpxor %ymm9,%ymm5,%ymm5
8545 vpslld $7,%ymm5,%ymm3
8546 vpsrld $25,%ymm5,%ymm5
8547 vpxor %ymm3,%ymm5,%ymm5
8548 vpalignr $4,%ymm13,%ymm13,%ymm13
8549 vpalignr $8,%ymm9,%ymm9,%ymm9
8550 vpalignr $12,%ymm5,%ymm5,%ymm5
8551
8552 decq %r10
8553 jne 1b
8554 vpaddd %ymm2,%ymm0,%ymm0
8555 vpaddd %ymm2,%ymm1,%ymm1
8556 vpaddd %ymm6,%ymm4,%ymm4
8557 vpaddd %ymm6,%ymm5,%ymm5
8558 vpaddd %ymm10,%ymm8,%ymm8
8559 vpaddd %ymm10,%ymm9,%ymm9
8560 vpaddd %ymm11,%ymm12,%ymm12
8561 vpaddd %ymm15,%ymm13,%ymm13
8562 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
8563
8564 vpand .clamp(%rip),%ymm3,%ymm3
8565 vmovdqa %ymm3,0(%rbp)
8566
8567 vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
8568 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
8569 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
8570 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
8571 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
8572 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
8573seal_avx2_short:
8574 movq %r8,%r8
8575 call poly_hash_ad_internal
8576 xorq %rcx,%rcx
8577seal_avx2_hash:
8578 cmpq $16,%rcx
8579 jb seal_avx2_short_loop
8580 addq 0(%rdi),%r10
8581 adcq 8+0(%rdi),%r11
8582 adcq $1,%r12
8583 movq 0+0(%rbp),%rax
8584 movq %rax,%r15
8585 mulq %r10
8586 movq %rax,%r13
8587 movq %rdx,%r14
8588 movq 0+0(%rbp),%rax
8589 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08008590 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008591 addq %rax,%r14
8592 adcq %rdx,%r15
8593 movq 8+0(%rbp),%rax
8594 movq %rax,%r9
8595 mulq %r10
8596 addq %rax,%r14
8597 adcq $0,%rdx
8598 movq %rdx,%r10
8599 movq 8+0(%rbp),%rax
8600 mulq %r11
8601 addq %rax,%r15
8602 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008603 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05008604 addq %r10,%r15
8605 adcq %rdx,%r9
8606 movq %r13,%r10
8607 movq %r14,%r11
8608 movq %r15,%r12
8609 andq $3,%r12
8610 movq %r15,%r13
8611 andq $-4,%r13
8612 movq %r9,%r14
8613 shrdq $2,%r9,%r15
8614 shrq $2,%r9
8615 addq %r13,%r10
8616 adcq %r14,%r11
8617 adcq $0,%r12
8618 addq %r15,%r10
8619 adcq %r9,%r11
8620 adcq $0,%r12
8621
8622 subq $16,%rcx
8623 addq $16,%rdi
8624 jmp seal_avx2_hash
8625seal_avx2_short_loop:
8626 cmpq $32,%rbx
8627 jb seal_avx2_short_tail
8628 subq $32,%rbx
8629
8630 vpxor (%rsi),%ymm0,%ymm0
8631 vmovdqu %ymm0,(%rdi)
8632 leaq 32(%rsi),%rsi
8633
8634 addq 0(%rdi),%r10
8635 adcq 8+0(%rdi),%r11
8636 adcq $1,%r12
8637 movq 0+0(%rbp),%rax
8638 movq %rax,%r15
8639 mulq %r10
8640 movq %rax,%r13
8641 movq %rdx,%r14
8642 movq 0+0(%rbp),%rax
8643 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08008644 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008645 addq %rax,%r14
8646 adcq %rdx,%r15
8647 movq 8+0(%rbp),%rax
8648 movq %rax,%r9
8649 mulq %r10
8650 addq %rax,%r14
8651 adcq $0,%rdx
8652 movq %rdx,%r10
8653 movq 8+0(%rbp),%rax
8654 mulq %r11
8655 addq %rax,%r15
8656 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008657 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05008658 addq %r10,%r15
8659 adcq %rdx,%r9
8660 movq %r13,%r10
8661 movq %r14,%r11
8662 movq %r15,%r12
8663 andq $3,%r12
8664 movq %r15,%r13
8665 andq $-4,%r13
8666 movq %r9,%r14
8667 shrdq $2,%r9,%r15
8668 shrq $2,%r9
8669 addq %r13,%r10
8670 adcq %r14,%r11
8671 adcq $0,%r12
8672 addq %r15,%r10
8673 adcq %r9,%r11
8674 adcq $0,%r12
8675 addq 16(%rdi),%r10
8676 adcq 8+16(%rdi),%r11
8677 adcq $1,%r12
8678 movq 0+0(%rbp),%rax
8679 movq %rax,%r15
8680 mulq %r10
8681 movq %rax,%r13
8682 movq %rdx,%r14
8683 movq 0+0(%rbp),%rax
8684 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08008685 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008686 addq %rax,%r14
8687 adcq %rdx,%r15
8688 movq 8+0(%rbp),%rax
8689 movq %rax,%r9
8690 mulq %r10
8691 addq %rax,%r14
8692 adcq $0,%rdx
8693 movq %rdx,%r10
8694 movq 8+0(%rbp),%rax
8695 mulq %r11
8696 addq %rax,%r15
8697 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008698 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05008699 addq %r10,%r15
8700 adcq %rdx,%r9
8701 movq %r13,%r10
8702 movq %r14,%r11
8703 movq %r15,%r12
8704 andq $3,%r12
8705 movq %r15,%r13
8706 andq $-4,%r13
8707 movq %r9,%r14
8708 shrdq $2,%r9,%r15
8709 shrq $2,%r9
8710 addq %r13,%r10
8711 adcq %r14,%r11
8712 adcq $0,%r12
8713 addq %r15,%r10
8714 adcq %r9,%r11
8715 adcq $0,%r12
8716
8717 leaq 32(%rdi),%rdi
8718
8719 vmovdqa %ymm4,%ymm0
8720 vmovdqa %ymm8,%ymm4
8721 vmovdqa %ymm12,%ymm8
8722 vmovdqa %ymm1,%ymm12
8723 vmovdqa %ymm5,%ymm1
8724 vmovdqa %ymm9,%ymm5
8725 vmovdqa %ymm13,%ymm9
8726 vmovdqa %ymm2,%ymm13
8727 vmovdqa %ymm6,%ymm2
8728 jmp seal_avx2_short_loop
8729seal_avx2_short_tail:
8730 cmpq $16,%rbx
8731 jb 1f
8732 subq $16,%rbx
8733 vpxor (%rsi),%xmm0,%xmm3
8734 vmovdqu %xmm3,(%rdi)
8735 leaq 16(%rsi),%rsi
8736 addq 0(%rdi),%r10
8737 adcq 8+0(%rdi),%r11
8738 adcq $1,%r12
8739 movq 0+0(%rbp),%rax
8740 movq %rax,%r15
8741 mulq %r10
8742 movq %rax,%r13
8743 movq %rdx,%r14
8744 movq 0+0(%rbp),%rax
8745 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08008746 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008747 addq %rax,%r14
8748 adcq %rdx,%r15
8749 movq 8+0(%rbp),%rax
8750 movq %rax,%r9
8751 mulq %r10
8752 addq %rax,%r14
8753 adcq $0,%rdx
8754 movq %rdx,%r10
8755 movq 8+0(%rbp),%rax
8756 mulq %r11
8757 addq %rax,%r15
8758 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008759 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05008760 addq %r10,%r15
8761 adcq %rdx,%r9
8762 movq %r13,%r10
8763 movq %r14,%r11
8764 movq %r15,%r12
8765 andq $3,%r12
8766 movq %r15,%r13
8767 andq $-4,%r13
8768 movq %r9,%r14
8769 shrdq $2,%r9,%r15
8770 shrq $2,%r9
8771 addq %r13,%r10
8772 adcq %r14,%r11
8773 adcq $0,%r12
8774 addq %r15,%r10
8775 adcq %r9,%r11
8776 adcq $0,%r12
8777
8778 leaq 16(%rdi),%rdi
8779 vextracti128 $1,%ymm0,%xmm0
87801:
8781 vzeroupper
8782 jmp seal_sse_tail_16
8783.cfi_endproc
8784#endif