blob: d149d0f77f1357f4fc241b12ae7dcb4bfc3c7757 [file] [log] [blame]
Robert Sloan6f79a502017-04-03 09:16:40 -07001#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
David Benjaminf31229b2017-01-25 14:08:15 -05002.text
3.extern OPENSSL_ia32cap_P
4.hidden OPENSSL_ia32cap_P
Robert Sloan5d625782017-02-13 09:55:39 -08005
6chacha20_poly1305_constants:
7
David Benjaminf31229b2017-01-25 14:08:15 -05008.align 64
9.chacha20_consts:
10.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
11.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
12.rol8:
13.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
14.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
15.rol16:
16.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
17.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
18.avx2_init:
19.long 0,0,0,0
20.sse_inc:
21.long 1,0,0,0
22.avx2_inc:
23.long 2,0,0,0,2,0,0,0
24.clamp:
25.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
26.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
27.align 16
28.and_masks:
29.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
30.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
31.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
32.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
33.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
34.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
35.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
36.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
37.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
38.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
39.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
40.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
41.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
42.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
43.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
44
45.type poly_hash_ad_internal,@function
46.align 64
47poly_hash_ad_internal:
48.cfi_startproc
49 xorq %r10,%r10
50 xorq %r11,%r11
51 xorq %r12,%r12
52 cmpq $13,%r8
53 jne hash_ad_loop
54poly_fast_tls_ad:
55
56 movq (%rcx),%r10
57 movq 5(%rcx),%r11
58 shrq $24,%r11
59 movq $1,%r12
60 movq 0+0(%rbp),%rax
61 movq %rax,%r15
62 mulq %r10
63 movq %rax,%r13
64 movq %rdx,%r14
65 movq 0+0(%rbp),%rax
66 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -080067 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -050068 addq %rax,%r14
69 adcq %rdx,%r15
70 movq 8+0(%rbp),%rax
71 movq %rax,%r9
72 mulq %r10
73 addq %rax,%r14
74 adcq $0,%rdx
75 movq %rdx,%r10
76 movq 8+0(%rbp),%rax
77 mulq %r11
78 addq %rax,%r15
79 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -080080 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -050081 addq %r10,%r15
82 adcq %rdx,%r9
83 movq %r13,%r10
84 movq %r14,%r11
85 movq %r15,%r12
86 andq $3,%r12
87 movq %r15,%r13
88 andq $-4,%r13
89 movq %r9,%r14
90 shrdq $2,%r9,%r15
91 shrq $2,%r9
92 addq %r13,%r10
93 adcq %r14,%r11
94 adcq $0,%r12
95 addq %r15,%r10
96 adcq %r9,%r11
97 adcq $0,%r12
98
99 .byte 0xf3,0xc3
100hash_ad_loop:
101
102 cmpq $16,%r8
103 jb hash_ad_tail
104 addq 0(%rcx),%r10
105 adcq 8+0(%rcx),%r11
106 adcq $1,%r12
107 movq 0+0(%rbp),%rax
108 movq %rax,%r15
109 mulq %r10
110 movq %rax,%r13
111 movq %rdx,%r14
112 movq 0+0(%rbp),%rax
113 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800114 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500115 addq %rax,%r14
116 adcq %rdx,%r15
117 movq 8+0(%rbp),%rax
118 movq %rax,%r9
119 mulq %r10
120 addq %rax,%r14
121 adcq $0,%rdx
122 movq %rdx,%r10
123 movq 8+0(%rbp),%rax
124 mulq %r11
125 addq %rax,%r15
126 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -0800127 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500128 addq %r10,%r15
129 adcq %rdx,%r9
130 movq %r13,%r10
131 movq %r14,%r11
132 movq %r15,%r12
133 andq $3,%r12
134 movq %r15,%r13
135 andq $-4,%r13
136 movq %r9,%r14
137 shrdq $2,%r9,%r15
138 shrq $2,%r9
139 addq %r13,%r10
140 adcq %r14,%r11
141 adcq $0,%r12
142 addq %r15,%r10
143 adcq %r9,%r11
144 adcq $0,%r12
145
Robert Sloan4d1ac502017-02-06 08:36:14 -0800146 leaq 16(%rcx),%rcx
David Benjaminf31229b2017-01-25 14:08:15 -0500147 subq $16,%r8
148 jmp hash_ad_loop
149hash_ad_tail:
150 cmpq $0,%r8
151 je 1f
152
153 xorq %r13,%r13
154 xorq %r14,%r14
155 xorq %r15,%r15
156 addq %r8,%rcx
157hash_ad_tail_loop:
158 shldq $8,%r13,%r14
159 shlq $8,%r13
160 movzbq -1(%rcx),%r15
161 xorq %r15,%r13
162 decq %rcx
163 decq %r8
164 jne hash_ad_tail_loop
165
166 addq %r13,%r10
167 adcq %r14,%r11
168 adcq $1,%r12
169 movq 0+0(%rbp),%rax
170 movq %rax,%r15
171 mulq %r10
172 movq %rax,%r13
173 movq %rdx,%r14
174 movq 0+0(%rbp),%rax
175 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800176 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500177 addq %rax,%r14
178 adcq %rdx,%r15
179 movq 8+0(%rbp),%rax
180 movq %rax,%r9
181 mulq %r10
182 addq %rax,%r14
183 adcq $0,%rdx
184 movq %rdx,%r10
185 movq 8+0(%rbp),%rax
186 mulq %r11
187 addq %rax,%r15
188 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -0800189 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500190 addq %r10,%r15
191 adcq %rdx,%r9
192 movq %r13,%r10
193 movq %r14,%r11
194 movq %r15,%r12
195 andq $3,%r12
196 movq %r15,%r13
197 andq $-4,%r13
198 movq %r9,%r14
199 shrdq $2,%r9,%r15
200 shrq $2,%r9
201 addq %r13,%r10
202 adcq %r14,%r11
203 adcq $0,%r12
204 addq %r15,%r10
205 adcq %r9,%r11
206 adcq $0,%r12
207
208
2091:
210 .byte 0xf3,0xc3
211.cfi_endproc
212.size poly_hash_ad_internal, .-poly_hash_ad_internal
213
214.globl chacha20_poly1305_open
215.hidden chacha20_poly1305_open
216.type chacha20_poly1305_open,@function
217.align 64
218chacha20_poly1305_open:
219.cfi_startproc
220 pushq %rbp
221.cfi_adjust_cfa_offset 8
222 pushq %rbx
223.cfi_adjust_cfa_offset 8
224 pushq %r12
225.cfi_adjust_cfa_offset 8
226 pushq %r13
227.cfi_adjust_cfa_offset 8
228 pushq %r14
229.cfi_adjust_cfa_offset 8
230 pushq %r15
231.cfi_adjust_cfa_offset 8
232
233
234 pushq %r9
235.cfi_adjust_cfa_offset 8
236 subq $288 + 32,%rsp
237.cfi_adjust_cfa_offset 288 + 32
238.cfi_offset rbp, -16
239.cfi_offset rbx, -24
240.cfi_offset r12, -32
241.cfi_offset r13, -40
242.cfi_offset r14, -48
243.cfi_offset r15, -56
David Benjaminf31229b2017-01-25 14:08:15 -0500244 leaq 32(%rsp),%rbp
245 andq $-32,%rbp
246 movq %rdx,8+32(%rbp)
247 movq %r8,0+32(%rbp)
248 movq %rdx,%rbx
249
250 movl OPENSSL_ia32cap_P+8(%rip),%eax
251 andl $288,%eax
252 xorl $288,%eax
253 jz chacha20_poly1305_open_avx2
254
2551:
256 cmpq $128,%rbx
257 jbe open_sse_128
258
259 movdqa .chacha20_consts(%rip),%xmm0
260 movdqu 0(%r9),%xmm4
261 movdqu 16(%r9),%xmm8
262 movdqu 32(%r9),%xmm12
263 movdqa %xmm12,%xmm7
264
265 movdqa %xmm4,48(%rbp)
266 movdqa %xmm8,64(%rbp)
267 movdqa %xmm12,96(%rbp)
268 movq $10,%r10
2691:
270 paddd %xmm4,%xmm0
271 pxor %xmm0,%xmm12
272 pshufb .rol16(%rip),%xmm12
273 paddd %xmm12,%xmm8
274 pxor %xmm8,%xmm4
275 movdqa %xmm4,%xmm3
276 pslld $12,%xmm3
277 psrld $20,%xmm4
278 pxor %xmm3,%xmm4
279 paddd %xmm4,%xmm0
280 pxor %xmm0,%xmm12
281 pshufb .rol8(%rip),%xmm12
282 paddd %xmm12,%xmm8
283 pxor %xmm8,%xmm4
284 movdqa %xmm4,%xmm3
285 pslld $7,%xmm3
286 psrld $25,%xmm4
287 pxor %xmm3,%xmm4
288.byte 102,15,58,15,228,4
289.byte 102,69,15,58,15,192,8
290.byte 102,69,15,58,15,228,12
291 paddd %xmm4,%xmm0
292 pxor %xmm0,%xmm12
293 pshufb .rol16(%rip),%xmm12
294 paddd %xmm12,%xmm8
295 pxor %xmm8,%xmm4
296 movdqa %xmm4,%xmm3
297 pslld $12,%xmm3
298 psrld $20,%xmm4
299 pxor %xmm3,%xmm4
300 paddd %xmm4,%xmm0
301 pxor %xmm0,%xmm12
302 pshufb .rol8(%rip),%xmm12
303 paddd %xmm12,%xmm8
304 pxor %xmm8,%xmm4
305 movdqa %xmm4,%xmm3
306 pslld $7,%xmm3
307 psrld $25,%xmm4
308 pxor %xmm3,%xmm4
309.byte 102,15,58,15,228,12
310.byte 102,69,15,58,15,192,8
311.byte 102,69,15,58,15,228,4
312
313 decq %r10
314 jne 1b
315
316 paddd .chacha20_consts(%rip),%xmm0
317 paddd 48(%rbp),%xmm4
318
319 pand .clamp(%rip),%xmm0
320 movdqa %xmm0,0(%rbp)
321 movdqa %xmm4,16(%rbp)
322
323 movq %r8,%r8
324 call poly_hash_ad_internal
325open_sse_main_loop:
326 cmpq $256,%rbx
327 jb 2f
328
329 movdqa .chacha20_consts(%rip),%xmm0
330 movdqa 48(%rbp),%xmm4
331 movdqa 64(%rbp),%xmm8
332 movdqa %xmm0,%xmm1
333 movdqa %xmm4,%xmm5
334 movdqa %xmm8,%xmm9
335 movdqa %xmm0,%xmm2
336 movdqa %xmm4,%xmm6
337 movdqa %xmm8,%xmm10
338 movdqa %xmm0,%xmm3
339 movdqa %xmm4,%xmm7
340 movdqa %xmm8,%xmm11
341 movdqa 96(%rbp),%xmm15
342 paddd .sse_inc(%rip),%xmm15
343 movdqa %xmm15,%xmm14
344 paddd .sse_inc(%rip),%xmm14
345 movdqa %xmm14,%xmm13
346 paddd .sse_inc(%rip),%xmm13
347 movdqa %xmm13,%xmm12
348 paddd .sse_inc(%rip),%xmm12
349 movdqa %xmm12,96(%rbp)
350 movdqa %xmm13,112(%rbp)
351 movdqa %xmm14,128(%rbp)
352 movdqa %xmm15,144(%rbp)
353
354
355
356 movq $4,%rcx
357 movq %rsi,%r8
3581:
359 movdqa %xmm8,80(%rbp)
360 movdqa .rol16(%rip),%xmm8
361 paddd %xmm7,%xmm3
362 paddd %xmm6,%xmm2
363 paddd %xmm5,%xmm1
364 paddd %xmm4,%xmm0
365 pxor %xmm3,%xmm15
366 pxor %xmm2,%xmm14
367 pxor %xmm1,%xmm13
368 pxor %xmm0,%xmm12
369.byte 102,69,15,56,0,248
370.byte 102,69,15,56,0,240
371.byte 102,69,15,56,0,232
372.byte 102,69,15,56,0,224
373 movdqa 80(%rbp),%xmm8
374 paddd %xmm15,%xmm11
375 paddd %xmm14,%xmm10
376 paddd %xmm13,%xmm9
377 paddd %xmm12,%xmm8
378 pxor %xmm11,%xmm7
379 addq 0(%r8),%r10
380 adcq 8+0(%r8),%r11
381 adcq $1,%r12
382
383 leaq 16(%r8),%r8
384 pxor %xmm10,%xmm6
385 pxor %xmm9,%xmm5
386 pxor %xmm8,%xmm4
387 movdqa %xmm8,80(%rbp)
388 movdqa %xmm7,%xmm8
389 psrld $20,%xmm8
390 pslld $32-20,%xmm7
391 pxor %xmm8,%xmm7
392 movdqa %xmm6,%xmm8
393 psrld $20,%xmm8
394 pslld $32-20,%xmm6
395 pxor %xmm8,%xmm6
396 movdqa %xmm5,%xmm8
397 psrld $20,%xmm8
398 pslld $32-20,%xmm5
399 pxor %xmm8,%xmm5
400 movdqa %xmm4,%xmm8
401 psrld $20,%xmm8
402 pslld $32-20,%xmm4
403 pxor %xmm8,%xmm4
404 movq 0+0(%rbp),%rax
405 movq %rax,%r15
406 mulq %r10
407 movq %rax,%r13
408 movq %rdx,%r14
409 movq 0+0(%rbp),%rax
410 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800411 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500412 addq %rax,%r14
413 adcq %rdx,%r15
414 movdqa .rol8(%rip),%xmm8
415 paddd %xmm7,%xmm3
416 paddd %xmm6,%xmm2
417 paddd %xmm5,%xmm1
418 paddd %xmm4,%xmm0
419 pxor %xmm3,%xmm15
420 pxor %xmm2,%xmm14
421 pxor %xmm1,%xmm13
422 pxor %xmm0,%xmm12
423.byte 102,69,15,56,0,248
424.byte 102,69,15,56,0,240
425.byte 102,69,15,56,0,232
426.byte 102,69,15,56,0,224
427 movdqa 80(%rbp),%xmm8
428 paddd %xmm15,%xmm11
429 paddd %xmm14,%xmm10
430 paddd %xmm13,%xmm9
431 paddd %xmm12,%xmm8
432 pxor %xmm11,%xmm7
433 pxor %xmm10,%xmm6
434 movq 8+0(%rbp),%rax
435 movq %rax,%r9
436 mulq %r10
437 addq %rax,%r14
438 adcq $0,%rdx
439 movq %rdx,%r10
440 movq 8+0(%rbp),%rax
441 mulq %r11
442 addq %rax,%r15
443 adcq $0,%rdx
444 pxor %xmm9,%xmm5
445 pxor %xmm8,%xmm4
446 movdqa %xmm8,80(%rbp)
447 movdqa %xmm7,%xmm8
448 psrld $25,%xmm8
449 pslld $32-25,%xmm7
450 pxor %xmm8,%xmm7
451 movdqa %xmm6,%xmm8
452 psrld $25,%xmm8
453 pslld $32-25,%xmm6
454 pxor %xmm8,%xmm6
455 movdqa %xmm5,%xmm8
456 psrld $25,%xmm8
457 pslld $32-25,%xmm5
458 pxor %xmm8,%xmm5
459 movdqa %xmm4,%xmm8
460 psrld $25,%xmm8
461 pslld $32-25,%xmm4
462 pxor %xmm8,%xmm4
463 movdqa 80(%rbp),%xmm8
Robert Sloan4d1ac502017-02-06 08:36:14 -0800464 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500465 addq %r10,%r15
466 adcq %rdx,%r9
467.byte 102,15,58,15,255,4
468.byte 102,69,15,58,15,219,8
469.byte 102,69,15,58,15,255,12
470.byte 102,15,58,15,246,4
471.byte 102,69,15,58,15,210,8
472.byte 102,69,15,58,15,246,12
473.byte 102,15,58,15,237,4
474.byte 102,69,15,58,15,201,8
475.byte 102,69,15,58,15,237,12
476.byte 102,15,58,15,228,4
477.byte 102,69,15,58,15,192,8
478.byte 102,69,15,58,15,228,12
479 movdqa %xmm8,80(%rbp)
480 movdqa .rol16(%rip),%xmm8
481 paddd %xmm7,%xmm3
482 paddd %xmm6,%xmm2
483 paddd %xmm5,%xmm1
484 paddd %xmm4,%xmm0
485 pxor %xmm3,%xmm15
486 pxor %xmm2,%xmm14
487 movq %r13,%r10
488 movq %r14,%r11
489 movq %r15,%r12
490 andq $3,%r12
491 movq %r15,%r13
492 andq $-4,%r13
493 movq %r9,%r14
494 shrdq $2,%r9,%r15
495 shrq $2,%r9
496 addq %r13,%r10
497 adcq %r14,%r11
498 adcq $0,%r12
499 addq %r15,%r10
500 adcq %r9,%r11
501 adcq $0,%r12
502 pxor %xmm1,%xmm13
503 pxor %xmm0,%xmm12
504.byte 102,69,15,56,0,248
505.byte 102,69,15,56,0,240
506.byte 102,69,15,56,0,232
507.byte 102,69,15,56,0,224
508 movdqa 80(%rbp),%xmm8
509 paddd %xmm15,%xmm11
510 paddd %xmm14,%xmm10
511 paddd %xmm13,%xmm9
512 paddd %xmm12,%xmm8
513 pxor %xmm11,%xmm7
514 pxor %xmm10,%xmm6
515 pxor %xmm9,%xmm5
516 pxor %xmm8,%xmm4
517 movdqa %xmm8,80(%rbp)
518 movdqa %xmm7,%xmm8
519 psrld $20,%xmm8
520 pslld $32-20,%xmm7
521 pxor %xmm8,%xmm7
522 movdqa %xmm6,%xmm8
523 psrld $20,%xmm8
524 pslld $32-20,%xmm6
525 pxor %xmm8,%xmm6
526 movdqa %xmm5,%xmm8
527 psrld $20,%xmm8
528 pslld $32-20,%xmm5
529 pxor %xmm8,%xmm5
530 movdqa %xmm4,%xmm8
531 psrld $20,%xmm8
532 pslld $32-20,%xmm4
533 pxor %xmm8,%xmm4
534 movdqa .rol8(%rip),%xmm8
535 paddd %xmm7,%xmm3
536 paddd %xmm6,%xmm2
537 paddd %xmm5,%xmm1
538 paddd %xmm4,%xmm0
539 pxor %xmm3,%xmm15
540 pxor %xmm2,%xmm14
541 pxor %xmm1,%xmm13
542 pxor %xmm0,%xmm12
543.byte 102,69,15,56,0,248
544.byte 102,69,15,56,0,240
545.byte 102,69,15,56,0,232
546.byte 102,69,15,56,0,224
547 movdqa 80(%rbp),%xmm8
548 paddd %xmm15,%xmm11
549 paddd %xmm14,%xmm10
550 paddd %xmm13,%xmm9
551 paddd %xmm12,%xmm8
552 pxor %xmm11,%xmm7
553 pxor %xmm10,%xmm6
554 pxor %xmm9,%xmm5
555 pxor %xmm8,%xmm4
556 movdqa %xmm8,80(%rbp)
557 movdqa %xmm7,%xmm8
558 psrld $25,%xmm8
559 pslld $32-25,%xmm7
560 pxor %xmm8,%xmm7
561 movdqa %xmm6,%xmm8
562 psrld $25,%xmm8
563 pslld $32-25,%xmm6
564 pxor %xmm8,%xmm6
565 movdqa %xmm5,%xmm8
566 psrld $25,%xmm8
567 pslld $32-25,%xmm5
568 pxor %xmm8,%xmm5
569 movdqa %xmm4,%xmm8
570 psrld $25,%xmm8
571 pslld $32-25,%xmm4
572 pxor %xmm8,%xmm4
573 movdqa 80(%rbp),%xmm8
574.byte 102,15,58,15,255,12
575.byte 102,69,15,58,15,219,8
576.byte 102,69,15,58,15,255,4
577.byte 102,15,58,15,246,12
578.byte 102,69,15,58,15,210,8
579.byte 102,69,15,58,15,246,4
580.byte 102,15,58,15,237,12
581.byte 102,69,15,58,15,201,8
582.byte 102,69,15,58,15,237,4
583.byte 102,15,58,15,228,12
584.byte 102,69,15,58,15,192,8
585.byte 102,69,15,58,15,228,4
586
587 decq %rcx
588 jge 1b
589 addq 0(%r8),%r10
590 adcq 8+0(%r8),%r11
591 adcq $1,%r12
592 movq 0+0(%rbp),%rax
593 movq %rax,%r15
594 mulq %r10
595 movq %rax,%r13
596 movq %rdx,%r14
597 movq 0+0(%rbp),%rax
598 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800599 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500600 addq %rax,%r14
601 adcq %rdx,%r15
602 movq 8+0(%rbp),%rax
603 movq %rax,%r9
604 mulq %r10
605 addq %rax,%r14
606 adcq $0,%rdx
607 movq %rdx,%r10
608 movq 8+0(%rbp),%rax
609 mulq %r11
610 addq %rax,%r15
611 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -0800612 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500613 addq %r10,%r15
614 adcq %rdx,%r9
615 movq %r13,%r10
616 movq %r14,%r11
617 movq %r15,%r12
618 andq $3,%r12
619 movq %r15,%r13
620 andq $-4,%r13
621 movq %r9,%r14
622 shrdq $2,%r9,%r15
623 shrq $2,%r9
624 addq %r13,%r10
625 adcq %r14,%r11
626 adcq $0,%r12
627 addq %r15,%r10
628 adcq %r9,%r11
629 adcq $0,%r12
630
631 leaq 16(%r8),%r8
632 cmpq $-6,%rcx
633 jg 1b
634 paddd .chacha20_consts(%rip),%xmm3
635 paddd 48(%rbp),%xmm7
636 paddd 64(%rbp),%xmm11
637 paddd 144(%rbp),%xmm15
638 paddd .chacha20_consts(%rip),%xmm2
639 paddd 48(%rbp),%xmm6
640 paddd 64(%rbp),%xmm10
641 paddd 128(%rbp),%xmm14
642 paddd .chacha20_consts(%rip),%xmm1
643 paddd 48(%rbp),%xmm5
644 paddd 64(%rbp),%xmm9
645 paddd 112(%rbp),%xmm13
646 paddd .chacha20_consts(%rip),%xmm0
647 paddd 48(%rbp),%xmm4
648 paddd 64(%rbp),%xmm8
649 paddd 96(%rbp),%xmm12
650 movdqa %xmm12,80(%rbp)
651 movdqu 0 + 0(%rsi),%xmm12
652 pxor %xmm3,%xmm12
653 movdqu %xmm12,0 + 0(%rdi)
654 movdqu 16 + 0(%rsi),%xmm12
655 pxor %xmm7,%xmm12
656 movdqu %xmm12,16 + 0(%rdi)
657 movdqu 32 + 0(%rsi),%xmm12
658 pxor %xmm11,%xmm12
659 movdqu %xmm12,32 + 0(%rdi)
660 movdqu 48 + 0(%rsi),%xmm12
661 pxor %xmm15,%xmm12
662 movdqu %xmm12,48 + 0(%rdi)
663 movdqu 0 + 64(%rsi),%xmm3
664 movdqu 16 + 64(%rsi),%xmm7
665 movdqu 32 + 64(%rsi),%xmm11
666 movdqu 48 + 64(%rsi),%xmm15
667 pxor %xmm3,%xmm2
668 pxor %xmm7,%xmm6
669 pxor %xmm11,%xmm10
670 pxor %xmm14,%xmm15
671 movdqu %xmm2,0 + 64(%rdi)
672 movdqu %xmm6,16 + 64(%rdi)
673 movdqu %xmm10,32 + 64(%rdi)
674 movdqu %xmm15,48 + 64(%rdi)
675 movdqu 0 + 128(%rsi),%xmm3
676 movdqu 16 + 128(%rsi),%xmm7
677 movdqu 32 + 128(%rsi),%xmm11
678 movdqu 48 + 128(%rsi),%xmm15
679 pxor %xmm3,%xmm1
680 pxor %xmm7,%xmm5
681 pxor %xmm11,%xmm9
682 pxor %xmm13,%xmm15
683 movdqu %xmm1,0 + 128(%rdi)
684 movdqu %xmm5,16 + 128(%rdi)
685 movdqu %xmm9,32 + 128(%rdi)
686 movdqu %xmm15,48 + 128(%rdi)
687 movdqu 0 + 192(%rsi),%xmm3
688 movdqu 16 + 192(%rsi),%xmm7
689 movdqu 32 + 192(%rsi),%xmm11
690 movdqu 48 + 192(%rsi),%xmm15
691 pxor %xmm3,%xmm0
692 pxor %xmm7,%xmm4
693 pxor %xmm11,%xmm8
694 pxor 80(%rbp),%xmm15
695 movdqu %xmm0,0 + 192(%rdi)
696 movdqu %xmm4,16 + 192(%rdi)
697 movdqu %xmm8,32 + 192(%rdi)
698 movdqu %xmm15,48 + 192(%rdi)
699
700 leaq 256(%rsi),%rsi
701 leaq 256(%rdi),%rdi
702 subq $256,%rbx
703 jmp open_sse_main_loop
7042:
705
706 testq %rbx,%rbx
707 jz open_sse_finalize
708 cmpq $64,%rbx
709 ja 3f
710 movdqa .chacha20_consts(%rip),%xmm0
711 movdqa 48(%rbp),%xmm4
712 movdqa 64(%rbp),%xmm8
713 movdqa 96(%rbp),%xmm12
714 paddd .sse_inc(%rip),%xmm12
715 movdqa %xmm12,96(%rbp)
716
717 xorq %r8,%r8
718 movq %rbx,%rcx
719 cmpq $16,%rcx
720 jb 2f
7211:
722 addq 0(%rsi,%r8), %r10
723 adcq 8+0(%rsi,%r8), %r11
724 adcq $1,%r12
725 movq 0+0(%rbp),%rax
726 movq %rax,%r15
727 mulq %r10
728 movq %rax,%r13
729 movq %rdx,%r14
730 movq 0+0(%rbp),%rax
731 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800732 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500733 addq %rax,%r14
734 adcq %rdx,%r15
735 movq 8+0(%rbp),%rax
736 movq %rax,%r9
737 mulq %r10
738 addq %rax,%r14
739 adcq $0,%rdx
740 movq %rdx,%r10
741 movq 8+0(%rbp),%rax
742 mulq %r11
743 addq %rax,%r15
744 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -0800745 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500746 addq %r10,%r15
747 adcq %rdx,%r9
748 movq %r13,%r10
749 movq %r14,%r11
750 movq %r15,%r12
751 andq $3,%r12
752 movq %r15,%r13
753 andq $-4,%r13
754 movq %r9,%r14
755 shrdq $2,%r9,%r15
756 shrq $2,%r9
757 addq %r13,%r10
758 adcq %r14,%r11
759 adcq $0,%r12
760 addq %r15,%r10
761 adcq %r9,%r11
762 adcq $0,%r12
763
764 subq $16,%rcx
7652:
766 addq $16,%r8
767 paddd %xmm4,%xmm0
768 pxor %xmm0,%xmm12
769 pshufb .rol16(%rip),%xmm12
770 paddd %xmm12,%xmm8
771 pxor %xmm8,%xmm4
772 movdqa %xmm4,%xmm3
773 pslld $12,%xmm3
774 psrld $20,%xmm4
775 pxor %xmm3,%xmm4
776 paddd %xmm4,%xmm0
777 pxor %xmm0,%xmm12
778 pshufb .rol8(%rip),%xmm12
779 paddd %xmm12,%xmm8
780 pxor %xmm8,%xmm4
781 movdqa %xmm4,%xmm3
782 pslld $7,%xmm3
783 psrld $25,%xmm4
784 pxor %xmm3,%xmm4
785.byte 102,15,58,15,228,4
786.byte 102,69,15,58,15,192,8
787.byte 102,69,15,58,15,228,12
788 paddd %xmm4,%xmm0
789 pxor %xmm0,%xmm12
790 pshufb .rol16(%rip),%xmm12
791 paddd %xmm12,%xmm8
792 pxor %xmm8,%xmm4
793 movdqa %xmm4,%xmm3
794 pslld $12,%xmm3
795 psrld $20,%xmm4
796 pxor %xmm3,%xmm4
797 paddd %xmm4,%xmm0
798 pxor %xmm0,%xmm12
799 pshufb .rol8(%rip),%xmm12
800 paddd %xmm12,%xmm8
801 pxor %xmm8,%xmm4
802 movdqa %xmm4,%xmm3
803 pslld $7,%xmm3
804 psrld $25,%xmm4
805 pxor %xmm3,%xmm4
806.byte 102,15,58,15,228,12
807.byte 102,69,15,58,15,192,8
808.byte 102,69,15,58,15,228,4
809
810 cmpq $16,%rcx
811 jae 1b
812 cmpq $160,%r8
813 jne 2b
814 paddd .chacha20_consts(%rip),%xmm0
815 paddd 48(%rbp),%xmm4
816 paddd 64(%rbp),%xmm8
817 paddd 96(%rbp),%xmm12
818
819 jmp open_sse_tail_64_dec_loop
8203:
821 cmpq $128,%rbx
822 ja 3f
823 movdqa .chacha20_consts(%rip),%xmm0
824 movdqa 48(%rbp),%xmm4
825 movdqa 64(%rbp),%xmm8
826 movdqa %xmm0,%xmm1
827 movdqa %xmm4,%xmm5
828 movdqa %xmm8,%xmm9
829 movdqa 96(%rbp),%xmm13
830 paddd .sse_inc(%rip),%xmm13
831 movdqa %xmm13,%xmm12
832 paddd .sse_inc(%rip),%xmm12
833 movdqa %xmm12,96(%rbp)
834 movdqa %xmm13,112(%rbp)
835
836 movq %rbx,%rcx
837 andq $-16,%rcx
838 xorq %r8,%r8
8391:
840 addq 0(%rsi,%r8), %r10
841 adcq 8+0(%rsi,%r8), %r11
842 adcq $1,%r12
843 movq 0+0(%rbp),%rax
844 movq %rax,%r15
845 mulq %r10
846 movq %rax,%r13
847 movq %rdx,%r14
848 movq 0+0(%rbp),%rax
849 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800850 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500851 addq %rax,%r14
852 adcq %rdx,%r15
853 movq 8+0(%rbp),%rax
854 movq %rax,%r9
855 mulq %r10
856 addq %rax,%r14
857 adcq $0,%rdx
858 movq %rdx,%r10
859 movq 8+0(%rbp),%rax
860 mulq %r11
861 addq %rax,%r15
862 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -0800863 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500864 addq %r10,%r15
865 adcq %rdx,%r9
866 movq %r13,%r10
867 movq %r14,%r11
868 movq %r15,%r12
869 andq $3,%r12
870 movq %r15,%r13
871 andq $-4,%r13
872 movq %r9,%r14
873 shrdq $2,%r9,%r15
874 shrq $2,%r9
875 addq %r13,%r10
876 adcq %r14,%r11
877 adcq $0,%r12
878 addq %r15,%r10
879 adcq %r9,%r11
880 adcq $0,%r12
881
8822:
883 addq $16,%r8
884 paddd %xmm4,%xmm0
885 pxor %xmm0,%xmm12
886 pshufb .rol16(%rip),%xmm12
887 paddd %xmm12,%xmm8
888 pxor %xmm8,%xmm4
889 movdqa %xmm4,%xmm3
890 pslld $12,%xmm3
891 psrld $20,%xmm4
892 pxor %xmm3,%xmm4
893 paddd %xmm4,%xmm0
894 pxor %xmm0,%xmm12
895 pshufb .rol8(%rip),%xmm12
896 paddd %xmm12,%xmm8
897 pxor %xmm8,%xmm4
898 movdqa %xmm4,%xmm3
899 pslld $7,%xmm3
900 psrld $25,%xmm4
901 pxor %xmm3,%xmm4
902.byte 102,15,58,15,228,4
903.byte 102,69,15,58,15,192,8
904.byte 102,69,15,58,15,228,12
905 paddd %xmm5,%xmm1
906 pxor %xmm1,%xmm13
907 pshufb .rol16(%rip),%xmm13
908 paddd %xmm13,%xmm9
909 pxor %xmm9,%xmm5
910 movdqa %xmm5,%xmm3
911 pslld $12,%xmm3
912 psrld $20,%xmm5
913 pxor %xmm3,%xmm5
914 paddd %xmm5,%xmm1
915 pxor %xmm1,%xmm13
916 pshufb .rol8(%rip),%xmm13
917 paddd %xmm13,%xmm9
918 pxor %xmm9,%xmm5
919 movdqa %xmm5,%xmm3
920 pslld $7,%xmm3
921 psrld $25,%xmm5
922 pxor %xmm3,%xmm5
923.byte 102,15,58,15,237,4
924.byte 102,69,15,58,15,201,8
925.byte 102,69,15,58,15,237,12
926 paddd %xmm4,%xmm0
927 pxor %xmm0,%xmm12
928 pshufb .rol16(%rip),%xmm12
929 paddd %xmm12,%xmm8
930 pxor %xmm8,%xmm4
931 movdqa %xmm4,%xmm3
932 pslld $12,%xmm3
933 psrld $20,%xmm4
934 pxor %xmm3,%xmm4
935 paddd %xmm4,%xmm0
936 pxor %xmm0,%xmm12
937 pshufb .rol8(%rip),%xmm12
938 paddd %xmm12,%xmm8
939 pxor %xmm8,%xmm4
940 movdqa %xmm4,%xmm3
941 pslld $7,%xmm3
942 psrld $25,%xmm4
943 pxor %xmm3,%xmm4
944.byte 102,15,58,15,228,12
945.byte 102,69,15,58,15,192,8
946.byte 102,69,15,58,15,228,4
947 paddd %xmm5,%xmm1
948 pxor %xmm1,%xmm13
949 pshufb .rol16(%rip),%xmm13
950 paddd %xmm13,%xmm9
951 pxor %xmm9,%xmm5
952 movdqa %xmm5,%xmm3
953 pslld $12,%xmm3
954 psrld $20,%xmm5
955 pxor %xmm3,%xmm5
956 paddd %xmm5,%xmm1
957 pxor %xmm1,%xmm13
958 pshufb .rol8(%rip),%xmm13
959 paddd %xmm13,%xmm9
960 pxor %xmm9,%xmm5
961 movdqa %xmm5,%xmm3
962 pslld $7,%xmm3
963 psrld $25,%xmm5
964 pxor %xmm3,%xmm5
965.byte 102,15,58,15,237,12
966.byte 102,69,15,58,15,201,8
967.byte 102,69,15,58,15,237,4
968
969 cmpq %rcx,%r8
970 jb 1b
971 cmpq $160,%r8
972 jne 2b
973 paddd .chacha20_consts(%rip),%xmm1
974 paddd 48(%rbp),%xmm5
975 paddd 64(%rbp),%xmm9
976 paddd 112(%rbp),%xmm13
977 paddd .chacha20_consts(%rip),%xmm0
978 paddd 48(%rbp),%xmm4
979 paddd 64(%rbp),%xmm8
980 paddd 96(%rbp),%xmm12
981 movdqu 0 + 0(%rsi),%xmm3
982 movdqu 16 + 0(%rsi),%xmm7
983 movdqu 32 + 0(%rsi),%xmm11
984 movdqu 48 + 0(%rsi),%xmm15
985 pxor %xmm3,%xmm1
986 pxor %xmm7,%xmm5
987 pxor %xmm11,%xmm9
988 pxor %xmm13,%xmm15
989 movdqu %xmm1,0 + 0(%rdi)
990 movdqu %xmm5,16 + 0(%rdi)
991 movdqu %xmm9,32 + 0(%rdi)
992 movdqu %xmm15,48 + 0(%rdi)
993
994 subq $64,%rbx
995 leaq 64(%rsi),%rsi
996 leaq 64(%rdi),%rdi
997 jmp open_sse_tail_64_dec_loop
9983:
999 cmpq $192,%rbx
1000 ja 3f
1001 movdqa .chacha20_consts(%rip),%xmm0
1002 movdqa 48(%rbp),%xmm4
1003 movdqa 64(%rbp),%xmm8
1004 movdqa %xmm0,%xmm1
1005 movdqa %xmm4,%xmm5
1006 movdqa %xmm8,%xmm9
1007 movdqa %xmm0,%xmm2
1008 movdqa %xmm4,%xmm6
1009 movdqa %xmm8,%xmm10
1010 movdqa 96(%rbp),%xmm14
1011 paddd .sse_inc(%rip),%xmm14
1012 movdqa %xmm14,%xmm13
1013 paddd .sse_inc(%rip),%xmm13
1014 movdqa %xmm13,%xmm12
1015 paddd .sse_inc(%rip),%xmm12
1016 movdqa %xmm12,96(%rbp)
1017 movdqa %xmm13,112(%rbp)
1018 movdqa %xmm14,128(%rbp)
1019
1020 movq %rbx,%rcx
1021 movq $160,%r8
1022 cmpq $160,%rcx
1023 cmovgq %r8,%rcx
1024 andq $-16,%rcx
1025 xorq %r8,%r8
10261:
1027 addq 0(%rsi,%r8), %r10
1028 adcq 8+0(%rsi,%r8), %r11
1029 adcq $1,%r12
1030 movq 0+0(%rbp),%rax
1031 movq %rax,%r15
1032 mulq %r10
1033 movq %rax,%r13
1034 movq %rdx,%r14
1035 movq 0+0(%rbp),%rax
1036 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001037 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001038 addq %rax,%r14
1039 adcq %rdx,%r15
1040 movq 8+0(%rbp),%rax
1041 movq %rax,%r9
1042 mulq %r10
1043 addq %rax,%r14
1044 adcq $0,%rdx
1045 movq %rdx,%r10
1046 movq 8+0(%rbp),%rax
1047 mulq %r11
1048 addq %rax,%r15
1049 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001050 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001051 addq %r10,%r15
1052 adcq %rdx,%r9
1053 movq %r13,%r10
1054 movq %r14,%r11
1055 movq %r15,%r12
1056 andq $3,%r12
1057 movq %r15,%r13
1058 andq $-4,%r13
1059 movq %r9,%r14
1060 shrdq $2,%r9,%r15
1061 shrq $2,%r9
1062 addq %r13,%r10
1063 adcq %r14,%r11
1064 adcq $0,%r12
1065 addq %r15,%r10
1066 adcq %r9,%r11
1067 adcq $0,%r12
1068
10692:
1070 addq $16,%r8
1071 paddd %xmm4,%xmm0
1072 pxor %xmm0,%xmm12
1073 pshufb .rol16(%rip),%xmm12
1074 paddd %xmm12,%xmm8
1075 pxor %xmm8,%xmm4
1076 movdqa %xmm4,%xmm3
1077 pslld $12,%xmm3
1078 psrld $20,%xmm4
1079 pxor %xmm3,%xmm4
1080 paddd %xmm4,%xmm0
1081 pxor %xmm0,%xmm12
1082 pshufb .rol8(%rip),%xmm12
1083 paddd %xmm12,%xmm8
1084 pxor %xmm8,%xmm4
1085 movdqa %xmm4,%xmm3
1086 pslld $7,%xmm3
1087 psrld $25,%xmm4
1088 pxor %xmm3,%xmm4
1089.byte 102,15,58,15,228,4
1090.byte 102,69,15,58,15,192,8
1091.byte 102,69,15,58,15,228,12
1092 paddd %xmm5,%xmm1
1093 pxor %xmm1,%xmm13
1094 pshufb .rol16(%rip),%xmm13
1095 paddd %xmm13,%xmm9
1096 pxor %xmm9,%xmm5
1097 movdqa %xmm5,%xmm3
1098 pslld $12,%xmm3
1099 psrld $20,%xmm5
1100 pxor %xmm3,%xmm5
1101 paddd %xmm5,%xmm1
1102 pxor %xmm1,%xmm13
1103 pshufb .rol8(%rip),%xmm13
1104 paddd %xmm13,%xmm9
1105 pxor %xmm9,%xmm5
1106 movdqa %xmm5,%xmm3
1107 pslld $7,%xmm3
1108 psrld $25,%xmm5
1109 pxor %xmm3,%xmm5
1110.byte 102,15,58,15,237,4
1111.byte 102,69,15,58,15,201,8
1112.byte 102,69,15,58,15,237,12
1113 paddd %xmm6,%xmm2
1114 pxor %xmm2,%xmm14
1115 pshufb .rol16(%rip),%xmm14
1116 paddd %xmm14,%xmm10
1117 pxor %xmm10,%xmm6
1118 movdqa %xmm6,%xmm3
1119 pslld $12,%xmm3
1120 psrld $20,%xmm6
1121 pxor %xmm3,%xmm6
1122 paddd %xmm6,%xmm2
1123 pxor %xmm2,%xmm14
1124 pshufb .rol8(%rip),%xmm14
1125 paddd %xmm14,%xmm10
1126 pxor %xmm10,%xmm6
1127 movdqa %xmm6,%xmm3
1128 pslld $7,%xmm3
1129 psrld $25,%xmm6
1130 pxor %xmm3,%xmm6
1131.byte 102,15,58,15,246,4
1132.byte 102,69,15,58,15,210,8
1133.byte 102,69,15,58,15,246,12
1134 paddd %xmm4,%xmm0
1135 pxor %xmm0,%xmm12
1136 pshufb .rol16(%rip),%xmm12
1137 paddd %xmm12,%xmm8
1138 pxor %xmm8,%xmm4
1139 movdqa %xmm4,%xmm3
1140 pslld $12,%xmm3
1141 psrld $20,%xmm4
1142 pxor %xmm3,%xmm4
1143 paddd %xmm4,%xmm0
1144 pxor %xmm0,%xmm12
1145 pshufb .rol8(%rip),%xmm12
1146 paddd %xmm12,%xmm8
1147 pxor %xmm8,%xmm4
1148 movdqa %xmm4,%xmm3
1149 pslld $7,%xmm3
1150 psrld $25,%xmm4
1151 pxor %xmm3,%xmm4
1152.byte 102,15,58,15,228,12
1153.byte 102,69,15,58,15,192,8
1154.byte 102,69,15,58,15,228,4
1155 paddd %xmm5,%xmm1
1156 pxor %xmm1,%xmm13
1157 pshufb .rol16(%rip),%xmm13
1158 paddd %xmm13,%xmm9
1159 pxor %xmm9,%xmm5
1160 movdqa %xmm5,%xmm3
1161 pslld $12,%xmm3
1162 psrld $20,%xmm5
1163 pxor %xmm3,%xmm5
1164 paddd %xmm5,%xmm1
1165 pxor %xmm1,%xmm13
1166 pshufb .rol8(%rip),%xmm13
1167 paddd %xmm13,%xmm9
1168 pxor %xmm9,%xmm5
1169 movdqa %xmm5,%xmm3
1170 pslld $7,%xmm3
1171 psrld $25,%xmm5
1172 pxor %xmm3,%xmm5
1173.byte 102,15,58,15,237,12
1174.byte 102,69,15,58,15,201,8
1175.byte 102,69,15,58,15,237,4
1176 paddd %xmm6,%xmm2
1177 pxor %xmm2,%xmm14
1178 pshufb .rol16(%rip),%xmm14
1179 paddd %xmm14,%xmm10
1180 pxor %xmm10,%xmm6
1181 movdqa %xmm6,%xmm3
1182 pslld $12,%xmm3
1183 psrld $20,%xmm6
1184 pxor %xmm3,%xmm6
1185 paddd %xmm6,%xmm2
1186 pxor %xmm2,%xmm14
1187 pshufb .rol8(%rip),%xmm14
1188 paddd %xmm14,%xmm10
1189 pxor %xmm10,%xmm6
1190 movdqa %xmm6,%xmm3
1191 pslld $7,%xmm3
1192 psrld $25,%xmm6
1193 pxor %xmm3,%xmm6
1194.byte 102,15,58,15,246,12
1195.byte 102,69,15,58,15,210,8
1196.byte 102,69,15,58,15,246,4
1197
1198 cmpq %rcx,%r8
1199 jb 1b
1200 cmpq $160,%r8
1201 jne 2b
1202 cmpq $176,%rbx
1203 jb 1f
1204 addq 160(%rsi),%r10
1205 adcq 8+160(%rsi),%r11
1206 adcq $1,%r12
1207 movq 0+0(%rbp),%rax
1208 movq %rax,%r15
1209 mulq %r10
1210 movq %rax,%r13
1211 movq %rdx,%r14
1212 movq 0+0(%rbp),%rax
1213 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001214 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001215 addq %rax,%r14
1216 adcq %rdx,%r15
1217 movq 8+0(%rbp),%rax
1218 movq %rax,%r9
1219 mulq %r10
1220 addq %rax,%r14
1221 adcq $0,%rdx
1222 movq %rdx,%r10
1223 movq 8+0(%rbp),%rax
1224 mulq %r11
1225 addq %rax,%r15
1226 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001227 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001228 addq %r10,%r15
1229 adcq %rdx,%r9
1230 movq %r13,%r10
1231 movq %r14,%r11
1232 movq %r15,%r12
1233 andq $3,%r12
1234 movq %r15,%r13
1235 andq $-4,%r13
1236 movq %r9,%r14
1237 shrdq $2,%r9,%r15
1238 shrq $2,%r9
1239 addq %r13,%r10
1240 adcq %r14,%r11
1241 adcq $0,%r12
1242 addq %r15,%r10
1243 adcq %r9,%r11
1244 adcq $0,%r12
1245
1246 cmpq $192,%rbx
1247 jb 1f
1248 addq 176(%rsi),%r10
1249 adcq 8+176(%rsi),%r11
1250 adcq $1,%r12
1251 movq 0+0(%rbp),%rax
1252 movq %rax,%r15
1253 mulq %r10
1254 movq %rax,%r13
1255 movq %rdx,%r14
1256 movq 0+0(%rbp),%rax
1257 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001258 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001259 addq %rax,%r14
1260 adcq %rdx,%r15
1261 movq 8+0(%rbp),%rax
1262 movq %rax,%r9
1263 mulq %r10
1264 addq %rax,%r14
1265 adcq $0,%rdx
1266 movq %rdx,%r10
1267 movq 8+0(%rbp),%rax
1268 mulq %r11
1269 addq %rax,%r15
1270 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001271 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001272 addq %r10,%r15
1273 adcq %rdx,%r9
1274 movq %r13,%r10
1275 movq %r14,%r11
1276 movq %r15,%r12
1277 andq $3,%r12
1278 movq %r15,%r13
1279 andq $-4,%r13
1280 movq %r9,%r14
1281 shrdq $2,%r9,%r15
1282 shrq $2,%r9
1283 addq %r13,%r10
1284 adcq %r14,%r11
1285 adcq $0,%r12
1286 addq %r15,%r10
1287 adcq %r9,%r11
1288 adcq $0,%r12
1289
12901:
1291 paddd .chacha20_consts(%rip),%xmm2
1292 paddd 48(%rbp),%xmm6
1293 paddd 64(%rbp),%xmm10
1294 paddd 128(%rbp),%xmm14
1295 paddd .chacha20_consts(%rip),%xmm1
1296 paddd 48(%rbp),%xmm5
1297 paddd 64(%rbp),%xmm9
1298 paddd 112(%rbp),%xmm13
1299 paddd .chacha20_consts(%rip),%xmm0
1300 paddd 48(%rbp),%xmm4
1301 paddd 64(%rbp),%xmm8
1302 paddd 96(%rbp),%xmm12
1303 movdqu 0 + 0(%rsi),%xmm3
1304 movdqu 16 + 0(%rsi),%xmm7
1305 movdqu 32 + 0(%rsi),%xmm11
1306 movdqu 48 + 0(%rsi),%xmm15
1307 pxor %xmm3,%xmm2
1308 pxor %xmm7,%xmm6
1309 pxor %xmm11,%xmm10
1310 pxor %xmm14,%xmm15
1311 movdqu %xmm2,0 + 0(%rdi)
1312 movdqu %xmm6,16 + 0(%rdi)
1313 movdqu %xmm10,32 + 0(%rdi)
1314 movdqu %xmm15,48 + 0(%rdi)
1315 movdqu 0 + 64(%rsi),%xmm3
1316 movdqu 16 + 64(%rsi),%xmm7
1317 movdqu 32 + 64(%rsi),%xmm11
1318 movdqu 48 + 64(%rsi),%xmm15
1319 pxor %xmm3,%xmm1
1320 pxor %xmm7,%xmm5
1321 pxor %xmm11,%xmm9
1322 pxor %xmm13,%xmm15
1323 movdqu %xmm1,0 + 64(%rdi)
1324 movdqu %xmm5,16 + 64(%rdi)
1325 movdqu %xmm9,32 + 64(%rdi)
1326 movdqu %xmm15,48 + 64(%rdi)
1327
1328 subq $128,%rbx
1329 leaq 128(%rsi),%rsi
1330 leaq 128(%rdi),%rdi
1331 jmp open_sse_tail_64_dec_loop
13323:
1333
1334 movdqa .chacha20_consts(%rip),%xmm0
1335 movdqa 48(%rbp),%xmm4
1336 movdqa 64(%rbp),%xmm8
1337 movdqa %xmm0,%xmm1
1338 movdqa %xmm4,%xmm5
1339 movdqa %xmm8,%xmm9
1340 movdqa %xmm0,%xmm2
1341 movdqa %xmm4,%xmm6
1342 movdqa %xmm8,%xmm10
1343 movdqa %xmm0,%xmm3
1344 movdqa %xmm4,%xmm7
1345 movdqa %xmm8,%xmm11
1346 movdqa 96(%rbp),%xmm15
1347 paddd .sse_inc(%rip),%xmm15
1348 movdqa %xmm15,%xmm14
1349 paddd .sse_inc(%rip),%xmm14
1350 movdqa %xmm14,%xmm13
1351 paddd .sse_inc(%rip),%xmm13
1352 movdqa %xmm13,%xmm12
1353 paddd .sse_inc(%rip),%xmm12
1354 movdqa %xmm12,96(%rbp)
1355 movdqa %xmm13,112(%rbp)
1356 movdqa %xmm14,128(%rbp)
1357 movdqa %xmm15,144(%rbp)
1358
1359 xorq %r8,%r8
13601:
1361 addq 0(%rsi,%r8), %r10
1362 adcq 8+0(%rsi,%r8), %r11
1363 adcq $1,%r12
1364 movdqa %xmm11,80(%rbp)
1365 paddd %xmm4,%xmm0
1366 pxor %xmm0,%xmm12
1367 pshufb .rol16(%rip),%xmm12
1368 paddd %xmm12,%xmm8
1369 pxor %xmm8,%xmm4
1370 movdqa %xmm4,%xmm11
1371 pslld $12,%xmm11
1372 psrld $20,%xmm4
1373 pxor %xmm11,%xmm4
1374 paddd %xmm4,%xmm0
1375 pxor %xmm0,%xmm12
1376 pshufb .rol8(%rip),%xmm12
1377 paddd %xmm12,%xmm8
1378 pxor %xmm8,%xmm4
1379 movdqa %xmm4,%xmm11
1380 pslld $7,%xmm11
1381 psrld $25,%xmm4
1382 pxor %xmm11,%xmm4
1383.byte 102,15,58,15,228,4
1384.byte 102,69,15,58,15,192,8
1385.byte 102,69,15,58,15,228,12
1386 paddd %xmm5,%xmm1
1387 pxor %xmm1,%xmm13
1388 pshufb .rol16(%rip),%xmm13
1389 paddd %xmm13,%xmm9
1390 pxor %xmm9,%xmm5
1391 movdqa %xmm5,%xmm11
1392 pslld $12,%xmm11
1393 psrld $20,%xmm5
1394 pxor %xmm11,%xmm5
1395 paddd %xmm5,%xmm1
1396 pxor %xmm1,%xmm13
1397 pshufb .rol8(%rip),%xmm13
1398 paddd %xmm13,%xmm9
1399 pxor %xmm9,%xmm5
1400 movdqa %xmm5,%xmm11
1401 pslld $7,%xmm11
1402 psrld $25,%xmm5
1403 pxor %xmm11,%xmm5
1404.byte 102,15,58,15,237,4
1405.byte 102,69,15,58,15,201,8
1406.byte 102,69,15,58,15,237,12
1407 paddd %xmm6,%xmm2
1408 pxor %xmm2,%xmm14
1409 pshufb .rol16(%rip),%xmm14
1410 paddd %xmm14,%xmm10
1411 pxor %xmm10,%xmm6
1412 movdqa %xmm6,%xmm11
1413 pslld $12,%xmm11
1414 psrld $20,%xmm6
1415 pxor %xmm11,%xmm6
1416 paddd %xmm6,%xmm2
1417 pxor %xmm2,%xmm14
1418 pshufb .rol8(%rip),%xmm14
1419 paddd %xmm14,%xmm10
1420 pxor %xmm10,%xmm6
1421 movdqa %xmm6,%xmm11
1422 pslld $7,%xmm11
1423 psrld $25,%xmm6
1424 pxor %xmm11,%xmm6
1425.byte 102,15,58,15,246,4
1426.byte 102,69,15,58,15,210,8
1427.byte 102,69,15,58,15,246,12
1428 movdqa 80(%rbp),%xmm11
1429 movq 0+0(%rbp),%rax
1430 movq %rax,%r15
1431 mulq %r10
1432 movq %rax,%r13
1433 movq %rdx,%r14
1434 movq 0+0(%rbp),%rax
1435 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001436 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001437 addq %rax,%r14
1438 adcq %rdx,%r15
1439 movdqa %xmm9,80(%rbp)
1440 paddd %xmm7,%xmm3
1441 pxor %xmm3,%xmm15
1442 pshufb .rol16(%rip),%xmm15
1443 paddd %xmm15,%xmm11
1444 pxor %xmm11,%xmm7
1445 movdqa %xmm7,%xmm9
1446 pslld $12,%xmm9
1447 psrld $20,%xmm7
1448 pxor %xmm9,%xmm7
1449 paddd %xmm7,%xmm3
1450 pxor %xmm3,%xmm15
1451 pshufb .rol8(%rip),%xmm15
1452 paddd %xmm15,%xmm11
1453 pxor %xmm11,%xmm7
1454 movdqa %xmm7,%xmm9
1455 pslld $7,%xmm9
1456 psrld $25,%xmm7
1457 pxor %xmm9,%xmm7
1458.byte 102,15,58,15,255,4
1459.byte 102,69,15,58,15,219,8
1460.byte 102,69,15,58,15,255,12
1461 movdqa 80(%rbp),%xmm9
1462 movq 8+0(%rbp),%rax
1463 movq %rax,%r9
1464 mulq %r10
1465 addq %rax,%r14
1466 adcq $0,%rdx
1467 movq %rdx,%r10
1468 movq 8+0(%rbp),%rax
1469 mulq %r11
1470 addq %rax,%r15
1471 adcq $0,%rdx
1472 movdqa %xmm11,80(%rbp)
1473 paddd %xmm4,%xmm0
1474 pxor %xmm0,%xmm12
1475 pshufb .rol16(%rip),%xmm12
1476 paddd %xmm12,%xmm8
1477 pxor %xmm8,%xmm4
1478 movdqa %xmm4,%xmm11
1479 pslld $12,%xmm11
1480 psrld $20,%xmm4
1481 pxor %xmm11,%xmm4
1482 paddd %xmm4,%xmm0
1483 pxor %xmm0,%xmm12
1484 pshufb .rol8(%rip),%xmm12
1485 paddd %xmm12,%xmm8
1486 pxor %xmm8,%xmm4
1487 movdqa %xmm4,%xmm11
1488 pslld $7,%xmm11
1489 psrld $25,%xmm4
1490 pxor %xmm11,%xmm4
1491.byte 102,15,58,15,228,12
1492.byte 102,69,15,58,15,192,8
1493.byte 102,69,15,58,15,228,4
1494 paddd %xmm5,%xmm1
1495 pxor %xmm1,%xmm13
1496 pshufb .rol16(%rip),%xmm13
1497 paddd %xmm13,%xmm9
1498 pxor %xmm9,%xmm5
1499 movdqa %xmm5,%xmm11
1500 pslld $12,%xmm11
1501 psrld $20,%xmm5
1502 pxor %xmm11,%xmm5
1503 paddd %xmm5,%xmm1
1504 pxor %xmm1,%xmm13
1505 pshufb .rol8(%rip),%xmm13
1506 paddd %xmm13,%xmm9
1507 pxor %xmm9,%xmm5
1508 movdqa %xmm5,%xmm11
1509 pslld $7,%xmm11
1510 psrld $25,%xmm5
1511 pxor %xmm11,%xmm5
1512.byte 102,15,58,15,237,12
1513.byte 102,69,15,58,15,201,8
1514.byte 102,69,15,58,15,237,4
Robert Sloan4d1ac502017-02-06 08:36:14 -08001515 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001516 addq %r10,%r15
1517 adcq %rdx,%r9
1518 paddd %xmm6,%xmm2
1519 pxor %xmm2,%xmm14
1520 pshufb .rol16(%rip),%xmm14
1521 paddd %xmm14,%xmm10
1522 pxor %xmm10,%xmm6
1523 movdqa %xmm6,%xmm11
1524 pslld $12,%xmm11
1525 psrld $20,%xmm6
1526 pxor %xmm11,%xmm6
1527 paddd %xmm6,%xmm2
1528 pxor %xmm2,%xmm14
1529 pshufb .rol8(%rip),%xmm14
1530 paddd %xmm14,%xmm10
1531 pxor %xmm10,%xmm6
1532 movdqa %xmm6,%xmm11
1533 pslld $7,%xmm11
1534 psrld $25,%xmm6
1535 pxor %xmm11,%xmm6
1536.byte 102,15,58,15,246,12
1537.byte 102,69,15,58,15,210,8
1538.byte 102,69,15,58,15,246,4
1539 movdqa 80(%rbp),%xmm11
1540 movq %r13,%r10
1541 movq %r14,%r11
1542 movq %r15,%r12
1543 andq $3,%r12
1544 movq %r15,%r13
1545 andq $-4,%r13
1546 movq %r9,%r14
1547 shrdq $2,%r9,%r15
1548 shrq $2,%r9
1549 addq %r13,%r10
1550 adcq %r14,%r11
1551 adcq $0,%r12
1552 addq %r15,%r10
1553 adcq %r9,%r11
1554 adcq $0,%r12
1555 movdqa %xmm9,80(%rbp)
1556 paddd %xmm7,%xmm3
1557 pxor %xmm3,%xmm15
1558 pshufb .rol16(%rip),%xmm15
1559 paddd %xmm15,%xmm11
1560 pxor %xmm11,%xmm7
1561 movdqa %xmm7,%xmm9
1562 pslld $12,%xmm9
1563 psrld $20,%xmm7
1564 pxor %xmm9,%xmm7
1565 paddd %xmm7,%xmm3
1566 pxor %xmm3,%xmm15
1567 pshufb .rol8(%rip),%xmm15
1568 paddd %xmm15,%xmm11
1569 pxor %xmm11,%xmm7
1570 movdqa %xmm7,%xmm9
1571 pslld $7,%xmm9
1572 psrld $25,%xmm7
1573 pxor %xmm9,%xmm7
1574.byte 102,15,58,15,255,12
1575.byte 102,69,15,58,15,219,8
1576.byte 102,69,15,58,15,255,4
1577 movdqa 80(%rbp),%xmm9
1578
1579 addq $16,%r8
1580 cmpq $160,%r8
1581 jb 1b
1582 movq %rbx,%rcx
1583 andq $-16,%rcx
15841:
1585 addq 0(%rsi,%r8), %r10
1586 adcq 8+0(%rsi,%r8), %r11
1587 adcq $1,%r12
1588 movq 0+0(%rbp),%rax
1589 movq %rax,%r15
1590 mulq %r10
1591 movq %rax,%r13
1592 movq %rdx,%r14
1593 movq 0+0(%rbp),%rax
1594 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001595 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001596 addq %rax,%r14
1597 adcq %rdx,%r15
1598 movq 8+0(%rbp),%rax
1599 movq %rax,%r9
1600 mulq %r10
1601 addq %rax,%r14
1602 adcq $0,%rdx
1603 movq %rdx,%r10
1604 movq 8+0(%rbp),%rax
1605 mulq %r11
1606 addq %rax,%r15
1607 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001608 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001609 addq %r10,%r15
1610 adcq %rdx,%r9
1611 movq %r13,%r10
1612 movq %r14,%r11
1613 movq %r15,%r12
1614 andq $3,%r12
1615 movq %r15,%r13
1616 andq $-4,%r13
1617 movq %r9,%r14
1618 shrdq $2,%r9,%r15
1619 shrq $2,%r9
1620 addq %r13,%r10
1621 adcq %r14,%r11
1622 adcq $0,%r12
1623 addq %r15,%r10
1624 adcq %r9,%r11
1625 adcq $0,%r12
1626
1627 addq $16,%r8
1628 cmpq %rcx,%r8
1629 jb 1b
1630 paddd .chacha20_consts(%rip),%xmm3
1631 paddd 48(%rbp),%xmm7
1632 paddd 64(%rbp),%xmm11
1633 paddd 144(%rbp),%xmm15
1634 paddd .chacha20_consts(%rip),%xmm2
1635 paddd 48(%rbp),%xmm6
1636 paddd 64(%rbp),%xmm10
1637 paddd 128(%rbp),%xmm14
1638 paddd .chacha20_consts(%rip),%xmm1
1639 paddd 48(%rbp),%xmm5
1640 paddd 64(%rbp),%xmm9
1641 paddd 112(%rbp),%xmm13
1642 paddd .chacha20_consts(%rip),%xmm0
1643 paddd 48(%rbp),%xmm4
1644 paddd 64(%rbp),%xmm8
1645 paddd 96(%rbp),%xmm12
1646 movdqa %xmm12,80(%rbp)
1647 movdqu 0 + 0(%rsi),%xmm12
1648 pxor %xmm3,%xmm12
1649 movdqu %xmm12,0 + 0(%rdi)
1650 movdqu 16 + 0(%rsi),%xmm12
1651 pxor %xmm7,%xmm12
1652 movdqu %xmm12,16 + 0(%rdi)
1653 movdqu 32 + 0(%rsi),%xmm12
1654 pxor %xmm11,%xmm12
1655 movdqu %xmm12,32 + 0(%rdi)
1656 movdqu 48 + 0(%rsi),%xmm12
1657 pxor %xmm15,%xmm12
1658 movdqu %xmm12,48 + 0(%rdi)
1659 movdqu 0 + 64(%rsi),%xmm3
1660 movdqu 16 + 64(%rsi),%xmm7
1661 movdqu 32 + 64(%rsi),%xmm11
1662 movdqu 48 + 64(%rsi),%xmm15
1663 pxor %xmm3,%xmm2
1664 pxor %xmm7,%xmm6
1665 pxor %xmm11,%xmm10
1666 pxor %xmm14,%xmm15
1667 movdqu %xmm2,0 + 64(%rdi)
1668 movdqu %xmm6,16 + 64(%rdi)
1669 movdqu %xmm10,32 + 64(%rdi)
1670 movdqu %xmm15,48 + 64(%rdi)
1671 movdqu 0 + 128(%rsi),%xmm3
1672 movdqu 16 + 128(%rsi),%xmm7
1673 movdqu 32 + 128(%rsi),%xmm11
1674 movdqu 48 + 128(%rsi),%xmm15
1675 pxor %xmm3,%xmm1
1676 pxor %xmm7,%xmm5
1677 pxor %xmm11,%xmm9
1678 pxor %xmm13,%xmm15
1679 movdqu %xmm1,0 + 128(%rdi)
1680 movdqu %xmm5,16 + 128(%rdi)
1681 movdqu %xmm9,32 + 128(%rdi)
1682 movdqu %xmm15,48 + 128(%rdi)
1683
1684 movdqa 80(%rbp),%xmm12
1685 subq $192,%rbx
1686 leaq 192(%rsi),%rsi
1687 leaq 192(%rdi),%rdi
1688
1689
1690open_sse_tail_64_dec_loop:
1691 cmpq $16,%rbx
1692 jb 1f
1693 subq $16,%rbx
1694 movdqu (%rsi),%xmm3
1695 pxor %xmm3,%xmm0
1696 movdqu %xmm0,(%rdi)
1697 leaq 16(%rsi),%rsi
1698 leaq 16(%rdi),%rdi
1699 movdqa %xmm4,%xmm0
1700 movdqa %xmm8,%xmm4
1701 movdqa %xmm12,%xmm8
1702 jmp open_sse_tail_64_dec_loop
17031:
1704 movdqa %xmm0,%xmm1
1705
1706
1707open_sse_tail_16:
1708 testq %rbx,%rbx
1709 jz open_sse_finalize
1710
1711
1712
1713 pxor %xmm3,%xmm3
1714 leaq -1(%rsi,%rbx), %rsi
1715 movq %rbx,%r8
17162:
1717 pslldq $1,%xmm3
1718 pinsrb $0,(%rsi),%xmm3
1719 subq $1,%rsi
1720 subq $1,%r8
1721 jnz 2b
1722
17233:
1724.byte 102,73,15,126,221
1725 pextrq $1,%xmm3,%r14
1726
1727 pxor %xmm1,%xmm3
1728
1729
17302:
1731 pextrb $0,%xmm3,(%rdi)
1732 psrldq $1,%xmm3
1733 addq $1,%rdi
1734 subq $1,%rbx
1735 jne 2b
1736
1737 addq %r13,%r10
1738 adcq %r14,%r11
1739 adcq $1,%r12
1740 movq 0+0(%rbp),%rax
1741 movq %rax,%r15
1742 mulq %r10
1743 movq %rax,%r13
1744 movq %rdx,%r14
1745 movq 0+0(%rbp),%rax
1746 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001747 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001748 addq %rax,%r14
1749 adcq %rdx,%r15
1750 movq 8+0(%rbp),%rax
1751 movq %rax,%r9
1752 mulq %r10
1753 addq %rax,%r14
1754 adcq $0,%rdx
1755 movq %rdx,%r10
1756 movq 8+0(%rbp),%rax
1757 mulq %r11
1758 addq %rax,%r15
1759 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001760 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001761 addq %r10,%r15
1762 adcq %rdx,%r9
1763 movq %r13,%r10
1764 movq %r14,%r11
1765 movq %r15,%r12
1766 andq $3,%r12
1767 movq %r15,%r13
1768 andq $-4,%r13
1769 movq %r9,%r14
1770 shrdq $2,%r9,%r15
1771 shrq $2,%r9
1772 addq %r13,%r10
1773 adcq %r14,%r11
1774 adcq $0,%r12
1775 addq %r15,%r10
1776 adcq %r9,%r11
1777 adcq $0,%r12
1778
1779
1780open_sse_finalize:
1781 addq 32(%rbp),%r10
1782 adcq 8+32(%rbp),%r11
1783 adcq $1,%r12
1784 movq 0+0(%rbp),%rax
1785 movq %rax,%r15
1786 mulq %r10
1787 movq %rax,%r13
1788 movq %rdx,%r14
1789 movq 0+0(%rbp),%rax
1790 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001791 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001792 addq %rax,%r14
1793 adcq %rdx,%r15
1794 movq 8+0(%rbp),%rax
1795 movq %rax,%r9
1796 mulq %r10
1797 addq %rax,%r14
1798 adcq $0,%rdx
1799 movq %rdx,%r10
1800 movq 8+0(%rbp),%rax
1801 mulq %r11
1802 addq %rax,%r15
1803 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001804 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001805 addq %r10,%r15
1806 adcq %rdx,%r9
1807 movq %r13,%r10
1808 movq %r14,%r11
1809 movq %r15,%r12
1810 andq $3,%r12
1811 movq %r15,%r13
1812 andq $-4,%r13
1813 movq %r9,%r14
1814 shrdq $2,%r9,%r15
1815 shrq $2,%r9
1816 addq %r13,%r10
1817 adcq %r14,%r11
1818 adcq $0,%r12
1819 addq %r15,%r10
1820 adcq %r9,%r11
1821 adcq $0,%r12
1822
1823
1824 movq %r10,%r13
1825 movq %r11,%r14
1826 movq %r12,%r15
1827 subq $-5,%r10
1828 sbbq $-1,%r11
1829 sbbq $3,%r12
1830 cmovcq %r13,%r10
1831 cmovcq %r14,%r11
1832 cmovcq %r15,%r12
1833
1834 addq 0+16(%rbp),%r10
1835 adcq 8+16(%rbp),%r11
1836
1837 addq $288 + 32,%rsp
1838.cfi_adjust_cfa_offset -(288 + 32)
1839 popq %r9
1840.cfi_adjust_cfa_offset -8
1841 movq %r10,(%r9)
1842 movq %r11,8(%r9)
1843
1844 popq %r15
1845.cfi_adjust_cfa_offset -8
1846 popq %r14
1847.cfi_adjust_cfa_offset -8
1848 popq %r13
1849.cfi_adjust_cfa_offset -8
1850 popq %r12
1851.cfi_adjust_cfa_offset -8
1852 popq %rbx
1853.cfi_adjust_cfa_offset -8
1854 popq %rbp
1855.cfi_adjust_cfa_offset -8
1856 .byte 0xf3,0xc3
1857.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
1858
1859open_sse_128:
1860 movdqu .chacha20_consts(%rip),%xmm0
1861 movdqa %xmm0,%xmm1
1862 movdqa %xmm0,%xmm2
1863 movdqu 0(%r9),%xmm4
1864 movdqa %xmm4,%xmm5
1865 movdqa %xmm4,%xmm6
1866 movdqu 16(%r9),%xmm8
1867 movdqa %xmm8,%xmm9
1868 movdqa %xmm8,%xmm10
1869 movdqu 32(%r9),%xmm12
1870 movdqa %xmm12,%xmm13
1871 paddd .sse_inc(%rip),%xmm13
1872 movdqa %xmm13,%xmm14
1873 paddd .sse_inc(%rip),%xmm14
1874 movdqa %xmm4,%xmm7
1875 movdqa %xmm8,%xmm11
1876 movdqa %xmm13,%xmm15
1877 movq $10,%r10
18781:
1879 paddd %xmm4,%xmm0
1880 pxor %xmm0,%xmm12
1881 pshufb .rol16(%rip),%xmm12
1882 paddd %xmm12,%xmm8
1883 pxor %xmm8,%xmm4
1884 movdqa %xmm4,%xmm3
1885 pslld $12,%xmm3
1886 psrld $20,%xmm4
1887 pxor %xmm3,%xmm4
1888 paddd %xmm4,%xmm0
1889 pxor %xmm0,%xmm12
1890 pshufb .rol8(%rip),%xmm12
1891 paddd %xmm12,%xmm8
1892 pxor %xmm8,%xmm4
1893 movdqa %xmm4,%xmm3
1894 pslld $7,%xmm3
1895 psrld $25,%xmm4
1896 pxor %xmm3,%xmm4
1897.byte 102,15,58,15,228,4
1898.byte 102,69,15,58,15,192,8
1899.byte 102,69,15,58,15,228,12
1900 paddd %xmm5,%xmm1
1901 pxor %xmm1,%xmm13
1902 pshufb .rol16(%rip),%xmm13
1903 paddd %xmm13,%xmm9
1904 pxor %xmm9,%xmm5
1905 movdqa %xmm5,%xmm3
1906 pslld $12,%xmm3
1907 psrld $20,%xmm5
1908 pxor %xmm3,%xmm5
1909 paddd %xmm5,%xmm1
1910 pxor %xmm1,%xmm13
1911 pshufb .rol8(%rip),%xmm13
1912 paddd %xmm13,%xmm9
1913 pxor %xmm9,%xmm5
1914 movdqa %xmm5,%xmm3
1915 pslld $7,%xmm3
1916 psrld $25,%xmm5
1917 pxor %xmm3,%xmm5
1918.byte 102,15,58,15,237,4
1919.byte 102,69,15,58,15,201,8
1920.byte 102,69,15,58,15,237,12
1921 paddd %xmm6,%xmm2
1922 pxor %xmm2,%xmm14
1923 pshufb .rol16(%rip),%xmm14
1924 paddd %xmm14,%xmm10
1925 pxor %xmm10,%xmm6
1926 movdqa %xmm6,%xmm3
1927 pslld $12,%xmm3
1928 psrld $20,%xmm6
1929 pxor %xmm3,%xmm6
1930 paddd %xmm6,%xmm2
1931 pxor %xmm2,%xmm14
1932 pshufb .rol8(%rip),%xmm14
1933 paddd %xmm14,%xmm10
1934 pxor %xmm10,%xmm6
1935 movdqa %xmm6,%xmm3
1936 pslld $7,%xmm3
1937 psrld $25,%xmm6
1938 pxor %xmm3,%xmm6
1939.byte 102,15,58,15,246,4
1940.byte 102,69,15,58,15,210,8
1941.byte 102,69,15,58,15,246,12
1942 paddd %xmm4,%xmm0
1943 pxor %xmm0,%xmm12
1944 pshufb .rol16(%rip),%xmm12
1945 paddd %xmm12,%xmm8
1946 pxor %xmm8,%xmm4
1947 movdqa %xmm4,%xmm3
1948 pslld $12,%xmm3
1949 psrld $20,%xmm4
1950 pxor %xmm3,%xmm4
1951 paddd %xmm4,%xmm0
1952 pxor %xmm0,%xmm12
1953 pshufb .rol8(%rip),%xmm12
1954 paddd %xmm12,%xmm8
1955 pxor %xmm8,%xmm4
1956 movdqa %xmm4,%xmm3
1957 pslld $7,%xmm3
1958 psrld $25,%xmm4
1959 pxor %xmm3,%xmm4
1960.byte 102,15,58,15,228,12
1961.byte 102,69,15,58,15,192,8
1962.byte 102,69,15,58,15,228,4
1963 paddd %xmm5,%xmm1
1964 pxor %xmm1,%xmm13
1965 pshufb .rol16(%rip),%xmm13
1966 paddd %xmm13,%xmm9
1967 pxor %xmm9,%xmm5
1968 movdqa %xmm5,%xmm3
1969 pslld $12,%xmm3
1970 psrld $20,%xmm5
1971 pxor %xmm3,%xmm5
1972 paddd %xmm5,%xmm1
1973 pxor %xmm1,%xmm13
1974 pshufb .rol8(%rip),%xmm13
1975 paddd %xmm13,%xmm9
1976 pxor %xmm9,%xmm5
1977 movdqa %xmm5,%xmm3
1978 pslld $7,%xmm3
1979 psrld $25,%xmm5
1980 pxor %xmm3,%xmm5
1981.byte 102,15,58,15,237,12
1982.byte 102,69,15,58,15,201,8
1983.byte 102,69,15,58,15,237,4
1984 paddd %xmm6,%xmm2
1985 pxor %xmm2,%xmm14
1986 pshufb .rol16(%rip),%xmm14
1987 paddd %xmm14,%xmm10
1988 pxor %xmm10,%xmm6
1989 movdqa %xmm6,%xmm3
1990 pslld $12,%xmm3
1991 psrld $20,%xmm6
1992 pxor %xmm3,%xmm6
1993 paddd %xmm6,%xmm2
1994 pxor %xmm2,%xmm14
1995 pshufb .rol8(%rip),%xmm14
1996 paddd %xmm14,%xmm10
1997 pxor %xmm10,%xmm6
1998 movdqa %xmm6,%xmm3
1999 pslld $7,%xmm3
2000 psrld $25,%xmm6
2001 pxor %xmm3,%xmm6
2002.byte 102,15,58,15,246,12
2003.byte 102,69,15,58,15,210,8
2004.byte 102,69,15,58,15,246,4
2005
2006 decq %r10
2007 jnz 1b
2008 paddd .chacha20_consts(%rip),%xmm0
2009 paddd .chacha20_consts(%rip),%xmm1
2010 paddd .chacha20_consts(%rip),%xmm2
2011 paddd %xmm7,%xmm4
2012 paddd %xmm7,%xmm5
2013 paddd %xmm7,%xmm6
2014 paddd %xmm11,%xmm9
2015 paddd %xmm11,%xmm10
2016 paddd %xmm15,%xmm13
2017 paddd .sse_inc(%rip),%xmm15
2018 paddd %xmm15,%xmm14
2019
2020 pand .clamp(%rip),%xmm0
2021 movdqa %xmm0,0(%rbp)
2022 movdqa %xmm4,16(%rbp)
2023
2024 movq %r8,%r8
2025 call poly_hash_ad_internal
20261:
2027 cmpq $16,%rbx
2028 jb open_sse_tail_16
2029 subq $16,%rbx
2030 addq 0(%rsi),%r10
2031 adcq 8+0(%rsi),%r11
2032 adcq $1,%r12
2033
2034
2035 movdqu 0(%rsi),%xmm3
2036 pxor %xmm3,%xmm1
2037 movdqu %xmm1,0(%rdi)
2038 leaq 16(%rsi),%rsi
2039 leaq 16(%rdi),%rdi
2040 movq 0+0(%rbp),%rax
2041 movq %rax,%r15
2042 mulq %r10
2043 movq %rax,%r13
2044 movq %rdx,%r14
2045 movq 0+0(%rbp),%rax
2046 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08002047 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05002048 addq %rax,%r14
2049 adcq %rdx,%r15
2050 movq 8+0(%rbp),%rax
2051 movq %rax,%r9
2052 mulq %r10
2053 addq %rax,%r14
2054 adcq $0,%rdx
2055 movq %rdx,%r10
2056 movq 8+0(%rbp),%rax
2057 mulq %r11
2058 addq %rax,%r15
2059 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08002060 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05002061 addq %r10,%r15
2062 adcq %rdx,%r9
2063 movq %r13,%r10
2064 movq %r14,%r11
2065 movq %r15,%r12
2066 andq $3,%r12
2067 movq %r15,%r13
2068 andq $-4,%r13
2069 movq %r9,%r14
2070 shrdq $2,%r9,%r15
2071 shrq $2,%r9
2072 addq %r13,%r10
2073 adcq %r14,%r11
2074 adcq $0,%r12
2075 addq %r15,%r10
2076 adcq %r9,%r11
2077 adcq $0,%r12
2078
2079
2080 movdqa %xmm5,%xmm1
2081 movdqa %xmm9,%xmm5
2082 movdqa %xmm13,%xmm9
2083 movdqa %xmm2,%xmm13
2084 movdqa %xmm6,%xmm2
2085 movdqa %xmm10,%xmm6
2086 movdqa %xmm14,%xmm10
2087 jmp 1b
2088 jmp open_sse_tail_16
2089.size chacha20_poly1305_open, .-chacha20_poly1305_open
2090.cfi_endproc
2091
2092
2093
2094
2095.globl chacha20_poly1305_seal
2096.hidden chacha20_poly1305_seal
2097.type chacha20_poly1305_seal,@function
2098.align 64
2099chacha20_poly1305_seal:
2100.cfi_startproc
2101 pushq %rbp
2102.cfi_adjust_cfa_offset 8
2103 pushq %rbx
2104.cfi_adjust_cfa_offset 8
2105 pushq %r12
2106.cfi_adjust_cfa_offset 8
2107 pushq %r13
2108.cfi_adjust_cfa_offset 8
2109 pushq %r14
2110.cfi_adjust_cfa_offset 8
2111 pushq %r15
2112.cfi_adjust_cfa_offset 8
2113
2114
2115 pushq %r9
2116.cfi_adjust_cfa_offset 8
2117 subq $288 + 32,%rsp
2118.cfi_adjust_cfa_offset 288 + 32
2119.cfi_offset rbp, -16
2120.cfi_offset rbx, -24
2121.cfi_offset r12, -32
2122.cfi_offset r13, -40
2123.cfi_offset r14, -48
2124.cfi_offset r15, -56
David Benjaminf31229b2017-01-25 14:08:15 -05002125 leaq 32(%rsp),%rbp
2126 andq $-32,%rbp
2127 movq %rdx,8+32(%rbp)
2128 movq %r8,0+32(%rbp)
2129 movq %rdx,%rbx
2130
2131 movl OPENSSL_ia32cap_P+8(%rip),%eax
2132 andl $288,%eax
2133 xorl $288,%eax
2134 jz chacha20_poly1305_seal_avx2
2135
2136 cmpq $128,%rbx
2137 jbe seal_sse_128
2138
2139 movdqa .chacha20_consts(%rip),%xmm0
2140 movdqu 0(%r9),%xmm4
2141 movdqu 16(%r9),%xmm8
2142 movdqu 32(%r9),%xmm12
2143 movdqa %xmm0,%xmm1
2144 movdqa %xmm0,%xmm2
2145 movdqa %xmm0,%xmm3
2146 movdqa %xmm4,%xmm5
2147 movdqa %xmm4,%xmm6
2148 movdqa %xmm4,%xmm7
2149 movdqa %xmm8,%xmm9
2150 movdqa %xmm8,%xmm10
2151 movdqa %xmm8,%xmm11
2152 movdqa %xmm12,%xmm15
2153 paddd .sse_inc(%rip),%xmm12
2154 movdqa %xmm12,%xmm14
2155 paddd .sse_inc(%rip),%xmm12
2156 movdqa %xmm12,%xmm13
2157 paddd .sse_inc(%rip),%xmm12
2158
2159 movdqa %xmm4,48(%rbp)
2160 movdqa %xmm8,64(%rbp)
2161 movdqa %xmm12,96(%rbp)
2162 movdqa %xmm13,112(%rbp)
2163 movdqa %xmm14,128(%rbp)
2164 movdqa %xmm15,144(%rbp)
2165 movq $10,%r10
21661:
2167 movdqa %xmm8,80(%rbp)
2168 movdqa .rol16(%rip),%xmm8
2169 paddd %xmm7,%xmm3
2170 paddd %xmm6,%xmm2
2171 paddd %xmm5,%xmm1
2172 paddd %xmm4,%xmm0
2173 pxor %xmm3,%xmm15
2174 pxor %xmm2,%xmm14
2175 pxor %xmm1,%xmm13
2176 pxor %xmm0,%xmm12
2177.byte 102,69,15,56,0,248
2178.byte 102,69,15,56,0,240
2179.byte 102,69,15,56,0,232
2180.byte 102,69,15,56,0,224
2181 movdqa 80(%rbp),%xmm8
2182 paddd %xmm15,%xmm11
2183 paddd %xmm14,%xmm10
2184 paddd %xmm13,%xmm9
2185 paddd %xmm12,%xmm8
2186 pxor %xmm11,%xmm7
2187 pxor %xmm10,%xmm6
2188 pxor %xmm9,%xmm5
2189 pxor %xmm8,%xmm4
2190 movdqa %xmm8,80(%rbp)
2191 movdqa %xmm7,%xmm8
2192 psrld $20,%xmm8
2193 pslld $32-20,%xmm7
2194 pxor %xmm8,%xmm7
2195 movdqa %xmm6,%xmm8
2196 psrld $20,%xmm8
2197 pslld $32-20,%xmm6
2198 pxor %xmm8,%xmm6
2199 movdqa %xmm5,%xmm8
2200 psrld $20,%xmm8
2201 pslld $32-20,%xmm5
2202 pxor %xmm8,%xmm5
2203 movdqa %xmm4,%xmm8
2204 psrld $20,%xmm8
2205 pslld $32-20,%xmm4
2206 pxor %xmm8,%xmm4
2207 movdqa .rol8(%rip),%xmm8
2208 paddd %xmm7,%xmm3
2209 paddd %xmm6,%xmm2
2210 paddd %xmm5,%xmm1
2211 paddd %xmm4,%xmm0
2212 pxor %xmm3,%xmm15
2213 pxor %xmm2,%xmm14
2214 pxor %xmm1,%xmm13
2215 pxor %xmm0,%xmm12
2216.byte 102,69,15,56,0,248
2217.byte 102,69,15,56,0,240
2218.byte 102,69,15,56,0,232
2219.byte 102,69,15,56,0,224
2220 movdqa 80(%rbp),%xmm8
2221 paddd %xmm15,%xmm11
2222 paddd %xmm14,%xmm10
2223 paddd %xmm13,%xmm9
2224 paddd %xmm12,%xmm8
2225 pxor %xmm11,%xmm7
2226 pxor %xmm10,%xmm6
2227 pxor %xmm9,%xmm5
2228 pxor %xmm8,%xmm4
2229 movdqa %xmm8,80(%rbp)
2230 movdqa %xmm7,%xmm8
2231 psrld $25,%xmm8
2232 pslld $32-25,%xmm7
2233 pxor %xmm8,%xmm7
2234 movdqa %xmm6,%xmm8
2235 psrld $25,%xmm8
2236 pslld $32-25,%xmm6
2237 pxor %xmm8,%xmm6
2238 movdqa %xmm5,%xmm8
2239 psrld $25,%xmm8
2240 pslld $32-25,%xmm5
2241 pxor %xmm8,%xmm5
2242 movdqa %xmm4,%xmm8
2243 psrld $25,%xmm8
2244 pslld $32-25,%xmm4
2245 pxor %xmm8,%xmm4
2246 movdqa 80(%rbp),%xmm8
2247.byte 102,15,58,15,255,4
2248.byte 102,69,15,58,15,219,8
2249.byte 102,69,15,58,15,255,12
2250.byte 102,15,58,15,246,4
2251.byte 102,69,15,58,15,210,8
2252.byte 102,69,15,58,15,246,12
2253.byte 102,15,58,15,237,4
2254.byte 102,69,15,58,15,201,8
2255.byte 102,69,15,58,15,237,12
2256.byte 102,15,58,15,228,4
2257.byte 102,69,15,58,15,192,8
2258.byte 102,69,15,58,15,228,12
2259 movdqa %xmm8,80(%rbp)
2260 movdqa .rol16(%rip),%xmm8
2261 paddd %xmm7,%xmm3
2262 paddd %xmm6,%xmm2
2263 paddd %xmm5,%xmm1
2264 paddd %xmm4,%xmm0
2265 pxor %xmm3,%xmm15
2266 pxor %xmm2,%xmm14
2267 pxor %xmm1,%xmm13
2268 pxor %xmm0,%xmm12
2269.byte 102,69,15,56,0,248
2270.byte 102,69,15,56,0,240
2271.byte 102,69,15,56,0,232
2272.byte 102,69,15,56,0,224
2273 movdqa 80(%rbp),%xmm8
2274 paddd %xmm15,%xmm11
2275 paddd %xmm14,%xmm10
2276 paddd %xmm13,%xmm9
2277 paddd %xmm12,%xmm8
2278 pxor %xmm11,%xmm7
2279 pxor %xmm10,%xmm6
2280 pxor %xmm9,%xmm5
2281 pxor %xmm8,%xmm4
2282 movdqa %xmm8,80(%rbp)
2283 movdqa %xmm7,%xmm8
2284 psrld $20,%xmm8
2285 pslld $32-20,%xmm7
2286 pxor %xmm8,%xmm7
2287 movdqa %xmm6,%xmm8
2288 psrld $20,%xmm8
2289 pslld $32-20,%xmm6
2290 pxor %xmm8,%xmm6
2291 movdqa %xmm5,%xmm8
2292 psrld $20,%xmm8
2293 pslld $32-20,%xmm5
2294 pxor %xmm8,%xmm5
2295 movdqa %xmm4,%xmm8
2296 psrld $20,%xmm8
2297 pslld $32-20,%xmm4
2298 pxor %xmm8,%xmm4
2299 movdqa .rol8(%rip),%xmm8
2300 paddd %xmm7,%xmm3
2301 paddd %xmm6,%xmm2
2302 paddd %xmm5,%xmm1
2303 paddd %xmm4,%xmm0
2304 pxor %xmm3,%xmm15
2305 pxor %xmm2,%xmm14
2306 pxor %xmm1,%xmm13
2307 pxor %xmm0,%xmm12
2308.byte 102,69,15,56,0,248
2309.byte 102,69,15,56,0,240
2310.byte 102,69,15,56,0,232
2311.byte 102,69,15,56,0,224
2312 movdqa 80(%rbp),%xmm8
2313 paddd %xmm15,%xmm11
2314 paddd %xmm14,%xmm10
2315 paddd %xmm13,%xmm9
2316 paddd %xmm12,%xmm8
2317 pxor %xmm11,%xmm7
2318 pxor %xmm10,%xmm6
2319 pxor %xmm9,%xmm5
2320 pxor %xmm8,%xmm4
2321 movdqa %xmm8,80(%rbp)
2322 movdqa %xmm7,%xmm8
2323 psrld $25,%xmm8
2324 pslld $32-25,%xmm7
2325 pxor %xmm8,%xmm7
2326 movdqa %xmm6,%xmm8
2327 psrld $25,%xmm8
2328 pslld $32-25,%xmm6
2329 pxor %xmm8,%xmm6
2330 movdqa %xmm5,%xmm8
2331 psrld $25,%xmm8
2332 pslld $32-25,%xmm5
2333 pxor %xmm8,%xmm5
2334 movdqa %xmm4,%xmm8
2335 psrld $25,%xmm8
2336 pslld $32-25,%xmm4
2337 pxor %xmm8,%xmm4
2338 movdqa 80(%rbp),%xmm8
2339.byte 102,15,58,15,255,12
2340.byte 102,69,15,58,15,219,8
2341.byte 102,69,15,58,15,255,4
2342.byte 102,15,58,15,246,12
2343.byte 102,69,15,58,15,210,8
2344.byte 102,69,15,58,15,246,4
2345.byte 102,15,58,15,237,12
2346.byte 102,69,15,58,15,201,8
2347.byte 102,69,15,58,15,237,4
2348.byte 102,15,58,15,228,12
2349.byte 102,69,15,58,15,192,8
2350.byte 102,69,15,58,15,228,4
2351
2352 decq %r10
2353 jnz 1b
2354 paddd .chacha20_consts(%rip),%xmm3
2355 paddd 48(%rbp),%xmm7
2356 paddd 64(%rbp),%xmm11
2357 paddd 144(%rbp),%xmm15
2358 paddd .chacha20_consts(%rip),%xmm2
2359 paddd 48(%rbp),%xmm6
2360 paddd 64(%rbp),%xmm10
2361 paddd 128(%rbp),%xmm14
2362 paddd .chacha20_consts(%rip),%xmm1
2363 paddd 48(%rbp),%xmm5
2364 paddd 64(%rbp),%xmm9
2365 paddd 112(%rbp),%xmm13
2366 paddd .chacha20_consts(%rip),%xmm0
2367 paddd 48(%rbp),%xmm4
2368 paddd 64(%rbp),%xmm8
2369 paddd 96(%rbp),%xmm12
2370
2371
2372 pand .clamp(%rip),%xmm3
2373 movdqa %xmm3,0(%rbp)
2374 movdqa %xmm7,16(%rbp)
2375
2376 movq %r8,%r8
2377 call poly_hash_ad_internal
2378 movdqu 0 + 0(%rsi),%xmm3
2379 movdqu 16 + 0(%rsi),%xmm7
2380 movdqu 32 + 0(%rsi),%xmm11
2381 movdqu 48 + 0(%rsi),%xmm15
2382 pxor %xmm3,%xmm2
2383 pxor %xmm7,%xmm6
2384 pxor %xmm11,%xmm10
2385 pxor %xmm14,%xmm15
2386 movdqu %xmm2,0 + 0(%rdi)
2387 movdqu %xmm6,16 + 0(%rdi)
2388 movdqu %xmm10,32 + 0(%rdi)
2389 movdqu %xmm15,48 + 0(%rdi)
2390 movdqu 0 + 64(%rsi),%xmm3
2391 movdqu 16 + 64(%rsi),%xmm7
2392 movdqu 32 + 64(%rsi),%xmm11
2393 movdqu 48 + 64(%rsi),%xmm15
2394 pxor %xmm3,%xmm1
2395 pxor %xmm7,%xmm5
2396 pxor %xmm11,%xmm9
2397 pxor %xmm13,%xmm15
2398 movdqu %xmm1,0 + 64(%rdi)
2399 movdqu %xmm5,16 + 64(%rdi)
2400 movdqu %xmm9,32 + 64(%rdi)
2401 movdqu %xmm15,48 + 64(%rdi)
2402
2403 cmpq $192,%rbx
2404 ja 1f
2405 movq $128,%rcx
2406 subq $128,%rbx
2407 leaq 128(%rsi),%rsi
2408 jmp seal_sse_128_seal_hash
24091:
2410 movdqu 0 + 128(%rsi),%xmm3
2411 movdqu 16 + 128(%rsi),%xmm7
2412 movdqu 32 + 128(%rsi),%xmm11
2413 movdqu 48 + 128(%rsi),%xmm15
2414 pxor %xmm3,%xmm0
2415 pxor %xmm7,%xmm4
2416 pxor %xmm11,%xmm8
2417 pxor %xmm12,%xmm15
2418 movdqu %xmm0,0 + 128(%rdi)
2419 movdqu %xmm4,16 + 128(%rdi)
2420 movdqu %xmm8,32 + 128(%rdi)
2421 movdqu %xmm15,48 + 128(%rdi)
2422
2423 movq $192,%rcx
2424 subq $192,%rbx
2425 leaq 192(%rsi),%rsi
2426 movq $2,%rcx
2427 movq $8,%r8
2428 cmpq $64,%rbx
2429 jbe seal_sse_tail_64
2430 cmpq $128,%rbx
2431 jbe seal_sse_tail_128
2432 cmpq $192,%rbx
2433 jbe seal_sse_tail_192
2434
24351:
2436 movdqa .chacha20_consts(%rip),%xmm0
2437 movdqa 48(%rbp),%xmm4
2438 movdqa 64(%rbp),%xmm8
2439 movdqa %xmm0,%xmm1
2440 movdqa %xmm4,%xmm5
2441 movdqa %xmm8,%xmm9
2442 movdqa %xmm0,%xmm2
2443 movdqa %xmm4,%xmm6
2444 movdqa %xmm8,%xmm10
2445 movdqa %xmm0,%xmm3
2446 movdqa %xmm4,%xmm7
2447 movdqa %xmm8,%xmm11
2448 movdqa 96(%rbp),%xmm15
2449 paddd .sse_inc(%rip),%xmm15
2450 movdqa %xmm15,%xmm14
2451 paddd .sse_inc(%rip),%xmm14
2452 movdqa %xmm14,%xmm13
2453 paddd .sse_inc(%rip),%xmm13
2454 movdqa %xmm13,%xmm12
2455 paddd .sse_inc(%rip),%xmm12
2456 movdqa %xmm12,96(%rbp)
2457 movdqa %xmm13,112(%rbp)
2458 movdqa %xmm14,128(%rbp)
2459 movdqa %xmm15,144(%rbp)
2460
24612:
2462 movdqa %xmm8,80(%rbp)
2463 movdqa .rol16(%rip),%xmm8
2464 paddd %xmm7,%xmm3
2465 paddd %xmm6,%xmm2
2466 paddd %xmm5,%xmm1
2467 paddd %xmm4,%xmm0
2468 pxor %xmm3,%xmm15
2469 pxor %xmm2,%xmm14
2470 pxor %xmm1,%xmm13
2471 pxor %xmm0,%xmm12
2472.byte 102,69,15,56,0,248
2473.byte 102,69,15,56,0,240
2474.byte 102,69,15,56,0,232
2475.byte 102,69,15,56,0,224
2476 movdqa 80(%rbp),%xmm8
2477 paddd %xmm15,%xmm11
2478 paddd %xmm14,%xmm10
2479 paddd %xmm13,%xmm9
2480 paddd %xmm12,%xmm8
2481 pxor %xmm11,%xmm7
2482 addq 0(%rdi),%r10
2483 adcq 8+0(%rdi),%r11
2484 adcq $1,%r12
2485 pxor %xmm10,%xmm6
2486 pxor %xmm9,%xmm5
2487 pxor %xmm8,%xmm4
2488 movdqa %xmm8,80(%rbp)
2489 movdqa %xmm7,%xmm8
2490 psrld $20,%xmm8
2491 pslld $32-20,%xmm7
2492 pxor %xmm8,%xmm7
2493 movdqa %xmm6,%xmm8
2494 psrld $20,%xmm8
2495 pslld $32-20,%xmm6
2496 pxor %xmm8,%xmm6
2497 movdqa %xmm5,%xmm8
2498 psrld $20,%xmm8
2499 pslld $32-20,%xmm5
2500 pxor %xmm8,%xmm5
2501 movdqa %xmm4,%xmm8
2502 psrld $20,%xmm8
2503 pslld $32-20,%xmm4
2504 pxor %xmm8,%xmm4
2505 movq 0+0(%rbp),%rax
2506 movq %rax,%r15
2507 mulq %r10
2508 movq %rax,%r13
2509 movq %rdx,%r14
2510 movq 0+0(%rbp),%rax
2511 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08002512 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05002513 addq %rax,%r14
2514 adcq %rdx,%r15
2515 movdqa .rol8(%rip),%xmm8
2516 paddd %xmm7,%xmm3
2517 paddd %xmm6,%xmm2
2518 paddd %xmm5,%xmm1
2519 paddd %xmm4,%xmm0
2520 pxor %xmm3,%xmm15
2521 pxor %xmm2,%xmm14
2522 pxor %xmm1,%xmm13
2523 pxor %xmm0,%xmm12
2524.byte 102,69,15,56,0,248
2525.byte 102,69,15,56,0,240
2526.byte 102,69,15,56,0,232
2527.byte 102,69,15,56,0,224
2528 movdqa 80(%rbp),%xmm8
2529 paddd %xmm15,%xmm11
2530 paddd %xmm14,%xmm10
2531 paddd %xmm13,%xmm9
2532 paddd %xmm12,%xmm8
2533 pxor %xmm11,%xmm7
2534 pxor %xmm10,%xmm6
2535 movq 8+0(%rbp),%rax
2536 movq %rax,%r9
2537 mulq %r10
2538 addq %rax,%r14
2539 adcq $0,%rdx
2540 movq %rdx,%r10
2541 movq 8+0(%rbp),%rax
2542 mulq %r11
2543 addq %rax,%r15
2544 adcq $0,%rdx
2545 pxor %xmm9,%xmm5
2546 pxor %xmm8,%xmm4
2547 movdqa %xmm8,80(%rbp)
2548 movdqa %xmm7,%xmm8
2549 psrld $25,%xmm8
2550 pslld $32-25,%xmm7
2551 pxor %xmm8,%xmm7
2552 movdqa %xmm6,%xmm8
2553 psrld $25,%xmm8
2554 pslld $32-25,%xmm6
2555 pxor %xmm8,%xmm6
2556 movdqa %xmm5,%xmm8
2557 psrld $25,%xmm8
2558 pslld $32-25,%xmm5
2559 pxor %xmm8,%xmm5
2560 movdqa %xmm4,%xmm8
2561 psrld $25,%xmm8
2562 pslld $32-25,%xmm4
2563 pxor %xmm8,%xmm4
2564 movdqa 80(%rbp),%xmm8
Robert Sloan4d1ac502017-02-06 08:36:14 -08002565 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05002566 addq %r10,%r15
2567 adcq %rdx,%r9
2568.byte 102,15,58,15,255,4
2569.byte 102,69,15,58,15,219,8
2570.byte 102,69,15,58,15,255,12
2571.byte 102,15,58,15,246,4
2572.byte 102,69,15,58,15,210,8
2573.byte 102,69,15,58,15,246,12
2574.byte 102,15,58,15,237,4
2575.byte 102,69,15,58,15,201,8
2576.byte 102,69,15,58,15,237,12
2577.byte 102,15,58,15,228,4
2578.byte 102,69,15,58,15,192,8
2579.byte 102,69,15,58,15,228,12
2580 movdqa %xmm8,80(%rbp)
2581 movdqa .rol16(%rip),%xmm8
2582 paddd %xmm7,%xmm3
2583 paddd %xmm6,%xmm2
2584 paddd %xmm5,%xmm1
2585 paddd %xmm4,%xmm0
2586 pxor %xmm3,%xmm15
2587 pxor %xmm2,%xmm14
2588 movq %r13,%r10
2589 movq %r14,%r11
2590 movq %r15,%r12
2591 andq $3,%r12
2592 movq %r15,%r13
2593 andq $-4,%r13
2594 movq %r9,%r14
2595 shrdq $2,%r9,%r15
2596 shrq $2,%r9
2597 addq %r13,%r10
2598 adcq %r14,%r11
2599 adcq $0,%r12
2600 addq %r15,%r10
2601 adcq %r9,%r11
2602 adcq $0,%r12
2603 pxor %xmm1,%xmm13
2604 pxor %xmm0,%xmm12
2605.byte 102,69,15,56,0,248
2606.byte 102,69,15,56,0,240
2607.byte 102,69,15,56,0,232
2608.byte 102,69,15,56,0,224
2609 movdqa 80(%rbp),%xmm8
2610 paddd %xmm15,%xmm11
2611 paddd %xmm14,%xmm10
2612 paddd %xmm13,%xmm9
2613 paddd %xmm12,%xmm8
2614 pxor %xmm11,%xmm7
2615 pxor %xmm10,%xmm6
2616 pxor %xmm9,%xmm5
2617 pxor %xmm8,%xmm4
2618 movdqa %xmm8,80(%rbp)
2619 movdqa %xmm7,%xmm8
2620 psrld $20,%xmm8
2621 pslld $32-20,%xmm7
2622 pxor %xmm8,%xmm7
2623 movdqa %xmm6,%xmm8
2624 psrld $20,%xmm8
2625 pslld $32-20,%xmm6
2626 pxor %xmm8,%xmm6
2627 movdqa %xmm5,%xmm8
2628 psrld $20,%xmm8
2629 pslld $32-20,%xmm5
2630 pxor %xmm8,%xmm5
2631 movdqa %xmm4,%xmm8
2632 psrld $20,%xmm8
2633 pslld $32-20,%xmm4
2634 pxor %xmm8,%xmm4
2635 movdqa .rol8(%rip),%xmm8
2636 paddd %xmm7,%xmm3
2637 paddd %xmm6,%xmm2
2638 paddd %xmm5,%xmm1
2639 paddd %xmm4,%xmm0
2640 pxor %xmm3,%xmm15
2641 pxor %xmm2,%xmm14
2642 pxor %xmm1,%xmm13
2643 pxor %xmm0,%xmm12
2644.byte 102,69,15,56,0,248
2645.byte 102,69,15,56,0,240
2646.byte 102,69,15,56,0,232
2647.byte 102,69,15,56,0,224
2648 movdqa 80(%rbp),%xmm8
2649 paddd %xmm15,%xmm11
2650 paddd %xmm14,%xmm10
2651 paddd %xmm13,%xmm9
2652 paddd %xmm12,%xmm8
2653 pxor %xmm11,%xmm7
2654 pxor %xmm10,%xmm6
2655 pxor %xmm9,%xmm5
2656 pxor %xmm8,%xmm4
2657 movdqa %xmm8,80(%rbp)
2658 movdqa %xmm7,%xmm8
2659 psrld $25,%xmm8
2660 pslld $32-25,%xmm7
2661 pxor %xmm8,%xmm7
2662 movdqa %xmm6,%xmm8
2663 psrld $25,%xmm8
2664 pslld $32-25,%xmm6
2665 pxor %xmm8,%xmm6
2666 movdqa %xmm5,%xmm8
2667 psrld $25,%xmm8
2668 pslld $32-25,%xmm5
2669 pxor %xmm8,%xmm5
2670 movdqa %xmm4,%xmm8
2671 psrld $25,%xmm8
2672 pslld $32-25,%xmm4
2673 pxor %xmm8,%xmm4
2674 movdqa 80(%rbp),%xmm8
2675.byte 102,15,58,15,255,12
2676.byte 102,69,15,58,15,219,8
2677.byte 102,69,15,58,15,255,4
2678.byte 102,15,58,15,246,12
2679.byte 102,69,15,58,15,210,8
2680.byte 102,69,15,58,15,246,4
2681.byte 102,15,58,15,237,12
2682.byte 102,69,15,58,15,201,8
2683.byte 102,69,15,58,15,237,4
2684.byte 102,15,58,15,228,12
2685.byte 102,69,15,58,15,192,8
2686.byte 102,69,15,58,15,228,4
2687
2688 leaq 16(%rdi),%rdi
2689 decq %r8
2690 jge 2b
2691 addq 0(%rdi),%r10
2692 adcq 8+0(%rdi),%r11
2693 adcq $1,%r12
2694 movq 0+0(%rbp),%rax
2695 movq %rax,%r15
2696 mulq %r10
2697 movq %rax,%r13
2698 movq %rdx,%r14
2699 movq 0+0(%rbp),%rax
2700 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08002701 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05002702 addq %rax,%r14
2703 adcq %rdx,%r15
2704 movq 8+0(%rbp),%rax
2705 movq %rax,%r9
2706 mulq %r10
2707 addq %rax,%r14
2708 adcq $0,%rdx
2709 movq %rdx,%r10
2710 movq 8+0(%rbp),%rax
2711 mulq %r11
2712 addq %rax,%r15
2713 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08002714 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05002715 addq %r10,%r15
2716 adcq %rdx,%r9
2717 movq %r13,%r10
2718 movq %r14,%r11
2719 movq %r15,%r12
2720 andq $3,%r12
2721 movq %r15,%r13
2722 andq $-4,%r13
2723 movq %r9,%r14
2724 shrdq $2,%r9,%r15
2725 shrq $2,%r9
2726 addq %r13,%r10
2727 adcq %r14,%r11
2728 adcq $0,%r12
2729 addq %r15,%r10
2730 adcq %r9,%r11
2731 adcq $0,%r12
2732
2733 leaq 16(%rdi),%rdi
2734 decq %rcx
2735 jg 2b
2736 paddd .chacha20_consts(%rip),%xmm3
2737 paddd 48(%rbp),%xmm7
2738 paddd 64(%rbp),%xmm11
2739 paddd 144(%rbp),%xmm15
2740 paddd .chacha20_consts(%rip),%xmm2
2741 paddd 48(%rbp),%xmm6
2742 paddd 64(%rbp),%xmm10
2743 paddd 128(%rbp),%xmm14
2744 paddd .chacha20_consts(%rip),%xmm1
2745 paddd 48(%rbp),%xmm5
2746 paddd 64(%rbp),%xmm9
2747 paddd 112(%rbp),%xmm13
2748 paddd .chacha20_consts(%rip),%xmm0
2749 paddd 48(%rbp),%xmm4
2750 paddd 64(%rbp),%xmm8
2751 paddd 96(%rbp),%xmm12
2752
2753 movdqa %xmm14,80(%rbp)
2754 movdqa %xmm14,80(%rbp)
2755 movdqu 0 + 0(%rsi),%xmm14
2756 pxor %xmm3,%xmm14
2757 movdqu %xmm14,0 + 0(%rdi)
2758 movdqu 16 + 0(%rsi),%xmm14
2759 pxor %xmm7,%xmm14
2760 movdqu %xmm14,16 + 0(%rdi)
2761 movdqu 32 + 0(%rsi),%xmm14
2762 pxor %xmm11,%xmm14
2763 movdqu %xmm14,32 + 0(%rdi)
2764 movdqu 48 + 0(%rsi),%xmm14
2765 pxor %xmm15,%xmm14
2766 movdqu %xmm14,48 + 0(%rdi)
2767
2768 movdqa 80(%rbp),%xmm14
2769 movdqu 0 + 64(%rsi),%xmm3
2770 movdqu 16 + 64(%rsi),%xmm7
2771 movdqu 32 + 64(%rsi),%xmm11
2772 movdqu 48 + 64(%rsi),%xmm15
2773 pxor %xmm3,%xmm2
2774 pxor %xmm7,%xmm6
2775 pxor %xmm11,%xmm10
2776 pxor %xmm14,%xmm15
2777 movdqu %xmm2,0 + 64(%rdi)
2778 movdqu %xmm6,16 + 64(%rdi)
2779 movdqu %xmm10,32 + 64(%rdi)
2780 movdqu %xmm15,48 + 64(%rdi)
2781 movdqu 0 + 128(%rsi),%xmm3
2782 movdqu 16 + 128(%rsi),%xmm7
2783 movdqu 32 + 128(%rsi),%xmm11
2784 movdqu 48 + 128(%rsi),%xmm15
2785 pxor %xmm3,%xmm1
2786 pxor %xmm7,%xmm5
2787 pxor %xmm11,%xmm9
2788 pxor %xmm13,%xmm15
2789 movdqu %xmm1,0 + 128(%rdi)
2790 movdqu %xmm5,16 + 128(%rdi)
2791 movdqu %xmm9,32 + 128(%rdi)
2792 movdqu %xmm15,48 + 128(%rdi)
2793
2794 cmpq $256,%rbx
2795 ja 3f
2796
2797 movq $192,%rcx
2798 subq $192,%rbx
2799 leaq 192(%rsi),%rsi
2800 jmp seal_sse_128_seal_hash
28013:
2802 movdqu 0 + 192(%rsi),%xmm3
2803 movdqu 16 + 192(%rsi),%xmm7
2804 movdqu 32 + 192(%rsi),%xmm11
2805 movdqu 48 + 192(%rsi),%xmm15
2806 pxor %xmm3,%xmm0
2807 pxor %xmm7,%xmm4
2808 pxor %xmm11,%xmm8
2809 pxor %xmm12,%xmm15
2810 movdqu %xmm0,0 + 192(%rdi)
2811 movdqu %xmm4,16 + 192(%rdi)
2812 movdqu %xmm8,32 + 192(%rdi)
2813 movdqu %xmm15,48 + 192(%rdi)
2814
2815 leaq 256(%rsi),%rsi
2816 subq $256,%rbx
2817 movq $6,%rcx
2818 movq $4,%r8
2819 cmpq $192,%rbx
2820 jg 1b
2821 movq %rbx,%rcx
2822 testq %rbx,%rbx
2823 je seal_sse_128_seal_hash
2824 movq $6,%rcx
2825 cmpq $64,%rbx
2826 jg 3f
2827
2828seal_sse_tail_64:
2829 movdqa .chacha20_consts(%rip),%xmm0
2830 movdqa 48(%rbp),%xmm4
2831 movdqa 64(%rbp),%xmm8
2832 movdqa 96(%rbp),%xmm12
2833 paddd .sse_inc(%rip),%xmm12
2834 movdqa %xmm12,96(%rbp)
2835
28361:
2837 addq 0(%rdi),%r10
2838 adcq 8+0(%rdi),%r11
2839 adcq $1,%r12
2840 movq 0+0(%rbp),%rax
2841 movq %rax,%r15
2842 mulq %r10
2843 movq %rax,%r13
2844 movq %rdx,%r14
2845 movq 0+0(%rbp),%rax
2846 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08002847 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05002848 addq %rax,%r14
2849 adcq %rdx,%r15
2850 movq 8+0(%rbp),%rax
2851 movq %rax,%r9
2852 mulq %r10
2853 addq %rax,%r14
2854 adcq $0,%rdx
2855 movq %rdx,%r10
2856 movq 8+0(%rbp),%rax
2857 mulq %r11
2858 addq %rax,%r15
2859 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08002860 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05002861 addq %r10,%r15
2862 adcq %rdx,%r9
2863 movq %r13,%r10
2864 movq %r14,%r11
2865 movq %r15,%r12
2866 andq $3,%r12
2867 movq %r15,%r13
2868 andq $-4,%r13
2869 movq %r9,%r14
2870 shrdq $2,%r9,%r15
2871 shrq $2,%r9
2872 addq %r13,%r10
2873 adcq %r14,%r11
2874 adcq $0,%r12
2875 addq %r15,%r10
2876 adcq %r9,%r11
2877 adcq $0,%r12
2878
2879 leaq 16(%rdi),%rdi
28802:
2881 paddd %xmm4,%xmm0
2882 pxor %xmm0,%xmm12
2883 pshufb .rol16(%rip),%xmm12
2884 paddd %xmm12,%xmm8
2885 pxor %xmm8,%xmm4
2886 movdqa %xmm4,%xmm3
2887 pslld $12,%xmm3
2888 psrld $20,%xmm4
2889 pxor %xmm3,%xmm4
2890 paddd %xmm4,%xmm0
2891 pxor %xmm0,%xmm12
2892 pshufb .rol8(%rip),%xmm12
2893 paddd %xmm12,%xmm8
2894 pxor %xmm8,%xmm4
2895 movdqa %xmm4,%xmm3
2896 pslld $7,%xmm3
2897 psrld $25,%xmm4
2898 pxor %xmm3,%xmm4
2899.byte 102,15,58,15,228,4
2900.byte 102,69,15,58,15,192,8
2901.byte 102,69,15,58,15,228,12
2902 paddd %xmm4,%xmm0
2903 pxor %xmm0,%xmm12
2904 pshufb .rol16(%rip),%xmm12
2905 paddd %xmm12,%xmm8
2906 pxor %xmm8,%xmm4
2907 movdqa %xmm4,%xmm3
2908 pslld $12,%xmm3
2909 psrld $20,%xmm4
2910 pxor %xmm3,%xmm4
2911 paddd %xmm4,%xmm0
2912 pxor %xmm0,%xmm12
2913 pshufb .rol8(%rip),%xmm12
2914 paddd %xmm12,%xmm8
2915 pxor %xmm8,%xmm4
2916 movdqa %xmm4,%xmm3
2917 pslld $7,%xmm3
2918 psrld $25,%xmm4
2919 pxor %xmm3,%xmm4
2920.byte 102,15,58,15,228,12
2921.byte 102,69,15,58,15,192,8
2922.byte 102,69,15,58,15,228,4
2923 addq 0(%rdi),%r10
2924 adcq 8+0(%rdi),%r11
2925 adcq $1,%r12
2926 movq 0+0(%rbp),%rax
2927 movq %rax,%r15
2928 mulq %r10
2929 movq %rax,%r13
2930 movq %rdx,%r14
2931 movq 0+0(%rbp),%rax
2932 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08002933 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05002934 addq %rax,%r14
2935 adcq %rdx,%r15
2936 movq 8+0(%rbp),%rax
2937 movq %rax,%r9
2938 mulq %r10
2939 addq %rax,%r14
2940 adcq $0,%rdx
2941 movq %rdx,%r10
2942 movq 8+0(%rbp),%rax
2943 mulq %r11
2944 addq %rax,%r15
2945 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08002946 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05002947 addq %r10,%r15
2948 adcq %rdx,%r9
2949 movq %r13,%r10
2950 movq %r14,%r11
2951 movq %r15,%r12
2952 andq $3,%r12
2953 movq %r15,%r13
2954 andq $-4,%r13
2955 movq %r9,%r14
2956 shrdq $2,%r9,%r15
2957 shrq $2,%r9
2958 addq %r13,%r10
2959 adcq %r14,%r11
2960 adcq $0,%r12
2961 addq %r15,%r10
2962 adcq %r9,%r11
2963 adcq $0,%r12
2964
2965 leaq 16(%rdi),%rdi
2966 decq %rcx
2967 jg 1b
2968 decq %r8
2969 jge 2b
2970 paddd .chacha20_consts(%rip),%xmm0
2971 paddd 48(%rbp),%xmm4
2972 paddd 64(%rbp),%xmm8
2973 paddd 96(%rbp),%xmm12
2974
2975 jmp seal_sse_128_seal
29763:
2977 cmpq $128,%rbx
2978 jg 3f
2979
2980seal_sse_tail_128:
2981 movdqa .chacha20_consts(%rip),%xmm0
2982 movdqa 48(%rbp),%xmm4
2983 movdqa 64(%rbp),%xmm8
2984 movdqa %xmm0,%xmm1
2985 movdqa %xmm4,%xmm5
2986 movdqa %xmm8,%xmm9
2987 movdqa 96(%rbp),%xmm13
2988 paddd .sse_inc(%rip),%xmm13
2989 movdqa %xmm13,%xmm12
2990 paddd .sse_inc(%rip),%xmm12
2991 movdqa %xmm12,96(%rbp)
2992 movdqa %xmm13,112(%rbp)
2993
29941:
2995 addq 0(%rdi),%r10
2996 adcq 8+0(%rdi),%r11
2997 adcq $1,%r12
2998 movq 0+0(%rbp),%rax
2999 movq %rax,%r15
3000 mulq %r10
3001 movq %rax,%r13
3002 movq %rdx,%r14
3003 movq 0+0(%rbp),%rax
3004 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003005 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003006 addq %rax,%r14
3007 adcq %rdx,%r15
3008 movq 8+0(%rbp),%rax
3009 movq %rax,%r9
3010 mulq %r10
3011 addq %rax,%r14
3012 adcq $0,%rdx
3013 movq %rdx,%r10
3014 movq 8+0(%rbp),%rax
3015 mulq %r11
3016 addq %rax,%r15
3017 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003018 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003019 addq %r10,%r15
3020 adcq %rdx,%r9
3021 movq %r13,%r10
3022 movq %r14,%r11
3023 movq %r15,%r12
3024 andq $3,%r12
3025 movq %r15,%r13
3026 andq $-4,%r13
3027 movq %r9,%r14
3028 shrdq $2,%r9,%r15
3029 shrq $2,%r9
3030 addq %r13,%r10
3031 adcq %r14,%r11
3032 adcq $0,%r12
3033 addq %r15,%r10
3034 adcq %r9,%r11
3035 adcq $0,%r12
3036
3037 leaq 16(%rdi),%rdi
30382:
3039 paddd %xmm4,%xmm0
3040 pxor %xmm0,%xmm12
3041 pshufb .rol16(%rip),%xmm12
3042 paddd %xmm12,%xmm8
3043 pxor %xmm8,%xmm4
3044 movdqa %xmm4,%xmm3
3045 pslld $12,%xmm3
3046 psrld $20,%xmm4
3047 pxor %xmm3,%xmm4
3048 paddd %xmm4,%xmm0
3049 pxor %xmm0,%xmm12
3050 pshufb .rol8(%rip),%xmm12
3051 paddd %xmm12,%xmm8
3052 pxor %xmm8,%xmm4
3053 movdqa %xmm4,%xmm3
3054 pslld $7,%xmm3
3055 psrld $25,%xmm4
3056 pxor %xmm3,%xmm4
3057.byte 102,15,58,15,228,4
3058.byte 102,69,15,58,15,192,8
3059.byte 102,69,15,58,15,228,12
3060 paddd %xmm5,%xmm1
3061 pxor %xmm1,%xmm13
3062 pshufb .rol16(%rip),%xmm13
3063 paddd %xmm13,%xmm9
3064 pxor %xmm9,%xmm5
3065 movdqa %xmm5,%xmm3
3066 pslld $12,%xmm3
3067 psrld $20,%xmm5
3068 pxor %xmm3,%xmm5
3069 paddd %xmm5,%xmm1
3070 pxor %xmm1,%xmm13
3071 pshufb .rol8(%rip),%xmm13
3072 paddd %xmm13,%xmm9
3073 pxor %xmm9,%xmm5
3074 movdqa %xmm5,%xmm3
3075 pslld $7,%xmm3
3076 psrld $25,%xmm5
3077 pxor %xmm3,%xmm5
3078.byte 102,15,58,15,237,4
3079.byte 102,69,15,58,15,201,8
3080.byte 102,69,15,58,15,237,12
3081 addq 0(%rdi),%r10
3082 adcq 8+0(%rdi),%r11
3083 adcq $1,%r12
3084 movq 0+0(%rbp),%rax
3085 movq %rax,%r15
3086 mulq %r10
3087 movq %rax,%r13
3088 movq %rdx,%r14
3089 movq 0+0(%rbp),%rax
3090 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003091 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003092 addq %rax,%r14
3093 adcq %rdx,%r15
3094 movq 8+0(%rbp),%rax
3095 movq %rax,%r9
3096 mulq %r10
3097 addq %rax,%r14
3098 adcq $0,%rdx
3099 movq %rdx,%r10
3100 movq 8+0(%rbp),%rax
3101 mulq %r11
3102 addq %rax,%r15
3103 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003104 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003105 addq %r10,%r15
3106 adcq %rdx,%r9
3107 movq %r13,%r10
3108 movq %r14,%r11
3109 movq %r15,%r12
3110 andq $3,%r12
3111 movq %r15,%r13
3112 andq $-4,%r13
3113 movq %r9,%r14
3114 shrdq $2,%r9,%r15
3115 shrq $2,%r9
3116 addq %r13,%r10
3117 adcq %r14,%r11
3118 adcq $0,%r12
3119 addq %r15,%r10
3120 adcq %r9,%r11
3121 adcq $0,%r12
3122 paddd %xmm4,%xmm0
3123 pxor %xmm0,%xmm12
3124 pshufb .rol16(%rip),%xmm12
3125 paddd %xmm12,%xmm8
3126 pxor %xmm8,%xmm4
3127 movdqa %xmm4,%xmm3
3128 pslld $12,%xmm3
3129 psrld $20,%xmm4
3130 pxor %xmm3,%xmm4
3131 paddd %xmm4,%xmm0
3132 pxor %xmm0,%xmm12
3133 pshufb .rol8(%rip),%xmm12
3134 paddd %xmm12,%xmm8
3135 pxor %xmm8,%xmm4
3136 movdqa %xmm4,%xmm3
3137 pslld $7,%xmm3
3138 psrld $25,%xmm4
3139 pxor %xmm3,%xmm4
3140.byte 102,15,58,15,228,12
3141.byte 102,69,15,58,15,192,8
3142.byte 102,69,15,58,15,228,4
3143 paddd %xmm5,%xmm1
3144 pxor %xmm1,%xmm13
3145 pshufb .rol16(%rip),%xmm13
3146 paddd %xmm13,%xmm9
3147 pxor %xmm9,%xmm5
3148 movdqa %xmm5,%xmm3
3149 pslld $12,%xmm3
3150 psrld $20,%xmm5
3151 pxor %xmm3,%xmm5
3152 paddd %xmm5,%xmm1
3153 pxor %xmm1,%xmm13
3154 pshufb .rol8(%rip),%xmm13
3155 paddd %xmm13,%xmm9
3156 pxor %xmm9,%xmm5
3157 movdqa %xmm5,%xmm3
3158 pslld $7,%xmm3
3159 psrld $25,%xmm5
3160 pxor %xmm3,%xmm5
3161.byte 102,15,58,15,237,12
3162.byte 102,69,15,58,15,201,8
3163.byte 102,69,15,58,15,237,4
3164
3165 leaq 16(%rdi),%rdi
3166 decq %rcx
3167 jg 1b
3168 decq %r8
3169 jge 2b
3170 paddd .chacha20_consts(%rip),%xmm1
3171 paddd 48(%rbp),%xmm5
3172 paddd 64(%rbp),%xmm9
3173 paddd 112(%rbp),%xmm13
3174 paddd .chacha20_consts(%rip),%xmm0
3175 paddd 48(%rbp),%xmm4
3176 paddd 64(%rbp),%xmm8
3177 paddd 96(%rbp),%xmm12
3178 movdqu 0 + 0(%rsi),%xmm3
3179 movdqu 16 + 0(%rsi),%xmm7
3180 movdqu 32 + 0(%rsi),%xmm11
3181 movdqu 48 + 0(%rsi),%xmm15
3182 pxor %xmm3,%xmm1
3183 pxor %xmm7,%xmm5
3184 pxor %xmm11,%xmm9
3185 pxor %xmm13,%xmm15
3186 movdqu %xmm1,0 + 0(%rdi)
3187 movdqu %xmm5,16 + 0(%rdi)
3188 movdqu %xmm9,32 + 0(%rdi)
3189 movdqu %xmm15,48 + 0(%rdi)
3190
3191 movq $64,%rcx
3192 subq $64,%rbx
3193 leaq 64(%rsi),%rsi
3194 jmp seal_sse_128_seal_hash
31953:
3196
3197seal_sse_tail_192:
3198 movdqa .chacha20_consts(%rip),%xmm0
3199 movdqa 48(%rbp),%xmm4
3200 movdqa 64(%rbp),%xmm8
3201 movdqa %xmm0,%xmm1
3202 movdqa %xmm4,%xmm5
3203 movdqa %xmm8,%xmm9
3204 movdqa %xmm0,%xmm2
3205 movdqa %xmm4,%xmm6
3206 movdqa %xmm8,%xmm10
3207 movdqa 96(%rbp),%xmm14
3208 paddd .sse_inc(%rip),%xmm14
3209 movdqa %xmm14,%xmm13
3210 paddd .sse_inc(%rip),%xmm13
3211 movdqa %xmm13,%xmm12
3212 paddd .sse_inc(%rip),%xmm12
3213 movdqa %xmm12,96(%rbp)
3214 movdqa %xmm13,112(%rbp)
3215 movdqa %xmm14,128(%rbp)
3216
32171:
3218 addq 0(%rdi),%r10
3219 adcq 8+0(%rdi),%r11
3220 adcq $1,%r12
3221 movq 0+0(%rbp),%rax
3222 movq %rax,%r15
3223 mulq %r10
3224 movq %rax,%r13
3225 movq %rdx,%r14
3226 movq 0+0(%rbp),%rax
3227 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003228 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003229 addq %rax,%r14
3230 adcq %rdx,%r15
3231 movq 8+0(%rbp),%rax
3232 movq %rax,%r9
3233 mulq %r10
3234 addq %rax,%r14
3235 adcq $0,%rdx
3236 movq %rdx,%r10
3237 movq 8+0(%rbp),%rax
3238 mulq %r11
3239 addq %rax,%r15
3240 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003241 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003242 addq %r10,%r15
3243 adcq %rdx,%r9
3244 movq %r13,%r10
3245 movq %r14,%r11
3246 movq %r15,%r12
3247 andq $3,%r12
3248 movq %r15,%r13
3249 andq $-4,%r13
3250 movq %r9,%r14
3251 shrdq $2,%r9,%r15
3252 shrq $2,%r9
3253 addq %r13,%r10
3254 adcq %r14,%r11
3255 adcq $0,%r12
3256 addq %r15,%r10
3257 adcq %r9,%r11
3258 adcq $0,%r12
3259
3260 leaq 16(%rdi),%rdi
32612:
3262 paddd %xmm4,%xmm0
3263 pxor %xmm0,%xmm12
3264 pshufb .rol16(%rip),%xmm12
3265 paddd %xmm12,%xmm8
3266 pxor %xmm8,%xmm4
3267 movdqa %xmm4,%xmm3
3268 pslld $12,%xmm3
3269 psrld $20,%xmm4
3270 pxor %xmm3,%xmm4
3271 paddd %xmm4,%xmm0
3272 pxor %xmm0,%xmm12
3273 pshufb .rol8(%rip),%xmm12
3274 paddd %xmm12,%xmm8
3275 pxor %xmm8,%xmm4
3276 movdqa %xmm4,%xmm3
3277 pslld $7,%xmm3
3278 psrld $25,%xmm4
3279 pxor %xmm3,%xmm4
3280.byte 102,15,58,15,228,4
3281.byte 102,69,15,58,15,192,8
3282.byte 102,69,15,58,15,228,12
3283 paddd %xmm5,%xmm1
3284 pxor %xmm1,%xmm13
3285 pshufb .rol16(%rip),%xmm13
3286 paddd %xmm13,%xmm9
3287 pxor %xmm9,%xmm5
3288 movdqa %xmm5,%xmm3
3289 pslld $12,%xmm3
3290 psrld $20,%xmm5
3291 pxor %xmm3,%xmm5
3292 paddd %xmm5,%xmm1
3293 pxor %xmm1,%xmm13
3294 pshufb .rol8(%rip),%xmm13
3295 paddd %xmm13,%xmm9
3296 pxor %xmm9,%xmm5
3297 movdqa %xmm5,%xmm3
3298 pslld $7,%xmm3
3299 psrld $25,%xmm5
3300 pxor %xmm3,%xmm5
3301.byte 102,15,58,15,237,4
3302.byte 102,69,15,58,15,201,8
3303.byte 102,69,15,58,15,237,12
3304 paddd %xmm6,%xmm2
3305 pxor %xmm2,%xmm14
3306 pshufb .rol16(%rip),%xmm14
3307 paddd %xmm14,%xmm10
3308 pxor %xmm10,%xmm6
3309 movdqa %xmm6,%xmm3
3310 pslld $12,%xmm3
3311 psrld $20,%xmm6
3312 pxor %xmm3,%xmm6
3313 paddd %xmm6,%xmm2
3314 pxor %xmm2,%xmm14
3315 pshufb .rol8(%rip),%xmm14
3316 paddd %xmm14,%xmm10
3317 pxor %xmm10,%xmm6
3318 movdqa %xmm6,%xmm3
3319 pslld $7,%xmm3
3320 psrld $25,%xmm6
3321 pxor %xmm3,%xmm6
3322.byte 102,15,58,15,246,4
3323.byte 102,69,15,58,15,210,8
3324.byte 102,69,15,58,15,246,12
3325 addq 0(%rdi),%r10
3326 adcq 8+0(%rdi),%r11
3327 adcq $1,%r12
3328 movq 0+0(%rbp),%rax
3329 movq %rax,%r15
3330 mulq %r10
3331 movq %rax,%r13
3332 movq %rdx,%r14
3333 movq 0+0(%rbp),%rax
3334 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003335 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003336 addq %rax,%r14
3337 adcq %rdx,%r15
3338 movq 8+0(%rbp),%rax
3339 movq %rax,%r9
3340 mulq %r10
3341 addq %rax,%r14
3342 adcq $0,%rdx
3343 movq %rdx,%r10
3344 movq 8+0(%rbp),%rax
3345 mulq %r11
3346 addq %rax,%r15
3347 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003348 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003349 addq %r10,%r15
3350 adcq %rdx,%r9
3351 movq %r13,%r10
3352 movq %r14,%r11
3353 movq %r15,%r12
3354 andq $3,%r12
3355 movq %r15,%r13
3356 andq $-4,%r13
3357 movq %r9,%r14
3358 shrdq $2,%r9,%r15
3359 shrq $2,%r9
3360 addq %r13,%r10
3361 adcq %r14,%r11
3362 adcq $0,%r12
3363 addq %r15,%r10
3364 adcq %r9,%r11
3365 adcq $0,%r12
3366 paddd %xmm4,%xmm0
3367 pxor %xmm0,%xmm12
3368 pshufb .rol16(%rip),%xmm12
3369 paddd %xmm12,%xmm8
3370 pxor %xmm8,%xmm4
3371 movdqa %xmm4,%xmm3
3372 pslld $12,%xmm3
3373 psrld $20,%xmm4
3374 pxor %xmm3,%xmm4
3375 paddd %xmm4,%xmm0
3376 pxor %xmm0,%xmm12
3377 pshufb .rol8(%rip),%xmm12
3378 paddd %xmm12,%xmm8
3379 pxor %xmm8,%xmm4
3380 movdqa %xmm4,%xmm3
3381 pslld $7,%xmm3
3382 psrld $25,%xmm4
3383 pxor %xmm3,%xmm4
3384.byte 102,15,58,15,228,12
3385.byte 102,69,15,58,15,192,8
3386.byte 102,69,15,58,15,228,4
3387 paddd %xmm5,%xmm1
3388 pxor %xmm1,%xmm13
3389 pshufb .rol16(%rip),%xmm13
3390 paddd %xmm13,%xmm9
3391 pxor %xmm9,%xmm5
3392 movdqa %xmm5,%xmm3
3393 pslld $12,%xmm3
3394 psrld $20,%xmm5
3395 pxor %xmm3,%xmm5
3396 paddd %xmm5,%xmm1
3397 pxor %xmm1,%xmm13
3398 pshufb .rol8(%rip),%xmm13
3399 paddd %xmm13,%xmm9
3400 pxor %xmm9,%xmm5
3401 movdqa %xmm5,%xmm3
3402 pslld $7,%xmm3
3403 psrld $25,%xmm5
3404 pxor %xmm3,%xmm5
3405.byte 102,15,58,15,237,12
3406.byte 102,69,15,58,15,201,8
3407.byte 102,69,15,58,15,237,4
3408 paddd %xmm6,%xmm2
3409 pxor %xmm2,%xmm14
3410 pshufb .rol16(%rip),%xmm14
3411 paddd %xmm14,%xmm10
3412 pxor %xmm10,%xmm6
3413 movdqa %xmm6,%xmm3
3414 pslld $12,%xmm3
3415 psrld $20,%xmm6
3416 pxor %xmm3,%xmm6
3417 paddd %xmm6,%xmm2
3418 pxor %xmm2,%xmm14
3419 pshufb .rol8(%rip),%xmm14
3420 paddd %xmm14,%xmm10
3421 pxor %xmm10,%xmm6
3422 movdqa %xmm6,%xmm3
3423 pslld $7,%xmm3
3424 psrld $25,%xmm6
3425 pxor %xmm3,%xmm6
3426.byte 102,15,58,15,246,12
3427.byte 102,69,15,58,15,210,8
3428.byte 102,69,15,58,15,246,4
3429
3430 leaq 16(%rdi),%rdi
3431 decq %rcx
3432 jg 1b
3433 decq %r8
3434 jge 2b
3435 paddd .chacha20_consts(%rip),%xmm2
3436 paddd 48(%rbp),%xmm6
3437 paddd 64(%rbp),%xmm10
3438 paddd 128(%rbp),%xmm14
3439 paddd .chacha20_consts(%rip),%xmm1
3440 paddd 48(%rbp),%xmm5
3441 paddd 64(%rbp),%xmm9
3442 paddd 112(%rbp),%xmm13
3443 paddd .chacha20_consts(%rip),%xmm0
3444 paddd 48(%rbp),%xmm4
3445 paddd 64(%rbp),%xmm8
3446 paddd 96(%rbp),%xmm12
3447 movdqu 0 + 0(%rsi),%xmm3
3448 movdqu 16 + 0(%rsi),%xmm7
3449 movdqu 32 + 0(%rsi),%xmm11
3450 movdqu 48 + 0(%rsi),%xmm15
3451 pxor %xmm3,%xmm2
3452 pxor %xmm7,%xmm6
3453 pxor %xmm11,%xmm10
3454 pxor %xmm14,%xmm15
3455 movdqu %xmm2,0 + 0(%rdi)
3456 movdqu %xmm6,16 + 0(%rdi)
3457 movdqu %xmm10,32 + 0(%rdi)
3458 movdqu %xmm15,48 + 0(%rdi)
3459 movdqu 0 + 64(%rsi),%xmm3
3460 movdqu 16 + 64(%rsi),%xmm7
3461 movdqu 32 + 64(%rsi),%xmm11
3462 movdqu 48 + 64(%rsi),%xmm15
3463 pxor %xmm3,%xmm1
3464 pxor %xmm7,%xmm5
3465 pxor %xmm11,%xmm9
3466 pxor %xmm13,%xmm15
3467 movdqu %xmm1,0 + 64(%rdi)
3468 movdqu %xmm5,16 + 64(%rdi)
3469 movdqu %xmm9,32 + 64(%rdi)
3470 movdqu %xmm15,48 + 64(%rdi)
3471
3472 movq $128,%rcx
3473 subq $128,%rbx
3474 leaq 128(%rsi),%rsi
3475
3476seal_sse_128_seal_hash:
3477 cmpq $16,%rcx
3478 jb seal_sse_128_seal
3479 addq 0(%rdi),%r10
3480 adcq 8+0(%rdi),%r11
3481 adcq $1,%r12
3482 movq 0+0(%rbp),%rax
3483 movq %rax,%r15
3484 mulq %r10
3485 movq %rax,%r13
3486 movq %rdx,%r14
3487 movq 0+0(%rbp),%rax
3488 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003489 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003490 addq %rax,%r14
3491 adcq %rdx,%r15
3492 movq 8+0(%rbp),%rax
3493 movq %rax,%r9
3494 mulq %r10
3495 addq %rax,%r14
3496 adcq $0,%rdx
3497 movq %rdx,%r10
3498 movq 8+0(%rbp),%rax
3499 mulq %r11
3500 addq %rax,%r15
3501 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003502 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003503 addq %r10,%r15
3504 adcq %rdx,%r9
3505 movq %r13,%r10
3506 movq %r14,%r11
3507 movq %r15,%r12
3508 andq $3,%r12
3509 movq %r15,%r13
3510 andq $-4,%r13
3511 movq %r9,%r14
3512 shrdq $2,%r9,%r15
3513 shrq $2,%r9
3514 addq %r13,%r10
3515 adcq %r14,%r11
3516 adcq $0,%r12
3517 addq %r15,%r10
3518 adcq %r9,%r11
3519 adcq $0,%r12
3520
3521 subq $16,%rcx
3522 leaq 16(%rdi),%rdi
3523 jmp seal_sse_128_seal_hash
3524
3525seal_sse_128_seal:
3526 cmpq $16,%rbx
3527 jb seal_sse_tail_16
3528 subq $16,%rbx
3529
3530 movdqu 0(%rsi),%xmm3
3531 pxor %xmm3,%xmm0
3532 movdqu %xmm0,0(%rdi)
3533
3534 addq 0(%rdi),%r10
3535 adcq 8(%rdi),%r11
3536 adcq $1,%r12
3537 leaq 16(%rsi),%rsi
3538 leaq 16(%rdi),%rdi
3539 movq 0+0(%rbp),%rax
3540 movq %rax,%r15
3541 mulq %r10
3542 movq %rax,%r13
3543 movq %rdx,%r14
3544 movq 0+0(%rbp),%rax
3545 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003546 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003547 addq %rax,%r14
3548 adcq %rdx,%r15
3549 movq 8+0(%rbp),%rax
3550 movq %rax,%r9
3551 mulq %r10
3552 addq %rax,%r14
3553 adcq $0,%rdx
3554 movq %rdx,%r10
3555 movq 8+0(%rbp),%rax
3556 mulq %r11
3557 addq %rax,%r15
3558 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003559 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003560 addq %r10,%r15
3561 adcq %rdx,%r9
3562 movq %r13,%r10
3563 movq %r14,%r11
3564 movq %r15,%r12
3565 andq $3,%r12
3566 movq %r15,%r13
3567 andq $-4,%r13
3568 movq %r9,%r14
3569 shrdq $2,%r9,%r15
3570 shrq $2,%r9
3571 addq %r13,%r10
3572 adcq %r14,%r11
3573 adcq $0,%r12
3574 addq %r15,%r10
3575 adcq %r9,%r11
3576 adcq $0,%r12
3577
3578
3579 movdqa %xmm4,%xmm0
3580 movdqa %xmm8,%xmm4
3581 movdqa %xmm12,%xmm8
3582 movdqa %xmm1,%xmm12
3583 movdqa %xmm5,%xmm1
3584 movdqa %xmm9,%xmm5
3585 movdqa %xmm13,%xmm9
3586 jmp seal_sse_128_seal
3587
3588seal_sse_tail_16:
3589 testq %rbx,%rbx
3590 jz seal_sse_finalize
3591
3592 movq %rbx,%r8
3593 shlq $4,%r8
3594 leaq .and_masks(%rip),%r13
3595 movq %rbx,%rcx
3596 leaq -1(%rsi,%rbx), %rsi
3597 pxor %xmm15,%xmm15
35981:
3599 pslldq $1,%xmm15
3600 pinsrb $0,(%rsi),%xmm15
3601 leaq -1(%rsi),%rsi
3602 decq %rcx
3603 jne 1b
3604
3605
3606 pxor %xmm0,%xmm15
3607
3608
3609 movq %rbx,%rcx
3610 movdqu %xmm15,%xmm0
36112:
3612 pextrb $0,%xmm0,(%rdi)
3613 psrldq $1,%xmm0
3614 addq $1,%rdi
3615 subq $1,%rcx
3616 jnz 2b
3617
3618 pand -16(%r13,%r8), %xmm15
3619.byte 102,77,15,126,253
3620 pextrq $1,%xmm15,%r14
3621 addq %r13,%r10
3622 adcq %r14,%r11
3623 adcq $1,%r12
3624 movq 0+0(%rbp),%rax
3625 movq %rax,%r15
3626 mulq %r10
3627 movq %rax,%r13
3628 movq %rdx,%r14
3629 movq 0+0(%rbp),%rax
3630 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003631 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003632 addq %rax,%r14
3633 adcq %rdx,%r15
3634 movq 8+0(%rbp),%rax
3635 movq %rax,%r9
3636 mulq %r10
3637 addq %rax,%r14
3638 adcq $0,%rdx
3639 movq %rdx,%r10
3640 movq 8+0(%rbp),%rax
3641 mulq %r11
3642 addq %rax,%r15
3643 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003644 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003645 addq %r10,%r15
3646 adcq %rdx,%r9
3647 movq %r13,%r10
3648 movq %r14,%r11
3649 movq %r15,%r12
3650 andq $3,%r12
3651 movq %r15,%r13
3652 andq $-4,%r13
3653 movq %r9,%r14
3654 shrdq $2,%r9,%r15
3655 shrq $2,%r9
3656 addq %r13,%r10
3657 adcq %r14,%r11
3658 adcq $0,%r12
3659 addq %r15,%r10
3660 adcq %r9,%r11
3661 adcq $0,%r12
3662
3663seal_sse_finalize:
3664 addq 32(%rbp),%r10
3665 adcq 8+32(%rbp),%r11
3666 adcq $1,%r12
3667 movq 0+0(%rbp),%rax
3668 movq %rax,%r15
3669 mulq %r10
3670 movq %rax,%r13
3671 movq %rdx,%r14
3672 movq 0+0(%rbp),%rax
3673 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003674 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003675 addq %rax,%r14
3676 adcq %rdx,%r15
3677 movq 8+0(%rbp),%rax
3678 movq %rax,%r9
3679 mulq %r10
3680 addq %rax,%r14
3681 adcq $0,%rdx
3682 movq %rdx,%r10
3683 movq 8+0(%rbp),%rax
3684 mulq %r11
3685 addq %rax,%r15
3686 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003687 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003688 addq %r10,%r15
3689 adcq %rdx,%r9
3690 movq %r13,%r10
3691 movq %r14,%r11
3692 movq %r15,%r12
3693 andq $3,%r12
3694 movq %r15,%r13
3695 andq $-4,%r13
3696 movq %r9,%r14
3697 shrdq $2,%r9,%r15
3698 shrq $2,%r9
3699 addq %r13,%r10
3700 adcq %r14,%r11
3701 adcq $0,%r12
3702 addq %r15,%r10
3703 adcq %r9,%r11
3704 adcq $0,%r12
3705
3706
3707 movq %r10,%r13
3708 movq %r11,%r14
3709 movq %r12,%r15
3710 subq $-5,%r10
3711 sbbq $-1,%r11
3712 sbbq $3,%r12
3713 cmovcq %r13,%r10
3714 cmovcq %r14,%r11
3715 cmovcq %r15,%r12
3716
3717 addq 0+16(%rbp),%r10
3718 adcq 8+16(%rbp),%r11
3719
3720 addq $288 + 32,%rsp
3721.cfi_adjust_cfa_offset -(288 + 32)
3722 popq %r9
3723.cfi_adjust_cfa_offset -8
3724 movq %r10,0(%r9)
3725 movq %r11,8(%r9)
3726
3727 popq %r15
3728.cfi_adjust_cfa_offset -8
3729 popq %r14
3730.cfi_adjust_cfa_offset -8
3731 popq %r13
3732.cfi_adjust_cfa_offset -8
3733 popq %r12
3734.cfi_adjust_cfa_offset -8
3735 popq %rbx
3736.cfi_adjust_cfa_offset -8
3737 popq %rbp
3738.cfi_adjust_cfa_offset -8
3739 .byte 0xf3,0xc3
3740.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
3741
3742seal_sse_128:
3743 movdqu .chacha20_consts(%rip),%xmm0
3744 movdqa %xmm0,%xmm1
3745 movdqa %xmm0,%xmm2
3746 movdqu 0(%r9),%xmm4
3747 movdqa %xmm4,%xmm5
3748 movdqa %xmm4,%xmm6
3749 movdqu 16(%r9),%xmm8
3750 movdqa %xmm8,%xmm9
3751 movdqa %xmm8,%xmm10
3752 movdqu 32(%r9),%xmm14
3753 movdqa %xmm14,%xmm12
3754 paddd .sse_inc(%rip),%xmm12
3755 movdqa %xmm12,%xmm13
3756 paddd .sse_inc(%rip),%xmm13
3757 movdqa %xmm4,%xmm7
3758 movdqa %xmm8,%xmm11
3759 movdqa %xmm12,%xmm15
3760 movq $10,%r10
37611:
3762 paddd %xmm4,%xmm0
3763 pxor %xmm0,%xmm12
3764 pshufb .rol16(%rip),%xmm12
3765 paddd %xmm12,%xmm8
3766 pxor %xmm8,%xmm4
3767 movdqa %xmm4,%xmm3
3768 pslld $12,%xmm3
3769 psrld $20,%xmm4
3770 pxor %xmm3,%xmm4
3771 paddd %xmm4,%xmm0
3772 pxor %xmm0,%xmm12
3773 pshufb .rol8(%rip),%xmm12
3774 paddd %xmm12,%xmm8
3775 pxor %xmm8,%xmm4
3776 movdqa %xmm4,%xmm3
3777 pslld $7,%xmm3
3778 psrld $25,%xmm4
3779 pxor %xmm3,%xmm4
3780.byte 102,15,58,15,228,4
3781.byte 102,69,15,58,15,192,8
3782.byte 102,69,15,58,15,228,12
3783 paddd %xmm5,%xmm1
3784 pxor %xmm1,%xmm13
3785 pshufb .rol16(%rip),%xmm13
3786 paddd %xmm13,%xmm9
3787 pxor %xmm9,%xmm5
3788 movdqa %xmm5,%xmm3
3789 pslld $12,%xmm3
3790 psrld $20,%xmm5
3791 pxor %xmm3,%xmm5
3792 paddd %xmm5,%xmm1
3793 pxor %xmm1,%xmm13
3794 pshufb .rol8(%rip),%xmm13
3795 paddd %xmm13,%xmm9
3796 pxor %xmm9,%xmm5
3797 movdqa %xmm5,%xmm3
3798 pslld $7,%xmm3
3799 psrld $25,%xmm5
3800 pxor %xmm3,%xmm5
3801.byte 102,15,58,15,237,4
3802.byte 102,69,15,58,15,201,8
3803.byte 102,69,15,58,15,237,12
3804 paddd %xmm6,%xmm2
3805 pxor %xmm2,%xmm14
3806 pshufb .rol16(%rip),%xmm14
3807 paddd %xmm14,%xmm10
3808 pxor %xmm10,%xmm6
3809 movdqa %xmm6,%xmm3
3810 pslld $12,%xmm3
3811 psrld $20,%xmm6
3812 pxor %xmm3,%xmm6
3813 paddd %xmm6,%xmm2
3814 pxor %xmm2,%xmm14
3815 pshufb .rol8(%rip),%xmm14
3816 paddd %xmm14,%xmm10
3817 pxor %xmm10,%xmm6
3818 movdqa %xmm6,%xmm3
3819 pslld $7,%xmm3
3820 psrld $25,%xmm6
3821 pxor %xmm3,%xmm6
3822.byte 102,15,58,15,246,4
3823.byte 102,69,15,58,15,210,8
3824.byte 102,69,15,58,15,246,12
3825 paddd %xmm4,%xmm0
3826 pxor %xmm0,%xmm12
3827 pshufb .rol16(%rip),%xmm12
3828 paddd %xmm12,%xmm8
3829 pxor %xmm8,%xmm4
3830 movdqa %xmm4,%xmm3
3831 pslld $12,%xmm3
3832 psrld $20,%xmm4
3833 pxor %xmm3,%xmm4
3834 paddd %xmm4,%xmm0
3835 pxor %xmm0,%xmm12
3836 pshufb .rol8(%rip),%xmm12
3837 paddd %xmm12,%xmm8
3838 pxor %xmm8,%xmm4
3839 movdqa %xmm4,%xmm3
3840 pslld $7,%xmm3
3841 psrld $25,%xmm4
3842 pxor %xmm3,%xmm4
3843.byte 102,15,58,15,228,12
3844.byte 102,69,15,58,15,192,8
3845.byte 102,69,15,58,15,228,4
3846 paddd %xmm5,%xmm1
3847 pxor %xmm1,%xmm13
3848 pshufb .rol16(%rip),%xmm13
3849 paddd %xmm13,%xmm9
3850 pxor %xmm9,%xmm5
3851 movdqa %xmm5,%xmm3
3852 pslld $12,%xmm3
3853 psrld $20,%xmm5
3854 pxor %xmm3,%xmm5
3855 paddd %xmm5,%xmm1
3856 pxor %xmm1,%xmm13
3857 pshufb .rol8(%rip),%xmm13
3858 paddd %xmm13,%xmm9
3859 pxor %xmm9,%xmm5
3860 movdqa %xmm5,%xmm3
3861 pslld $7,%xmm3
3862 psrld $25,%xmm5
3863 pxor %xmm3,%xmm5
3864.byte 102,15,58,15,237,12
3865.byte 102,69,15,58,15,201,8
3866.byte 102,69,15,58,15,237,4
3867 paddd %xmm6,%xmm2
3868 pxor %xmm2,%xmm14
3869 pshufb .rol16(%rip),%xmm14
3870 paddd %xmm14,%xmm10
3871 pxor %xmm10,%xmm6
3872 movdqa %xmm6,%xmm3
3873 pslld $12,%xmm3
3874 psrld $20,%xmm6
3875 pxor %xmm3,%xmm6
3876 paddd %xmm6,%xmm2
3877 pxor %xmm2,%xmm14
3878 pshufb .rol8(%rip),%xmm14
3879 paddd %xmm14,%xmm10
3880 pxor %xmm10,%xmm6
3881 movdqa %xmm6,%xmm3
3882 pslld $7,%xmm3
3883 psrld $25,%xmm6
3884 pxor %xmm3,%xmm6
3885.byte 102,15,58,15,246,12
3886.byte 102,69,15,58,15,210,8
3887.byte 102,69,15,58,15,246,4
3888
3889 decq %r10
3890 jnz 1b
3891 paddd .chacha20_consts(%rip),%xmm0
3892 paddd .chacha20_consts(%rip),%xmm1
3893 paddd .chacha20_consts(%rip),%xmm2
3894 paddd %xmm7,%xmm4
3895 paddd %xmm7,%xmm5
3896 paddd %xmm7,%xmm6
3897 paddd %xmm11,%xmm8
3898 paddd %xmm11,%xmm9
3899 paddd %xmm15,%xmm12
3900 paddd .sse_inc(%rip),%xmm15
3901 paddd %xmm15,%xmm13
3902
3903 pand .clamp(%rip),%xmm2
3904 movdqa %xmm2,0(%rbp)
3905 movdqa %xmm6,16(%rbp)
3906
3907 movq %r8,%r8
3908 call poly_hash_ad_internal
3909 jmp seal_sse_128_seal
3910.size chacha20_poly1305_seal, .-chacha20_poly1305_seal
3911
3912
3913.type chacha20_poly1305_open_avx2,@function
3914.align 64
3915chacha20_poly1305_open_avx2:
3916 vzeroupper
3917 vmovdqa .chacha20_consts(%rip),%ymm0
3918 vbroadcasti128 0(%r9),%ymm4
3919 vbroadcasti128 16(%r9),%ymm8
3920 vbroadcasti128 32(%r9),%ymm12
3921 vpaddd .avx2_init(%rip),%ymm12,%ymm12
3922 cmpq $192,%rbx
3923 jbe open_avx2_192
3924 cmpq $320,%rbx
3925 jbe open_avx2_320
3926
3927 vmovdqa %ymm4,64(%rbp)
3928 vmovdqa %ymm8,96(%rbp)
3929 vmovdqa %ymm12,160(%rbp)
3930 movq $10,%r10
39311:
3932 vpaddd %ymm4,%ymm0,%ymm0
3933 vpxor %ymm0,%ymm12,%ymm12
3934 vpshufb .rol16(%rip),%ymm12,%ymm12
3935 vpaddd %ymm12,%ymm8,%ymm8
3936 vpxor %ymm8,%ymm4,%ymm4
3937 vpsrld $20,%ymm4,%ymm3
3938 vpslld $12,%ymm4,%ymm4
3939 vpxor %ymm3,%ymm4,%ymm4
3940 vpaddd %ymm4,%ymm0,%ymm0
3941 vpxor %ymm0,%ymm12,%ymm12
3942 vpshufb .rol8(%rip),%ymm12,%ymm12
3943 vpaddd %ymm12,%ymm8,%ymm8
3944 vpxor %ymm8,%ymm4,%ymm4
3945 vpslld $7,%ymm4,%ymm3
3946 vpsrld $25,%ymm4,%ymm4
3947 vpxor %ymm3,%ymm4,%ymm4
3948 vpalignr $12,%ymm12,%ymm12,%ymm12
3949 vpalignr $8,%ymm8,%ymm8,%ymm8
3950 vpalignr $4,%ymm4,%ymm4,%ymm4
3951 vpaddd %ymm4,%ymm0,%ymm0
3952 vpxor %ymm0,%ymm12,%ymm12
3953 vpshufb .rol16(%rip),%ymm12,%ymm12
3954 vpaddd %ymm12,%ymm8,%ymm8
3955 vpxor %ymm8,%ymm4,%ymm4
3956 vpsrld $20,%ymm4,%ymm3
3957 vpslld $12,%ymm4,%ymm4
3958 vpxor %ymm3,%ymm4,%ymm4
3959 vpaddd %ymm4,%ymm0,%ymm0
3960 vpxor %ymm0,%ymm12,%ymm12
3961 vpshufb .rol8(%rip),%ymm12,%ymm12
3962 vpaddd %ymm12,%ymm8,%ymm8
3963 vpxor %ymm8,%ymm4,%ymm4
3964 vpslld $7,%ymm4,%ymm3
3965 vpsrld $25,%ymm4,%ymm4
3966 vpxor %ymm3,%ymm4,%ymm4
3967 vpalignr $4,%ymm12,%ymm12,%ymm12
3968 vpalignr $8,%ymm8,%ymm8,%ymm8
3969 vpalignr $12,%ymm4,%ymm4,%ymm4
3970
3971 decq %r10
3972 jne 1b
3973 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
3974 vpaddd 64(%rbp),%ymm4,%ymm4
3975 vpaddd 96(%rbp),%ymm8,%ymm8
3976 vpaddd 160(%rbp),%ymm12,%ymm12
3977
3978 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
3979
3980 vpand .clamp(%rip),%ymm3,%ymm3
3981 vmovdqa %ymm3,0(%rbp)
3982
3983 vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
3984 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
3985
3986 movq %r8,%r8
3987 call poly_hash_ad_internal
3988 xorq %rcx,%rcx
3989
39901:
3991 addq 0(%rsi,%rcx), %r10
3992 adcq 8+0(%rsi,%rcx), %r11
3993 adcq $1,%r12
3994 movq 0+0(%rbp),%rax
3995 movq %rax,%r15
3996 mulq %r10
3997 movq %rax,%r13
3998 movq %rdx,%r14
3999 movq 0+0(%rbp),%rax
4000 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08004001 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004002 addq %rax,%r14
4003 adcq %rdx,%r15
4004 movq 8+0(%rbp),%rax
4005 movq %rax,%r9
4006 mulq %r10
4007 addq %rax,%r14
4008 adcq $0,%rdx
4009 movq %rdx,%r10
4010 movq 8+0(%rbp),%rax
4011 mulq %r11
4012 addq %rax,%r15
4013 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004014 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05004015 addq %r10,%r15
4016 adcq %rdx,%r9
4017 movq %r13,%r10
4018 movq %r14,%r11
4019 movq %r15,%r12
4020 andq $3,%r12
4021 movq %r15,%r13
4022 andq $-4,%r13
4023 movq %r9,%r14
4024 shrdq $2,%r9,%r15
4025 shrq $2,%r9
4026 addq %r13,%r10
4027 adcq %r14,%r11
4028 adcq $0,%r12
4029 addq %r15,%r10
4030 adcq %r9,%r11
4031 adcq $0,%r12
4032
4033 addq $16,%rcx
4034 cmpq $64,%rcx
4035 jne 1b
4036
4037 vpxor 0(%rsi),%ymm0,%ymm0
4038 vpxor 32(%rsi),%ymm4,%ymm4
4039 vmovdqu %ymm0,0(%rdi)
4040 vmovdqu %ymm4,32(%rdi)
4041 leaq 64(%rsi),%rsi
4042 leaq 64(%rdi),%rdi
4043 subq $64,%rbx
40441:
4045
4046 cmpq $512,%rbx
4047 jb 3f
4048 vmovdqa .chacha20_consts(%rip),%ymm0
4049 vmovdqa 64(%rbp),%ymm4
4050 vmovdqa 96(%rbp),%ymm8
4051 vmovdqa %ymm0,%ymm1
4052 vmovdqa %ymm4,%ymm5
4053 vmovdqa %ymm8,%ymm9
4054 vmovdqa %ymm0,%ymm2
4055 vmovdqa %ymm4,%ymm6
4056 vmovdqa %ymm8,%ymm10
4057 vmovdqa %ymm0,%ymm3
4058 vmovdqa %ymm4,%ymm7
4059 vmovdqa %ymm8,%ymm11
4060 vmovdqa .avx2_inc(%rip),%ymm12
4061 vpaddd 160(%rbp),%ymm12,%ymm15
4062 vpaddd %ymm15,%ymm12,%ymm14
4063 vpaddd %ymm14,%ymm12,%ymm13
4064 vpaddd %ymm13,%ymm12,%ymm12
4065 vmovdqa %ymm15,256(%rbp)
4066 vmovdqa %ymm14,224(%rbp)
4067 vmovdqa %ymm13,192(%rbp)
4068 vmovdqa %ymm12,160(%rbp)
4069
4070 xorq %rcx,%rcx
40712:
4072 addq 0*8(%rsi,%rcx), %r10
4073 adcq 8+0*8(%rsi,%rcx), %r11
4074 adcq $1,%r12
4075 vmovdqa %ymm8,128(%rbp)
4076 vmovdqa .rol16(%rip),%ymm8
4077 vpaddd %ymm7,%ymm3,%ymm3
4078 vpaddd %ymm6,%ymm2,%ymm2
4079 vpaddd %ymm5,%ymm1,%ymm1
4080 vpaddd %ymm4,%ymm0,%ymm0
4081 vpxor %ymm3,%ymm15,%ymm15
4082 vpxor %ymm2,%ymm14,%ymm14
4083 vpxor %ymm1,%ymm13,%ymm13
4084 vpxor %ymm0,%ymm12,%ymm12
4085 movq 0+0(%rbp),%rdx
4086 movq %rdx,%r15
4087 mulxq %r10,%r13,%r14
4088 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004089 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004090 addq %rax,%r14
4091 adcq %rdx,%r15
4092 vpshufb %ymm8,%ymm15,%ymm15
4093 vpshufb %ymm8,%ymm14,%ymm14
4094 vpshufb %ymm8,%ymm13,%ymm13
4095 vpshufb %ymm8,%ymm12,%ymm12
4096 vmovdqa 128(%rbp),%ymm8
4097 vpaddd %ymm15,%ymm11,%ymm11
4098 vpaddd %ymm14,%ymm10,%ymm10
4099 vpaddd %ymm13,%ymm9,%ymm9
4100 vpaddd %ymm12,%ymm8,%ymm8
4101 movq 8+0(%rbp),%rdx
4102 mulxq %r10,%r10,%rax
4103 addq %r10,%r14
4104 mulxq %r11,%r11,%r9
4105 adcq %r11,%r15
4106 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004107 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004108 vpxor %ymm11,%ymm7,%ymm7
4109 vpxor %ymm10,%ymm6,%ymm6
4110 vpxor %ymm9,%ymm5,%ymm5
4111 vpxor %ymm8,%ymm4,%ymm4
4112 vmovdqa %ymm8,128(%rbp)
4113 vpsrld $20,%ymm7,%ymm8
4114 vpslld $32-20,%ymm7,%ymm7
4115 vpxor %ymm8,%ymm7,%ymm7
4116 vpsrld $20,%ymm6,%ymm8
4117 vpslld $32-20,%ymm6,%ymm6
4118 vpxor %ymm8,%ymm6,%ymm6
4119 vpsrld $20,%ymm5,%ymm8
4120 addq %rax,%r15
4121 adcq %rdx,%r9
4122 vpslld $32-20,%ymm5,%ymm5
4123 vpxor %ymm8,%ymm5,%ymm5
4124 vpsrld $20,%ymm4,%ymm8
4125 vpslld $32-20,%ymm4,%ymm4
4126 vpxor %ymm8,%ymm4,%ymm4
4127 vmovdqa .rol8(%rip),%ymm8
4128 vpaddd %ymm7,%ymm3,%ymm3
4129 vpaddd %ymm6,%ymm2,%ymm2
4130 vpaddd %ymm5,%ymm1,%ymm1
4131 vpaddd %ymm4,%ymm0,%ymm0
4132 movq %r13,%r10
4133 movq %r14,%r11
4134 movq %r15,%r12
4135 andq $3,%r12
4136 movq %r15,%r13
4137 andq $-4,%r13
4138 movq %r9,%r14
4139 shrdq $2,%r9,%r15
4140 shrq $2,%r9
4141 addq %r13,%r10
4142 adcq %r14,%r11
4143 adcq $0,%r12
4144 addq %r15,%r10
4145 adcq %r9,%r11
4146 adcq $0,%r12
4147 vpxor %ymm3,%ymm15,%ymm15
4148 vpxor %ymm2,%ymm14,%ymm14
4149 vpxor %ymm1,%ymm13,%ymm13
4150 vpxor %ymm0,%ymm12,%ymm12
4151 vpshufb %ymm8,%ymm15,%ymm15
4152 vpshufb %ymm8,%ymm14,%ymm14
4153 vpshufb %ymm8,%ymm13,%ymm13
4154 vpshufb %ymm8,%ymm12,%ymm12
4155 vmovdqa 128(%rbp),%ymm8
4156 addq 2*8(%rsi,%rcx), %r10
4157 adcq 8+2*8(%rsi,%rcx), %r11
4158 adcq $1,%r12
4159 vpaddd %ymm15,%ymm11,%ymm11
4160 vpaddd %ymm14,%ymm10,%ymm10
4161 vpaddd %ymm13,%ymm9,%ymm9
4162 vpaddd %ymm12,%ymm8,%ymm8
4163 vpxor %ymm11,%ymm7,%ymm7
4164 vpxor %ymm10,%ymm6,%ymm6
4165 vpxor %ymm9,%ymm5,%ymm5
4166 vpxor %ymm8,%ymm4,%ymm4
4167 movq 0+0(%rbp),%rdx
4168 movq %rdx,%r15
4169 mulxq %r10,%r13,%r14
4170 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004171 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004172 addq %rax,%r14
4173 adcq %rdx,%r15
4174 vmovdqa %ymm8,128(%rbp)
4175 vpsrld $25,%ymm7,%ymm8
4176 vpslld $32-25,%ymm7,%ymm7
4177 vpxor %ymm8,%ymm7,%ymm7
4178 vpsrld $25,%ymm6,%ymm8
4179 vpslld $32-25,%ymm6,%ymm6
4180 vpxor %ymm8,%ymm6,%ymm6
4181 vpsrld $25,%ymm5,%ymm8
4182 vpslld $32-25,%ymm5,%ymm5
4183 vpxor %ymm8,%ymm5,%ymm5
4184 vpsrld $25,%ymm4,%ymm8
4185 vpslld $32-25,%ymm4,%ymm4
4186 vpxor %ymm8,%ymm4,%ymm4
4187 vmovdqa 128(%rbp),%ymm8
4188 vpalignr $4,%ymm7,%ymm7,%ymm7
4189 vpalignr $8,%ymm11,%ymm11,%ymm11
4190 vpalignr $12,%ymm15,%ymm15,%ymm15
4191 vpalignr $4,%ymm6,%ymm6,%ymm6
4192 movq 8+0(%rbp),%rdx
4193 mulxq %r10,%r10,%rax
4194 addq %r10,%r14
4195 mulxq %r11,%r11,%r9
4196 adcq %r11,%r15
4197 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004198 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004199 vpalignr $8,%ymm10,%ymm10,%ymm10
4200 vpalignr $12,%ymm14,%ymm14,%ymm14
4201 vpalignr $4,%ymm5,%ymm5,%ymm5
4202 vpalignr $8,%ymm9,%ymm9,%ymm9
4203 vpalignr $12,%ymm13,%ymm13,%ymm13
4204 vpalignr $4,%ymm4,%ymm4,%ymm4
4205 vpalignr $8,%ymm8,%ymm8,%ymm8
4206 vpalignr $12,%ymm12,%ymm12,%ymm12
4207 vmovdqa %ymm8,128(%rbp)
4208 vmovdqa .rol16(%rip),%ymm8
4209 vpaddd %ymm7,%ymm3,%ymm3
4210 vpaddd %ymm6,%ymm2,%ymm2
4211 vpaddd %ymm5,%ymm1,%ymm1
4212 vpaddd %ymm4,%ymm0,%ymm0
4213 vpxor %ymm3,%ymm15,%ymm15
4214 vpxor %ymm2,%ymm14,%ymm14
4215 vpxor %ymm1,%ymm13,%ymm13
4216 vpxor %ymm0,%ymm12,%ymm12
4217 addq %rax,%r15
4218 adcq %rdx,%r9
4219 vpshufb %ymm8,%ymm15,%ymm15
4220 vpshufb %ymm8,%ymm14,%ymm14
4221 vpshufb %ymm8,%ymm13,%ymm13
4222 vpshufb %ymm8,%ymm12,%ymm12
4223 vmovdqa 128(%rbp),%ymm8
4224 vpaddd %ymm15,%ymm11,%ymm11
4225 vpaddd %ymm14,%ymm10,%ymm10
4226 vpaddd %ymm13,%ymm9,%ymm9
4227 vpaddd %ymm12,%ymm8,%ymm8
4228 movq %r13,%r10
4229 movq %r14,%r11
4230 movq %r15,%r12
4231 andq $3,%r12
4232 movq %r15,%r13
4233 andq $-4,%r13
4234 movq %r9,%r14
4235 shrdq $2,%r9,%r15
4236 shrq $2,%r9
4237 addq %r13,%r10
4238 adcq %r14,%r11
4239 adcq $0,%r12
4240 addq %r15,%r10
4241 adcq %r9,%r11
4242 adcq $0,%r12
4243 vpxor %ymm11,%ymm7,%ymm7
4244 vpxor %ymm10,%ymm6,%ymm6
4245 vpxor %ymm9,%ymm5,%ymm5
4246 vpxor %ymm8,%ymm4,%ymm4
4247 vmovdqa %ymm8,128(%rbp)
4248 vpsrld $20,%ymm7,%ymm8
4249 vpslld $32-20,%ymm7,%ymm7
4250 vpxor %ymm8,%ymm7,%ymm7
4251 addq 4*8(%rsi,%rcx), %r10
4252 adcq 8+4*8(%rsi,%rcx), %r11
4253 adcq $1,%r12
4254
4255 leaq 48(%rcx),%rcx
4256 vpsrld $20,%ymm6,%ymm8
4257 vpslld $32-20,%ymm6,%ymm6
4258 vpxor %ymm8,%ymm6,%ymm6
4259 vpsrld $20,%ymm5,%ymm8
4260 vpslld $32-20,%ymm5,%ymm5
4261 vpxor %ymm8,%ymm5,%ymm5
4262 vpsrld $20,%ymm4,%ymm8
4263 vpslld $32-20,%ymm4,%ymm4
4264 vpxor %ymm8,%ymm4,%ymm4
4265 vmovdqa .rol8(%rip),%ymm8
4266 vpaddd %ymm7,%ymm3,%ymm3
4267 vpaddd %ymm6,%ymm2,%ymm2
4268 vpaddd %ymm5,%ymm1,%ymm1
4269 vpaddd %ymm4,%ymm0,%ymm0
4270 vpxor %ymm3,%ymm15,%ymm15
4271 vpxor %ymm2,%ymm14,%ymm14
4272 vpxor %ymm1,%ymm13,%ymm13
4273 vpxor %ymm0,%ymm12,%ymm12
4274 movq 0+0(%rbp),%rdx
4275 movq %rdx,%r15
4276 mulxq %r10,%r13,%r14
4277 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004278 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004279 addq %rax,%r14
4280 adcq %rdx,%r15
4281 vpshufb %ymm8,%ymm15,%ymm15
4282 vpshufb %ymm8,%ymm14,%ymm14
4283 vpshufb %ymm8,%ymm13,%ymm13
4284 vpshufb %ymm8,%ymm12,%ymm12
4285 vmovdqa 128(%rbp),%ymm8
4286 vpaddd %ymm15,%ymm11,%ymm11
4287 vpaddd %ymm14,%ymm10,%ymm10
4288 vpaddd %ymm13,%ymm9,%ymm9
4289 movq 8+0(%rbp),%rdx
4290 mulxq %r10,%r10,%rax
4291 addq %r10,%r14
4292 mulxq %r11,%r11,%r9
4293 adcq %r11,%r15
4294 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004295 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004296 vpaddd %ymm12,%ymm8,%ymm8
4297 vpxor %ymm11,%ymm7,%ymm7
4298 vpxor %ymm10,%ymm6,%ymm6
4299 vpxor %ymm9,%ymm5,%ymm5
4300 vpxor %ymm8,%ymm4,%ymm4
4301 vmovdqa %ymm8,128(%rbp)
4302 vpsrld $25,%ymm7,%ymm8
4303 vpslld $32-25,%ymm7,%ymm7
4304 addq %rax,%r15
4305 adcq %rdx,%r9
4306 vpxor %ymm8,%ymm7,%ymm7
4307 vpsrld $25,%ymm6,%ymm8
4308 vpslld $32-25,%ymm6,%ymm6
4309 vpxor %ymm8,%ymm6,%ymm6
4310 vpsrld $25,%ymm5,%ymm8
4311 vpslld $32-25,%ymm5,%ymm5
4312 vpxor %ymm8,%ymm5,%ymm5
4313 vpsrld $25,%ymm4,%ymm8
4314 vpslld $32-25,%ymm4,%ymm4
4315 vpxor %ymm8,%ymm4,%ymm4
4316 vmovdqa 128(%rbp),%ymm8
4317 vpalignr $12,%ymm7,%ymm7,%ymm7
4318 vpalignr $8,%ymm11,%ymm11,%ymm11
4319 vpalignr $4,%ymm15,%ymm15,%ymm15
4320 vpalignr $12,%ymm6,%ymm6,%ymm6
4321 vpalignr $8,%ymm10,%ymm10,%ymm10
4322 vpalignr $4,%ymm14,%ymm14,%ymm14
4323 vpalignr $12,%ymm5,%ymm5,%ymm5
4324 movq %r13,%r10
4325 movq %r14,%r11
4326 movq %r15,%r12
4327 andq $3,%r12
4328 movq %r15,%r13
4329 andq $-4,%r13
4330 movq %r9,%r14
4331 shrdq $2,%r9,%r15
4332 shrq $2,%r9
4333 addq %r13,%r10
4334 adcq %r14,%r11
4335 adcq $0,%r12
4336 addq %r15,%r10
4337 adcq %r9,%r11
4338 adcq $0,%r12
4339 vpalignr $8,%ymm9,%ymm9,%ymm9
4340 vpalignr $4,%ymm13,%ymm13,%ymm13
4341 vpalignr $12,%ymm4,%ymm4,%ymm4
4342 vpalignr $8,%ymm8,%ymm8,%ymm8
4343 vpalignr $4,%ymm12,%ymm12,%ymm12
4344
4345 cmpq $60*8,%rcx
4346 jne 2b
4347 vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
4348 vpaddd 64(%rbp),%ymm7,%ymm7
4349 vpaddd 96(%rbp),%ymm11,%ymm11
4350 vpaddd 256(%rbp),%ymm15,%ymm15
4351 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
4352 vpaddd 64(%rbp),%ymm6,%ymm6
4353 vpaddd 96(%rbp),%ymm10,%ymm10
4354 vpaddd 224(%rbp),%ymm14,%ymm14
4355 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
4356 vpaddd 64(%rbp),%ymm5,%ymm5
4357 vpaddd 96(%rbp),%ymm9,%ymm9
4358 vpaddd 192(%rbp),%ymm13,%ymm13
4359 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
4360 vpaddd 64(%rbp),%ymm4,%ymm4
4361 vpaddd 96(%rbp),%ymm8,%ymm8
4362 vpaddd 160(%rbp),%ymm12,%ymm12
4363
4364 vmovdqa %ymm0,128(%rbp)
4365 addq 60*8(%rsi),%r10
4366 adcq 8+60*8(%rsi),%r11
4367 adcq $1,%r12
4368 vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
4369 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
4370 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
4371 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
4372 vpxor 0+0(%rsi),%ymm0,%ymm0
4373 vpxor 32+0(%rsi),%ymm3,%ymm3
4374 vpxor 64+0(%rsi),%ymm7,%ymm7
4375 vpxor 96+0(%rsi),%ymm11,%ymm11
4376 vmovdqu %ymm0,0+0(%rdi)
4377 vmovdqu %ymm3,32+0(%rdi)
4378 vmovdqu %ymm7,64+0(%rdi)
4379 vmovdqu %ymm11,96+0(%rdi)
4380
4381 vmovdqa 128(%rbp),%ymm0
4382 movq 0+0(%rbp),%rax
4383 movq %rax,%r15
4384 mulq %r10
4385 movq %rax,%r13
4386 movq %rdx,%r14
4387 movq 0+0(%rbp),%rax
4388 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08004389 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004390 addq %rax,%r14
4391 adcq %rdx,%r15
4392 movq 8+0(%rbp),%rax
4393 movq %rax,%r9
4394 mulq %r10
4395 addq %rax,%r14
4396 adcq $0,%rdx
4397 movq %rdx,%r10
4398 movq 8+0(%rbp),%rax
4399 mulq %r11
4400 addq %rax,%r15
4401 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004402 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05004403 addq %r10,%r15
4404 adcq %rdx,%r9
4405 movq %r13,%r10
4406 movq %r14,%r11
4407 movq %r15,%r12
4408 andq $3,%r12
4409 movq %r15,%r13
4410 andq $-4,%r13
4411 movq %r9,%r14
4412 shrdq $2,%r9,%r15
4413 shrq $2,%r9
4414 addq %r13,%r10
4415 adcq %r14,%r11
4416 adcq $0,%r12
4417 addq %r15,%r10
4418 adcq %r9,%r11
4419 adcq $0,%r12
4420 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
4421 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
4422 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
4423 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
4424 vpxor 0+128(%rsi),%ymm3,%ymm3
4425 vpxor 32+128(%rsi),%ymm2,%ymm2
4426 vpxor 64+128(%rsi),%ymm6,%ymm6
4427 vpxor 96+128(%rsi),%ymm10,%ymm10
4428 vmovdqu %ymm3,0+128(%rdi)
4429 vmovdqu %ymm2,32+128(%rdi)
4430 vmovdqu %ymm6,64+128(%rdi)
4431 vmovdqu %ymm10,96+128(%rdi)
4432 addq 60*8+16(%rsi),%r10
4433 adcq 8+60*8+16(%rsi),%r11
4434 adcq $1,%r12
4435 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
4436 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
4437 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
4438 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
4439 vpxor 0+256(%rsi),%ymm3,%ymm3
4440 vpxor 32+256(%rsi),%ymm1,%ymm1
4441 vpxor 64+256(%rsi),%ymm5,%ymm5
4442 vpxor 96+256(%rsi),%ymm9,%ymm9
4443 vmovdqu %ymm3,0+256(%rdi)
4444 vmovdqu %ymm1,32+256(%rdi)
4445 vmovdqu %ymm5,64+256(%rdi)
4446 vmovdqu %ymm9,96+256(%rdi)
4447 movq 0+0(%rbp),%rax
4448 movq %rax,%r15
4449 mulq %r10
4450 movq %rax,%r13
4451 movq %rdx,%r14
4452 movq 0+0(%rbp),%rax
4453 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08004454 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004455 addq %rax,%r14
4456 adcq %rdx,%r15
4457 movq 8+0(%rbp),%rax
4458 movq %rax,%r9
4459 mulq %r10
4460 addq %rax,%r14
4461 adcq $0,%rdx
4462 movq %rdx,%r10
4463 movq 8+0(%rbp),%rax
4464 mulq %r11
4465 addq %rax,%r15
4466 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004467 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05004468 addq %r10,%r15
4469 adcq %rdx,%r9
4470 movq %r13,%r10
4471 movq %r14,%r11
4472 movq %r15,%r12
4473 andq $3,%r12
4474 movq %r15,%r13
4475 andq $-4,%r13
4476 movq %r9,%r14
4477 shrdq $2,%r9,%r15
4478 shrq $2,%r9
4479 addq %r13,%r10
4480 adcq %r14,%r11
4481 adcq $0,%r12
4482 addq %r15,%r10
4483 adcq %r9,%r11
4484 adcq $0,%r12
4485 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
4486 vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
4487 vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
4488 vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
4489 vpxor 0+384(%rsi),%ymm3,%ymm3
4490 vpxor 32+384(%rsi),%ymm0,%ymm0
4491 vpxor 64+384(%rsi),%ymm4,%ymm4
4492 vpxor 96+384(%rsi),%ymm8,%ymm8
4493 vmovdqu %ymm3,0+384(%rdi)
4494 vmovdqu %ymm0,32+384(%rdi)
4495 vmovdqu %ymm4,64+384(%rdi)
4496 vmovdqu %ymm8,96+384(%rdi)
4497
4498 leaq 512(%rsi),%rsi
4499 leaq 512(%rdi),%rdi
4500 subq $512,%rbx
4501 jmp 1b
45023:
4503 testq %rbx,%rbx
4504 vzeroupper
4505 je open_sse_finalize
45063:
4507 cmpq $128,%rbx
4508 ja 3f
4509 vmovdqa .chacha20_consts(%rip),%ymm0
4510 vmovdqa 64(%rbp),%ymm4
4511 vmovdqa 96(%rbp),%ymm8
4512 vmovdqa .avx2_inc(%rip),%ymm12
4513 vpaddd 160(%rbp),%ymm12,%ymm12
4514 vmovdqa %ymm12,160(%rbp)
4515
4516 xorq %r8,%r8
4517 movq %rbx,%rcx
4518 andq $-16,%rcx
4519 testq %rcx,%rcx
4520 je 2f
45211:
4522 addq 0*8(%rsi,%r8), %r10
4523 adcq 8+0*8(%rsi,%r8), %r11
4524 adcq $1,%r12
4525 movq 0+0(%rbp),%rax
4526 movq %rax,%r15
4527 mulq %r10
4528 movq %rax,%r13
4529 movq %rdx,%r14
4530 movq 0+0(%rbp),%rax
4531 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08004532 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004533 addq %rax,%r14
4534 adcq %rdx,%r15
4535 movq 8+0(%rbp),%rax
4536 movq %rax,%r9
4537 mulq %r10
4538 addq %rax,%r14
4539 adcq $0,%rdx
4540 movq %rdx,%r10
4541 movq 8+0(%rbp),%rax
4542 mulq %r11
4543 addq %rax,%r15
4544 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004545 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05004546 addq %r10,%r15
4547 adcq %rdx,%r9
4548 movq %r13,%r10
4549 movq %r14,%r11
4550 movq %r15,%r12
4551 andq $3,%r12
4552 movq %r15,%r13
4553 andq $-4,%r13
4554 movq %r9,%r14
4555 shrdq $2,%r9,%r15
4556 shrq $2,%r9
4557 addq %r13,%r10
4558 adcq %r14,%r11
4559 adcq $0,%r12
4560 addq %r15,%r10
4561 adcq %r9,%r11
4562 adcq $0,%r12
4563
45642:
4565 addq $16,%r8
4566 vpaddd %ymm4,%ymm0,%ymm0
4567 vpxor %ymm0,%ymm12,%ymm12
4568 vpshufb .rol16(%rip),%ymm12,%ymm12
4569 vpaddd %ymm12,%ymm8,%ymm8
4570 vpxor %ymm8,%ymm4,%ymm4
4571 vpsrld $20,%ymm4,%ymm3
4572 vpslld $12,%ymm4,%ymm4
4573 vpxor %ymm3,%ymm4,%ymm4
4574 vpaddd %ymm4,%ymm0,%ymm0
4575 vpxor %ymm0,%ymm12,%ymm12
4576 vpshufb .rol8(%rip),%ymm12,%ymm12
4577 vpaddd %ymm12,%ymm8,%ymm8
4578 vpxor %ymm8,%ymm4,%ymm4
4579 vpslld $7,%ymm4,%ymm3
4580 vpsrld $25,%ymm4,%ymm4
4581 vpxor %ymm3,%ymm4,%ymm4
4582 vpalignr $12,%ymm12,%ymm12,%ymm12
4583 vpalignr $8,%ymm8,%ymm8,%ymm8
4584 vpalignr $4,%ymm4,%ymm4,%ymm4
4585 vpaddd %ymm4,%ymm0,%ymm0
4586 vpxor %ymm0,%ymm12,%ymm12
4587 vpshufb .rol16(%rip),%ymm12,%ymm12
4588 vpaddd %ymm12,%ymm8,%ymm8
4589 vpxor %ymm8,%ymm4,%ymm4
4590 vpsrld $20,%ymm4,%ymm3
4591 vpslld $12,%ymm4,%ymm4
4592 vpxor %ymm3,%ymm4,%ymm4
4593 vpaddd %ymm4,%ymm0,%ymm0
4594 vpxor %ymm0,%ymm12,%ymm12
4595 vpshufb .rol8(%rip),%ymm12,%ymm12
4596 vpaddd %ymm12,%ymm8,%ymm8
4597 vpxor %ymm8,%ymm4,%ymm4
4598 vpslld $7,%ymm4,%ymm3
4599 vpsrld $25,%ymm4,%ymm4
4600 vpxor %ymm3,%ymm4,%ymm4
4601 vpalignr $4,%ymm12,%ymm12,%ymm12
4602 vpalignr $8,%ymm8,%ymm8,%ymm8
4603 vpalignr $12,%ymm4,%ymm4,%ymm4
4604
4605 cmpq %rcx,%r8
4606 jb 1b
4607 cmpq $160,%r8
4608 jne 2b
4609 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
4610 vpaddd 64(%rbp),%ymm4,%ymm4
4611 vpaddd 96(%rbp),%ymm8,%ymm8
4612 vpaddd 160(%rbp),%ymm12,%ymm12
4613 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
4614 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
4615 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
4616 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
4617 vmovdqa %ymm3,%ymm8
4618
4619 jmp open_avx2_tail_loop
46203:
4621 cmpq $256,%rbx
4622 ja 3f
4623 vmovdqa .chacha20_consts(%rip),%ymm0
4624 vmovdqa 64(%rbp),%ymm4
4625 vmovdqa 96(%rbp),%ymm8
4626 vmovdqa %ymm0,%ymm1
4627 vmovdqa %ymm4,%ymm5
4628 vmovdqa %ymm8,%ymm9
4629 vmovdqa .avx2_inc(%rip),%ymm12
4630 vpaddd 160(%rbp),%ymm12,%ymm13
4631 vpaddd %ymm13,%ymm12,%ymm12
4632 vmovdqa %ymm12,160(%rbp)
4633 vmovdqa %ymm13,192(%rbp)
4634
4635 movq %rbx,128(%rbp)
4636 movq %rbx,%rcx
4637 subq $128,%rcx
4638 shrq $4,%rcx
4639 movq $10,%r8
4640 cmpq $10,%rcx
4641 cmovgq %r8,%rcx
4642 movq %rsi,%rbx
4643 xorq %r8,%r8
46441:
4645 addq 0(%rbx),%r10
4646 adcq 8+0(%rbx),%r11
4647 adcq $1,%r12
4648 movq 0+0(%rbp),%rdx
4649 movq %rdx,%r15
4650 mulxq %r10,%r13,%r14
4651 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004652 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004653 addq %rax,%r14
4654 adcq %rdx,%r15
4655 movq 8+0(%rbp),%rdx
4656 mulxq %r10,%r10,%rax
4657 addq %r10,%r14
4658 mulxq %r11,%r11,%r9
4659 adcq %r11,%r15
4660 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004661 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004662 addq %rax,%r15
4663 adcq %rdx,%r9
4664 movq %r13,%r10
4665 movq %r14,%r11
4666 movq %r15,%r12
4667 andq $3,%r12
4668 movq %r15,%r13
4669 andq $-4,%r13
4670 movq %r9,%r14
4671 shrdq $2,%r9,%r15
4672 shrq $2,%r9
4673 addq %r13,%r10
4674 adcq %r14,%r11
4675 adcq $0,%r12
4676 addq %r15,%r10
4677 adcq %r9,%r11
4678 adcq $0,%r12
4679
4680 leaq 16(%rbx),%rbx
46812:
4682 vpaddd %ymm4,%ymm0,%ymm0
4683 vpxor %ymm0,%ymm12,%ymm12
4684 vpshufb .rol16(%rip),%ymm12,%ymm12
4685 vpaddd %ymm12,%ymm8,%ymm8
4686 vpxor %ymm8,%ymm4,%ymm4
4687 vpsrld $20,%ymm4,%ymm3
4688 vpslld $12,%ymm4,%ymm4
4689 vpxor %ymm3,%ymm4,%ymm4
4690 vpaddd %ymm4,%ymm0,%ymm0
4691 vpxor %ymm0,%ymm12,%ymm12
4692 vpshufb .rol8(%rip),%ymm12,%ymm12
4693 vpaddd %ymm12,%ymm8,%ymm8
4694 vpxor %ymm8,%ymm4,%ymm4
4695 vpslld $7,%ymm4,%ymm3
4696 vpsrld $25,%ymm4,%ymm4
4697 vpxor %ymm3,%ymm4,%ymm4
4698 vpalignr $12,%ymm12,%ymm12,%ymm12
4699 vpalignr $8,%ymm8,%ymm8,%ymm8
4700 vpalignr $4,%ymm4,%ymm4,%ymm4
4701 vpaddd %ymm5,%ymm1,%ymm1
4702 vpxor %ymm1,%ymm13,%ymm13
4703 vpshufb .rol16(%rip),%ymm13,%ymm13
4704 vpaddd %ymm13,%ymm9,%ymm9
4705 vpxor %ymm9,%ymm5,%ymm5
4706 vpsrld $20,%ymm5,%ymm3
4707 vpslld $12,%ymm5,%ymm5
4708 vpxor %ymm3,%ymm5,%ymm5
4709 vpaddd %ymm5,%ymm1,%ymm1
4710 vpxor %ymm1,%ymm13,%ymm13
4711 vpshufb .rol8(%rip),%ymm13,%ymm13
4712 vpaddd %ymm13,%ymm9,%ymm9
4713 vpxor %ymm9,%ymm5,%ymm5
4714 vpslld $7,%ymm5,%ymm3
4715 vpsrld $25,%ymm5,%ymm5
4716 vpxor %ymm3,%ymm5,%ymm5
4717 vpalignr $12,%ymm13,%ymm13,%ymm13
4718 vpalignr $8,%ymm9,%ymm9,%ymm9
4719 vpalignr $4,%ymm5,%ymm5,%ymm5
4720
4721 incq %r8
4722 vpaddd %ymm4,%ymm0,%ymm0
4723 vpxor %ymm0,%ymm12,%ymm12
4724 vpshufb .rol16(%rip),%ymm12,%ymm12
4725 vpaddd %ymm12,%ymm8,%ymm8
4726 vpxor %ymm8,%ymm4,%ymm4
4727 vpsrld $20,%ymm4,%ymm3
4728 vpslld $12,%ymm4,%ymm4
4729 vpxor %ymm3,%ymm4,%ymm4
4730 vpaddd %ymm4,%ymm0,%ymm0
4731 vpxor %ymm0,%ymm12,%ymm12
4732 vpshufb .rol8(%rip),%ymm12,%ymm12
4733 vpaddd %ymm12,%ymm8,%ymm8
4734 vpxor %ymm8,%ymm4,%ymm4
4735 vpslld $7,%ymm4,%ymm3
4736 vpsrld $25,%ymm4,%ymm4
4737 vpxor %ymm3,%ymm4,%ymm4
4738 vpalignr $4,%ymm12,%ymm12,%ymm12
4739 vpalignr $8,%ymm8,%ymm8,%ymm8
4740 vpalignr $12,%ymm4,%ymm4,%ymm4
4741 vpaddd %ymm5,%ymm1,%ymm1
4742 vpxor %ymm1,%ymm13,%ymm13
4743 vpshufb .rol16(%rip),%ymm13,%ymm13
4744 vpaddd %ymm13,%ymm9,%ymm9
4745 vpxor %ymm9,%ymm5,%ymm5
4746 vpsrld $20,%ymm5,%ymm3
4747 vpslld $12,%ymm5,%ymm5
4748 vpxor %ymm3,%ymm5,%ymm5
4749 vpaddd %ymm5,%ymm1,%ymm1
4750 vpxor %ymm1,%ymm13,%ymm13
4751 vpshufb .rol8(%rip),%ymm13,%ymm13
4752 vpaddd %ymm13,%ymm9,%ymm9
4753 vpxor %ymm9,%ymm5,%ymm5
4754 vpslld $7,%ymm5,%ymm3
4755 vpsrld $25,%ymm5,%ymm5
4756 vpxor %ymm3,%ymm5,%ymm5
4757 vpalignr $4,%ymm13,%ymm13,%ymm13
4758 vpalignr $8,%ymm9,%ymm9,%ymm9
4759 vpalignr $12,%ymm5,%ymm5,%ymm5
4760 vpaddd %ymm6,%ymm2,%ymm2
4761 vpxor %ymm2,%ymm14,%ymm14
4762 vpshufb .rol16(%rip),%ymm14,%ymm14
4763 vpaddd %ymm14,%ymm10,%ymm10
4764 vpxor %ymm10,%ymm6,%ymm6
4765 vpsrld $20,%ymm6,%ymm3
4766 vpslld $12,%ymm6,%ymm6
4767 vpxor %ymm3,%ymm6,%ymm6
4768 vpaddd %ymm6,%ymm2,%ymm2
4769 vpxor %ymm2,%ymm14,%ymm14
4770 vpshufb .rol8(%rip),%ymm14,%ymm14
4771 vpaddd %ymm14,%ymm10,%ymm10
4772 vpxor %ymm10,%ymm6,%ymm6
4773 vpslld $7,%ymm6,%ymm3
4774 vpsrld $25,%ymm6,%ymm6
4775 vpxor %ymm3,%ymm6,%ymm6
4776 vpalignr $4,%ymm14,%ymm14,%ymm14
4777 vpalignr $8,%ymm10,%ymm10,%ymm10
4778 vpalignr $12,%ymm6,%ymm6,%ymm6
4779
4780 cmpq %rcx,%r8
4781 jb 1b
4782 cmpq $10,%r8
4783 jne 2b
4784 movq %rbx,%r8
4785 subq %rsi,%rbx
4786 movq %rbx,%rcx
4787 movq 128(%rbp),%rbx
47881:
4789 addq $16,%rcx
4790 cmpq %rbx,%rcx
4791 jg 1f
4792 addq 0(%r8),%r10
4793 adcq 8+0(%r8),%r11
4794 adcq $1,%r12
4795 movq 0+0(%rbp),%rdx
4796 movq %rdx,%r15
4797 mulxq %r10,%r13,%r14
4798 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004799 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004800 addq %rax,%r14
4801 adcq %rdx,%r15
4802 movq 8+0(%rbp),%rdx
4803 mulxq %r10,%r10,%rax
4804 addq %r10,%r14
4805 mulxq %r11,%r11,%r9
4806 adcq %r11,%r15
4807 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004808 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004809 addq %rax,%r15
4810 adcq %rdx,%r9
4811 movq %r13,%r10
4812 movq %r14,%r11
4813 movq %r15,%r12
4814 andq $3,%r12
4815 movq %r15,%r13
4816 andq $-4,%r13
4817 movq %r9,%r14
4818 shrdq $2,%r9,%r15
4819 shrq $2,%r9
4820 addq %r13,%r10
4821 adcq %r14,%r11
4822 adcq $0,%r12
4823 addq %r15,%r10
4824 adcq %r9,%r11
4825 adcq $0,%r12
4826
4827 leaq 16(%r8),%r8
4828 jmp 1b
48291:
4830 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
4831 vpaddd 64(%rbp),%ymm5,%ymm5
4832 vpaddd 96(%rbp),%ymm9,%ymm9
4833 vpaddd 192(%rbp),%ymm13,%ymm13
4834 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
4835 vpaddd 64(%rbp),%ymm4,%ymm4
4836 vpaddd 96(%rbp),%ymm8,%ymm8
4837 vpaddd 160(%rbp),%ymm12,%ymm12
4838 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
4839 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
4840 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
4841 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
4842 vpxor 0+0(%rsi),%ymm3,%ymm3
4843 vpxor 32+0(%rsi),%ymm1,%ymm1
4844 vpxor 64+0(%rsi),%ymm5,%ymm5
4845 vpxor 96+0(%rsi),%ymm9,%ymm9
4846 vmovdqu %ymm3,0+0(%rdi)
4847 vmovdqu %ymm1,32+0(%rdi)
4848 vmovdqu %ymm5,64+0(%rdi)
4849 vmovdqu %ymm9,96+0(%rdi)
4850 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
4851 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
4852 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
4853 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
4854 vmovdqa %ymm3,%ymm8
4855
4856 leaq 128(%rsi),%rsi
4857 leaq 128(%rdi),%rdi
4858 subq $128,%rbx
4859 jmp open_avx2_tail_loop
48603:
4861 cmpq $384,%rbx
4862 ja 3f
4863 vmovdqa .chacha20_consts(%rip),%ymm0
4864 vmovdqa 64(%rbp),%ymm4
4865 vmovdqa 96(%rbp),%ymm8
4866 vmovdqa %ymm0,%ymm1
4867 vmovdqa %ymm4,%ymm5
4868 vmovdqa %ymm8,%ymm9
4869 vmovdqa %ymm0,%ymm2
4870 vmovdqa %ymm4,%ymm6
4871 vmovdqa %ymm8,%ymm10
4872 vmovdqa .avx2_inc(%rip),%ymm12
4873 vpaddd 160(%rbp),%ymm12,%ymm14
4874 vpaddd %ymm14,%ymm12,%ymm13
4875 vpaddd %ymm13,%ymm12,%ymm12
4876 vmovdqa %ymm12,160(%rbp)
4877 vmovdqa %ymm13,192(%rbp)
4878 vmovdqa %ymm14,224(%rbp)
4879
4880 movq %rbx,128(%rbp)
4881 movq %rbx,%rcx
4882 subq $256,%rcx
4883 shrq $4,%rcx
4884 addq $6,%rcx
4885 movq $10,%r8
4886 cmpq $10,%rcx
4887 cmovgq %r8,%rcx
4888 movq %rsi,%rbx
4889 xorq %r8,%r8
48901:
4891 addq 0(%rbx),%r10
4892 adcq 8+0(%rbx),%r11
4893 adcq $1,%r12
4894 movq 0+0(%rbp),%rdx
4895 movq %rdx,%r15
4896 mulxq %r10,%r13,%r14
4897 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004898 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004899 addq %rax,%r14
4900 adcq %rdx,%r15
4901 movq 8+0(%rbp),%rdx
4902 mulxq %r10,%r10,%rax
4903 addq %r10,%r14
4904 mulxq %r11,%r11,%r9
4905 adcq %r11,%r15
4906 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004907 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004908 addq %rax,%r15
4909 adcq %rdx,%r9
4910 movq %r13,%r10
4911 movq %r14,%r11
4912 movq %r15,%r12
4913 andq $3,%r12
4914 movq %r15,%r13
4915 andq $-4,%r13
4916 movq %r9,%r14
4917 shrdq $2,%r9,%r15
4918 shrq $2,%r9
4919 addq %r13,%r10
4920 adcq %r14,%r11
4921 adcq $0,%r12
4922 addq %r15,%r10
4923 adcq %r9,%r11
4924 adcq $0,%r12
4925
4926 leaq 16(%rbx),%rbx
49272:
4928 vpaddd %ymm6,%ymm2,%ymm2
4929 vpxor %ymm2,%ymm14,%ymm14
4930 vpshufb .rol16(%rip),%ymm14,%ymm14
4931 vpaddd %ymm14,%ymm10,%ymm10
4932 vpxor %ymm10,%ymm6,%ymm6
4933 vpsrld $20,%ymm6,%ymm3
4934 vpslld $12,%ymm6,%ymm6
4935 vpxor %ymm3,%ymm6,%ymm6
4936 vpaddd %ymm6,%ymm2,%ymm2
4937 vpxor %ymm2,%ymm14,%ymm14
4938 vpshufb .rol8(%rip),%ymm14,%ymm14
4939 vpaddd %ymm14,%ymm10,%ymm10
4940 vpxor %ymm10,%ymm6,%ymm6
4941 vpslld $7,%ymm6,%ymm3
4942 vpsrld $25,%ymm6,%ymm6
4943 vpxor %ymm3,%ymm6,%ymm6
4944 vpalignr $12,%ymm14,%ymm14,%ymm14
4945 vpalignr $8,%ymm10,%ymm10,%ymm10
4946 vpalignr $4,%ymm6,%ymm6,%ymm6
4947 vpaddd %ymm5,%ymm1,%ymm1
4948 vpxor %ymm1,%ymm13,%ymm13
4949 vpshufb .rol16(%rip),%ymm13,%ymm13
4950 vpaddd %ymm13,%ymm9,%ymm9
4951 vpxor %ymm9,%ymm5,%ymm5
4952 vpsrld $20,%ymm5,%ymm3
4953 vpslld $12,%ymm5,%ymm5
4954 vpxor %ymm3,%ymm5,%ymm5
4955 vpaddd %ymm5,%ymm1,%ymm1
4956 vpxor %ymm1,%ymm13,%ymm13
4957 vpshufb .rol8(%rip),%ymm13,%ymm13
4958 vpaddd %ymm13,%ymm9,%ymm9
4959 vpxor %ymm9,%ymm5,%ymm5
4960 vpslld $7,%ymm5,%ymm3
4961 vpsrld $25,%ymm5,%ymm5
4962 vpxor %ymm3,%ymm5,%ymm5
4963 vpalignr $12,%ymm13,%ymm13,%ymm13
4964 vpalignr $8,%ymm9,%ymm9,%ymm9
4965 vpalignr $4,%ymm5,%ymm5,%ymm5
4966 vpaddd %ymm4,%ymm0,%ymm0
4967 vpxor %ymm0,%ymm12,%ymm12
4968 vpshufb .rol16(%rip),%ymm12,%ymm12
4969 vpaddd %ymm12,%ymm8,%ymm8
4970 vpxor %ymm8,%ymm4,%ymm4
4971 vpsrld $20,%ymm4,%ymm3
4972 vpslld $12,%ymm4,%ymm4
4973 vpxor %ymm3,%ymm4,%ymm4
4974 vpaddd %ymm4,%ymm0,%ymm0
4975 vpxor %ymm0,%ymm12,%ymm12
4976 vpshufb .rol8(%rip),%ymm12,%ymm12
4977 vpaddd %ymm12,%ymm8,%ymm8
4978 vpxor %ymm8,%ymm4,%ymm4
4979 vpslld $7,%ymm4,%ymm3
4980 vpsrld $25,%ymm4,%ymm4
4981 vpxor %ymm3,%ymm4,%ymm4
4982 vpalignr $12,%ymm12,%ymm12,%ymm12
4983 vpalignr $8,%ymm8,%ymm8,%ymm8
4984 vpalignr $4,%ymm4,%ymm4,%ymm4
4985 addq 0(%rbx),%r10
4986 adcq 8+0(%rbx),%r11
4987 adcq $1,%r12
4988 movq 0+0(%rbp),%rax
4989 movq %rax,%r15
4990 mulq %r10
4991 movq %rax,%r13
4992 movq %rdx,%r14
4993 movq 0+0(%rbp),%rax
4994 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08004995 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004996 addq %rax,%r14
4997 adcq %rdx,%r15
4998 movq 8+0(%rbp),%rax
4999 movq %rax,%r9
5000 mulq %r10
5001 addq %rax,%r14
5002 adcq $0,%rdx
5003 movq %rdx,%r10
5004 movq 8+0(%rbp),%rax
5005 mulq %r11
5006 addq %rax,%r15
5007 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005008 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05005009 addq %r10,%r15
5010 adcq %rdx,%r9
5011 movq %r13,%r10
5012 movq %r14,%r11
5013 movq %r15,%r12
5014 andq $3,%r12
5015 movq %r15,%r13
5016 andq $-4,%r13
5017 movq %r9,%r14
5018 shrdq $2,%r9,%r15
5019 shrq $2,%r9
5020 addq %r13,%r10
5021 adcq %r14,%r11
5022 adcq $0,%r12
5023 addq %r15,%r10
5024 adcq %r9,%r11
5025 adcq $0,%r12
5026
5027 leaq 16(%rbx),%rbx
5028 incq %r8
5029 vpaddd %ymm6,%ymm2,%ymm2
5030 vpxor %ymm2,%ymm14,%ymm14
5031 vpshufb .rol16(%rip),%ymm14,%ymm14
5032 vpaddd %ymm14,%ymm10,%ymm10
5033 vpxor %ymm10,%ymm6,%ymm6
5034 vpsrld $20,%ymm6,%ymm3
5035 vpslld $12,%ymm6,%ymm6
5036 vpxor %ymm3,%ymm6,%ymm6
5037 vpaddd %ymm6,%ymm2,%ymm2
5038 vpxor %ymm2,%ymm14,%ymm14
5039 vpshufb .rol8(%rip),%ymm14,%ymm14
5040 vpaddd %ymm14,%ymm10,%ymm10
5041 vpxor %ymm10,%ymm6,%ymm6
5042 vpslld $7,%ymm6,%ymm3
5043 vpsrld $25,%ymm6,%ymm6
5044 vpxor %ymm3,%ymm6,%ymm6
5045 vpalignr $4,%ymm14,%ymm14,%ymm14
5046 vpalignr $8,%ymm10,%ymm10,%ymm10
5047 vpalignr $12,%ymm6,%ymm6,%ymm6
5048 vpaddd %ymm5,%ymm1,%ymm1
5049 vpxor %ymm1,%ymm13,%ymm13
5050 vpshufb .rol16(%rip),%ymm13,%ymm13
5051 vpaddd %ymm13,%ymm9,%ymm9
5052 vpxor %ymm9,%ymm5,%ymm5
5053 vpsrld $20,%ymm5,%ymm3
5054 vpslld $12,%ymm5,%ymm5
5055 vpxor %ymm3,%ymm5,%ymm5
5056 vpaddd %ymm5,%ymm1,%ymm1
5057 vpxor %ymm1,%ymm13,%ymm13
5058 vpshufb .rol8(%rip),%ymm13,%ymm13
5059 vpaddd %ymm13,%ymm9,%ymm9
5060 vpxor %ymm9,%ymm5,%ymm5
5061 vpslld $7,%ymm5,%ymm3
5062 vpsrld $25,%ymm5,%ymm5
5063 vpxor %ymm3,%ymm5,%ymm5
5064 vpalignr $4,%ymm13,%ymm13,%ymm13
5065 vpalignr $8,%ymm9,%ymm9,%ymm9
5066 vpalignr $12,%ymm5,%ymm5,%ymm5
5067 vpaddd %ymm4,%ymm0,%ymm0
5068 vpxor %ymm0,%ymm12,%ymm12
5069 vpshufb .rol16(%rip),%ymm12,%ymm12
5070 vpaddd %ymm12,%ymm8,%ymm8
5071 vpxor %ymm8,%ymm4,%ymm4
5072 vpsrld $20,%ymm4,%ymm3
5073 vpslld $12,%ymm4,%ymm4
5074 vpxor %ymm3,%ymm4,%ymm4
5075 vpaddd %ymm4,%ymm0,%ymm0
5076 vpxor %ymm0,%ymm12,%ymm12
5077 vpshufb .rol8(%rip),%ymm12,%ymm12
5078 vpaddd %ymm12,%ymm8,%ymm8
5079 vpxor %ymm8,%ymm4,%ymm4
5080 vpslld $7,%ymm4,%ymm3
5081 vpsrld $25,%ymm4,%ymm4
5082 vpxor %ymm3,%ymm4,%ymm4
5083 vpalignr $4,%ymm12,%ymm12,%ymm12
5084 vpalignr $8,%ymm8,%ymm8,%ymm8
5085 vpalignr $12,%ymm4,%ymm4,%ymm4
5086
5087 cmpq %rcx,%r8
5088 jb 1b
5089 cmpq $10,%r8
5090 jne 2b
5091 movq %rbx,%r8
5092 subq %rsi,%rbx
5093 movq %rbx,%rcx
5094 movq 128(%rbp),%rbx
50951:
5096 addq $16,%rcx
5097 cmpq %rbx,%rcx
5098 jg 1f
5099 addq 0(%r8),%r10
5100 adcq 8+0(%r8),%r11
5101 adcq $1,%r12
5102 movq 0+0(%rbp),%rdx
5103 movq %rdx,%r15
5104 mulxq %r10,%r13,%r14
5105 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005106 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005107 addq %rax,%r14
5108 adcq %rdx,%r15
5109 movq 8+0(%rbp),%rdx
5110 mulxq %r10,%r10,%rax
5111 addq %r10,%r14
5112 mulxq %r11,%r11,%r9
5113 adcq %r11,%r15
5114 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08005115 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05005116 addq %rax,%r15
5117 adcq %rdx,%r9
5118 movq %r13,%r10
5119 movq %r14,%r11
5120 movq %r15,%r12
5121 andq $3,%r12
5122 movq %r15,%r13
5123 andq $-4,%r13
5124 movq %r9,%r14
5125 shrdq $2,%r9,%r15
5126 shrq $2,%r9
5127 addq %r13,%r10
5128 adcq %r14,%r11
5129 adcq $0,%r12
5130 addq %r15,%r10
5131 adcq %r9,%r11
5132 adcq $0,%r12
5133
5134 leaq 16(%r8),%r8
5135 jmp 1b
51361:
5137 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
5138 vpaddd 64(%rbp),%ymm6,%ymm6
5139 vpaddd 96(%rbp),%ymm10,%ymm10
5140 vpaddd 224(%rbp),%ymm14,%ymm14
5141 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
5142 vpaddd 64(%rbp),%ymm5,%ymm5
5143 vpaddd 96(%rbp),%ymm9,%ymm9
5144 vpaddd 192(%rbp),%ymm13,%ymm13
5145 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
5146 vpaddd 64(%rbp),%ymm4,%ymm4
5147 vpaddd 96(%rbp),%ymm8,%ymm8
5148 vpaddd 160(%rbp),%ymm12,%ymm12
5149 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
5150 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
5151 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
5152 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
5153 vpxor 0+0(%rsi),%ymm3,%ymm3
5154 vpxor 32+0(%rsi),%ymm2,%ymm2
5155 vpxor 64+0(%rsi),%ymm6,%ymm6
5156 vpxor 96+0(%rsi),%ymm10,%ymm10
5157 vmovdqu %ymm3,0+0(%rdi)
5158 vmovdqu %ymm2,32+0(%rdi)
5159 vmovdqu %ymm6,64+0(%rdi)
5160 vmovdqu %ymm10,96+0(%rdi)
5161 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
5162 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
5163 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
5164 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
5165 vpxor 0+128(%rsi),%ymm3,%ymm3
5166 vpxor 32+128(%rsi),%ymm1,%ymm1
5167 vpxor 64+128(%rsi),%ymm5,%ymm5
5168 vpxor 96+128(%rsi),%ymm9,%ymm9
5169 vmovdqu %ymm3,0+128(%rdi)
5170 vmovdqu %ymm1,32+128(%rdi)
5171 vmovdqu %ymm5,64+128(%rdi)
5172 vmovdqu %ymm9,96+128(%rdi)
5173 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
5174 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
5175 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
5176 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
5177 vmovdqa %ymm3,%ymm8
5178
5179 leaq 256(%rsi),%rsi
5180 leaq 256(%rdi),%rdi
5181 subq $256,%rbx
5182 jmp open_avx2_tail_loop
51833:
5184 vmovdqa .chacha20_consts(%rip),%ymm0
5185 vmovdqa 64(%rbp),%ymm4
5186 vmovdqa 96(%rbp),%ymm8
5187 vmovdqa %ymm0,%ymm1
5188 vmovdqa %ymm4,%ymm5
5189 vmovdqa %ymm8,%ymm9
5190 vmovdqa %ymm0,%ymm2
5191 vmovdqa %ymm4,%ymm6
5192 vmovdqa %ymm8,%ymm10
5193 vmovdqa %ymm0,%ymm3
5194 vmovdqa %ymm4,%ymm7
5195 vmovdqa %ymm8,%ymm11
5196 vmovdqa .avx2_inc(%rip),%ymm12
5197 vpaddd 160(%rbp),%ymm12,%ymm15
5198 vpaddd %ymm15,%ymm12,%ymm14
5199 vpaddd %ymm14,%ymm12,%ymm13
5200 vpaddd %ymm13,%ymm12,%ymm12
5201 vmovdqa %ymm15,256(%rbp)
5202 vmovdqa %ymm14,224(%rbp)
5203 vmovdqa %ymm13,192(%rbp)
5204 vmovdqa %ymm12,160(%rbp)
5205
5206 xorq %rcx,%rcx
5207 movq %rsi,%r8
52081:
5209 addq 0(%r8),%r10
5210 adcq 8+0(%r8),%r11
5211 adcq $1,%r12
5212 movq 0+0(%rbp),%rax
5213 movq %rax,%r15
5214 mulq %r10
5215 movq %rax,%r13
5216 movq %rdx,%r14
5217 movq 0+0(%rbp),%rax
5218 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08005219 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005220 addq %rax,%r14
5221 adcq %rdx,%r15
5222 movq 8+0(%rbp),%rax
5223 movq %rax,%r9
5224 mulq %r10
5225 addq %rax,%r14
5226 adcq $0,%rdx
5227 movq %rdx,%r10
5228 movq 8+0(%rbp),%rax
5229 mulq %r11
5230 addq %rax,%r15
5231 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005232 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05005233 addq %r10,%r15
5234 adcq %rdx,%r9
5235 movq %r13,%r10
5236 movq %r14,%r11
5237 movq %r15,%r12
5238 andq $3,%r12
5239 movq %r15,%r13
5240 andq $-4,%r13
5241 movq %r9,%r14
5242 shrdq $2,%r9,%r15
5243 shrq $2,%r9
5244 addq %r13,%r10
5245 adcq %r14,%r11
5246 adcq $0,%r12
5247 addq %r15,%r10
5248 adcq %r9,%r11
5249 adcq $0,%r12
5250
5251 leaq 16(%r8),%r8
52522:
5253 vmovdqa %ymm8,128(%rbp)
5254 vmovdqa .rol16(%rip),%ymm8
5255 vpaddd %ymm7,%ymm3,%ymm3
5256 vpaddd %ymm6,%ymm2,%ymm2
5257 vpaddd %ymm5,%ymm1,%ymm1
5258 vpaddd %ymm4,%ymm0,%ymm0
5259 vpxor %ymm3,%ymm15,%ymm15
5260 vpxor %ymm2,%ymm14,%ymm14
5261 vpxor %ymm1,%ymm13,%ymm13
5262 vpxor %ymm0,%ymm12,%ymm12
5263 vpshufb %ymm8,%ymm15,%ymm15
5264 vpshufb %ymm8,%ymm14,%ymm14
5265 vpshufb %ymm8,%ymm13,%ymm13
5266 vpshufb %ymm8,%ymm12,%ymm12
5267 vmovdqa 128(%rbp),%ymm8
5268 vpaddd %ymm15,%ymm11,%ymm11
5269 vpaddd %ymm14,%ymm10,%ymm10
5270 vpaddd %ymm13,%ymm9,%ymm9
5271 vpaddd %ymm12,%ymm8,%ymm8
5272 vpxor %ymm11,%ymm7,%ymm7
5273 vpxor %ymm10,%ymm6,%ymm6
5274 vpxor %ymm9,%ymm5,%ymm5
5275 vpxor %ymm8,%ymm4,%ymm4
5276 vmovdqa %ymm8,128(%rbp)
5277 vpsrld $20,%ymm7,%ymm8
5278 vpslld $32-20,%ymm7,%ymm7
5279 vpxor %ymm8,%ymm7,%ymm7
5280 vpsrld $20,%ymm6,%ymm8
5281 vpslld $32-20,%ymm6,%ymm6
5282 vpxor %ymm8,%ymm6,%ymm6
5283 vpsrld $20,%ymm5,%ymm8
5284 vpslld $32-20,%ymm5,%ymm5
5285 vpxor %ymm8,%ymm5,%ymm5
5286 vpsrld $20,%ymm4,%ymm8
5287 vpslld $32-20,%ymm4,%ymm4
5288 vpxor %ymm8,%ymm4,%ymm4
5289 vmovdqa .rol8(%rip),%ymm8
5290 addq 0(%r8),%r10
5291 adcq 8+0(%r8),%r11
5292 adcq $1,%r12
5293 movq 0+0(%rbp),%rdx
5294 movq %rdx,%r15
5295 mulxq %r10,%r13,%r14
5296 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005297 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005298 addq %rax,%r14
5299 adcq %rdx,%r15
5300 movq 8+0(%rbp),%rdx
5301 mulxq %r10,%r10,%rax
5302 addq %r10,%r14
5303 mulxq %r11,%r11,%r9
5304 adcq %r11,%r15
5305 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08005306 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05005307 addq %rax,%r15
5308 adcq %rdx,%r9
5309 movq %r13,%r10
5310 movq %r14,%r11
5311 movq %r15,%r12
5312 andq $3,%r12
5313 movq %r15,%r13
5314 andq $-4,%r13
5315 movq %r9,%r14
5316 shrdq $2,%r9,%r15
5317 shrq $2,%r9
5318 addq %r13,%r10
5319 adcq %r14,%r11
5320 adcq $0,%r12
5321 addq %r15,%r10
5322 adcq %r9,%r11
5323 adcq $0,%r12
5324 vpaddd %ymm7,%ymm3,%ymm3
5325 vpaddd %ymm6,%ymm2,%ymm2
5326 vpaddd %ymm5,%ymm1,%ymm1
5327 vpaddd %ymm4,%ymm0,%ymm0
5328 vpxor %ymm3,%ymm15,%ymm15
5329 vpxor %ymm2,%ymm14,%ymm14
5330 vpxor %ymm1,%ymm13,%ymm13
5331 vpxor %ymm0,%ymm12,%ymm12
5332 vpshufb %ymm8,%ymm15,%ymm15
5333 vpshufb %ymm8,%ymm14,%ymm14
5334 vpshufb %ymm8,%ymm13,%ymm13
5335 vpshufb %ymm8,%ymm12,%ymm12
5336 vmovdqa 128(%rbp),%ymm8
5337 vpaddd %ymm15,%ymm11,%ymm11
5338 vpaddd %ymm14,%ymm10,%ymm10
5339 vpaddd %ymm13,%ymm9,%ymm9
5340 vpaddd %ymm12,%ymm8,%ymm8
5341 vpxor %ymm11,%ymm7,%ymm7
5342 vpxor %ymm10,%ymm6,%ymm6
5343 vpxor %ymm9,%ymm5,%ymm5
5344 vpxor %ymm8,%ymm4,%ymm4
5345 vmovdqa %ymm8,128(%rbp)
5346 vpsrld $25,%ymm7,%ymm8
5347 vpslld $32-25,%ymm7,%ymm7
5348 vpxor %ymm8,%ymm7,%ymm7
5349 vpsrld $25,%ymm6,%ymm8
5350 vpslld $32-25,%ymm6,%ymm6
5351 vpxor %ymm8,%ymm6,%ymm6
5352 vpsrld $25,%ymm5,%ymm8
5353 vpslld $32-25,%ymm5,%ymm5
5354 vpxor %ymm8,%ymm5,%ymm5
5355 vpsrld $25,%ymm4,%ymm8
5356 vpslld $32-25,%ymm4,%ymm4
5357 vpxor %ymm8,%ymm4,%ymm4
5358 vmovdqa 128(%rbp),%ymm8
5359 vpalignr $4,%ymm7,%ymm7,%ymm7
5360 vpalignr $8,%ymm11,%ymm11,%ymm11
5361 vpalignr $12,%ymm15,%ymm15,%ymm15
5362 vpalignr $4,%ymm6,%ymm6,%ymm6
5363 vpalignr $8,%ymm10,%ymm10,%ymm10
5364 vpalignr $12,%ymm14,%ymm14,%ymm14
5365 vpalignr $4,%ymm5,%ymm5,%ymm5
5366 vpalignr $8,%ymm9,%ymm9,%ymm9
5367 vpalignr $12,%ymm13,%ymm13,%ymm13
5368 vpalignr $4,%ymm4,%ymm4,%ymm4
5369 vpalignr $8,%ymm8,%ymm8,%ymm8
5370 vpalignr $12,%ymm12,%ymm12,%ymm12
5371 vmovdqa %ymm8,128(%rbp)
5372 addq 16(%r8),%r10
5373 adcq 8+16(%r8),%r11
5374 adcq $1,%r12
5375 movq 0+0(%rbp),%rdx
5376 movq %rdx,%r15
5377 mulxq %r10,%r13,%r14
5378 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005379 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005380 addq %rax,%r14
5381 adcq %rdx,%r15
5382 movq 8+0(%rbp),%rdx
5383 mulxq %r10,%r10,%rax
5384 addq %r10,%r14
5385 mulxq %r11,%r11,%r9
5386 adcq %r11,%r15
5387 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08005388 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05005389 addq %rax,%r15
5390 adcq %rdx,%r9
5391 movq %r13,%r10
5392 movq %r14,%r11
5393 movq %r15,%r12
5394 andq $3,%r12
5395 movq %r15,%r13
5396 andq $-4,%r13
5397 movq %r9,%r14
5398 shrdq $2,%r9,%r15
5399 shrq $2,%r9
5400 addq %r13,%r10
5401 adcq %r14,%r11
5402 adcq $0,%r12
5403 addq %r15,%r10
5404 adcq %r9,%r11
5405 adcq $0,%r12
5406
5407 leaq 32(%r8),%r8
5408 vmovdqa .rol16(%rip),%ymm8
5409 vpaddd %ymm7,%ymm3,%ymm3
5410 vpaddd %ymm6,%ymm2,%ymm2
5411 vpaddd %ymm5,%ymm1,%ymm1
5412 vpaddd %ymm4,%ymm0,%ymm0
5413 vpxor %ymm3,%ymm15,%ymm15
5414 vpxor %ymm2,%ymm14,%ymm14
5415 vpxor %ymm1,%ymm13,%ymm13
5416 vpxor %ymm0,%ymm12,%ymm12
5417 vpshufb %ymm8,%ymm15,%ymm15
5418 vpshufb %ymm8,%ymm14,%ymm14
5419 vpshufb %ymm8,%ymm13,%ymm13
5420 vpshufb %ymm8,%ymm12,%ymm12
5421 vmovdqa 128(%rbp),%ymm8
5422 vpaddd %ymm15,%ymm11,%ymm11
5423 vpaddd %ymm14,%ymm10,%ymm10
5424 vpaddd %ymm13,%ymm9,%ymm9
5425 vpaddd %ymm12,%ymm8,%ymm8
5426 vpxor %ymm11,%ymm7,%ymm7
5427 vpxor %ymm10,%ymm6,%ymm6
5428 vpxor %ymm9,%ymm5,%ymm5
5429 vpxor %ymm8,%ymm4,%ymm4
5430 vmovdqa %ymm8,128(%rbp)
5431 vpsrld $20,%ymm7,%ymm8
5432 vpslld $32-20,%ymm7,%ymm7
5433 vpxor %ymm8,%ymm7,%ymm7
5434 vpsrld $20,%ymm6,%ymm8
5435 vpslld $32-20,%ymm6,%ymm6
5436 vpxor %ymm8,%ymm6,%ymm6
5437 vpsrld $20,%ymm5,%ymm8
5438 vpslld $32-20,%ymm5,%ymm5
5439 vpxor %ymm8,%ymm5,%ymm5
5440 vpsrld $20,%ymm4,%ymm8
5441 vpslld $32-20,%ymm4,%ymm4
5442 vpxor %ymm8,%ymm4,%ymm4
5443 vmovdqa .rol8(%rip),%ymm8
5444 vpaddd %ymm7,%ymm3,%ymm3
5445 vpaddd %ymm6,%ymm2,%ymm2
5446 vpaddd %ymm5,%ymm1,%ymm1
5447 vpaddd %ymm4,%ymm0,%ymm0
5448 vpxor %ymm3,%ymm15,%ymm15
5449 vpxor %ymm2,%ymm14,%ymm14
5450 vpxor %ymm1,%ymm13,%ymm13
5451 vpxor %ymm0,%ymm12,%ymm12
5452 vpshufb %ymm8,%ymm15,%ymm15
5453 vpshufb %ymm8,%ymm14,%ymm14
5454 vpshufb %ymm8,%ymm13,%ymm13
5455 vpshufb %ymm8,%ymm12,%ymm12
5456 vmovdqa 128(%rbp),%ymm8
5457 vpaddd %ymm15,%ymm11,%ymm11
5458 vpaddd %ymm14,%ymm10,%ymm10
5459 vpaddd %ymm13,%ymm9,%ymm9
5460 vpaddd %ymm12,%ymm8,%ymm8
5461 vpxor %ymm11,%ymm7,%ymm7
5462 vpxor %ymm10,%ymm6,%ymm6
5463 vpxor %ymm9,%ymm5,%ymm5
5464 vpxor %ymm8,%ymm4,%ymm4
5465 vmovdqa %ymm8,128(%rbp)
5466 vpsrld $25,%ymm7,%ymm8
5467 vpslld $32-25,%ymm7,%ymm7
5468 vpxor %ymm8,%ymm7,%ymm7
5469 vpsrld $25,%ymm6,%ymm8
5470 vpslld $32-25,%ymm6,%ymm6
5471 vpxor %ymm8,%ymm6,%ymm6
5472 vpsrld $25,%ymm5,%ymm8
5473 vpslld $32-25,%ymm5,%ymm5
5474 vpxor %ymm8,%ymm5,%ymm5
5475 vpsrld $25,%ymm4,%ymm8
5476 vpslld $32-25,%ymm4,%ymm4
5477 vpxor %ymm8,%ymm4,%ymm4
5478 vmovdqa 128(%rbp),%ymm8
5479 vpalignr $12,%ymm7,%ymm7,%ymm7
5480 vpalignr $8,%ymm11,%ymm11,%ymm11
5481 vpalignr $4,%ymm15,%ymm15,%ymm15
5482 vpalignr $12,%ymm6,%ymm6,%ymm6
5483 vpalignr $8,%ymm10,%ymm10,%ymm10
5484 vpalignr $4,%ymm14,%ymm14,%ymm14
5485 vpalignr $12,%ymm5,%ymm5,%ymm5
5486 vpalignr $8,%ymm9,%ymm9,%ymm9
5487 vpalignr $4,%ymm13,%ymm13,%ymm13
5488 vpalignr $12,%ymm4,%ymm4,%ymm4
5489 vpalignr $8,%ymm8,%ymm8,%ymm8
5490 vpalignr $4,%ymm12,%ymm12,%ymm12
5491
5492 incq %rcx
5493 cmpq $4,%rcx
5494 jl 1b
5495 cmpq $10,%rcx
5496 jne 2b
5497 movq %rbx,%rcx
5498 subq $384,%rcx
5499 andq $-16,%rcx
55001:
5501 testq %rcx,%rcx
5502 je 1f
5503 addq 0(%r8),%r10
5504 adcq 8+0(%r8),%r11
5505 adcq $1,%r12
5506 movq 0+0(%rbp),%rdx
5507 movq %rdx,%r15
5508 mulxq %r10,%r13,%r14
5509 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005510 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005511 addq %rax,%r14
5512 adcq %rdx,%r15
5513 movq 8+0(%rbp),%rdx
5514 mulxq %r10,%r10,%rax
5515 addq %r10,%r14
5516 mulxq %r11,%r11,%r9
5517 adcq %r11,%r15
5518 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08005519 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05005520 addq %rax,%r15
5521 adcq %rdx,%r9
5522 movq %r13,%r10
5523 movq %r14,%r11
5524 movq %r15,%r12
5525 andq $3,%r12
5526 movq %r15,%r13
5527 andq $-4,%r13
5528 movq %r9,%r14
5529 shrdq $2,%r9,%r15
5530 shrq $2,%r9
5531 addq %r13,%r10
5532 adcq %r14,%r11
5533 adcq $0,%r12
5534 addq %r15,%r10
5535 adcq %r9,%r11
5536 adcq $0,%r12
5537
5538 leaq 16(%r8),%r8
5539 subq $16,%rcx
5540 jmp 1b
55411:
5542 vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
5543 vpaddd 64(%rbp),%ymm7,%ymm7
5544 vpaddd 96(%rbp),%ymm11,%ymm11
5545 vpaddd 256(%rbp),%ymm15,%ymm15
5546 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
5547 vpaddd 64(%rbp),%ymm6,%ymm6
5548 vpaddd 96(%rbp),%ymm10,%ymm10
5549 vpaddd 224(%rbp),%ymm14,%ymm14
5550 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
5551 vpaddd 64(%rbp),%ymm5,%ymm5
5552 vpaddd 96(%rbp),%ymm9,%ymm9
5553 vpaddd 192(%rbp),%ymm13,%ymm13
5554 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
5555 vpaddd 64(%rbp),%ymm4,%ymm4
5556 vpaddd 96(%rbp),%ymm8,%ymm8
5557 vpaddd 160(%rbp),%ymm12,%ymm12
5558
5559 vmovdqa %ymm0,128(%rbp)
5560 vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
5561 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
5562 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
5563 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
5564 vpxor 0+0(%rsi),%ymm0,%ymm0
5565 vpxor 32+0(%rsi),%ymm3,%ymm3
5566 vpxor 64+0(%rsi),%ymm7,%ymm7
5567 vpxor 96+0(%rsi),%ymm11,%ymm11
5568 vmovdqu %ymm0,0+0(%rdi)
5569 vmovdqu %ymm3,32+0(%rdi)
5570 vmovdqu %ymm7,64+0(%rdi)
5571 vmovdqu %ymm11,96+0(%rdi)
5572
5573 vmovdqa 128(%rbp),%ymm0
5574 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
5575 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
5576 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
5577 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
5578 vpxor 0+128(%rsi),%ymm3,%ymm3
5579 vpxor 32+128(%rsi),%ymm2,%ymm2
5580 vpxor 64+128(%rsi),%ymm6,%ymm6
5581 vpxor 96+128(%rsi),%ymm10,%ymm10
5582 vmovdqu %ymm3,0+128(%rdi)
5583 vmovdqu %ymm2,32+128(%rdi)
5584 vmovdqu %ymm6,64+128(%rdi)
5585 vmovdqu %ymm10,96+128(%rdi)
5586 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
5587 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
5588 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
5589 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
5590 vpxor 0+256(%rsi),%ymm3,%ymm3
5591 vpxor 32+256(%rsi),%ymm1,%ymm1
5592 vpxor 64+256(%rsi),%ymm5,%ymm5
5593 vpxor 96+256(%rsi),%ymm9,%ymm9
5594 vmovdqu %ymm3,0+256(%rdi)
5595 vmovdqu %ymm1,32+256(%rdi)
5596 vmovdqu %ymm5,64+256(%rdi)
5597 vmovdqu %ymm9,96+256(%rdi)
5598 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
5599 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
5600 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
5601 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
5602 vmovdqa %ymm3,%ymm8
5603
5604 leaq 384(%rsi),%rsi
5605 leaq 384(%rdi),%rdi
5606 subq $384,%rbx
5607open_avx2_tail_loop:
5608 cmpq $32,%rbx
5609 jb open_avx2_tail
5610 subq $32,%rbx
5611 vpxor (%rsi),%ymm0,%ymm0
5612 vmovdqu %ymm0,(%rdi)
5613 leaq 32(%rsi),%rsi
5614 leaq 32(%rdi),%rdi
5615 vmovdqa %ymm4,%ymm0
5616 vmovdqa %ymm8,%ymm4
5617 vmovdqa %ymm12,%ymm8
5618 jmp open_avx2_tail_loop
5619open_avx2_tail:
5620 cmpq $16,%rbx
5621 vmovdqa %xmm0,%xmm1
5622 jb 1f
5623 subq $16,%rbx
5624
5625 vpxor (%rsi),%xmm0,%xmm1
5626 vmovdqu %xmm1,(%rdi)
5627 leaq 16(%rsi),%rsi
5628 leaq 16(%rdi),%rdi
5629 vperm2i128 $0x11,%ymm0,%ymm0,%ymm0
5630 vmovdqa %xmm0,%xmm1
56311:
5632 vzeroupper
5633 jmp open_sse_tail_16
5634
5635open_avx2_192:
5636 vmovdqa %ymm0,%ymm1
5637 vmovdqa %ymm0,%ymm2
5638 vmovdqa %ymm4,%ymm5
5639 vmovdqa %ymm4,%ymm6
5640 vmovdqa %ymm8,%ymm9
5641 vmovdqa %ymm8,%ymm10
5642 vpaddd .avx2_inc(%rip),%ymm12,%ymm13
5643 vmovdqa %ymm12,%ymm11
5644 vmovdqa %ymm13,%ymm15
5645 movq $10,%r10
56461:
5647 vpaddd %ymm4,%ymm0,%ymm0
5648 vpxor %ymm0,%ymm12,%ymm12
5649 vpshufb .rol16(%rip),%ymm12,%ymm12
5650 vpaddd %ymm12,%ymm8,%ymm8
5651 vpxor %ymm8,%ymm4,%ymm4
5652 vpsrld $20,%ymm4,%ymm3
5653 vpslld $12,%ymm4,%ymm4
5654 vpxor %ymm3,%ymm4,%ymm4
5655 vpaddd %ymm4,%ymm0,%ymm0
5656 vpxor %ymm0,%ymm12,%ymm12
5657 vpshufb .rol8(%rip),%ymm12,%ymm12
5658 vpaddd %ymm12,%ymm8,%ymm8
5659 vpxor %ymm8,%ymm4,%ymm4
5660 vpslld $7,%ymm4,%ymm3
5661 vpsrld $25,%ymm4,%ymm4
5662 vpxor %ymm3,%ymm4,%ymm4
5663 vpalignr $12,%ymm12,%ymm12,%ymm12
5664 vpalignr $8,%ymm8,%ymm8,%ymm8
5665 vpalignr $4,%ymm4,%ymm4,%ymm4
5666 vpaddd %ymm5,%ymm1,%ymm1
5667 vpxor %ymm1,%ymm13,%ymm13
5668 vpshufb .rol16(%rip),%ymm13,%ymm13
5669 vpaddd %ymm13,%ymm9,%ymm9
5670 vpxor %ymm9,%ymm5,%ymm5
5671 vpsrld $20,%ymm5,%ymm3
5672 vpslld $12,%ymm5,%ymm5
5673 vpxor %ymm3,%ymm5,%ymm5
5674 vpaddd %ymm5,%ymm1,%ymm1
5675 vpxor %ymm1,%ymm13,%ymm13
5676 vpshufb .rol8(%rip),%ymm13,%ymm13
5677 vpaddd %ymm13,%ymm9,%ymm9
5678 vpxor %ymm9,%ymm5,%ymm5
5679 vpslld $7,%ymm5,%ymm3
5680 vpsrld $25,%ymm5,%ymm5
5681 vpxor %ymm3,%ymm5,%ymm5
5682 vpalignr $12,%ymm13,%ymm13,%ymm13
5683 vpalignr $8,%ymm9,%ymm9,%ymm9
5684 vpalignr $4,%ymm5,%ymm5,%ymm5
5685 vpaddd %ymm4,%ymm0,%ymm0
5686 vpxor %ymm0,%ymm12,%ymm12
5687 vpshufb .rol16(%rip),%ymm12,%ymm12
5688 vpaddd %ymm12,%ymm8,%ymm8
5689 vpxor %ymm8,%ymm4,%ymm4
5690 vpsrld $20,%ymm4,%ymm3
5691 vpslld $12,%ymm4,%ymm4
5692 vpxor %ymm3,%ymm4,%ymm4
5693 vpaddd %ymm4,%ymm0,%ymm0
5694 vpxor %ymm0,%ymm12,%ymm12
5695 vpshufb .rol8(%rip),%ymm12,%ymm12
5696 vpaddd %ymm12,%ymm8,%ymm8
5697 vpxor %ymm8,%ymm4,%ymm4
5698 vpslld $7,%ymm4,%ymm3
5699 vpsrld $25,%ymm4,%ymm4
5700 vpxor %ymm3,%ymm4,%ymm4
5701 vpalignr $4,%ymm12,%ymm12,%ymm12
5702 vpalignr $8,%ymm8,%ymm8,%ymm8
5703 vpalignr $12,%ymm4,%ymm4,%ymm4
5704 vpaddd %ymm5,%ymm1,%ymm1
5705 vpxor %ymm1,%ymm13,%ymm13
5706 vpshufb .rol16(%rip),%ymm13,%ymm13
5707 vpaddd %ymm13,%ymm9,%ymm9
5708 vpxor %ymm9,%ymm5,%ymm5
5709 vpsrld $20,%ymm5,%ymm3
5710 vpslld $12,%ymm5,%ymm5
5711 vpxor %ymm3,%ymm5,%ymm5
5712 vpaddd %ymm5,%ymm1,%ymm1
5713 vpxor %ymm1,%ymm13,%ymm13
5714 vpshufb .rol8(%rip),%ymm13,%ymm13
5715 vpaddd %ymm13,%ymm9,%ymm9
5716 vpxor %ymm9,%ymm5,%ymm5
5717 vpslld $7,%ymm5,%ymm3
5718 vpsrld $25,%ymm5,%ymm5
5719 vpxor %ymm3,%ymm5,%ymm5
5720 vpalignr $4,%ymm13,%ymm13,%ymm13
5721 vpalignr $8,%ymm9,%ymm9,%ymm9
5722 vpalignr $12,%ymm5,%ymm5,%ymm5
5723
5724 decq %r10
5725 jne 1b
5726 vpaddd %ymm2,%ymm0,%ymm0
5727 vpaddd %ymm2,%ymm1,%ymm1
5728 vpaddd %ymm6,%ymm4,%ymm4
5729 vpaddd %ymm6,%ymm5,%ymm5
5730 vpaddd %ymm10,%ymm8,%ymm8
5731 vpaddd %ymm10,%ymm9,%ymm9
5732 vpaddd %ymm11,%ymm12,%ymm12
5733 vpaddd %ymm15,%ymm13,%ymm13
5734 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
5735
5736 vpand .clamp(%rip),%ymm3,%ymm3
5737 vmovdqa %ymm3,0(%rbp)
5738
5739 vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
5740 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
5741 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
5742 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
5743 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
5744 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
5745open_avx2_short:
5746 movq %r8,%r8
5747 call poly_hash_ad_internal
5748open_avx2_hash_and_xor_loop:
5749 cmpq $32,%rbx
5750 jb open_avx2_short_tail_32
5751 subq $32,%rbx
5752 addq 0(%rsi),%r10
5753 adcq 8+0(%rsi),%r11
5754 adcq $1,%r12
5755 movq 0+0(%rbp),%rax
5756 movq %rax,%r15
5757 mulq %r10
5758 movq %rax,%r13
5759 movq %rdx,%r14
5760 movq 0+0(%rbp),%rax
5761 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08005762 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005763 addq %rax,%r14
5764 adcq %rdx,%r15
5765 movq 8+0(%rbp),%rax
5766 movq %rax,%r9
5767 mulq %r10
5768 addq %rax,%r14
5769 adcq $0,%rdx
5770 movq %rdx,%r10
5771 movq 8+0(%rbp),%rax
5772 mulq %r11
5773 addq %rax,%r15
5774 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005775 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05005776 addq %r10,%r15
5777 adcq %rdx,%r9
5778 movq %r13,%r10
5779 movq %r14,%r11
5780 movq %r15,%r12
5781 andq $3,%r12
5782 movq %r15,%r13
5783 andq $-4,%r13
5784 movq %r9,%r14
5785 shrdq $2,%r9,%r15
5786 shrq $2,%r9
5787 addq %r13,%r10
5788 adcq %r14,%r11
5789 adcq $0,%r12
5790 addq %r15,%r10
5791 adcq %r9,%r11
5792 adcq $0,%r12
5793 addq 16(%rsi),%r10
5794 adcq 8+16(%rsi),%r11
5795 adcq $1,%r12
5796 movq 0+0(%rbp),%rax
5797 movq %rax,%r15
5798 mulq %r10
5799 movq %rax,%r13
5800 movq %rdx,%r14
5801 movq 0+0(%rbp),%rax
5802 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08005803 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005804 addq %rax,%r14
5805 adcq %rdx,%r15
5806 movq 8+0(%rbp),%rax
5807 movq %rax,%r9
5808 mulq %r10
5809 addq %rax,%r14
5810 adcq $0,%rdx
5811 movq %rdx,%r10
5812 movq 8+0(%rbp),%rax
5813 mulq %r11
5814 addq %rax,%r15
5815 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005816 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05005817 addq %r10,%r15
5818 adcq %rdx,%r9
5819 movq %r13,%r10
5820 movq %r14,%r11
5821 movq %r15,%r12
5822 andq $3,%r12
5823 movq %r15,%r13
5824 andq $-4,%r13
5825 movq %r9,%r14
5826 shrdq $2,%r9,%r15
5827 shrq $2,%r9
5828 addq %r13,%r10
5829 adcq %r14,%r11
5830 adcq $0,%r12
5831 addq %r15,%r10
5832 adcq %r9,%r11
5833 adcq $0,%r12
5834
5835
5836 vpxor (%rsi),%ymm0,%ymm0
5837 vmovdqu %ymm0,(%rdi)
5838 leaq 32(%rsi),%rsi
5839 leaq 32(%rdi),%rdi
5840
5841 vmovdqa %ymm4,%ymm0
5842 vmovdqa %ymm8,%ymm4
5843 vmovdqa %ymm12,%ymm8
5844 vmovdqa %ymm1,%ymm12
5845 vmovdqa %ymm5,%ymm1
5846 vmovdqa %ymm9,%ymm5
5847 vmovdqa %ymm13,%ymm9
5848 vmovdqa %ymm2,%ymm13
5849 vmovdqa %ymm6,%ymm2
5850 jmp open_avx2_hash_and_xor_loop
5851open_avx2_short_tail_32:
5852 cmpq $16,%rbx
5853 vmovdqa %xmm0,%xmm1
5854 jb 1f
5855 subq $16,%rbx
5856 addq 0(%rsi),%r10
5857 adcq 8+0(%rsi),%r11
5858 adcq $1,%r12
5859 movq 0+0(%rbp),%rax
5860 movq %rax,%r15
5861 mulq %r10
5862 movq %rax,%r13
5863 movq %rdx,%r14
5864 movq 0+0(%rbp),%rax
5865 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08005866 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005867 addq %rax,%r14
5868 adcq %rdx,%r15
5869 movq 8+0(%rbp),%rax
5870 movq %rax,%r9
5871 mulq %r10
5872 addq %rax,%r14
5873 adcq $0,%rdx
5874 movq %rdx,%r10
5875 movq 8+0(%rbp),%rax
5876 mulq %r11
5877 addq %rax,%r15
5878 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005879 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05005880 addq %r10,%r15
5881 adcq %rdx,%r9
5882 movq %r13,%r10
5883 movq %r14,%r11
5884 movq %r15,%r12
5885 andq $3,%r12
5886 movq %r15,%r13
5887 andq $-4,%r13
5888 movq %r9,%r14
5889 shrdq $2,%r9,%r15
5890 shrq $2,%r9
5891 addq %r13,%r10
5892 adcq %r14,%r11
5893 adcq $0,%r12
5894 addq %r15,%r10
5895 adcq %r9,%r11
5896 adcq $0,%r12
5897
5898 vpxor (%rsi),%xmm0,%xmm3
5899 vmovdqu %xmm3,(%rdi)
5900 leaq 16(%rsi),%rsi
5901 leaq 16(%rdi),%rdi
5902 vextracti128 $1,%ymm0,%xmm1
59031:
5904 vzeroupper
5905 jmp open_sse_tail_16
5906
5907open_avx2_320:
5908 vmovdqa %ymm0,%ymm1
5909 vmovdqa %ymm0,%ymm2
5910 vmovdqa %ymm4,%ymm5
5911 vmovdqa %ymm4,%ymm6
5912 vmovdqa %ymm8,%ymm9
5913 vmovdqa %ymm8,%ymm10
5914 vpaddd .avx2_inc(%rip),%ymm12,%ymm13
5915 vpaddd .avx2_inc(%rip),%ymm13,%ymm14
5916 vmovdqa %ymm4,%ymm7
5917 vmovdqa %ymm8,%ymm11
5918 vmovdqa %ymm12,160(%rbp)
5919 vmovdqa %ymm13,192(%rbp)
5920 vmovdqa %ymm14,224(%rbp)
5921 movq $10,%r10
59221:
5923 vpaddd %ymm4,%ymm0,%ymm0
5924 vpxor %ymm0,%ymm12,%ymm12
5925 vpshufb .rol16(%rip),%ymm12,%ymm12
5926 vpaddd %ymm12,%ymm8,%ymm8
5927 vpxor %ymm8,%ymm4,%ymm4
5928 vpsrld $20,%ymm4,%ymm3
5929 vpslld $12,%ymm4,%ymm4
5930 vpxor %ymm3,%ymm4,%ymm4
5931 vpaddd %ymm4,%ymm0,%ymm0
5932 vpxor %ymm0,%ymm12,%ymm12
5933 vpshufb .rol8(%rip),%ymm12,%ymm12
5934 vpaddd %ymm12,%ymm8,%ymm8
5935 vpxor %ymm8,%ymm4,%ymm4
5936 vpslld $7,%ymm4,%ymm3
5937 vpsrld $25,%ymm4,%ymm4
5938 vpxor %ymm3,%ymm4,%ymm4
5939 vpalignr $12,%ymm12,%ymm12,%ymm12
5940 vpalignr $8,%ymm8,%ymm8,%ymm8
5941 vpalignr $4,%ymm4,%ymm4,%ymm4
5942 vpaddd %ymm5,%ymm1,%ymm1
5943 vpxor %ymm1,%ymm13,%ymm13
5944 vpshufb .rol16(%rip),%ymm13,%ymm13
5945 vpaddd %ymm13,%ymm9,%ymm9
5946 vpxor %ymm9,%ymm5,%ymm5
5947 vpsrld $20,%ymm5,%ymm3
5948 vpslld $12,%ymm5,%ymm5
5949 vpxor %ymm3,%ymm5,%ymm5
5950 vpaddd %ymm5,%ymm1,%ymm1
5951 vpxor %ymm1,%ymm13,%ymm13
5952 vpshufb .rol8(%rip),%ymm13,%ymm13
5953 vpaddd %ymm13,%ymm9,%ymm9
5954 vpxor %ymm9,%ymm5,%ymm5
5955 vpslld $7,%ymm5,%ymm3
5956 vpsrld $25,%ymm5,%ymm5
5957 vpxor %ymm3,%ymm5,%ymm5
5958 vpalignr $12,%ymm13,%ymm13,%ymm13
5959 vpalignr $8,%ymm9,%ymm9,%ymm9
5960 vpalignr $4,%ymm5,%ymm5,%ymm5
5961 vpaddd %ymm6,%ymm2,%ymm2
5962 vpxor %ymm2,%ymm14,%ymm14
5963 vpshufb .rol16(%rip),%ymm14,%ymm14
5964 vpaddd %ymm14,%ymm10,%ymm10
5965 vpxor %ymm10,%ymm6,%ymm6
5966 vpsrld $20,%ymm6,%ymm3
5967 vpslld $12,%ymm6,%ymm6
5968 vpxor %ymm3,%ymm6,%ymm6
5969 vpaddd %ymm6,%ymm2,%ymm2
5970 vpxor %ymm2,%ymm14,%ymm14
5971 vpshufb .rol8(%rip),%ymm14,%ymm14
5972 vpaddd %ymm14,%ymm10,%ymm10
5973 vpxor %ymm10,%ymm6,%ymm6
5974 vpslld $7,%ymm6,%ymm3
5975 vpsrld $25,%ymm6,%ymm6
5976 vpxor %ymm3,%ymm6,%ymm6
5977 vpalignr $12,%ymm14,%ymm14,%ymm14
5978 vpalignr $8,%ymm10,%ymm10,%ymm10
5979 vpalignr $4,%ymm6,%ymm6,%ymm6
5980 vpaddd %ymm4,%ymm0,%ymm0
5981 vpxor %ymm0,%ymm12,%ymm12
5982 vpshufb .rol16(%rip),%ymm12,%ymm12
5983 vpaddd %ymm12,%ymm8,%ymm8
5984 vpxor %ymm8,%ymm4,%ymm4
5985 vpsrld $20,%ymm4,%ymm3
5986 vpslld $12,%ymm4,%ymm4
5987 vpxor %ymm3,%ymm4,%ymm4
5988 vpaddd %ymm4,%ymm0,%ymm0
5989 vpxor %ymm0,%ymm12,%ymm12
5990 vpshufb .rol8(%rip),%ymm12,%ymm12
5991 vpaddd %ymm12,%ymm8,%ymm8
5992 vpxor %ymm8,%ymm4,%ymm4
5993 vpslld $7,%ymm4,%ymm3
5994 vpsrld $25,%ymm4,%ymm4
5995 vpxor %ymm3,%ymm4,%ymm4
5996 vpalignr $4,%ymm12,%ymm12,%ymm12
5997 vpalignr $8,%ymm8,%ymm8,%ymm8
5998 vpalignr $12,%ymm4,%ymm4,%ymm4
5999 vpaddd %ymm5,%ymm1,%ymm1
6000 vpxor %ymm1,%ymm13,%ymm13
6001 vpshufb .rol16(%rip),%ymm13,%ymm13
6002 vpaddd %ymm13,%ymm9,%ymm9
6003 vpxor %ymm9,%ymm5,%ymm5
6004 vpsrld $20,%ymm5,%ymm3
6005 vpslld $12,%ymm5,%ymm5
6006 vpxor %ymm3,%ymm5,%ymm5
6007 vpaddd %ymm5,%ymm1,%ymm1
6008 vpxor %ymm1,%ymm13,%ymm13
6009 vpshufb .rol8(%rip),%ymm13,%ymm13
6010 vpaddd %ymm13,%ymm9,%ymm9
6011 vpxor %ymm9,%ymm5,%ymm5
6012 vpslld $7,%ymm5,%ymm3
6013 vpsrld $25,%ymm5,%ymm5
6014 vpxor %ymm3,%ymm5,%ymm5
6015 vpalignr $4,%ymm13,%ymm13,%ymm13
6016 vpalignr $8,%ymm9,%ymm9,%ymm9
6017 vpalignr $12,%ymm5,%ymm5,%ymm5
6018 vpaddd %ymm6,%ymm2,%ymm2
6019 vpxor %ymm2,%ymm14,%ymm14
6020 vpshufb .rol16(%rip),%ymm14,%ymm14
6021 vpaddd %ymm14,%ymm10,%ymm10
6022 vpxor %ymm10,%ymm6,%ymm6
6023 vpsrld $20,%ymm6,%ymm3
6024 vpslld $12,%ymm6,%ymm6
6025 vpxor %ymm3,%ymm6,%ymm6
6026 vpaddd %ymm6,%ymm2,%ymm2
6027 vpxor %ymm2,%ymm14,%ymm14
6028 vpshufb .rol8(%rip),%ymm14,%ymm14
6029 vpaddd %ymm14,%ymm10,%ymm10
6030 vpxor %ymm10,%ymm6,%ymm6
6031 vpslld $7,%ymm6,%ymm3
6032 vpsrld $25,%ymm6,%ymm6
6033 vpxor %ymm3,%ymm6,%ymm6
6034 vpalignr $4,%ymm14,%ymm14,%ymm14
6035 vpalignr $8,%ymm10,%ymm10,%ymm10
6036 vpalignr $12,%ymm6,%ymm6,%ymm6
6037
6038 decq %r10
6039 jne 1b
6040 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
6041 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
6042 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
6043 vpaddd %ymm7,%ymm4,%ymm4
6044 vpaddd %ymm7,%ymm5,%ymm5
6045 vpaddd %ymm7,%ymm6,%ymm6
6046 vpaddd %ymm11,%ymm8,%ymm8
6047 vpaddd %ymm11,%ymm9,%ymm9
6048 vpaddd %ymm11,%ymm10,%ymm10
6049 vpaddd 160(%rbp),%ymm12,%ymm12
6050 vpaddd 192(%rbp),%ymm13,%ymm13
6051 vpaddd 224(%rbp),%ymm14,%ymm14
6052 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
6053
6054 vpand .clamp(%rip),%ymm3,%ymm3
6055 vmovdqa %ymm3,0(%rbp)
6056
6057 vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
6058 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
6059 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
6060 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
6061 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
6062 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
6063 vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
6064 vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
6065 vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
6066 vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
6067 jmp open_avx2_short
6068.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
6069
6070
6071.type chacha20_poly1305_seal_avx2,@function
6072.align 64
6073chacha20_poly1305_seal_avx2:
6074 vzeroupper
6075 vmovdqa .chacha20_consts(%rip),%ymm0
6076 vbroadcasti128 0(%r9),%ymm4
6077 vbroadcasti128 16(%r9),%ymm8
6078 vbroadcasti128 32(%r9),%ymm12
6079 vpaddd .avx2_init(%rip),%ymm12,%ymm12
6080 cmpq $192,%rbx
6081 jbe seal_avx2_192
6082 cmpq $320,%rbx
6083 jbe seal_avx2_320
6084 vmovdqa %ymm0,%ymm1
6085 vmovdqa %ymm0,%ymm2
6086 vmovdqa %ymm0,%ymm3
6087 vmovdqa %ymm4,%ymm5
6088 vmovdqa %ymm4,%ymm6
6089 vmovdqa %ymm4,%ymm7
6090 vmovdqa %ymm4,64(%rbp)
6091 vmovdqa %ymm8,%ymm9
6092 vmovdqa %ymm8,%ymm10
6093 vmovdqa %ymm8,%ymm11
6094 vmovdqa %ymm8,96(%rbp)
6095 vmovdqa %ymm12,%ymm15
6096 vpaddd .avx2_inc(%rip),%ymm15,%ymm14
6097 vpaddd .avx2_inc(%rip),%ymm14,%ymm13
6098 vpaddd .avx2_inc(%rip),%ymm13,%ymm12
6099 vmovdqa %ymm12,160(%rbp)
6100 vmovdqa %ymm13,192(%rbp)
6101 vmovdqa %ymm14,224(%rbp)
6102 vmovdqa %ymm15,256(%rbp)
6103 movq $10,%r10
61041:
6105 vmovdqa %ymm8,128(%rbp)
6106 vmovdqa .rol16(%rip),%ymm8
6107 vpaddd %ymm7,%ymm3,%ymm3
6108 vpaddd %ymm6,%ymm2,%ymm2
6109 vpaddd %ymm5,%ymm1,%ymm1
6110 vpaddd %ymm4,%ymm0,%ymm0
6111 vpxor %ymm3,%ymm15,%ymm15
6112 vpxor %ymm2,%ymm14,%ymm14
6113 vpxor %ymm1,%ymm13,%ymm13
6114 vpxor %ymm0,%ymm12,%ymm12
6115 vpshufb %ymm8,%ymm15,%ymm15
6116 vpshufb %ymm8,%ymm14,%ymm14
6117 vpshufb %ymm8,%ymm13,%ymm13
6118 vpshufb %ymm8,%ymm12,%ymm12
6119 vmovdqa 128(%rbp),%ymm8
6120 vpaddd %ymm15,%ymm11,%ymm11
6121 vpaddd %ymm14,%ymm10,%ymm10
6122 vpaddd %ymm13,%ymm9,%ymm9
6123 vpaddd %ymm12,%ymm8,%ymm8
6124 vpxor %ymm11,%ymm7,%ymm7
6125 vpxor %ymm10,%ymm6,%ymm6
6126 vpxor %ymm9,%ymm5,%ymm5
6127 vpxor %ymm8,%ymm4,%ymm4
6128 vmovdqa %ymm8,128(%rbp)
6129 vpsrld $20,%ymm7,%ymm8
6130 vpslld $32-20,%ymm7,%ymm7
6131 vpxor %ymm8,%ymm7,%ymm7
6132 vpsrld $20,%ymm6,%ymm8
6133 vpslld $32-20,%ymm6,%ymm6
6134 vpxor %ymm8,%ymm6,%ymm6
6135 vpsrld $20,%ymm5,%ymm8
6136 vpslld $32-20,%ymm5,%ymm5
6137 vpxor %ymm8,%ymm5,%ymm5
6138 vpsrld $20,%ymm4,%ymm8
6139 vpslld $32-20,%ymm4,%ymm4
6140 vpxor %ymm8,%ymm4,%ymm4
6141 vmovdqa .rol8(%rip),%ymm8
6142 vpaddd %ymm7,%ymm3,%ymm3
6143 vpaddd %ymm6,%ymm2,%ymm2
6144 vpaddd %ymm5,%ymm1,%ymm1
6145 vpaddd %ymm4,%ymm0,%ymm0
6146 vpxor %ymm3,%ymm15,%ymm15
6147 vpxor %ymm2,%ymm14,%ymm14
6148 vpxor %ymm1,%ymm13,%ymm13
6149 vpxor %ymm0,%ymm12,%ymm12
6150 vpshufb %ymm8,%ymm15,%ymm15
6151 vpshufb %ymm8,%ymm14,%ymm14
6152 vpshufb %ymm8,%ymm13,%ymm13
6153 vpshufb %ymm8,%ymm12,%ymm12
6154 vmovdqa 128(%rbp),%ymm8
6155 vpaddd %ymm15,%ymm11,%ymm11
6156 vpaddd %ymm14,%ymm10,%ymm10
6157 vpaddd %ymm13,%ymm9,%ymm9
6158 vpaddd %ymm12,%ymm8,%ymm8
6159 vpxor %ymm11,%ymm7,%ymm7
6160 vpxor %ymm10,%ymm6,%ymm6
6161 vpxor %ymm9,%ymm5,%ymm5
6162 vpxor %ymm8,%ymm4,%ymm4
6163 vmovdqa %ymm8,128(%rbp)
6164 vpsrld $25,%ymm7,%ymm8
6165 vpslld $32-25,%ymm7,%ymm7
6166 vpxor %ymm8,%ymm7,%ymm7
6167 vpsrld $25,%ymm6,%ymm8
6168 vpslld $32-25,%ymm6,%ymm6
6169 vpxor %ymm8,%ymm6,%ymm6
6170 vpsrld $25,%ymm5,%ymm8
6171 vpslld $32-25,%ymm5,%ymm5
6172 vpxor %ymm8,%ymm5,%ymm5
6173 vpsrld $25,%ymm4,%ymm8
6174 vpslld $32-25,%ymm4,%ymm4
6175 vpxor %ymm8,%ymm4,%ymm4
6176 vmovdqa 128(%rbp),%ymm8
6177 vpalignr $4,%ymm7,%ymm7,%ymm7
6178 vpalignr $8,%ymm11,%ymm11,%ymm11
6179 vpalignr $12,%ymm15,%ymm15,%ymm15
6180 vpalignr $4,%ymm6,%ymm6,%ymm6
6181 vpalignr $8,%ymm10,%ymm10,%ymm10
6182 vpalignr $12,%ymm14,%ymm14,%ymm14
6183 vpalignr $4,%ymm5,%ymm5,%ymm5
6184 vpalignr $8,%ymm9,%ymm9,%ymm9
6185 vpalignr $12,%ymm13,%ymm13,%ymm13
6186 vpalignr $4,%ymm4,%ymm4,%ymm4
6187 vpalignr $8,%ymm8,%ymm8,%ymm8
6188 vpalignr $12,%ymm12,%ymm12,%ymm12
6189 vmovdqa %ymm8,128(%rbp)
6190 vmovdqa .rol16(%rip),%ymm8
6191 vpaddd %ymm7,%ymm3,%ymm3
6192 vpaddd %ymm6,%ymm2,%ymm2
6193 vpaddd %ymm5,%ymm1,%ymm1
6194 vpaddd %ymm4,%ymm0,%ymm0
6195 vpxor %ymm3,%ymm15,%ymm15
6196 vpxor %ymm2,%ymm14,%ymm14
6197 vpxor %ymm1,%ymm13,%ymm13
6198 vpxor %ymm0,%ymm12,%ymm12
6199 vpshufb %ymm8,%ymm15,%ymm15
6200 vpshufb %ymm8,%ymm14,%ymm14
6201 vpshufb %ymm8,%ymm13,%ymm13
6202 vpshufb %ymm8,%ymm12,%ymm12
6203 vmovdqa 128(%rbp),%ymm8
6204 vpaddd %ymm15,%ymm11,%ymm11
6205 vpaddd %ymm14,%ymm10,%ymm10
6206 vpaddd %ymm13,%ymm9,%ymm9
6207 vpaddd %ymm12,%ymm8,%ymm8
6208 vpxor %ymm11,%ymm7,%ymm7
6209 vpxor %ymm10,%ymm6,%ymm6
6210 vpxor %ymm9,%ymm5,%ymm5
6211 vpxor %ymm8,%ymm4,%ymm4
6212 vmovdqa %ymm8,128(%rbp)
6213 vpsrld $20,%ymm7,%ymm8
6214 vpslld $32-20,%ymm7,%ymm7
6215 vpxor %ymm8,%ymm7,%ymm7
6216 vpsrld $20,%ymm6,%ymm8
6217 vpslld $32-20,%ymm6,%ymm6
6218 vpxor %ymm8,%ymm6,%ymm6
6219 vpsrld $20,%ymm5,%ymm8
6220 vpslld $32-20,%ymm5,%ymm5
6221 vpxor %ymm8,%ymm5,%ymm5
6222 vpsrld $20,%ymm4,%ymm8
6223 vpslld $32-20,%ymm4,%ymm4
6224 vpxor %ymm8,%ymm4,%ymm4
6225 vmovdqa .rol8(%rip),%ymm8
6226 vpaddd %ymm7,%ymm3,%ymm3
6227 vpaddd %ymm6,%ymm2,%ymm2
6228 vpaddd %ymm5,%ymm1,%ymm1
6229 vpaddd %ymm4,%ymm0,%ymm0
6230 vpxor %ymm3,%ymm15,%ymm15
6231 vpxor %ymm2,%ymm14,%ymm14
6232 vpxor %ymm1,%ymm13,%ymm13
6233 vpxor %ymm0,%ymm12,%ymm12
6234 vpshufb %ymm8,%ymm15,%ymm15
6235 vpshufb %ymm8,%ymm14,%ymm14
6236 vpshufb %ymm8,%ymm13,%ymm13
6237 vpshufb %ymm8,%ymm12,%ymm12
6238 vmovdqa 128(%rbp),%ymm8
6239 vpaddd %ymm15,%ymm11,%ymm11
6240 vpaddd %ymm14,%ymm10,%ymm10
6241 vpaddd %ymm13,%ymm9,%ymm9
6242 vpaddd %ymm12,%ymm8,%ymm8
6243 vpxor %ymm11,%ymm7,%ymm7
6244 vpxor %ymm10,%ymm6,%ymm6
6245 vpxor %ymm9,%ymm5,%ymm5
6246 vpxor %ymm8,%ymm4,%ymm4
6247 vmovdqa %ymm8,128(%rbp)
6248 vpsrld $25,%ymm7,%ymm8
6249 vpslld $32-25,%ymm7,%ymm7
6250 vpxor %ymm8,%ymm7,%ymm7
6251 vpsrld $25,%ymm6,%ymm8
6252 vpslld $32-25,%ymm6,%ymm6
6253 vpxor %ymm8,%ymm6,%ymm6
6254 vpsrld $25,%ymm5,%ymm8
6255 vpslld $32-25,%ymm5,%ymm5
6256 vpxor %ymm8,%ymm5,%ymm5
6257 vpsrld $25,%ymm4,%ymm8
6258 vpslld $32-25,%ymm4,%ymm4
6259 vpxor %ymm8,%ymm4,%ymm4
6260 vmovdqa 128(%rbp),%ymm8
6261 vpalignr $12,%ymm7,%ymm7,%ymm7
6262 vpalignr $8,%ymm11,%ymm11,%ymm11
6263 vpalignr $4,%ymm15,%ymm15,%ymm15
6264 vpalignr $12,%ymm6,%ymm6,%ymm6
6265 vpalignr $8,%ymm10,%ymm10,%ymm10
6266 vpalignr $4,%ymm14,%ymm14,%ymm14
6267 vpalignr $12,%ymm5,%ymm5,%ymm5
6268 vpalignr $8,%ymm9,%ymm9,%ymm9
6269 vpalignr $4,%ymm13,%ymm13,%ymm13
6270 vpalignr $12,%ymm4,%ymm4,%ymm4
6271 vpalignr $8,%ymm8,%ymm8,%ymm8
6272 vpalignr $4,%ymm12,%ymm12,%ymm12
6273
6274 decq %r10
6275 jnz 1b
6276 vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
6277 vpaddd 64(%rbp),%ymm7,%ymm7
6278 vpaddd 96(%rbp),%ymm11,%ymm11
6279 vpaddd 256(%rbp),%ymm15,%ymm15
6280 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
6281 vpaddd 64(%rbp),%ymm6,%ymm6
6282 vpaddd 96(%rbp),%ymm10,%ymm10
6283 vpaddd 224(%rbp),%ymm14,%ymm14
6284 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
6285 vpaddd 64(%rbp),%ymm5,%ymm5
6286 vpaddd 96(%rbp),%ymm9,%ymm9
6287 vpaddd 192(%rbp),%ymm13,%ymm13
6288 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
6289 vpaddd 64(%rbp),%ymm4,%ymm4
6290 vpaddd 96(%rbp),%ymm8,%ymm8
6291 vpaddd 160(%rbp),%ymm12,%ymm12
6292
6293 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
6294 vperm2i128 $0x02,%ymm3,%ymm7,%ymm15
6295 vperm2i128 $0x13,%ymm3,%ymm7,%ymm3
6296 vpand .clamp(%rip),%ymm15,%ymm15
6297 vmovdqa %ymm15,0(%rbp)
6298 movq %r8,%r8
6299 call poly_hash_ad_internal
6300
6301 vpxor 0(%rsi),%ymm3,%ymm3
6302 vpxor 32(%rsi),%ymm11,%ymm11
6303 vmovdqu %ymm3,0(%rdi)
6304 vmovdqu %ymm11,32(%rdi)
6305 vperm2i128 $0x02,%ymm2,%ymm6,%ymm15
6306 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
6307 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
6308 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
6309 vpxor 0+64(%rsi),%ymm15,%ymm15
6310 vpxor 32+64(%rsi),%ymm2,%ymm2
6311 vpxor 64+64(%rsi),%ymm6,%ymm6
6312 vpxor 96+64(%rsi),%ymm10,%ymm10
6313 vmovdqu %ymm15,0+64(%rdi)
6314 vmovdqu %ymm2,32+64(%rdi)
6315 vmovdqu %ymm6,64+64(%rdi)
6316 vmovdqu %ymm10,96+64(%rdi)
6317 vperm2i128 $0x02,%ymm1,%ymm5,%ymm15
6318 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
6319 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
6320 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
6321 vpxor 0+192(%rsi),%ymm15,%ymm15
6322 vpxor 32+192(%rsi),%ymm1,%ymm1
6323 vpxor 64+192(%rsi),%ymm5,%ymm5
6324 vpxor 96+192(%rsi),%ymm9,%ymm9
6325 vmovdqu %ymm15,0+192(%rdi)
6326 vmovdqu %ymm1,32+192(%rdi)
6327 vmovdqu %ymm5,64+192(%rdi)
6328 vmovdqu %ymm9,96+192(%rdi)
6329 vperm2i128 $0x13,%ymm0,%ymm4,%ymm15
6330 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
6331 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
6332 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
6333 vmovdqa %ymm15,%ymm8
6334
6335 leaq 320(%rsi),%rsi
6336 subq $320,%rbx
6337 movq $320,%rcx
6338 cmpq $128,%rbx
6339 jbe seal_avx2_hash
6340 vpxor 0(%rsi),%ymm0,%ymm0
6341 vpxor 32(%rsi),%ymm4,%ymm4
6342 vpxor 64(%rsi),%ymm8,%ymm8
6343 vpxor 96(%rsi),%ymm12,%ymm12
6344 vmovdqu %ymm0,320(%rdi)
6345 vmovdqu %ymm4,352(%rdi)
6346 vmovdqu %ymm8,384(%rdi)
6347 vmovdqu %ymm12,416(%rdi)
6348 leaq 128(%rsi),%rsi
6349 subq $128,%rbx
6350 movq $8,%rcx
6351 movq $2,%r8
6352 cmpq $128,%rbx
6353 jbe seal_avx2_tail_128
6354 cmpq $256,%rbx
6355 jbe seal_avx2_tail_256
6356 cmpq $384,%rbx
6357 jbe seal_avx2_tail_384
6358 cmpq $512,%rbx
6359 jbe seal_avx2_tail_512
6360 vmovdqa .chacha20_consts(%rip),%ymm0
6361 vmovdqa 64(%rbp),%ymm4
6362 vmovdqa 96(%rbp),%ymm8
6363 vmovdqa %ymm0,%ymm1
6364 vmovdqa %ymm4,%ymm5
6365 vmovdqa %ymm8,%ymm9
6366 vmovdqa %ymm0,%ymm2
6367 vmovdqa %ymm4,%ymm6
6368 vmovdqa %ymm8,%ymm10
6369 vmovdqa %ymm0,%ymm3
6370 vmovdqa %ymm4,%ymm7
6371 vmovdqa %ymm8,%ymm11
6372 vmovdqa .avx2_inc(%rip),%ymm12
6373 vpaddd 160(%rbp),%ymm12,%ymm15
6374 vpaddd %ymm15,%ymm12,%ymm14
6375 vpaddd %ymm14,%ymm12,%ymm13
6376 vpaddd %ymm13,%ymm12,%ymm12
6377 vmovdqa %ymm15,256(%rbp)
6378 vmovdqa %ymm14,224(%rbp)
6379 vmovdqa %ymm13,192(%rbp)
6380 vmovdqa %ymm12,160(%rbp)
6381 vmovdqa %ymm8,128(%rbp)
6382 vmovdqa .rol16(%rip),%ymm8
6383 vpaddd %ymm7,%ymm3,%ymm3
6384 vpaddd %ymm6,%ymm2,%ymm2
6385 vpaddd %ymm5,%ymm1,%ymm1
6386 vpaddd %ymm4,%ymm0,%ymm0
6387 vpxor %ymm3,%ymm15,%ymm15
6388 vpxor %ymm2,%ymm14,%ymm14
6389 vpxor %ymm1,%ymm13,%ymm13
6390 vpxor %ymm0,%ymm12,%ymm12
6391 vpshufb %ymm8,%ymm15,%ymm15
6392 vpshufb %ymm8,%ymm14,%ymm14
6393 vpshufb %ymm8,%ymm13,%ymm13
6394 vpshufb %ymm8,%ymm12,%ymm12
6395 vmovdqa 128(%rbp),%ymm8
6396 vpaddd %ymm15,%ymm11,%ymm11
6397 vpaddd %ymm14,%ymm10,%ymm10
6398 vpaddd %ymm13,%ymm9,%ymm9
6399 vpaddd %ymm12,%ymm8,%ymm8
6400 vpxor %ymm11,%ymm7,%ymm7
6401 vpxor %ymm10,%ymm6,%ymm6
6402 vpxor %ymm9,%ymm5,%ymm5
6403 vpxor %ymm8,%ymm4,%ymm4
6404 vmovdqa %ymm8,128(%rbp)
6405 vpsrld $20,%ymm7,%ymm8
6406 vpslld $32-20,%ymm7,%ymm7
6407 vpxor %ymm8,%ymm7,%ymm7
6408 vpsrld $20,%ymm6,%ymm8
6409 vpslld $32-20,%ymm6,%ymm6
6410 vpxor %ymm8,%ymm6,%ymm6
6411 vpsrld $20,%ymm5,%ymm8
6412 vpslld $32-20,%ymm5,%ymm5
6413 vpxor %ymm8,%ymm5,%ymm5
6414 vpsrld $20,%ymm4,%ymm8
6415 vpslld $32-20,%ymm4,%ymm4
6416 vpxor %ymm8,%ymm4,%ymm4
6417 vmovdqa .rol8(%rip),%ymm8
6418 vpaddd %ymm7,%ymm3,%ymm3
6419 vpaddd %ymm6,%ymm2,%ymm2
6420 vpaddd %ymm5,%ymm1,%ymm1
6421 vpaddd %ymm4,%ymm0,%ymm0
6422 vpxor %ymm3,%ymm15,%ymm15
6423 vpxor %ymm2,%ymm14,%ymm14
6424 vpxor %ymm1,%ymm13,%ymm13
6425 vpxor %ymm0,%ymm12,%ymm12
6426 vpshufb %ymm8,%ymm15,%ymm15
6427 vpshufb %ymm8,%ymm14,%ymm14
6428 vpshufb %ymm8,%ymm13,%ymm13
6429 vpshufb %ymm8,%ymm12,%ymm12
6430 vmovdqa 128(%rbp),%ymm8
6431 vpaddd %ymm15,%ymm11,%ymm11
6432 vpaddd %ymm14,%ymm10,%ymm10
6433 vpaddd %ymm13,%ymm9,%ymm9
6434 vpaddd %ymm12,%ymm8,%ymm8
6435 vpxor %ymm11,%ymm7,%ymm7
6436 vpxor %ymm10,%ymm6,%ymm6
6437 vpxor %ymm9,%ymm5,%ymm5
6438 vpxor %ymm8,%ymm4,%ymm4
6439 vmovdqa %ymm8,128(%rbp)
6440 vpsrld $25,%ymm7,%ymm8
6441 vpslld $32-25,%ymm7,%ymm7
6442 vpxor %ymm8,%ymm7,%ymm7
6443 vpsrld $25,%ymm6,%ymm8
6444 vpslld $32-25,%ymm6,%ymm6
6445 vpxor %ymm8,%ymm6,%ymm6
6446 vpsrld $25,%ymm5,%ymm8
6447 vpslld $32-25,%ymm5,%ymm5
6448 vpxor %ymm8,%ymm5,%ymm5
6449 vpsrld $25,%ymm4,%ymm8
6450 vpslld $32-25,%ymm4,%ymm4
6451 vpxor %ymm8,%ymm4,%ymm4
6452 vmovdqa 128(%rbp),%ymm8
6453 vpalignr $4,%ymm7,%ymm7,%ymm7
6454 vpalignr $8,%ymm11,%ymm11,%ymm11
6455 vpalignr $12,%ymm15,%ymm15,%ymm15
6456 vpalignr $4,%ymm6,%ymm6,%ymm6
6457 vpalignr $8,%ymm10,%ymm10,%ymm10
6458 vpalignr $12,%ymm14,%ymm14,%ymm14
6459 vpalignr $4,%ymm5,%ymm5,%ymm5
6460 vpalignr $8,%ymm9,%ymm9,%ymm9
6461 vpalignr $12,%ymm13,%ymm13,%ymm13
6462 vpalignr $4,%ymm4,%ymm4,%ymm4
6463 vpalignr $8,%ymm8,%ymm8,%ymm8
6464 vpalignr $12,%ymm12,%ymm12,%ymm12
6465 vmovdqa %ymm8,128(%rbp)
6466 vmovdqa .rol16(%rip),%ymm8
6467 vpaddd %ymm7,%ymm3,%ymm3
6468 vpaddd %ymm6,%ymm2,%ymm2
6469 vpaddd %ymm5,%ymm1,%ymm1
6470 vpaddd %ymm4,%ymm0,%ymm0
6471 vpxor %ymm3,%ymm15,%ymm15
6472 vpxor %ymm2,%ymm14,%ymm14
6473 vpxor %ymm1,%ymm13,%ymm13
6474 vpxor %ymm0,%ymm12,%ymm12
6475 vpshufb %ymm8,%ymm15,%ymm15
6476 vpshufb %ymm8,%ymm14,%ymm14
6477 vpshufb %ymm8,%ymm13,%ymm13
6478 vpshufb %ymm8,%ymm12,%ymm12
6479 vmovdqa 128(%rbp),%ymm8
6480 vpaddd %ymm15,%ymm11,%ymm11
6481 vpaddd %ymm14,%ymm10,%ymm10
6482 vpaddd %ymm13,%ymm9,%ymm9
6483 vpaddd %ymm12,%ymm8,%ymm8
6484 vpxor %ymm11,%ymm7,%ymm7
6485 vpxor %ymm10,%ymm6,%ymm6
6486 vpxor %ymm9,%ymm5,%ymm5
6487 vpxor %ymm8,%ymm4,%ymm4
6488 vmovdqa %ymm8,128(%rbp)
6489 vpsrld $20,%ymm7,%ymm8
6490 vpslld $32-20,%ymm7,%ymm7
6491 vpxor %ymm8,%ymm7,%ymm7
6492 vpsrld $20,%ymm6,%ymm8
6493 vpslld $32-20,%ymm6,%ymm6
6494 vpxor %ymm8,%ymm6,%ymm6
6495 vpsrld $20,%ymm5,%ymm8
6496 vpslld $32-20,%ymm5,%ymm5
6497 vpxor %ymm8,%ymm5,%ymm5
6498 vpsrld $20,%ymm4,%ymm8
6499 vpslld $32-20,%ymm4,%ymm4
6500 vpxor %ymm8,%ymm4,%ymm4
6501 vmovdqa .rol8(%rip),%ymm8
6502 vpaddd %ymm7,%ymm3,%ymm3
6503 vpaddd %ymm6,%ymm2,%ymm2
6504 vpaddd %ymm5,%ymm1,%ymm1
6505 vpaddd %ymm4,%ymm0,%ymm0
6506 vpxor %ymm3,%ymm15,%ymm15
6507 vpxor %ymm2,%ymm14,%ymm14
6508 vpxor %ymm1,%ymm13,%ymm13
6509 vpxor %ymm0,%ymm12,%ymm12
6510 vpshufb %ymm8,%ymm15,%ymm15
6511 vpshufb %ymm8,%ymm14,%ymm14
6512 vpshufb %ymm8,%ymm13,%ymm13
6513 vpshufb %ymm8,%ymm12,%ymm12
6514 vmovdqa 128(%rbp),%ymm8
6515 vpaddd %ymm15,%ymm11,%ymm11
6516 vpaddd %ymm14,%ymm10,%ymm10
6517 vpaddd %ymm13,%ymm9,%ymm9
6518 vpaddd %ymm12,%ymm8,%ymm8
6519 vpxor %ymm11,%ymm7,%ymm7
6520 vpxor %ymm10,%ymm6,%ymm6
6521 vpxor %ymm9,%ymm5,%ymm5
6522 vpxor %ymm8,%ymm4,%ymm4
6523 vmovdqa %ymm8,128(%rbp)
6524 vpsrld $25,%ymm7,%ymm8
6525 vpslld $32-25,%ymm7,%ymm7
6526 vpxor %ymm8,%ymm7,%ymm7
6527 vpsrld $25,%ymm6,%ymm8
6528 vpslld $32-25,%ymm6,%ymm6
6529 vpxor %ymm8,%ymm6,%ymm6
6530 vpsrld $25,%ymm5,%ymm8
6531 vpslld $32-25,%ymm5,%ymm5
6532 vpxor %ymm8,%ymm5,%ymm5
6533 vpsrld $25,%ymm4,%ymm8
6534 vpslld $32-25,%ymm4,%ymm4
6535 vpxor %ymm8,%ymm4,%ymm4
6536 vmovdqa 128(%rbp),%ymm8
6537 vpalignr $12,%ymm7,%ymm7,%ymm7
6538 vpalignr $8,%ymm11,%ymm11,%ymm11
6539 vpalignr $4,%ymm15,%ymm15,%ymm15
6540 vpalignr $12,%ymm6,%ymm6,%ymm6
6541 vpalignr $8,%ymm10,%ymm10,%ymm10
6542 vpalignr $4,%ymm14,%ymm14,%ymm14
6543 vpalignr $12,%ymm5,%ymm5,%ymm5
6544 vpalignr $8,%ymm9,%ymm9,%ymm9
6545 vpalignr $4,%ymm13,%ymm13,%ymm13
6546 vpalignr $12,%ymm4,%ymm4,%ymm4
6547 vpalignr $8,%ymm8,%ymm8,%ymm8
6548 vpalignr $4,%ymm12,%ymm12,%ymm12
6549 vmovdqa %ymm8,128(%rbp)
6550 vmovdqa .rol16(%rip),%ymm8
6551 vpaddd %ymm7,%ymm3,%ymm3
6552 vpaddd %ymm6,%ymm2,%ymm2
6553 vpaddd %ymm5,%ymm1,%ymm1
6554 vpaddd %ymm4,%ymm0,%ymm0
6555 vpxor %ymm3,%ymm15,%ymm15
6556 vpxor %ymm2,%ymm14,%ymm14
6557 vpxor %ymm1,%ymm13,%ymm13
6558 vpxor %ymm0,%ymm12,%ymm12
6559 vpshufb %ymm8,%ymm15,%ymm15
6560 vpshufb %ymm8,%ymm14,%ymm14
6561 vpshufb %ymm8,%ymm13,%ymm13
6562 vpshufb %ymm8,%ymm12,%ymm12
6563 vmovdqa 128(%rbp),%ymm8
6564 vpaddd %ymm15,%ymm11,%ymm11
6565 vpaddd %ymm14,%ymm10,%ymm10
6566 vpaddd %ymm13,%ymm9,%ymm9
6567 vpaddd %ymm12,%ymm8,%ymm8
6568 vpxor %ymm11,%ymm7,%ymm7
6569 vpxor %ymm10,%ymm6,%ymm6
6570 vpxor %ymm9,%ymm5,%ymm5
6571 vpxor %ymm8,%ymm4,%ymm4
6572 vmovdqa %ymm8,128(%rbp)
6573 vpsrld $20,%ymm7,%ymm8
6574 vpslld $32-20,%ymm7,%ymm7
6575 vpxor %ymm8,%ymm7,%ymm7
6576 vpsrld $20,%ymm6,%ymm8
6577 vpslld $32-20,%ymm6,%ymm6
6578 vpxor %ymm8,%ymm6,%ymm6
6579 vpsrld $20,%ymm5,%ymm8
6580 vpslld $32-20,%ymm5,%ymm5
6581 vpxor %ymm8,%ymm5,%ymm5
6582 vpsrld $20,%ymm4,%ymm8
6583 vpslld $32-20,%ymm4,%ymm4
6584 vpxor %ymm8,%ymm4,%ymm4
6585 vmovdqa .rol8(%rip),%ymm8
6586 vpaddd %ymm7,%ymm3,%ymm3
6587 vpaddd %ymm6,%ymm2,%ymm2
6588 vpaddd %ymm5,%ymm1,%ymm1
6589 vpaddd %ymm4,%ymm0,%ymm0
6590
6591 subq $16,%rdi
6592 movq $9,%rcx
6593 jmp 4f
65941:
6595 vmovdqa .chacha20_consts(%rip),%ymm0
6596 vmovdqa 64(%rbp),%ymm4
6597 vmovdqa 96(%rbp),%ymm8
6598 vmovdqa %ymm0,%ymm1
6599 vmovdqa %ymm4,%ymm5
6600 vmovdqa %ymm8,%ymm9
6601 vmovdqa %ymm0,%ymm2
6602 vmovdqa %ymm4,%ymm6
6603 vmovdqa %ymm8,%ymm10
6604 vmovdqa %ymm0,%ymm3
6605 vmovdqa %ymm4,%ymm7
6606 vmovdqa %ymm8,%ymm11
6607 vmovdqa .avx2_inc(%rip),%ymm12
6608 vpaddd 160(%rbp),%ymm12,%ymm15
6609 vpaddd %ymm15,%ymm12,%ymm14
6610 vpaddd %ymm14,%ymm12,%ymm13
6611 vpaddd %ymm13,%ymm12,%ymm12
6612 vmovdqa %ymm15,256(%rbp)
6613 vmovdqa %ymm14,224(%rbp)
6614 vmovdqa %ymm13,192(%rbp)
6615 vmovdqa %ymm12,160(%rbp)
6616
6617 movq $10,%rcx
66182:
6619 addq 0(%rdi),%r10
6620 adcq 8+0(%rdi),%r11
6621 adcq $1,%r12
6622 vmovdqa %ymm8,128(%rbp)
6623 vmovdqa .rol16(%rip),%ymm8
6624 vpaddd %ymm7,%ymm3,%ymm3
6625 vpaddd %ymm6,%ymm2,%ymm2
6626 vpaddd %ymm5,%ymm1,%ymm1
6627 vpaddd %ymm4,%ymm0,%ymm0
6628 vpxor %ymm3,%ymm15,%ymm15
6629 vpxor %ymm2,%ymm14,%ymm14
6630 vpxor %ymm1,%ymm13,%ymm13
6631 vpxor %ymm0,%ymm12,%ymm12
6632 movq 0+0(%rbp),%rdx
6633 movq %rdx,%r15
6634 mulxq %r10,%r13,%r14
6635 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08006636 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05006637 addq %rax,%r14
6638 adcq %rdx,%r15
6639 vpshufb %ymm8,%ymm15,%ymm15
6640 vpshufb %ymm8,%ymm14,%ymm14
6641 vpshufb %ymm8,%ymm13,%ymm13
6642 vpshufb %ymm8,%ymm12,%ymm12
6643 vmovdqa 128(%rbp),%ymm8
6644 vpaddd %ymm15,%ymm11,%ymm11
6645 vpaddd %ymm14,%ymm10,%ymm10
6646 vpaddd %ymm13,%ymm9,%ymm9
6647 vpaddd %ymm12,%ymm8,%ymm8
6648 movq 8+0(%rbp),%rdx
6649 mulxq %r10,%r10,%rax
6650 addq %r10,%r14
6651 mulxq %r11,%r11,%r9
6652 adcq %r11,%r15
6653 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08006654 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05006655 vpxor %ymm11,%ymm7,%ymm7
6656 vpxor %ymm10,%ymm6,%ymm6
6657 vpxor %ymm9,%ymm5,%ymm5
6658 vpxor %ymm8,%ymm4,%ymm4
6659 vmovdqa %ymm8,128(%rbp)
6660 vpsrld $20,%ymm7,%ymm8
6661 vpslld $32-20,%ymm7,%ymm7
6662 vpxor %ymm8,%ymm7,%ymm7
6663 vpsrld $20,%ymm6,%ymm8
6664 vpslld $32-20,%ymm6,%ymm6
6665 vpxor %ymm8,%ymm6,%ymm6
6666 vpsrld $20,%ymm5,%ymm8
6667 addq %rax,%r15
6668 adcq %rdx,%r9
6669 vpslld $32-20,%ymm5,%ymm5
6670 vpxor %ymm8,%ymm5,%ymm5
6671 vpsrld $20,%ymm4,%ymm8
6672 vpslld $32-20,%ymm4,%ymm4
6673 vpxor %ymm8,%ymm4,%ymm4
6674 vmovdqa .rol8(%rip),%ymm8
6675 vpaddd %ymm7,%ymm3,%ymm3
6676 vpaddd %ymm6,%ymm2,%ymm2
6677 vpaddd %ymm5,%ymm1,%ymm1
6678 vpaddd %ymm4,%ymm0,%ymm0
6679 movq %r13,%r10
6680 movq %r14,%r11
6681 movq %r15,%r12
6682 andq $3,%r12
6683 movq %r15,%r13
6684 andq $-4,%r13
6685 movq %r9,%r14
6686 shrdq $2,%r9,%r15
6687 shrq $2,%r9
6688 addq %r13,%r10
6689 adcq %r14,%r11
6690 adcq $0,%r12
6691 addq %r15,%r10
6692 adcq %r9,%r11
6693 adcq $0,%r12
6694
66954:
6696 vpxor %ymm3,%ymm15,%ymm15
6697 vpxor %ymm2,%ymm14,%ymm14
6698 vpxor %ymm1,%ymm13,%ymm13
6699 vpxor %ymm0,%ymm12,%ymm12
6700 vpshufb %ymm8,%ymm15,%ymm15
6701 vpshufb %ymm8,%ymm14,%ymm14
6702 vpshufb %ymm8,%ymm13,%ymm13
6703 vpshufb %ymm8,%ymm12,%ymm12
6704 vmovdqa 128(%rbp),%ymm8
6705 addq 16(%rdi),%r10
6706 adcq 8+16(%rdi),%r11
6707 adcq $1,%r12
6708 vpaddd %ymm15,%ymm11,%ymm11
6709 vpaddd %ymm14,%ymm10,%ymm10
6710 vpaddd %ymm13,%ymm9,%ymm9
6711 vpaddd %ymm12,%ymm8,%ymm8
6712 vpxor %ymm11,%ymm7,%ymm7
6713 vpxor %ymm10,%ymm6,%ymm6
6714 vpxor %ymm9,%ymm5,%ymm5
6715 vpxor %ymm8,%ymm4,%ymm4
6716 movq 0+0(%rbp),%rdx
6717 movq %rdx,%r15
6718 mulxq %r10,%r13,%r14
6719 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08006720 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05006721 addq %rax,%r14
6722 adcq %rdx,%r15
6723 vmovdqa %ymm8,128(%rbp)
6724 vpsrld $25,%ymm7,%ymm8
6725 vpslld $32-25,%ymm7,%ymm7
6726 vpxor %ymm8,%ymm7,%ymm7
6727 vpsrld $25,%ymm6,%ymm8
6728 vpslld $32-25,%ymm6,%ymm6
6729 vpxor %ymm8,%ymm6,%ymm6
6730 vpsrld $25,%ymm5,%ymm8
6731 vpslld $32-25,%ymm5,%ymm5
6732 vpxor %ymm8,%ymm5,%ymm5
6733 vpsrld $25,%ymm4,%ymm8
6734 vpslld $32-25,%ymm4,%ymm4
6735 vpxor %ymm8,%ymm4,%ymm4
6736 vmovdqa 128(%rbp),%ymm8
6737 vpalignr $4,%ymm7,%ymm7,%ymm7
6738 vpalignr $8,%ymm11,%ymm11,%ymm11
6739 vpalignr $12,%ymm15,%ymm15,%ymm15
6740 vpalignr $4,%ymm6,%ymm6,%ymm6
6741 movq 8+0(%rbp),%rdx
6742 mulxq %r10,%r10,%rax
6743 addq %r10,%r14
6744 mulxq %r11,%r11,%r9
6745 adcq %r11,%r15
6746 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08006747 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05006748 vpalignr $8,%ymm10,%ymm10,%ymm10
6749 vpalignr $12,%ymm14,%ymm14,%ymm14
6750 vpalignr $4,%ymm5,%ymm5,%ymm5
6751 vpalignr $8,%ymm9,%ymm9,%ymm9
6752 vpalignr $12,%ymm13,%ymm13,%ymm13
6753 vpalignr $4,%ymm4,%ymm4,%ymm4
6754 vpalignr $8,%ymm8,%ymm8,%ymm8
6755 vpalignr $12,%ymm12,%ymm12,%ymm12
6756 vmovdqa %ymm8,128(%rbp)
6757 vmovdqa .rol16(%rip),%ymm8
6758 vpaddd %ymm7,%ymm3,%ymm3
6759 vpaddd %ymm6,%ymm2,%ymm2
6760 vpaddd %ymm5,%ymm1,%ymm1
6761 vpaddd %ymm4,%ymm0,%ymm0
6762 vpxor %ymm3,%ymm15,%ymm15
6763 vpxor %ymm2,%ymm14,%ymm14
6764 vpxor %ymm1,%ymm13,%ymm13
6765 vpxor %ymm0,%ymm12,%ymm12
6766 addq %rax,%r15
6767 adcq %rdx,%r9
6768 vpshufb %ymm8,%ymm15,%ymm15
6769 vpshufb %ymm8,%ymm14,%ymm14
6770 vpshufb %ymm8,%ymm13,%ymm13
6771 vpshufb %ymm8,%ymm12,%ymm12
6772 vmovdqa 128(%rbp),%ymm8
6773 vpaddd %ymm15,%ymm11,%ymm11
6774 vpaddd %ymm14,%ymm10,%ymm10
6775 vpaddd %ymm13,%ymm9,%ymm9
6776 vpaddd %ymm12,%ymm8,%ymm8
6777 movq %r13,%r10
6778 movq %r14,%r11
6779 movq %r15,%r12
6780 andq $3,%r12
6781 movq %r15,%r13
6782 andq $-4,%r13
6783 movq %r9,%r14
6784 shrdq $2,%r9,%r15
6785 shrq $2,%r9
6786 addq %r13,%r10
6787 adcq %r14,%r11
6788 adcq $0,%r12
6789 addq %r15,%r10
6790 adcq %r9,%r11
6791 adcq $0,%r12
6792 vpxor %ymm11,%ymm7,%ymm7
6793 vpxor %ymm10,%ymm6,%ymm6
6794 vpxor %ymm9,%ymm5,%ymm5
6795 vpxor %ymm8,%ymm4,%ymm4
6796 vmovdqa %ymm8,128(%rbp)
6797 vpsrld $20,%ymm7,%ymm8
6798 vpslld $32-20,%ymm7,%ymm7
6799 vpxor %ymm8,%ymm7,%ymm7
6800 addq 32(%rdi),%r10
6801 adcq 8+32(%rdi),%r11
6802 adcq $1,%r12
6803
6804 leaq 48(%rdi),%rdi
6805 vpsrld $20,%ymm6,%ymm8
6806 vpslld $32-20,%ymm6,%ymm6
6807 vpxor %ymm8,%ymm6,%ymm6
6808 vpsrld $20,%ymm5,%ymm8
6809 vpslld $32-20,%ymm5,%ymm5
6810 vpxor %ymm8,%ymm5,%ymm5
6811 vpsrld $20,%ymm4,%ymm8
6812 vpslld $32-20,%ymm4,%ymm4
6813 vpxor %ymm8,%ymm4,%ymm4
6814 vmovdqa .rol8(%rip),%ymm8
6815 vpaddd %ymm7,%ymm3,%ymm3
6816 vpaddd %ymm6,%ymm2,%ymm2
6817 vpaddd %ymm5,%ymm1,%ymm1
6818 vpaddd %ymm4,%ymm0,%ymm0
6819 vpxor %ymm3,%ymm15,%ymm15
6820 vpxor %ymm2,%ymm14,%ymm14
6821 vpxor %ymm1,%ymm13,%ymm13
6822 vpxor %ymm0,%ymm12,%ymm12
6823 movq 0+0(%rbp),%rdx
6824 movq %rdx,%r15
6825 mulxq %r10,%r13,%r14
6826 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08006827 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05006828 addq %rax,%r14
6829 adcq %rdx,%r15
6830 vpshufb %ymm8,%ymm15,%ymm15
6831 vpshufb %ymm8,%ymm14,%ymm14
6832 vpshufb %ymm8,%ymm13,%ymm13
6833 vpshufb %ymm8,%ymm12,%ymm12
6834 vmovdqa 128(%rbp),%ymm8
6835 vpaddd %ymm15,%ymm11,%ymm11
6836 vpaddd %ymm14,%ymm10,%ymm10
6837 vpaddd %ymm13,%ymm9,%ymm9
6838 movq 8+0(%rbp),%rdx
6839 mulxq %r10,%r10,%rax
6840 addq %r10,%r14
6841 mulxq %r11,%r11,%r9
6842 adcq %r11,%r15
6843 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08006844 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05006845 vpaddd %ymm12,%ymm8,%ymm8
6846 vpxor %ymm11,%ymm7,%ymm7
6847 vpxor %ymm10,%ymm6,%ymm6
6848 vpxor %ymm9,%ymm5,%ymm5
6849 vpxor %ymm8,%ymm4,%ymm4
6850 vmovdqa %ymm8,128(%rbp)
6851 vpsrld $25,%ymm7,%ymm8
6852 vpslld $32-25,%ymm7,%ymm7
6853 addq %rax,%r15
6854 adcq %rdx,%r9
6855 vpxor %ymm8,%ymm7,%ymm7
6856 vpsrld $25,%ymm6,%ymm8
6857 vpslld $32-25,%ymm6,%ymm6
6858 vpxor %ymm8,%ymm6,%ymm6
6859 vpsrld $25,%ymm5,%ymm8
6860 vpslld $32-25,%ymm5,%ymm5
6861 vpxor %ymm8,%ymm5,%ymm5
6862 vpsrld $25,%ymm4,%ymm8
6863 vpslld $32-25,%ymm4,%ymm4
6864 vpxor %ymm8,%ymm4,%ymm4
6865 vmovdqa 128(%rbp),%ymm8
6866 vpalignr $12,%ymm7,%ymm7,%ymm7
6867 vpalignr $8,%ymm11,%ymm11,%ymm11
6868 vpalignr $4,%ymm15,%ymm15,%ymm15
6869 vpalignr $12,%ymm6,%ymm6,%ymm6
6870 vpalignr $8,%ymm10,%ymm10,%ymm10
6871 vpalignr $4,%ymm14,%ymm14,%ymm14
6872 vpalignr $12,%ymm5,%ymm5,%ymm5
6873 movq %r13,%r10
6874 movq %r14,%r11
6875 movq %r15,%r12
6876 andq $3,%r12
6877 movq %r15,%r13
6878 andq $-4,%r13
6879 movq %r9,%r14
6880 shrdq $2,%r9,%r15
6881 shrq $2,%r9
6882 addq %r13,%r10
6883 adcq %r14,%r11
6884 adcq $0,%r12
6885 addq %r15,%r10
6886 adcq %r9,%r11
6887 adcq $0,%r12
6888 vpalignr $8,%ymm9,%ymm9,%ymm9
6889 vpalignr $4,%ymm13,%ymm13,%ymm13
6890 vpalignr $12,%ymm4,%ymm4,%ymm4
6891 vpalignr $8,%ymm8,%ymm8,%ymm8
6892 vpalignr $4,%ymm12,%ymm12,%ymm12
6893
6894 decq %rcx
6895 jne 2b
6896 vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
6897 vpaddd 64(%rbp),%ymm7,%ymm7
6898 vpaddd 96(%rbp),%ymm11,%ymm11
6899 vpaddd 256(%rbp),%ymm15,%ymm15
6900 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
6901 vpaddd 64(%rbp),%ymm6,%ymm6
6902 vpaddd 96(%rbp),%ymm10,%ymm10
6903 vpaddd 224(%rbp),%ymm14,%ymm14
6904 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
6905 vpaddd 64(%rbp),%ymm5,%ymm5
6906 vpaddd 96(%rbp),%ymm9,%ymm9
6907 vpaddd 192(%rbp),%ymm13,%ymm13
6908 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
6909 vpaddd 64(%rbp),%ymm4,%ymm4
6910 vpaddd 96(%rbp),%ymm8,%ymm8
6911 vpaddd 160(%rbp),%ymm12,%ymm12
6912
6913 leaq 32(%rdi),%rdi
6914 vmovdqa %ymm0,128(%rbp)
6915 addq -32(%rdi),%r10
6916 adcq 8+-32(%rdi),%r11
6917 adcq $1,%r12
6918 vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
6919 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
6920 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
6921 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
6922 vpxor 0+0(%rsi),%ymm0,%ymm0
6923 vpxor 32+0(%rsi),%ymm3,%ymm3
6924 vpxor 64+0(%rsi),%ymm7,%ymm7
6925 vpxor 96+0(%rsi),%ymm11,%ymm11
6926 vmovdqu %ymm0,0+0(%rdi)
6927 vmovdqu %ymm3,32+0(%rdi)
6928 vmovdqu %ymm7,64+0(%rdi)
6929 vmovdqu %ymm11,96+0(%rdi)
6930
6931 vmovdqa 128(%rbp),%ymm0
6932 movq 0+0(%rbp),%rax
6933 movq %rax,%r15
6934 mulq %r10
6935 movq %rax,%r13
6936 movq %rdx,%r14
6937 movq 0+0(%rbp),%rax
6938 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08006939 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05006940 addq %rax,%r14
6941 adcq %rdx,%r15
6942 movq 8+0(%rbp),%rax
6943 movq %rax,%r9
6944 mulq %r10
6945 addq %rax,%r14
6946 adcq $0,%rdx
6947 movq %rdx,%r10
6948 movq 8+0(%rbp),%rax
6949 mulq %r11
6950 addq %rax,%r15
6951 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08006952 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05006953 addq %r10,%r15
6954 adcq %rdx,%r9
6955 movq %r13,%r10
6956 movq %r14,%r11
6957 movq %r15,%r12
6958 andq $3,%r12
6959 movq %r15,%r13
6960 andq $-4,%r13
6961 movq %r9,%r14
6962 shrdq $2,%r9,%r15
6963 shrq $2,%r9
6964 addq %r13,%r10
6965 adcq %r14,%r11
6966 adcq $0,%r12
6967 addq %r15,%r10
6968 adcq %r9,%r11
6969 adcq $0,%r12
6970 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
6971 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
6972 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
6973 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
6974 vpxor 0+128(%rsi),%ymm3,%ymm3
6975 vpxor 32+128(%rsi),%ymm2,%ymm2
6976 vpxor 64+128(%rsi),%ymm6,%ymm6
6977 vpxor 96+128(%rsi),%ymm10,%ymm10
6978 vmovdqu %ymm3,0+128(%rdi)
6979 vmovdqu %ymm2,32+128(%rdi)
6980 vmovdqu %ymm6,64+128(%rdi)
6981 vmovdqu %ymm10,96+128(%rdi)
6982 addq -16(%rdi),%r10
6983 adcq 8+-16(%rdi),%r11
6984 adcq $1,%r12
6985 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
6986 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
6987 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
6988 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
6989 vpxor 0+256(%rsi),%ymm3,%ymm3
6990 vpxor 32+256(%rsi),%ymm1,%ymm1
6991 vpxor 64+256(%rsi),%ymm5,%ymm5
6992 vpxor 96+256(%rsi),%ymm9,%ymm9
6993 vmovdqu %ymm3,0+256(%rdi)
6994 vmovdqu %ymm1,32+256(%rdi)
6995 vmovdqu %ymm5,64+256(%rdi)
6996 vmovdqu %ymm9,96+256(%rdi)
6997 movq 0+0(%rbp),%rax
6998 movq %rax,%r15
6999 mulq %r10
7000 movq %rax,%r13
7001 movq %rdx,%r14
7002 movq 0+0(%rbp),%rax
7003 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007004 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007005 addq %rax,%r14
7006 adcq %rdx,%r15
7007 movq 8+0(%rbp),%rax
7008 movq %rax,%r9
7009 mulq %r10
7010 addq %rax,%r14
7011 adcq $0,%rdx
7012 movq %rdx,%r10
7013 movq 8+0(%rbp),%rax
7014 mulq %r11
7015 addq %rax,%r15
7016 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007017 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007018 addq %r10,%r15
7019 adcq %rdx,%r9
7020 movq %r13,%r10
7021 movq %r14,%r11
7022 movq %r15,%r12
7023 andq $3,%r12
7024 movq %r15,%r13
7025 andq $-4,%r13
7026 movq %r9,%r14
7027 shrdq $2,%r9,%r15
7028 shrq $2,%r9
7029 addq %r13,%r10
7030 adcq %r14,%r11
7031 adcq $0,%r12
7032 addq %r15,%r10
7033 adcq %r9,%r11
7034 adcq $0,%r12
7035 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
7036 vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
7037 vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
7038 vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
7039 vpxor 0+384(%rsi),%ymm3,%ymm3
7040 vpxor 32+384(%rsi),%ymm0,%ymm0
7041 vpxor 64+384(%rsi),%ymm4,%ymm4
7042 vpxor 96+384(%rsi),%ymm8,%ymm8
7043 vmovdqu %ymm3,0+384(%rdi)
7044 vmovdqu %ymm0,32+384(%rdi)
7045 vmovdqu %ymm4,64+384(%rdi)
7046 vmovdqu %ymm8,96+384(%rdi)
7047
7048 leaq 512(%rsi),%rsi
7049 subq $512,%rbx
7050 cmpq $512,%rbx
7051 jg 1b
7052 addq 0(%rdi),%r10
7053 adcq 8+0(%rdi),%r11
7054 adcq $1,%r12
7055 movq 0+0(%rbp),%rax
7056 movq %rax,%r15
7057 mulq %r10
7058 movq %rax,%r13
7059 movq %rdx,%r14
7060 movq 0+0(%rbp),%rax
7061 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007062 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007063 addq %rax,%r14
7064 adcq %rdx,%r15
7065 movq 8+0(%rbp),%rax
7066 movq %rax,%r9
7067 mulq %r10
7068 addq %rax,%r14
7069 adcq $0,%rdx
7070 movq %rdx,%r10
7071 movq 8+0(%rbp),%rax
7072 mulq %r11
7073 addq %rax,%r15
7074 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007075 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007076 addq %r10,%r15
7077 adcq %rdx,%r9
7078 movq %r13,%r10
7079 movq %r14,%r11
7080 movq %r15,%r12
7081 andq $3,%r12
7082 movq %r15,%r13
7083 andq $-4,%r13
7084 movq %r9,%r14
7085 shrdq $2,%r9,%r15
7086 shrq $2,%r9
7087 addq %r13,%r10
7088 adcq %r14,%r11
7089 adcq $0,%r12
7090 addq %r15,%r10
7091 adcq %r9,%r11
7092 adcq $0,%r12
7093 addq 16(%rdi),%r10
7094 adcq 8+16(%rdi),%r11
7095 adcq $1,%r12
7096 movq 0+0(%rbp),%rax
7097 movq %rax,%r15
7098 mulq %r10
7099 movq %rax,%r13
7100 movq %rdx,%r14
7101 movq 0+0(%rbp),%rax
7102 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007103 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007104 addq %rax,%r14
7105 adcq %rdx,%r15
7106 movq 8+0(%rbp),%rax
7107 movq %rax,%r9
7108 mulq %r10
7109 addq %rax,%r14
7110 adcq $0,%rdx
7111 movq %rdx,%r10
7112 movq 8+0(%rbp),%rax
7113 mulq %r11
7114 addq %rax,%r15
7115 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007116 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007117 addq %r10,%r15
7118 adcq %rdx,%r9
7119 movq %r13,%r10
7120 movq %r14,%r11
7121 movq %r15,%r12
7122 andq $3,%r12
7123 movq %r15,%r13
7124 andq $-4,%r13
7125 movq %r9,%r14
7126 shrdq $2,%r9,%r15
7127 shrq $2,%r9
7128 addq %r13,%r10
7129 adcq %r14,%r11
7130 adcq $0,%r12
7131 addq %r15,%r10
7132 adcq %r9,%r11
7133 adcq $0,%r12
7134
7135 leaq 32(%rdi),%rdi
7136 movq $10,%rcx
7137 xorq %r8,%r8
7138 cmpq $128,%rbx
7139 ja 3f
7140
7141seal_avx2_tail_128:
7142 vmovdqa .chacha20_consts(%rip),%ymm0
7143 vmovdqa 64(%rbp),%ymm4
7144 vmovdqa 96(%rbp),%ymm8
7145 vmovdqa .avx2_inc(%rip),%ymm12
7146 vpaddd 160(%rbp),%ymm12,%ymm12
7147 vmovdqa %ymm12,160(%rbp)
7148
71491:
7150 addq 0(%rdi),%r10
7151 adcq 8+0(%rdi),%r11
7152 adcq $1,%r12
7153 movq 0+0(%rbp),%rax
7154 movq %rax,%r15
7155 mulq %r10
7156 movq %rax,%r13
7157 movq %rdx,%r14
7158 movq 0+0(%rbp),%rax
7159 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007160 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007161 addq %rax,%r14
7162 adcq %rdx,%r15
7163 movq 8+0(%rbp),%rax
7164 movq %rax,%r9
7165 mulq %r10
7166 addq %rax,%r14
7167 adcq $0,%rdx
7168 movq %rdx,%r10
7169 movq 8+0(%rbp),%rax
7170 mulq %r11
7171 addq %rax,%r15
7172 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007173 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007174 addq %r10,%r15
7175 adcq %rdx,%r9
7176 movq %r13,%r10
7177 movq %r14,%r11
7178 movq %r15,%r12
7179 andq $3,%r12
7180 movq %r15,%r13
7181 andq $-4,%r13
7182 movq %r9,%r14
7183 shrdq $2,%r9,%r15
7184 shrq $2,%r9
7185 addq %r13,%r10
7186 adcq %r14,%r11
7187 adcq $0,%r12
7188 addq %r15,%r10
7189 adcq %r9,%r11
7190 adcq $0,%r12
7191
7192 leaq 16(%rdi),%rdi
71932:
7194 vpaddd %ymm4,%ymm0,%ymm0
7195 vpxor %ymm0,%ymm12,%ymm12
7196 vpshufb .rol16(%rip),%ymm12,%ymm12
7197 vpaddd %ymm12,%ymm8,%ymm8
7198 vpxor %ymm8,%ymm4,%ymm4
7199 vpsrld $20,%ymm4,%ymm3
7200 vpslld $12,%ymm4,%ymm4
7201 vpxor %ymm3,%ymm4,%ymm4
7202 vpaddd %ymm4,%ymm0,%ymm0
7203 vpxor %ymm0,%ymm12,%ymm12
7204 vpshufb .rol8(%rip),%ymm12,%ymm12
7205 vpaddd %ymm12,%ymm8,%ymm8
7206 vpxor %ymm8,%ymm4,%ymm4
7207 vpslld $7,%ymm4,%ymm3
7208 vpsrld $25,%ymm4,%ymm4
7209 vpxor %ymm3,%ymm4,%ymm4
7210 vpalignr $12,%ymm12,%ymm12,%ymm12
7211 vpalignr $8,%ymm8,%ymm8,%ymm8
7212 vpalignr $4,%ymm4,%ymm4,%ymm4
7213 addq 0(%rdi),%r10
7214 adcq 8+0(%rdi),%r11
7215 adcq $1,%r12
7216 movq 0+0(%rbp),%rax
7217 movq %rax,%r15
7218 mulq %r10
7219 movq %rax,%r13
7220 movq %rdx,%r14
7221 movq 0+0(%rbp),%rax
7222 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007223 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007224 addq %rax,%r14
7225 adcq %rdx,%r15
7226 movq 8+0(%rbp),%rax
7227 movq %rax,%r9
7228 mulq %r10
7229 addq %rax,%r14
7230 adcq $0,%rdx
7231 movq %rdx,%r10
7232 movq 8+0(%rbp),%rax
7233 mulq %r11
7234 addq %rax,%r15
7235 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007236 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007237 addq %r10,%r15
7238 adcq %rdx,%r9
7239 movq %r13,%r10
7240 movq %r14,%r11
7241 movq %r15,%r12
7242 andq $3,%r12
7243 movq %r15,%r13
7244 andq $-4,%r13
7245 movq %r9,%r14
7246 shrdq $2,%r9,%r15
7247 shrq $2,%r9
7248 addq %r13,%r10
7249 adcq %r14,%r11
7250 adcq $0,%r12
7251 addq %r15,%r10
7252 adcq %r9,%r11
7253 adcq $0,%r12
7254 vpaddd %ymm4,%ymm0,%ymm0
7255 vpxor %ymm0,%ymm12,%ymm12
7256 vpshufb .rol16(%rip),%ymm12,%ymm12
7257 vpaddd %ymm12,%ymm8,%ymm8
7258 vpxor %ymm8,%ymm4,%ymm4
7259 vpsrld $20,%ymm4,%ymm3
7260 vpslld $12,%ymm4,%ymm4
7261 vpxor %ymm3,%ymm4,%ymm4
7262 vpaddd %ymm4,%ymm0,%ymm0
7263 vpxor %ymm0,%ymm12,%ymm12
7264 vpshufb .rol8(%rip),%ymm12,%ymm12
7265 vpaddd %ymm12,%ymm8,%ymm8
7266 vpxor %ymm8,%ymm4,%ymm4
7267 vpslld $7,%ymm4,%ymm3
7268 vpsrld $25,%ymm4,%ymm4
7269 vpxor %ymm3,%ymm4,%ymm4
7270 vpalignr $4,%ymm12,%ymm12,%ymm12
7271 vpalignr $8,%ymm8,%ymm8,%ymm8
7272 vpalignr $12,%ymm4,%ymm4,%ymm4
7273 addq 16(%rdi),%r10
7274 adcq 8+16(%rdi),%r11
7275 adcq $1,%r12
7276 movq 0+0(%rbp),%rax
7277 movq %rax,%r15
7278 mulq %r10
7279 movq %rax,%r13
7280 movq %rdx,%r14
7281 movq 0+0(%rbp),%rax
7282 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007283 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007284 addq %rax,%r14
7285 adcq %rdx,%r15
7286 movq 8+0(%rbp),%rax
7287 movq %rax,%r9
7288 mulq %r10
7289 addq %rax,%r14
7290 adcq $0,%rdx
7291 movq %rdx,%r10
7292 movq 8+0(%rbp),%rax
7293 mulq %r11
7294 addq %rax,%r15
7295 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007296 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007297 addq %r10,%r15
7298 adcq %rdx,%r9
7299 movq %r13,%r10
7300 movq %r14,%r11
7301 movq %r15,%r12
7302 andq $3,%r12
7303 movq %r15,%r13
7304 andq $-4,%r13
7305 movq %r9,%r14
7306 shrdq $2,%r9,%r15
7307 shrq $2,%r9
7308 addq %r13,%r10
7309 adcq %r14,%r11
7310 adcq $0,%r12
7311 addq %r15,%r10
7312 adcq %r9,%r11
7313 adcq $0,%r12
7314
7315 leaq 32(%rdi),%rdi
7316 decq %rcx
7317 jg 1b
7318 decq %r8
7319 jge 2b
7320 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
7321 vpaddd 64(%rbp),%ymm4,%ymm4
7322 vpaddd 96(%rbp),%ymm8,%ymm8
7323 vpaddd 160(%rbp),%ymm12,%ymm12
7324 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
7325 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
7326 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
7327 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
7328 vmovdqa %ymm3,%ymm8
7329
7330 jmp seal_avx2_short_loop
73313:
7332 cmpq $256,%rbx
7333 ja 3f
7334
7335seal_avx2_tail_256:
7336 vmovdqa .chacha20_consts(%rip),%ymm0
7337 vmovdqa 64(%rbp),%ymm4
7338 vmovdqa 96(%rbp),%ymm8
7339 vmovdqa %ymm0,%ymm1
7340 vmovdqa %ymm4,%ymm5
7341 vmovdqa %ymm8,%ymm9
7342 vmovdqa .avx2_inc(%rip),%ymm12
7343 vpaddd 160(%rbp),%ymm12,%ymm13
7344 vpaddd %ymm13,%ymm12,%ymm12
7345 vmovdqa %ymm12,160(%rbp)
7346 vmovdqa %ymm13,192(%rbp)
7347
73481:
7349 addq 0(%rdi),%r10
7350 adcq 8+0(%rdi),%r11
7351 adcq $1,%r12
7352 movq 0+0(%rbp),%rax
7353 movq %rax,%r15
7354 mulq %r10
7355 movq %rax,%r13
7356 movq %rdx,%r14
7357 movq 0+0(%rbp),%rax
7358 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007359 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007360 addq %rax,%r14
7361 adcq %rdx,%r15
7362 movq 8+0(%rbp),%rax
7363 movq %rax,%r9
7364 mulq %r10
7365 addq %rax,%r14
7366 adcq $0,%rdx
7367 movq %rdx,%r10
7368 movq 8+0(%rbp),%rax
7369 mulq %r11
7370 addq %rax,%r15
7371 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007372 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007373 addq %r10,%r15
7374 adcq %rdx,%r9
7375 movq %r13,%r10
7376 movq %r14,%r11
7377 movq %r15,%r12
7378 andq $3,%r12
7379 movq %r15,%r13
7380 andq $-4,%r13
7381 movq %r9,%r14
7382 shrdq $2,%r9,%r15
7383 shrq $2,%r9
7384 addq %r13,%r10
7385 adcq %r14,%r11
7386 adcq $0,%r12
7387 addq %r15,%r10
7388 adcq %r9,%r11
7389 adcq $0,%r12
7390
7391 leaq 16(%rdi),%rdi
73922:
7393 vpaddd %ymm4,%ymm0,%ymm0
7394 vpxor %ymm0,%ymm12,%ymm12
7395 vpshufb .rol16(%rip),%ymm12,%ymm12
7396 vpaddd %ymm12,%ymm8,%ymm8
7397 vpxor %ymm8,%ymm4,%ymm4
7398 vpsrld $20,%ymm4,%ymm3
7399 vpslld $12,%ymm4,%ymm4
7400 vpxor %ymm3,%ymm4,%ymm4
7401 vpaddd %ymm4,%ymm0,%ymm0
7402 vpxor %ymm0,%ymm12,%ymm12
7403 vpshufb .rol8(%rip),%ymm12,%ymm12
7404 vpaddd %ymm12,%ymm8,%ymm8
7405 vpxor %ymm8,%ymm4,%ymm4
7406 vpslld $7,%ymm4,%ymm3
7407 vpsrld $25,%ymm4,%ymm4
7408 vpxor %ymm3,%ymm4,%ymm4
7409 vpalignr $12,%ymm12,%ymm12,%ymm12
7410 vpalignr $8,%ymm8,%ymm8,%ymm8
7411 vpalignr $4,%ymm4,%ymm4,%ymm4
7412 vpaddd %ymm5,%ymm1,%ymm1
7413 vpxor %ymm1,%ymm13,%ymm13
7414 vpshufb .rol16(%rip),%ymm13,%ymm13
7415 vpaddd %ymm13,%ymm9,%ymm9
7416 vpxor %ymm9,%ymm5,%ymm5
7417 vpsrld $20,%ymm5,%ymm3
7418 vpslld $12,%ymm5,%ymm5
7419 vpxor %ymm3,%ymm5,%ymm5
7420 vpaddd %ymm5,%ymm1,%ymm1
7421 vpxor %ymm1,%ymm13,%ymm13
7422 vpshufb .rol8(%rip),%ymm13,%ymm13
7423 vpaddd %ymm13,%ymm9,%ymm9
7424 vpxor %ymm9,%ymm5,%ymm5
7425 vpslld $7,%ymm5,%ymm3
7426 vpsrld $25,%ymm5,%ymm5
7427 vpxor %ymm3,%ymm5,%ymm5
7428 vpalignr $12,%ymm13,%ymm13,%ymm13
7429 vpalignr $8,%ymm9,%ymm9,%ymm9
7430 vpalignr $4,%ymm5,%ymm5,%ymm5
7431 addq 0(%rdi),%r10
7432 adcq 8+0(%rdi),%r11
7433 adcq $1,%r12
7434 movq 0+0(%rbp),%rax
7435 movq %rax,%r15
7436 mulq %r10
7437 movq %rax,%r13
7438 movq %rdx,%r14
7439 movq 0+0(%rbp),%rax
7440 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007441 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007442 addq %rax,%r14
7443 adcq %rdx,%r15
7444 movq 8+0(%rbp),%rax
7445 movq %rax,%r9
7446 mulq %r10
7447 addq %rax,%r14
7448 adcq $0,%rdx
7449 movq %rdx,%r10
7450 movq 8+0(%rbp),%rax
7451 mulq %r11
7452 addq %rax,%r15
7453 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007454 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007455 addq %r10,%r15
7456 adcq %rdx,%r9
7457 movq %r13,%r10
7458 movq %r14,%r11
7459 movq %r15,%r12
7460 andq $3,%r12
7461 movq %r15,%r13
7462 andq $-4,%r13
7463 movq %r9,%r14
7464 shrdq $2,%r9,%r15
7465 shrq $2,%r9
7466 addq %r13,%r10
7467 adcq %r14,%r11
7468 adcq $0,%r12
7469 addq %r15,%r10
7470 adcq %r9,%r11
7471 adcq $0,%r12
7472 vpaddd %ymm4,%ymm0,%ymm0
7473 vpxor %ymm0,%ymm12,%ymm12
7474 vpshufb .rol16(%rip),%ymm12,%ymm12
7475 vpaddd %ymm12,%ymm8,%ymm8
7476 vpxor %ymm8,%ymm4,%ymm4
7477 vpsrld $20,%ymm4,%ymm3
7478 vpslld $12,%ymm4,%ymm4
7479 vpxor %ymm3,%ymm4,%ymm4
7480 vpaddd %ymm4,%ymm0,%ymm0
7481 vpxor %ymm0,%ymm12,%ymm12
7482 vpshufb .rol8(%rip),%ymm12,%ymm12
7483 vpaddd %ymm12,%ymm8,%ymm8
7484 vpxor %ymm8,%ymm4,%ymm4
7485 vpslld $7,%ymm4,%ymm3
7486 vpsrld $25,%ymm4,%ymm4
7487 vpxor %ymm3,%ymm4,%ymm4
7488 vpalignr $4,%ymm12,%ymm12,%ymm12
7489 vpalignr $8,%ymm8,%ymm8,%ymm8
7490 vpalignr $12,%ymm4,%ymm4,%ymm4
7491 vpaddd %ymm5,%ymm1,%ymm1
7492 vpxor %ymm1,%ymm13,%ymm13
7493 vpshufb .rol16(%rip),%ymm13,%ymm13
7494 vpaddd %ymm13,%ymm9,%ymm9
7495 vpxor %ymm9,%ymm5,%ymm5
7496 vpsrld $20,%ymm5,%ymm3
7497 vpslld $12,%ymm5,%ymm5
7498 vpxor %ymm3,%ymm5,%ymm5
7499 vpaddd %ymm5,%ymm1,%ymm1
7500 vpxor %ymm1,%ymm13,%ymm13
7501 vpshufb .rol8(%rip),%ymm13,%ymm13
7502 vpaddd %ymm13,%ymm9,%ymm9
7503 vpxor %ymm9,%ymm5,%ymm5
7504 vpslld $7,%ymm5,%ymm3
7505 vpsrld $25,%ymm5,%ymm5
7506 vpxor %ymm3,%ymm5,%ymm5
7507 vpalignr $4,%ymm13,%ymm13,%ymm13
7508 vpalignr $8,%ymm9,%ymm9,%ymm9
7509 vpalignr $12,%ymm5,%ymm5,%ymm5
7510 addq 16(%rdi),%r10
7511 adcq 8+16(%rdi),%r11
7512 adcq $1,%r12
7513 movq 0+0(%rbp),%rax
7514 movq %rax,%r15
7515 mulq %r10
7516 movq %rax,%r13
7517 movq %rdx,%r14
7518 movq 0+0(%rbp),%rax
7519 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007520 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007521 addq %rax,%r14
7522 adcq %rdx,%r15
7523 movq 8+0(%rbp),%rax
7524 movq %rax,%r9
7525 mulq %r10
7526 addq %rax,%r14
7527 adcq $0,%rdx
7528 movq %rdx,%r10
7529 movq 8+0(%rbp),%rax
7530 mulq %r11
7531 addq %rax,%r15
7532 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007533 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007534 addq %r10,%r15
7535 adcq %rdx,%r9
7536 movq %r13,%r10
7537 movq %r14,%r11
7538 movq %r15,%r12
7539 andq $3,%r12
7540 movq %r15,%r13
7541 andq $-4,%r13
7542 movq %r9,%r14
7543 shrdq $2,%r9,%r15
7544 shrq $2,%r9
7545 addq %r13,%r10
7546 adcq %r14,%r11
7547 adcq $0,%r12
7548 addq %r15,%r10
7549 adcq %r9,%r11
7550 adcq $0,%r12
7551
7552 leaq 32(%rdi),%rdi
7553 decq %rcx
7554 jg 1b
7555 decq %r8
7556 jge 2b
7557 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
7558 vpaddd 64(%rbp),%ymm5,%ymm5
7559 vpaddd 96(%rbp),%ymm9,%ymm9
7560 vpaddd 192(%rbp),%ymm13,%ymm13
7561 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
7562 vpaddd 64(%rbp),%ymm4,%ymm4
7563 vpaddd 96(%rbp),%ymm8,%ymm8
7564 vpaddd 160(%rbp),%ymm12,%ymm12
7565 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
7566 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
7567 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
7568 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
7569 vpxor 0+0(%rsi),%ymm3,%ymm3
7570 vpxor 32+0(%rsi),%ymm1,%ymm1
7571 vpxor 64+0(%rsi),%ymm5,%ymm5
7572 vpxor 96+0(%rsi),%ymm9,%ymm9
7573 vmovdqu %ymm3,0+0(%rdi)
7574 vmovdqu %ymm1,32+0(%rdi)
7575 vmovdqu %ymm5,64+0(%rdi)
7576 vmovdqu %ymm9,96+0(%rdi)
7577 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
7578 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
7579 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
7580 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
7581 vmovdqa %ymm3,%ymm8
7582
7583 movq $128,%rcx
7584 leaq 128(%rsi),%rsi
7585 subq $128,%rbx
7586 jmp seal_avx2_hash
75873:
7588 cmpq $384,%rbx
7589 ja seal_avx2_tail_512
7590
7591seal_avx2_tail_384:
7592 vmovdqa .chacha20_consts(%rip),%ymm0
7593 vmovdqa 64(%rbp),%ymm4
7594 vmovdqa 96(%rbp),%ymm8
7595 vmovdqa %ymm0,%ymm1
7596 vmovdqa %ymm4,%ymm5
7597 vmovdqa %ymm8,%ymm9
7598 vmovdqa %ymm0,%ymm2
7599 vmovdqa %ymm4,%ymm6
7600 vmovdqa %ymm8,%ymm10
7601 vmovdqa .avx2_inc(%rip),%ymm12
7602 vpaddd 160(%rbp),%ymm12,%ymm14
7603 vpaddd %ymm14,%ymm12,%ymm13
7604 vpaddd %ymm13,%ymm12,%ymm12
7605 vmovdqa %ymm12,160(%rbp)
7606 vmovdqa %ymm13,192(%rbp)
7607 vmovdqa %ymm14,224(%rbp)
7608
76091:
7610 addq 0(%rdi),%r10
7611 adcq 8+0(%rdi),%r11
7612 adcq $1,%r12
7613 movq 0+0(%rbp),%rax
7614 movq %rax,%r15
7615 mulq %r10
7616 movq %rax,%r13
7617 movq %rdx,%r14
7618 movq 0+0(%rbp),%rax
7619 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007620 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007621 addq %rax,%r14
7622 adcq %rdx,%r15
7623 movq 8+0(%rbp),%rax
7624 movq %rax,%r9
7625 mulq %r10
7626 addq %rax,%r14
7627 adcq $0,%rdx
7628 movq %rdx,%r10
7629 movq 8+0(%rbp),%rax
7630 mulq %r11
7631 addq %rax,%r15
7632 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007633 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007634 addq %r10,%r15
7635 adcq %rdx,%r9
7636 movq %r13,%r10
7637 movq %r14,%r11
7638 movq %r15,%r12
7639 andq $3,%r12
7640 movq %r15,%r13
7641 andq $-4,%r13
7642 movq %r9,%r14
7643 shrdq $2,%r9,%r15
7644 shrq $2,%r9
7645 addq %r13,%r10
7646 adcq %r14,%r11
7647 adcq $0,%r12
7648 addq %r15,%r10
7649 adcq %r9,%r11
7650 adcq $0,%r12
7651
7652 leaq 16(%rdi),%rdi
76532:
7654 vpaddd %ymm4,%ymm0,%ymm0
7655 vpxor %ymm0,%ymm12,%ymm12
7656 vpshufb .rol16(%rip),%ymm12,%ymm12
7657 vpaddd %ymm12,%ymm8,%ymm8
7658 vpxor %ymm8,%ymm4,%ymm4
7659 vpsrld $20,%ymm4,%ymm3
7660 vpslld $12,%ymm4,%ymm4
7661 vpxor %ymm3,%ymm4,%ymm4
7662 vpaddd %ymm4,%ymm0,%ymm0
7663 vpxor %ymm0,%ymm12,%ymm12
7664 vpshufb .rol8(%rip),%ymm12,%ymm12
7665 vpaddd %ymm12,%ymm8,%ymm8
7666 vpxor %ymm8,%ymm4,%ymm4
7667 vpslld $7,%ymm4,%ymm3
7668 vpsrld $25,%ymm4,%ymm4
7669 vpxor %ymm3,%ymm4,%ymm4
7670 vpalignr $12,%ymm12,%ymm12,%ymm12
7671 vpalignr $8,%ymm8,%ymm8,%ymm8
7672 vpalignr $4,%ymm4,%ymm4,%ymm4
7673 vpaddd %ymm5,%ymm1,%ymm1
7674 vpxor %ymm1,%ymm13,%ymm13
7675 vpshufb .rol16(%rip),%ymm13,%ymm13
7676 vpaddd %ymm13,%ymm9,%ymm9
7677 vpxor %ymm9,%ymm5,%ymm5
7678 vpsrld $20,%ymm5,%ymm3
7679 vpslld $12,%ymm5,%ymm5
7680 vpxor %ymm3,%ymm5,%ymm5
7681 vpaddd %ymm5,%ymm1,%ymm1
7682 vpxor %ymm1,%ymm13,%ymm13
7683 vpshufb .rol8(%rip),%ymm13,%ymm13
7684 vpaddd %ymm13,%ymm9,%ymm9
7685 vpxor %ymm9,%ymm5,%ymm5
7686 vpslld $7,%ymm5,%ymm3
7687 vpsrld $25,%ymm5,%ymm5
7688 vpxor %ymm3,%ymm5,%ymm5
7689 vpalignr $12,%ymm13,%ymm13,%ymm13
7690 vpalignr $8,%ymm9,%ymm9,%ymm9
7691 vpalignr $4,%ymm5,%ymm5,%ymm5
7692 addq 0(%rdi),%r10
7693 adcq 8+0(%rdi),%r11
7694 adcq $1,%r12
7695 movq 0+0(%rbp),%rax
7696 movq %rax,%r15
7697 mulq %r10
7698 movq %rax,%r13
7699 movq %rdx,%r14
7700 movq 0+0(%rbp),%rax
7701 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007702 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007703 addq %rax,%r14
7704 adcq %rdx,%r15
7705 movq 8+0(%rbp),%rax
7706 movq %rax,%r9
7707 mulq %r10
7708 addq %rax,%r14
7709 adcq $0,%rdx
7710 movq %rdx,%r10
7711 movq 8+0(%rbp),%rax
7712 mulq %r11
7713 addq %rax,%r15
7714 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007715 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007716 addq %r10,%r15
7717 adcq %rdx,%r9
7718 movq %r13,%r10
7719 movq %r14,%r11
7720 movq %r15,%r12
7721 andq $3,%r12
7722 movq %r15,%r13
7723 andq $-4,%r13
7724 movq %r9,%r14
7725 shrdq $2,%r9,%r15
7726 shrq $2,%r9
7727 addq %r13,%r10
7728 adcq %r14,%r11
7729 adcq $0,%r12
7730 addq %r15,%r10
7731 adcq %r9,%r11
7732 adcq $0,%r12
7733 vpaddd %ymm6,%ymm2,%ymm2
7734 vpxor %ymm2,%ymm14,%ymm14
7735 vpshufb .rol16(%rip),%ymm14,%ymm14
7736 vpaddd %ymm14,%ymm10,%ymm10
7737 vpxor %ymm10,%ymm6,%ymm6
7738 vpsrld $20,%ymm6,%ymm3
7739 vpslld $12,%ymm6,%ymm6
7740 vpxor %ymm3,%ymm6,%ymm6
7741 vpaddd %ymm6,%ymm2,%ymm2
7742 vpxor %ymm2,%ymm14,%ymm14
7743 vpshufb .rol8(%rip),%ymm14,%ymm14
7744 vpaddd %ymm14,%ymm10,%ymm10
7745 vpxor %ymm10,%ymm6,%ymm6
7746 vpslld $7,%ymm6,%ymm3
7747 vpsrld $25,%ymm6,%ymm6
7748 vpxor %ymm3,%ymm6,%ymm6
7749 vpalignr $12,%ymm14,%ymm14,%ymm14
7750 vpalignr $8,%ymm10,%ymm10,%ymm10
7751 vpalignr $4,%ymm6,%ymm6,%ymm6
7752 vpaddd %ymm4,%ymm0,%ymm0
7753 vpxor %ymm0,%ymm12,%ymm12
7754 vpshufb .rol16(%rip),%ymm12,%ymm12
7755 vpaddd %ymm12,%ymm8,%ymm8
7756 vpxor %ymm8,%ymm4,%ymm4
7757 vpsrld $20,%ymm4,%ymm3
7758 vpslld $12,%ymm4,%ymm4
7759 vpxor %ymm3,%ymm4,%ymm4
7760 vpaddd %ymm4,%ymm0,%ymm0
7761 vpxor %ymm0,%ymm12,%ymm12
7762 vpshufb .rol8(%rip),%ymm12,%ymm12
7763 vpaddd %ymm12,%ymm8,%ymm8
7764 vpxor %ymm8,%ymm4,%ymm4
7765 vpslld $7,%ymm4,%ymm3
7766 vpsrld $25,%ymm4,%ymm4
7767 vpxor %ymm3,%ymm4,%ymm4
7768 vpalignr $4,%ymm12,%ymm12,%ymm12
7769 vpalignr $8,%ymm8,%ymm8,%ymm8
7770 vpalignr $12,%ymm4,%ymm4,%ymm4
7771 addq 16(%rdi),%r10
7772 adcq 8+16(%rdi),%r11
7773 adcq $1,%r12
7774 movq 0+0(%rbp),%rax
7775 movq %rax,%r15
7776 mulq %r10
7777 movq %rax,%r13
7778 movq %rdx,%r14
7779 movq 0+0(%rbp),%rax
7780 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007781 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007782 addq %rax,%r14
7783 adcq %rdx,%r15
7784 movq 8+0(%rbp),%rax
7785 movq %rax,%r9
7786 mulq %r10
7787 addq %rax,%r14
7788 adcq $0,%rdx
7789 movq %rdx,%r10
7790 movq 8+0(%rbp),%rax
7791 mulq %r11
7792 addq %rax,%r15
7793 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007794 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007795 addq %r10,%r15
7796 adcq %rdx,%r9
7797 movq %r13,%r10
7798 movq %r14,%r11
7799 movq %r15,%r12
7800 andq $3,%r12
7801 movq %r15,%r13
7802 andq $-4,%r13
7803 movq %r9,%r14
7804 shrdq $2,%r9,%r15
7805 shrq $2,%r9
7806 addq %r13,%r10
7807 adcq %r14,%r11
7808 adcq $0,%r12
7809 addq %r15,%r10
7810 adcq %r9,%r11
7811 adcq $0,%r12
7812 vpaddd %ymm5,%ymm1,%ymm1
7813 vpxor %ymm1,%ymm13,%ymm13
7814 vpshufb .rol16(%rip),%ymm13,%ymm13
7815 vpaddd %ymm13,%ymm9,%ymm9
7816 vpxor %ymm9,%ymm5,%ymm5
7817 vpsrld $20,%ymm5,%ymm3
7818 vpslld $12,%ymm5,%ymm5
7819 vpxor %ymm3,%ymm5,%ymm5
7820 vpaddd %ymm5,%ymm1,%ymm1
7821 vpxor %ymm1,%ymm13,%ymm13
7822 vpshufb .rol8(%rip),%ymm13,%ymm13
7823 vpaddd %ymm13,%ymm9,%ymm9
7824 vpxor %ymm9,%ymm5,%ymm5
7825 vpslld $7,%ymm5,%ymm3
7826 vpsrld $25,%ymm5,%ymm5
7827 vpxor %ymm3,%ymm5,%ymm5
7828 vpalignr $4,%ymm13,%ymm13,%ymm13
7829 vpalignr $8,%ymm9,%ymm9,%ymm9
7830 vpalignr $12,%ymm5,%ymm5,%ymm5
7831 vpaddd %ymm6,%ymm2,%ymm2
7832 vpxor %ymm2,%ymm14,%ymm14
7833 vpshufb .rol16(%rip),%ymm14,%ymm14
7834 vpaddd %ymm14,%ymm10,%ymm10
7835 vpxor %ymm10,%ymm6,%ymm6
7836 vpsrld $20,%ymm6,%ymm3
7837 vpslld $12,%ymm6,%ymm6
7838 vpxor %ymm3,%ymm6,%ymm6
7839 vpaddd %ymm6,%ymm2,%ymm2
7840 vpxor %ymm2,%ymm14,%ymm14
7841 vpshufb .rol8(%rip),%ymm14,%ymm14
7842 vpaddd %ymm14,%ymm10,%ymm10
7843 vpxor %ymm10,%ymm6,%ymm6
7844 vpslld $7,%ymm6,%ymm3
7845 vpsrld $25,%ymm6,%ymm6
7846 vpxor %ymm3,%ymm6,%ymm6
7847 vpalignr $4,%ymm14,%ymm14,%ymm14
7848 vpalignr $8,%ymm10,%ymm10,%ymm10
7849 vpalignr $12,%ymm6,%ymm6,%ymm6
7850
7851 leaq 32(%rdi),%rdi
7852 decq %rcx
7853 jg 1b
7854 decq %r8
7855 jge 2b
7856 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
7857 vpaddd 64(%rbp),%ymm6,%ymm6
7858 vpaddd 96(%rbp),%ymm10,%ymm10
7859 vpaddd 224(%rbp),%ymm14,%ymm14
7860 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
7861 vpaddd 64(%rbp),%ymm5,%ymm5
7862 vpaddd 96(%rbp),%ymm9,%ymm9
7863 vpaddd 192(%rbp),%ymm13,%ymm13
7864 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
7865 vpaddd 64(%rbp),%ymm4,%ymm4
7866 vpaddd 96(%rbp),%ymm8,%ymm8
7867 vpaddd 160(%rbp),%ymm12,%ymm12
7868 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
7869 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
7870 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
7871 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
7872 vpxor 0+0(%rsi),%ymm3,%ymm3
7873 vpxor 32+0(%rsi),%ymm2,%ymm2
7874 vpxor 64+0(%rsi),%ymm6,%ymm6
7875 vpxor 96+0(%rsi),%ymm10,%ymm10
7876 vmovdqu %ymm3,0+0(%rdi)
7877 vmovdqu %ymm2,32+0(%rdi)
7878 vmovdqu %ymm6,64+0(%rdi)
7879 vmovdqu %ymm10,96+0(%rdi)
7880 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
7881 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
7882 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
7883 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
7884 vpxor 0+128(%rsi),%ymm3,%ymm3
7885 vpxor 32+128(%rsi),%ymm1,%ymm1
7886 vpxor 64+128(%rsi),%ymm5,%ymm5
7887 vpxor 96+128(%rsi),%ymm9,%ymm9
7888 vmovdqu %ymm3,0+128(%rdi)
7889 vmovdqu %ymm1,32+128(%rdi)
7890 vmovdqu %ymm5,64+128(%rdi)
7891 vmovdqu %ymm9,96+128(%rdi)
7892 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
7893 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
7894 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
7895 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
7896 vmovdqa %ymm3,%ymm8
7897
7898 movq $256,%rcx
7899 leaq 256(%rsi),%rsi
7900 subq $256,%rbx
7901 jmp seal_avx2_hash
7902
7903seal_avx2_tail_512:
7904 vmovdqa .chacha20_consts(%rip),%ymm0
7905 vmovdqa 64(%rbp),%ymm4
7906 vmovdqa 96(%rbp),%ymm8
7907 vmovdqa %ymm0,%ymm1
7908 vmovdqa %ymm4,%ymm5
7909 vmovdqa %ymm8,%ymm9
7910 vmovdqa %ymm0,%ymm2
7911 vmovdqa %ymm4,%ymm6
7912 vmovdqa %ymm8,%ymm10
7913 vmovdqa %ymm0,%ymm3
7914 vmovdqa %ymm4,%ymm7
7915 vmovdqa %ymm8,%ymm11
7916 vmovdqa .avx2_inc(%rip),%ymm12
7917 vpaddd 160(%rbp),%ymm12,%ymm15
7918 vpaddd %ymm15,%ymm12,%ymm14
7919 vpaddd %ymm14,%ymm12,%ymm13
7920 vpaddd %ymm13,%ymm12,%ymm12
7921 vmovdqa %ymm15,256(%rbp)
7922 vmovdqa %ymm14,224(%rbp)
7923 vmovdqa %ymm13,192(%rbp)
7924 vmovdqa %ymm12,160(%rbp)
7925
79261:
7927 addq 0(%rdi),%r10
7928 adcq 8+0(%rdi),%r11
7929 adcq $1,%r12
7930 movq 0+0(%rbp),%rdx
7931 movq %rdx,%r15
7932 mulxq %r10,%r13,%r14
7933 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007934 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007935 addq %rax,%r14
7936 adcq %rdx,%r15
7937 movq 8+0(%rbp),%rdx
7938 mulxq %r10,%r10,%rax
7939 addq %r10,%r14
7940 mulxq %r11,%r11,%r9
7941 adcq %r11,%r15
7942 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08007943 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05007944 addq %rax,%r15
7945 adcq %rdx,%r9
7946 movq %r13,%r10
7947 movq %r14,%r11
7948 movq %r15,%r12
7949 andq $3,%r12
7950 movq %r15,%r13
7951 andq $-4,%r13
7952 movq %r9,%r14
7953 shrdq $2,%r9,%r15
7954 shrq $2,%r9
7955 addq %r13,%r10
7956 adcq %r14,%r11
7957 adcq $0,%r12
7958 addq %r15,%r10
7959 adcq %r9,%r11
7960 adcq $0,%r12
7961
7962 leaq 16(%rdi),%rdi
79632:
7964 vmovdqa %ymm8,128(%rbp)
7965 vmovdqa .rol16(%rip),%ymm8
7966 vpaddd %ymm7,%ymm3,%ymm3
7967 vpaddd %ymm6,%ymm2,%ymm2
7968 vpaddd %ymm5,%ymm1,%ymm1
7969 vpaddd %ymm4,%ymm0,%ymm0
7970 vpxor %ymm3,%ymm15,%ymm15
7971 vpxor %ymm2,%ymm14,%ymm14
7972 vpxor %ymm1,%ymm13,%ymm13
7973 vpxor %ymm0,%ymm12,%ymm12
7974 vpshufb %ymm8,%ymm15,%ymm15
7975 vpshufb %ymm8,%ymm14,%ymm14
7976 vpshufb %ymm8,%ymm13,%ymm13
7977 vpshufb %ymm8,%ymm12,%ymm12
7978 vmovdqa 128(%rbp),%ymm8
7979 vpaddd %ymm15,%ymm11,%ymm11
7980 vpaddd %ymm14,%ymm10,%ymm10
7981 vpaddd %ymm13,%ymm9,%ymm9
7982 vpaddd %ymm12,%ymm8,%ymm8
7983 vpxor %ymm11,%ymm7,%ymm7
7984 addq 0(%rdi),%r10
7985 adcq 8+0(%rdi),%r11
7986 adcq $1,%r12
7987 vpxor %ymm10,%ymm6,%ymm6
7988 vpxor %ymm9,%ymm5,%ymm5
7989 vpxor %ymm8,%ymm4,%ymm4
7990 vmovdqa %ymm8,128(%rbp)
7991 vpsrld $20,%ymm7,%ymm8
7992 vpslld $32-20,%ymm7,%ymm7
7993 vpxor %ymm8,%ymm7,%ymm7
7994 vpsrld $20,%ymm6,%ymm8
7995 vpslld $32-20,%ymm6,%ymm6
7996 vpxor %ymm8,%ymm6,%ymm6
7997 vpsrld $20,%ymm5,%ymm8
7998 vpslld $32-20,%ymm5,%ymm5
7999 vpxor %ymm8,%ymm5,%ymm5
8000 vpsrld $20,%ymm4,%ymm8
8001 vpslld $32-20,%ymm4,%ymm4
8002 vpxor %ymm8,%ymm4,%ymm4
8003 vmovdqa .rol8(%rip),%ymm8
8004 vpaddd %ymm7,%ymm3,%ymm3
8005 vpaddd %ymm6,%ymm2,%ymm2
8006 vpaddd %ymm5,%ymm1,%ymm1
8007 movq 0+0(%rbp),%rdx
8008 movq %rdx,%r15
8009 mulxq %r10,%r13,%r14
8010 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008011 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008012 addq %rax,%r14
8013 adcq %rdx,%r15
8014 vpaddd %ymm4,%ymm0,%ymm0
8015 vpxor %ymm3,%ymm15,%ymm15
8016 vpxor %ymm2,%ymm14,%ymm14
8017 vpxor %ymm1,%ymm13,%ymm13
8018 vpxor %ymm0,%ymm12,%ymm12
8019 vpshufb %ymm8,%ymm15,%ymm15
8020 vpshufb %ymm8,%ymm14,%ymm14
8021 vpshufb %ymm8,%ymm13,%ymm13
8022 vpshufb %ymm8,%ymm12,%ymm12
8023 vmovdqa 128(%rbp),%ymm8
8024 vpaddd %ymm15,%ymm11,%ymm11
8025 vpaddd %ymm14,%ymm10,%ymm10
8026 vpaddd %ymm13,%ymm9,%ymm9
8027 vpaddd %ymm12,%ymm8,%ymm8
8028 vpxor %ymm11,%ymm7,%ymm7
8029 vpxor %ymm10,%ymm6,%ymm6
8030 vpxor %ymm9,%ymm5,%ymm5
8031 vpxor %ymm8,%ymm4,%ymm4
8032 vmovdqa %ymm8,128(%rbp)
8033 vpsrld $25,%ymm7,%ymm8
8034 movq 8+0(%rbp),%rdx
8035 mulxq %r10,%r10,%rax
8036 addq %r10,%r14
8037 mulxq %r11,%r11,%r9
8038 adcq %r11,%r15
8039 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08008040 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05008041 vpslld $32-25,%ymm7,%ymm7
8042 vpxor %ymm8,%ymm7,%ymm7
8043 vpsrld $25,%ymm6,%ymm8
8044 vpslld $32-25,%ymm6,%ymm6
8045 vpxor %ymm8,%ymm6,%ymm6
8046 vpsrld $25,%ymm5,%ymm8
8047 vpslld $32-25,%ymm5,%ymm5
8048 vpxor %ymm8,%ymm5,%ymm5
8049 vpsrld $25,%ymm4,%ymm8
8050 vpslld $32-25,%ymm4,%ymm4
8051 vpxor %ymm8,%ymm4,%ymm4
8052 vmovdqa 128(%rbp),%ymm8
8053 vpalignr $4,%ymm7,%ymm7,%ymm7
8054 vpalignr $8,%ymm11,%ymm11,%ymm11
8055 vpalignr $12,%ymm15,%ymm15,%ymm15
8056 vpalignr $4,%ymm6,%ymm6,%ymm6
8057 vpalignr $8,%ymm10,%ymm10,%ymm10
8058 vpalignr $12,%ymm14,%ymm14,%ymm14
8059 vpalignr $4,%ymm5,%ymm5,%ymm5
8060 vpalignr $8,%ymm9,%ymm9,%ymm9
8061 addq %rax,%r15
8062 adcq %rdx,%r9
8063 vpalignr $12,%ymm13,%ymm13,%ymm13
8064 vpalignr $4,%ymm4,%ymm4,%ymm4
8065 vpalignr $8,%ymm8,%ymm8,%ymm8
8066 vpalignr $12,%ymm12,%ymm12,%ymm12
8067 vmovdqa %ymm8,128(%rbp)
8068 vmovdqa .rol16(%rip),%ymm8
8069 vpaddd %ymm7,%ymm3,%ymm3
8070 vpaddd %ymm6,%ymm2,%ymm2
8071 vpaddd %ymm5,%ymm1,%ymm1
8072 vpaddd %ymm4,%ymm0,%ymm0
8073 vpxor %ymm3,%ymm15,%ymm15
8074 vpxor %ymm2,%ymm14,%ymm14
8075 vpxor %ymm1,%ymm13,%ymm13
8076 vpxor %ymm0,%ymm12,%ymm12
8077 vpshufb %ymm8,%ymm15,%ymm15
8078 vpshufb %ymm8,%ymm14,%ymm14
8079 vpshufb %ymm8,%ymm13,%ymm13
8080 vpshufb %ymm8,%ymm12,%ymm12
8081 vmovdqa 128(%rbp),%ymm8
8082 vpaddd %ymm15,%ymm11,%ymm11
8083 movq %r13,%r10
8084 movq %r14,%r11
8085 movq %r15,%r12
8086 andq $3,%r12
8087 movq %r15,%r13
8088 andq $-4,%r13
8089 movq %r9,%r14
8090 shrdq $2,%r9,%r15
8091 shrq $2,%r9
8092 addq %r13,%r10
8093 adcq %r14,%r11
8094 adcq $0,%r12
8095 addq %r15,%r10
8096 adcq %r9,%r11
8097 adcq $0,%r12
8098 vpaddd %ymm14,%ymm10,%ymm10
8099 vpaddd %ymm13,%ymm9,%ymm9
8100 vpaddd %ymm12,%ymm8,%ymm8
8101 vpxor %ymm11,%ymm7,%ymm7
8102 vpxor %ymm10,%ymm6,%ymm6
8103 vpxor %ymm9,%ymm5,%ymm5
8104 vpxor %ymm8,%ymm4,%ymm4
8105 vmovdqa %ymm8,128(%rbp)
8106 vpsrld $20,%ymm7,%ymm8
8107 vpslld $32-20,%ymm7,%ymm7
8108 vpxor %ymm8,%ymm7,%ymm7
8109 vpsrld $20,%ymm6,%ymm8
8110 vpslld $32-20,%ymm6,%ymm6
8111 vpxor %ymm8,%ymm6,%ymm6
8112 vpsrld $20,%ymm5,%ymm8
8113 vpslld $32-20,%ymm5,%ymm5
8114 vpxor %ymm8,%ymm5,%ymm5
8115 vpsrld $20,%ymm4,%ymm8
8116 vpslld $32-20,%ymm4,%ymm4
8117 vpxor %ymm8,%ymm4,%ymm4
8118 addq 16(%rdi),%r10
8119 adcq 8+16(%rdi),%r11
8120 adcq $1,%r12
8121 vmovdqa .rol8(%rip),%ymm8
8122 vpaddd %ymm7,%ymm3,%ymm3
8123 vpaddd %ymm6,%ymm2,%ymm2
8124 vpaddd %ymm5,%ymm1,%ymm1
8125 vpaddd %ymm4,%ymm0,%ymm0
8126 vpxor %ymm3,%ymm15,%ymm15
8127 vpxor %ymm2,%ymm14,%ymm14
8128 vpxor %ymm1,%ymm13,%ymm13
8129 vpxor %ymm0,%ymm12,%ymm12
8130 vpshufb %ymm8,%ymm15,%ymm15
8131 vpshufb %ymm8,%ymm14,%ymm14
8132 vpshufb %ymm8,%ymm13,%ymm13
8133 vpshufb %ymm8,%ymm12,%ymm12
8134 vmovdqa 128(%rbp),%ymm8
8135 vpaddd %ymm15,%ymm11,%ymm11
8136 vpaddd %ymm14,%ymm10,%ymm10
8137 vpaddd %ymm13,%ymm9,%ymm9
8138 vpaddd %ymm12,%ymm8,%ymm8
8139 vpxor %ymm11,%ymm7,%ymm7
8140 vpxor %ymm10,%ymm6,%ymm6
8141 movq 0+0(%rbp),%rdx
8142 movq %rdx,%r15
8143 mulxq %r10,%r13,%r14
8144 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008145 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008146 addq %rax,%r14
8147 adcq %rdx,%r15
8148 vpxor %ymm9,%ymm5,%ymm5
8149 vpxor %ymm8,%ymm4,%ymm4
8150 vmovdqa %ymm8,128(%rbp)
8151 vpsrld $25,%ymm7,%ymm8
8152 vpslld $32-25,%ymm7,%ymm7
8153 vpxor %ymm8,%ymm7,%ymm7
8154 vpsrld $25,%ymm6,%ymm8
8155 vpslld $32-25,%ymm6,%ymm6
8156 vpxor %ymm8,%ymm6,%ymm6
8157 vpsrld $25,%ymm5,%ymm8
8158 vpslld $32-25,%ymm5,%ymm5
8159 vpxor %ymm8,%ymm5,%ymm5
8160 vpsrld $25,%ymm4,%ymm8
8161 vpslld $32-25,%ymm4,%ymm4
8162 vpxor %ymm8,%ymm4,%ymm4
8163 vmovdqa 128(%rbp),%ymm8
8164 vpalignr $12,%ymm7,%ymm7,%ymm7
8165 vpalignr $8,%ymm11,%ymm11,%ymm11
8166 vpalignr $4,%ymm15,%ymm15,%ymm15
8167 vpalignr $12,%ymm6,%ymm6,%ymm6
8168 movq 8+0(%rbp),%rdx
8169 mulxq %r10,%r10,%rax
8170 addq %r10,%r14
8171 mulxq %r11,%r11,%r9
8172 adcq %r11,%r15
8173 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08008174 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05008175 vpalignr $8,%ymm10,%ymm10,%ymm10
8176 vpalignr $4,%ymm14,%ymm14,%ymm14
8177 vpalignr $12,%ymm5,%ymm5,%ymm5
8178 vpalignr $8,%ymm9,%ymm9,%ymm9
8179 vpalignr $4,%ymm13,%ymm13,%ymm13
8180 vpalignr $12,%ymm4,%ymm4,%ymm4
8181 vpalignr $8,%ymm8,%ymm8,%ymm8
8182 vpalignr $4,%ymm12,%ymm12,%ymm12
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195 addq %rax,%r15
8196 adcq %rdx,%r9
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217 movq %r13,%r10
8218 movq %r14,%r11
8219 movq %r15,%r12
8220 andq $3,%r12
8221 movq %r15,%r13
8222 andq $-4,%r13
8223 movq %r9,%r14
8224 shrdq $2,%r9,%r15
8225 shrq $2,%r9
8226 addq %r13,%r10
8227 adcq %r14,%r11
8228 adcq $0,%r12
8229 addq %r15,%r10
8230 adcq %r9,%r11
8231 adcq $0,%r12
8232
8233 leaq 32(%rdi),%rdi
8234 decq %rcx
8235 jg 1b
8236 decq %r8
8237 jge 2b
8238 vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
8239 vpaddd 64(%rbp),%ymm7,%ymm7
8240 vpaddd 96(%rbp),%ymm11,%ymm11
8241 vpaddd 256(%rbp),%ymm15,%ymm15
8242 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
8243 vpaddd 64(%rbp),%ymm6,%ymm6
8244 vpaddd 96(%rbp),%ymm10,%ymm10
8245 vpaddd 224(%rbp),%ymm14,%ymm14
8246 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
8247 vpaddd 64(%rbp),%ymm5,%ymm5
8248 vpaddd 96(%rbp),%ymm9,%ymm9
8249 vpaddd 192(%rbp),%ymm13,%ymm13
8250 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
8251 vpaddd 64(%rbp),%ymm4,%ymm4
8252 vpaddd 96(%rbp),%ymm8,%ymm8
8253 vpaddd 160(%rbp),%ymm12,%ymm12
8254
8255 vmovdqa %ymm0,128(%rbp)
8256 vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
8257 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
8258 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
8259 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
8260 vpxor 0+0(%rsi),%ymm0,%ymm0
8261 vpxor 32+0(%rsi),%ymm3,%ymm3
8262 vpxor 64+0(%rsi),%ymm7,%ymm7
8263 vpxor 96+0(%rsi),%ymm11,%ymm11
8264 vmovdqu %ymm0,0+0(%rdi)
8265 vmovdqu %ymm3,32+0(%rdi)
8266 vmovdqu %ymm7,64+0(%rdi)
8267 vmovdqu %ymm11,96+0(%rdi)
8268
8269 vmovdqa 128(%rbp),%ymm0
8270 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
8271 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
8272 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
8273 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
8274 vpxor 0+128(%rsi),%ymm3,%ymm3
8275 vpxor 32+128(%rsi),%ymm2,%ymm2
8276 vpxor 64+128(%rsi),%ymm6,%ymm6
8277 vpxor 96+128(%rsi),%ymm10,%ymm10
8278 vmovdqu %ymm3,0+128(%rdi)
8279 vmovdqu %ymm2,32+128(%rdi)
8280 vmovdqu %ymm6,64+128(%rdi)
8281 vmovdqu %ymm10,96+128(%rdi)
8282 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
8283 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
8284 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
8285 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
8286 vpxor 0+256(%rsi),%ymm3,%ymm3
8287 vpxor 32+256(%rsi),%ymm1,%ymm1
8288 vpxor 64+256(%rsi),%ymm5,%ymm5
8289 vpxor 96+256(%rsi),%ymm9,%ymm9
8290 vmovdqu %ymm3,0+256(%rdi)
8291 vmovdqu %ymm1,32+256(%rdi)
8292 vmovdqu %ymm5,64+256(%rdi)
8293 vmovdqu %ymm9,96+256(%rdi)
8294 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
8295 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
8296 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
8297 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
8298 vmovdqa %ymm3,%ymm8
8299
8300 movq $384,%rcx
8301 leaq 384(%rsi),%rsi
8302 subq $384,%rbx
8303 jmp seal_avx2_hash
8304
8305seal_avx2_320:
8306 vmovdqa %ymm0,%ymm1
8307 vmovdqa %ymm0,%ymm2
8308 vmovdqa %ymm4,%ymm5
8309 vmovdqa %ymm4,%ymm6
8310 vmovdqa %ymm8,%ymm9
8311 vmovdqa %ymm8,%ymm10
8312 vpaddd .avx2_inc(%rip),%ymm12,%ymm13
8313 vpaddd .avx2_inc(%rip),%ymm13,%ymm14
8314 vmovdqa %ymm4,%ymm7
8315 vmovdqa %ymm8,%ymm11
8316 vmovdqa %ymm12,160(%rbp)
8317 vmovdqa %ymm13,192(%rbp)
8318 vmovdqa %ymm14,224(%rbp)
8319 movq $10,%r10
83201:
8321 vpaddd %ymm4,%ymm0,%ymm0
8322 vpxor %ymm0,%ymm12,%ymm12
8323 vpshufb .rol16(%rip),%ymm12,%ymm12
8324 vpaddd %ymm12,%ymm8,%ymm8
8325 vpxor %ymm8,%ymm4,%ymm4
8326 vpsrld $20,%ymm4,%ymm3
8327 vpslld $12,%ymm4,%ymm4
8328 vpxor %ymm3,%ymm4,%ymm4
8329 vpaddd %ymm4,%ymm0,%ymm0
8330 vpxor %ymm0,%ymm12,%ymm12
8331 vpshufb .rol8(%rip),%ymm12,%ymm12
8332 vpaddd %ymm12,%ymm8,%ymm8
8333 vpxor %ymm8,%ymm4,%ymm4
8334 vpslld $7,%ymm4,%ymm3
8335 vpsrld $25,%ymm4,%ymm4
8336 vpxor %ymm3,%ymm4,%ymm4
8337 vpalignr $12,%ymm12,%ymm12,%ymm12
8338 vpalignr $8,%ymm8,%ymm8,%ymm8
8339 vpalignr $4,%ymm4,%ymm4,%ymm4
8340 vpaddd %ymm5,%ymm1,%ymm1
8341 vpxor %ymm1,%ymm13,%ymm13
8342 vpshufb .rol16(%rip),%ymm13,%ymm13
8343 vpaddd %ymm13,%ymm9,%ymm9
8344 vpxor %ymm9,%ymm5,%ymm5
8345 vpsrld $20,%ymm5,%ymm3
8346 vpslld $12,%ymm5,%ymm5
8347 vpxor %ymm3,%ymm5,%ymm5
8348 vpaddd %ymm5,%ymm1,%ymm1
8349 vpxor %ymm1,%ymm13,%ymm13
8350 vpshufb .rol8(%rip),%ymm13,%ymm13
8351 vpaddd %ymm13,%ymm9,%ymm9
8352 vpxor %ymm9,%ymm5,%ymm5
8353 vpslld $7,%ymm5,%ymm3
8354 vpsrld $25,%ymm5,%ymm5
8355 vpxor %ymm3,%ymm5,%ymm5
8356 vpalignr $12,%ymm13,%ymm13,%ymm13
8357 vpalignr $8,%ymm9,%ymm9,%ymm9
8358 vpalignr $4,%ymm5,%ymm5,%ymm5
8359 vpaddd %ymm6,%ymm2,%ymm2
8360 vpxor %ymm2,%ymm14,%ymm14
8361 vpshufb .rol16(%rip),%ymm14,%ymm14
8362 vpaddd %ymm14,%ymm10,%ymm10
8363 vpxor %ymm10,%ymm6,%ymm6
8364 vpsrld $20,%ymm6,%ymm3
8365 vpslld $12,%ymm6,%ymm6
8366 vpxor %ymm3,%ymm6,%ymm6
8367 vpaddd %ymm6,%ymm2,%ymm2
8368 vpxor %ymm2,%ymm14,%ymm14
8369 vpshufb .rol8(%rip),%ymm14,%ymm14
8370 vpaddd %ymm14,%ymm10,%ymm10
8371 vpxor %ymm10,%ymm6,%ymm6
8372 vpslld $7,%ymm6,%ymm3
8373 vpsrld $25,%ymm6,%ymm6
8374 vpxor %ymm3,%ymm6,%ymm6
8375 vpalignr $12,%ymm14,%ymm14,%ymm14
8376 vpalignr $8,%ymm10,%ymm10,%ymm10
8377 vpalignr $4,%ymm6,%ymm6,%ymm6
8378 vpaddd %ymm4,%ymm0,%ymm0
8379 vpxor %ymm0,%ymm12,%ymm12
8380 vpshufb .rol16(%rip),%ymm12,%ymm12
8381 vpaddd %ymm12,%ymm8,%ymm8
8382 vpxor %ymm8,%ymm4,%ymm4
8383 vpsrld $20,%ymm4,%ymm3
8384 vpslld $12,%ymm4,%ymm4
8385 vpxor %ymm3,%ymm4,%ymm4
8386 vpaddd %ymm4,%ymm0,%ymm0
8387 vpxor %ymm0,%ymm12,%ymm12
8388 vpshufb .rol8(%rip),%ymm12,%ymm12
8389 vpaddd %ymm12,%ymm8,%ymm8
8390 vpxor %ymm8,%ymm4,%ymm4
8391 vpslld $7,%ymm4,%ymm3
8392 vpsrld $25,%ymm4,%ymm4
8393 vpxor %ymm3,%ymm4,%ymm4
8394 vpalignr $4,%ymm12,%ymm12,%ymm12
8395 vpalignr $8,%ymm8,%ymm8,%ymm8
8396 vpalignr $12,%ymm4,%ymm4,%ymm4
8397 vpaddd %ymm5,%ymm1,%ymm1
8398 vpxor %ymm1,%ymm13,%ymm13
8399 vpshufb .rol16(%rip),%ymm13,%ymm13
8400 vpaddd %ymm13,%ymm9,%ymm9
8401 vpxor %ymm9,%ymm5,%ymm5
8402 vpsrld $20,%ymm5,%ymm3
8403 vpslld $12,%ymm5,%ymm5
8404 vpxor %ymm3,%ymm5,%ymm5
8405 vpaddd %ymm5,%ymm1,%ymm1
8406 vpxor %ymm1,%ymm13,%ymm13
8407 vpshufb .rol8(%rip),%ymm13,%ymm13
8408 vpaddd %ymm13,%ymm9,%ymm9
8409 vpxor %ymm9,%ymm5,%ymm5
8410 vpslld $7,%ymm5,%ymm3
8411 vpsrld $25,%ymm5,%ymm5
8412 vpxor %ymm3,%ymm5,%ymm5
8413 vpalignr $4,%ymm13,%ymm13,%ymm13
8414 vpalignr $8,%ymm9,%ymm9,%ymm9
8415 vpalignr $12,%ymm5,%ymm5,%ymm5
8416 vpaddd %ymm6,%ymm2,%ymm2
8417 vpxor %ymm2,%ymm14,%ymm14
8418 vpshufb .rol16(%rip),%ymm14,%ymm14
8419 vpaddd %ymm14,%ymm10,%ymm10
8420 vpxor %ymm10,%ymm6,%ymm6
8421 vpsrld $20,%ymm6,%ymm3
8422 vpslld $12,%ymm6,%ymm6
8423 vpxor %ymm3,%ymm6,%ymm6
8424 vpaddd %ymm6,%ymm2,%ymm2
8425 vpxor %ymm2,%ymm14,%ymm14
8426 vpshufb .rol8(%rip),%ymm14,%ymm14
8427 vpaddd %ymm14,%ymm10,%ymm10
8428 vpxor %ymm10,%ymm6,%ymm6
8429 vpslld $7,%ymm6,%ymm3
8430 vpsrld $25,%ymm6,%ymm6
8431 vpxor %ymm3,%ymm6,%ymm6
8432 vpalignr $4,%ymm14,%ymm14,%ymm14
8433 vpalignr $8,%ymm10,%ymm10,%ymm10
8434 vpalignr $12,%ymm6,%ymm6,%ymm6
8435
8436 decq %r10
8437 jne 1b
8438 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
8439 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
8440 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
8441 vpaddd %ymm7,%ymm4,%ymm4
8442 vpaddd %ymm7,%ymm5,%ymm5
8443 vpaddd %ymm7,%ymm6,%ymm6
8444 vpaddd %ymm11,%ymm8,%ymm8
8445 vpaddd %ymm11,%ymm9,%ymm9
8446 vpaddd %ymm11,%ymm10,%ymm10
8447 vpaddd 160(%rbp),%ymm12,%ymm12
8448 vpaddd 192(%rbp),%ymm13,%ymm13
8449 vpaddd 224(%rbp),%ymm14,%ymm14
8450 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
8451
8452 vpand .clamp(%rip),%ymm3,%ymm3
8453 vmovdqa %ymm3,0(%rbp)
8454
8455 vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
8456 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
8457 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
8458 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
8459 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
8460 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
8461 vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
8462 vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
8463 vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
8464 vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
8465 jmp seal_avx2_short
8466
8467seal_avx2_192:
8468 vmovdqa %ymm0,%ymm1
8469 vmovdqa %ymm0,%ymm2
8470 vmovdqa %ymm4,%ymm5
8471 vmovdqa %ymm4,%ymm6
8472 vmovdqa %ymm8,%ymm9
8473 vmovdqa %ymm8,%ymm10
8474 vpaddd .avx2_inc(%rip),%ymm12,%ymm13
8475 vmovdqa %ymm12,%ymm11
8476 vmovdqa %ymm13,%ymm15
8477 movq $10,%r10
84781:
8479 vpaddd %ymm4,%ymm0,%ymm0
8480 vpxor %ymm0,%ymm12,%ymm12
8481 vpshufb .rol16(%rip),%ymm12,%ymm12
8482 vpaddd %ymm12,%ymm8,%ymm8
8483 vpxor %ymm8,%ymm4,%ymm4
8484 vpsrld $20,%ymm4,%ymm3
8485 vpslld $12,%ymm4,%ymm4
8486 vpxor %ymm3,%ymm4,%ymm4
8487 vpaddd %ymm4,%ymm0,%ymm0
8488 vpxor %ymm0,%ymm12,%ymm12
8489 vpshufb .rol8(%rip),%ymm12,%ymm12
8490 vpaddd %ymm12,%ymm8,%ymm8
8491 vpxor %ymm8,%ymm4,%ymm4
8492 vpslld $7,%ymm4,%ymm3
8493 vpsrld $25,%ymm4,%ymm4
8494 vpxor %ymm3,%ymm4,%ymm4
8495 vpalignr $12,%ymm12,%ymm12,%ymm12
8496 vpalignr $8,%ymm8,%ymm8,%ymm8
8497 vpalignr $4,%ymm4,%ymm4,%ymm4
8498 vpaddd %ymm5,%ymm1,%ymm1
8499 vpxor %ymm1,%ymm13,%ymm13
8500 vpshufb .rol16(%rip),%ymm13,%ymm13
8501 vpaddd %ymm13,%ymm9,%ymm9
8502 vpxor %ymm9,%ymm5,%ymm5
8503 vpsrld $20,%ymm5,%ymm3
8504 vpslld $12,%ymm5,%ymm5
8505 vpxor %ymm3,%ymm5,%ymm5
8506 vpaddd %ymm5,%ymm1,%ymm1
8507 vpxor %ymm1,%ymm13,%ymm13
8508 vpshufb .rol8(%rip),%ymm13,%ymm13
8509 vpaddd %ymm13,%ymm9,%ymm9
8510 vpxor %ymm9,%ymm5,%ymm5
8511 vpslld $7,%ymm5,%ymm3
8512 vpsrld $25,%ymm5,%ymm5
8513 vpxor %ymm3,%ymm5,%ymm5
8514 vpalignr $12,%ymm13,%ymm13,%ymm13
8515 vpalignr $8,%ymm9,%ymm9,%ymm9
8516 vpalignr $4,%ymm5,%ymm5,%ymm5
8517 vpaddd %ymm4,%ymm0,%ymm0
8518 vpxor %ymm0,%ymm12,%ymm12
8519 vpshufb .rol16(%rip),%ymm12,%ymm12
8520 vpaddd %ymm12,%ymm8,%ymm8
8521 vpxor %ymm8,%ymm4,%ymm4
8522 vpsrld $20,%ymm4,%ymm3
8523 vpslld $12,%ymm4,%ymm4
8524 vpxor %ymm3,%ymm4,%ymm4
8525 vpaddd %ymm4,%ymm0,%ymm0
8526 vpxor %ymm0,%ymm12,%ymm12
8527 vpshufb .rol8(%rip),%ymm12,%ymm12
8528 vpaddd %ymm12,%ymm8,%ymm8
8529 vpxor %ymm8,%ymm4,%ymm4
8530 vpslld $7,%ymm4,%ymm3
8531 vpsrld $25,%ymm4,%ymm4
8532 vpxor %ymm3,%ymm4,%ymm4
8533 vpalignr $4,%ymm12,%ymm12,%ymm12
8534 vpalignr $8,%ymm8,%ymm8,%ymm8
8535 vpalignr $12,%ymm4,%ymm4,%ymm4
8536 vpaddd %ymm5,%ymm1,%ymm1
8537 vpxor %ymm1,%ymm13,%ymm13
8538 vpshufb .rol16(%rip),%ymm13,%ymm13
8539 vpaddd %ymm13,%ymm9,%ymm9
8540 vpxor %ymm9,%ymm5,%ymm5
8541 vpsrld $20,%ymm5,%ymm3
8542 vpslld $12,%ymm5,%ymm5
8543 vpxor %ymm3,%ymm5,%ymm5
8544 vpaddd %ymm5,%ymm1,%ymm1
8545 vpxor %ymm1,%ymm13,%ymm13
8546 vpshufb .rol8(%rip),%ymm13,%ymm13
8547 vpaddd %ymm13,%ymm9,%ymm9
8548 vpxor %ymm9,%ymm5,%ymm5
8549 vpslld $7,%ymm5,%ymm3
8550 vpsrld $25,%ymm5,%ymm5
8551 vpxor %ymm3,%ymm5,%ymm5
8552 vpalignr $4,%ymm13,%ymm13,%ymm13
8553 vpalignr $8,%ymm9,%ymm9,%ymm9
8554 vpalignr $12,%ymm5,%ymm5,%ymm5
8555
8556 decq %r10
8557 jne 1b
8558 vpaddd %ymm2,%ymm0,%ymm0
8559 vpaddd %ymm2,%ymm1,%ymm1
8560 vpaddd %ymm6,%ymm4,%ymm4
8561 vpaddd %ymm6,%ymm5,%ymm5
8562 vpaddd %ymm10,%ymm8,%ymm8
8563 vpaddd %ymm10,%ymm9,%ymm9
8564 vpaddd %ymm11,%ymm12,%ymm12
8565 vpaddd %ymm15,%ymm13,%ymm13
8566 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
8567
8568 vpand .clamp(%rip),%ymm3,%ymm3
8569 vmovdqa %ymm3,0(%rbp)
8570
8571 vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
8572 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
8573 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
8574 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
8575 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
8576 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
8577seal_avx2_short:
8578 movq %r8,%r8
8579 call poly_hash_ad_internal
8580 xorq %rcx,%rcx
8581seal_avx2_hash:
8582 cmpq $16,%rcx
8583 jb seal_avx2_short_loop
8584 addq 0(%rdi),%r10
8585 adcq 8+0(%rdi),%r11
8586 adcq $1,%r12
8587 movq 0+0(%rbp),%rax
8588 movq %rax,%r15
8589 mulq %r10
8590 movq %rax,%r13
8591 movq %rdx,%r14
8592 movq 0+0(%rbp),%rax
8593 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08008594 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008595 addq %rax,%r14
8596 adcq %rdx,%r15
8597 movq 8+0(%rbp),%rax
8598 movq %rax,%r9
8599 mulq %r10
8600 addq %rax,%r14
8601 adcq $0,%rdx
8602 movq %rdx,%r10
8603 movq 8+0(%rbp),%rax
8604 mulq %r11
8605 addq %rax,%r15
8606 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008607 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05008608 addq %r10,%r15
8609 adcq %rdx,%r9
8610 movq %r13,%r10
8611 movq %r14,%r11
8612 movq %r15,%r12
8613 andq $3,%r12
8614 movq %r15,%r13
8615 andq $-4,%r13
8616 movq %r9,%r14
8617 shrdq $2,%r9,%r15
8618 shrq $2,%r9
8619 addq %r13,%r10
8620 adcq %r14,%r11
8621 adcq $0,%r12
8622 addq %r15,%r10
8623 adcq %r9,%r11
8624 adcq $0,%r12
8625
8626 subq $16,%rcx
8627 addq $16,%rdi
8628 jmp seal_avx2_hash
8629seal_avx2_short_loop:
8630 cmpq $32,%rbx
8631 jb seal_avx2_short_tail
8632 subq $32,%rbx
8633
8634 vpxor (%rsi),%ymm0,%ymm0
8635 vmovdqu %ymm0,(%rdi)
8636 leaq 32(%rsi),%rsi
8637
8638 addq 0(%rdi),%r10
8639 adcq 8+0(%rdi),%r11
8640 adcq $1,%r12
8641 movq 0+0(%rbp),%rax
8642 movq %rax,%r15
8643 mulq %r10
8644 movq %rax,%r13
8645 movq %rdx,%r14
8646 movq 0+0(%rbp),%rax
8647 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08008648 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008649 addq %rax,%r14
8650 adcq %rdx,%r15
8651 movq 8+0(%rbp),%rax
8652 movq %rax,%r9
8653 mulq %r10
8654 addq %rax,%r14
8655 adcq $0,%rdx
8656 movq %rdx,%r10
8657 movq 8+0(%rbp),%rax
8658 mulq %r11
8659 addq %rax,%r15
8660 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008661 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05008662 addq %r10,%r15
8663 adcq %rdx,%r9
8664 movq %r13,%r10
8665 movq %r14,%r11
8666 movq %r15,%r12
8667 andq $3,%r12
8668 movq %r15,%r13
8669 andq $-4,%r13
8670 movq %r9,%r14
8671 shrdq $2,%r9,%r15
8672 shrq $2,%r9
8673 addq %r13,%r10
8674 adcq %r14,%r11
8675 adcq $0,%r12
8676 addq %r15,%r10
8677 adcq %r9,%r11
8678 adcq $0,%r12
8679 addq 16(%rdi),%r10
8680 adcq 8+16(%rdi),%r11
8681 adcq $1,%r12
8682 movq 0+0(%rbp),%rax
8683 movq %rax,%r15
8684 mulq %r10
8685 movq %rax,%r13
8686 movq %rdx,%r14
8687 movq 0+0(%rbp),%rax
8688 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08008689 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008690 addq %rax,%r14
8691 adcq %rdx,%r15
8692 movq 8+0(%rbp),%rax
8693 movq %rax,%r9
8694 mulq %r10
8695 addq %rax,%r14
8696 adcq $0,%rdx
8697 movq %rdx,%r10
8698 movq 8+0(%rbp),%rax
8699 mulq %r11
8700 addq %rax,%r15
8701 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008702 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05008703 addq %r10,%r15
8704 adcq %rdx,%r9
8705 movq %r13,%r10
8706 movq %r14,%r11
8707 movq %r15,%r12
8708 andq $3,%r12
8709 movq %r15,%r13
8710 andq $-4,%r13
8711 movq %r9,%r14
8712 shrdq $2,%r9,%r15
8713 shrq $2,%r9
8714 addq %r13,%r10
8715 adcq %r14,%r11
8716 adcq $0,%r12
8717 addq %r15,%r10
8718 adcq %r9,%r11
8719 adcq $0,%r12
8720
8721 leaq 32(%rdi),%rdi
8722
8723 vmovdqa %ymm4,%ymm0
8724 vmovdqa %ymm8,%ymm4
8725 vmovdqa %ymm12,%ymm8
8726 vmovdqa %ymm1,%ymm12
8727 vmovdqa %ymm5,%ymm1
8728 vmovdqa %ymm9,%ymm5
8729 vmovdqa %ymm13,%ymm9
8730 vmovdqa %ymm2,%ymm13
8731 vmovdqa %ymm6,%ymm2
8732 jmp seal_avx2_short_loop
8733seal_avx2_short_tail:
8734 cmpq $16,%rbx
8735 jb 1f
8736 subq $16,%rbx
8737 vpxor (%rsi),%xmm0,%xmm3
8738 vmovdqu %xmm3,(%rdi)
8739 leaq 16(%rsi),%rsi
8740 addq 0(%rdi),%r10
8741 adcq 8+0(%rdi),%r11
8742 adcq $1,%r12
8743 movq 0+0(%rbp),%rax
8744 movq %rax,%r15
8745 mulq %r10
8746 movq %rax,%r13
8747 movq %rdx,%r14
8748 movq 0+0(%rbp),%rax
8749 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08008750 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008751 addq %rax,%r14
8752 adcq %rdx,%r15
8753 movq 8+0(%rbp),%rax
8754 movq %rax,%r9
8755 mulq %r10
8756 addq %rax,%r14
8757 adcq $0,%rdx
8758 movq %rdx,%r10
8759 movq 8+0(%rbp),%rax
8760 mulq %r11
8761 addq %rax,%r15
8762 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008763 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05008764 addq %r10,%r15
8765 adcq %rdx,%r9
8766 movq %r13,%r10
8767 movq %r14,%r11
8768 movq %r15,%r12
8769 andq $3,%r12
8770 movq %r15,%r13
8771 andq $-4,%r13
8772 movq %r9,%r14
8773 shrdq $2,%r9,%r15
8774 shrq $2,%r9
8775 addq %r13,%r10
8776 adcq %r14,%r11
8777 adcq $0,%r12
8778 addq %r15,%r10
8779 adcq %r9,%r11
8780 adcq $0,%r12
8781
8782 leaq 16(%rdi),%rdi
8783 vextracti128 $1,%ymm0,%xmm0
87841:
8785 vzeroupper
8786 jmp seal_sse_tail_16
8787.cfi_endproc
8788#endif