blob: 9db2a586088dc62a13c89f3e6f6a2fb54d13e62b [file] [log] [blame]
Robert Sloan6f79a502017-04-03 09:16:40 -07001#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
David Benjaminf31229b2017-01-25 14:08:15 -05002.text
3
Robert Sloan5d625782017-02-13 09:55:39 -08004
5chacha20_poly1305_constants:
6
David Benjaminf31229b2017-01-25 14:08:15 -05007.p2align 6
8.chacha20_consts:
9.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
10.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
11.rol8:
12.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
13.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
14.rol16:
15.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
16.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
17.avx2_init:
18.long 0,0,0,0
19.sse_inc:
20.long 1,0,0,0
21.avx2_inc:
22.long 2,0,0,0,2,0,0,0
23.clamp:
24.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
25.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
26.p2align 4
27.and_masks:
28.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
29.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
30.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
31.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
32.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
33.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
34.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
35.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
36.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
37.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
38.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
39.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
40.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
41.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
42.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
43
44
45.p2align 6
46poly_hash_ad_internal:
Robert Sloana94fe052017-02-21 08:49:28 -080047
David Benjaminf31229b2017-01-25 14:08:15 -050048 xorq %r10,%r10
49 xorq %r11,%r11
50 xorq %r12,%r12
51 cmpq $13,%r8
52 jne hash_ad_loop
53poly_fast_tls_ad:
54
55 movq (%rcx),%r10
56 movq 5(%rcx),%r11
57 shrq $24,%r11
58 movq $1,%r12
59 movq 0+0(%rbp),%rax
60 movq %rax,%r15
61 mulq %r10
62 movq %rax,%r13
63 movq %rdx,%r14
64 movq 0+0(%rbp),%rax
65 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -080066 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -050067 addq %rax,%r14
68 adcq %rdx,%r15
69 movq 8+0(%rbp),%rax
70 movq %rax,%r9
71 mulq %r10
72 addq %rax,%r14
73 adcq $0,%rdx
74 movq %rdx,%r10
75 movq 8+0(%rbp),%rax
76 mulq %r11
77 addq %rax,%r15
78 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -080079 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -050080 addq %r10,%r15
81 adcq %rdx,%r9
82 movq %r13,%r10
83 movq %r14,%r11
84 movq %r15,%r12
85 andq $3,%r12
86 movq %r15,%r13
87 andq $-4,%r13
88 movq %r9,%r14
89 shrdq $2,%r9,%r15
90 shrq $2,%r9
91 addq %r13,%r10
92 adcq %r14,%r11
93 adcq $0,%r12
94 addq %r15,%r10
95 adcq %r9,%r11
96 adcq $0,%r12
97
98 .byte 0xf3,0xc3
99hash_ad_loop:
100
101 cmpq $16,%r8
102 jb hash_ad_tail
103 addq 0(%rcx),%r10
104 adcq 8+0(%rcx),%r11
105 adcq $1,%r12
106 movq 0+0(%rbp),%rax
107 movq %rax,%r15
108 mulq %r10
109 movq %rax,%r13
110 movq %rdx,%r14
111 movq 0+0(%rbp),%rax
112 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800113 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500114 addq %rax,%r14
115 adcq %rdx,%r15
116 movq 8+0(%rbp),%rax
117 movq %rax,%r9
118 mulq %r10
119 addq %rax,%r14
120 adcq $0,%rdx
121 movq %rdx,%r10
122 movq 8+0(%rbp),%rax
123 mulq %r11
124 addq %rax,%r15
125 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -0800126 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500127 addq %r10,%r15
128 adcq %rdx,%r9
129 movq %r13,%r10
130 movq %r14,%r11
131 movq %r15,%r12
132 andq $3,%r12
133 movq %r15,%r13
134 andq $-4,%r13
135 movq %r9,%r14
136 shrdq $2,%r9,%r15
137 shrq $2,%r9
138 addq %r13,%r10
139 adcq %r14,%r11
140 adcq $0,%r12
141 addq %r15,%r10
142 adcq %r9,%r11
143 adcq $0,%r12
144
Robert Sloan4d1ac502017-02-06 08:36:14 -0800145 leaq 16(%rcx),%rcx
David Benjaminf31229b2017-01-25 14:08:15 -0500146 subq $16,%r8
147 jmp hash_ad_loop
148hash_ad_tail:
149 cmpq $0,%r8
150 je 1f
151
152 xorq %r13,%r13
153 xorq %r14,%r14
154 xorq %r15,%r15
155 addq %r8,%rcx
156hash_ad_tail_loop:
157 shldq $8,%r13,%r14
158 shlq $8,%r13
159 movzbq -1(%rcx),%r15
160 xorq %r15,%r13
161 decq %rcx
162 decq %r8
163 jne hash_ad_tail_loop
164
165 addq %r13,%r10
166 adcq %r14,%r11
167 adcq $1,%r12
168 movq 0+0(%rbp),%rax
169 movq %rax,%r15
170 mulq %r10
171 movq %rax,%r13
172 movq %rdx,%r14
173 movq 0+0(%rbp),%rax
174 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800175 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500176 addq %rax,%r14
177 adcq %rdx,%r15
178 movq 8+0(%rbp),%rax
179 movq %rax,%r9
180 mulq %r10
181 addq %rax,%r14
182 adcq $0,%rdx
183 movq %rdx,%r10
184 movq 8+0(%rbp),%rax
185 mulq %r11
186 addq %rax,%r15
187 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -0800188 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500189 addq %r10,%r15
190 adcq %rdx,%r9
191 movq %r13,%r10
192 movq %r14,%r11
193 movq %r15,%r12
194 andq $3,%r12
195 movq %r15,%r13
196 andq $-4,%r13
197 movq %r9,%r14
198 shrdq $2,%r9,%r15
199 shrq $2,%r9
200 addq %r13,%r10
201 adcq %r14,%r11
202 adcq $0,%r12
203 addq %r15,%r10
204 adcq %r9,%r11
205 adcq $0,%r12
206
207
2081:
209 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -0800210
David Benjaminf31229b2017-01-25 14:08:15 -0500211
212
213.globl _chacha20_poly1305_open
214.private_extern _chacha20_poly1305_open
215
216.p2align 6
217_chacha20_poly1305_open:
Robert Sloana94fe052017-02-21 08:49:28 -0800218
David Benjaminf31229b2017-01-25 14:08:15 -0500219 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800220
David Benjaminf31229b2017-01-25 14:08:15 -0500221 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800222
David Benjaminf31229b2017-01-25 14:08:15 -0500223 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -0800224
David Benjaminf31229b2017-01-25 14:08:15 -0500225 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -0800226
David Benjaminf31229b2017-01-25 14:08:15 -0500227 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -0800228
David Benjaminf31229b2017-01-25 14:08:15 -0500229 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -0800230
David Benjaminf31229b2017-01-25 14:08:15 -0500231
232
233 pushq %r9
Robert Sloana94fe052017-02-21 08:49:28 -0800234
David Benjaminf31229b2017-01-25 14:08:15 -0500235 subq $288 + 32,%rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800236
237
238
239
240
241
242
David Benjaminf31229b2017-01-25 14:08:15 -0500243 leaq 32(%rsp),%rbp
244 andq $-32,%rbp
245 movq %rdx,8+32(%rbp)
246 movq %r8,0+32(%rbp)
247 movq %rdx,%rbx
248
249 movl _OPENSSL_ia32cap_P+8(%rip),%eax
250 andl $288,%eax
251 xorl $288,%eax
252 jz chacha20_poly1305_open_avx2
253
2541:
255 cmpq $128,%rbx
256 jbe open_sse_128
257
258 movdqa .chacha20_consts(%rip),%xmm0
259 movdqu 0(%r9),%xmm4
260 movdqu 16(%r9),%xmm8
261 movdqu 32(%r9),%xmm12
262 movdqa %xmm12,%xmm7
263
264 movdqa %xmm4,48(%rbp)
265 movdqa %xmm8,64(%rbp)
266 movdqa %xmm12,96(%rbp)
267 movq $10,%r10
2681:
269 paddd %xmm4,%xmm0
270 pxor %xmm0,%xmm12
271 pshufb .rol16(%rip),%xmm12
272 paddd %xmm12,%xmm8
273 pxor %xmm8,%xmm4
274 movdqa %xmm4,%xmm3
275 pslld $12,%xmm3
276 psrld $20,%xmm4
277 pxor %xmm3,%xmm4
278 paddd %xmm4,%xmm0
279 pxor %xmm0,%xmm12
280 pshufb .rol8(%rip),%xmm12
281 paddd %xmm12,%xmm8
282 pxor %xmm8,%xmm4
283 movdqa %xmm4,%xmm3
284 pslld $7,%xmm3
285 psrld $25,%xmm4
286 pxor %xmm3,%xmm4
287.byte 102,15,58,15,228,4
288.byte 102,69,15,58,15,192,8
289.byte 102,69,15,58,15,228,12
290 paddd %xmm4,%xmm0
291 pxor %xmm0,%xmm12
292 pshufb .rol16(%rip),%xmm12
293 paddd %xmm12,%xmm8
294 pxor %xmm8,%xmm4
295 movdqa %xmm4,%xmm3
296 pslld $12,%xmm3
297 psrld $20,%xmm4
298 pxor %xmm3,%xmm4
299 paddd %xmm4,%xmm0
300 pxor %xmm0,%xmm12
301 pshufb .rol8(%rip),%xmm12
302 paddd %xmm12,%xmm8
303 pxor %xmm8,%xmm4
304 movdqa %xmm4,%xmm3
305 pslld $7,%xmm3
306 psrld $25,%xmm4
307 pxor %xmm3,%xmm4
308.byte 102,15,58,15,228,12
309.byte 102,69,15,58,15,192,8
310.byte 102,69,15,58,15,228,4
311
312 decq %r10
313 jne 1b
314
315 paddd .chacha20_consts(%rip),%xmm0
316 paddd 48(%rbp),%xmm4
317
318 pand .clamp(%rip),%xmm0
319 movdqa %xmm0,0(%rbp)
320 movdqa %xmm4,16(%rbp)
321
322 movq %r8,%r8
323 call poly_hash_ad_internal
324open_sse_main_loop:
325 cmpq $256,%rbx
326 jb 2f
327
328 movdqa .chacha20_consts(%rip),%xmm0
329 movdqa 48(%rbp),%xmm4
330 movdqa 64(%rbp),%xmm8
331 movdqa %xmm0,%xmm1
332 movdqa %xmm4,%xmm5
333 movdqa %xmm8,%xmm9
334 movdqa %xmm0,%xmm2
335 movdqa %xmm4,%xmm6
336 movdqa %xmm8,%xmm10
337 movdqa %xmm0,%xmm3
338 movdqa %xmm4,%xmm7
339 movdqa %xmm8,%xmm11
340 movdqa 96(%rbp),%xmm15
341 paddd .sse_inc(%rip),%xmm15
342 movdqa %xmm15,%xmm14
343 paddd .sse_inc(%rip),%xmm14
344 movdqa %xmm14,%xmm13
345 paddd .sse_inc(%rip),%xmm13
346 movdqa %xmm13,%xmm12
347 paddd .sse_inc(%rip),%xmm12
348 movdqa %xmm12,96(%rbp)
349 movdqa %xmm13,112(%rbp)
350 movdqa %xmm14,128(%rbp)
351 movdqa %xmm15,144(%rbp)
352
353
354
355 movq $4,%rcx
356 movq %rsi,%r8
3571:
358 movdqa %xmm8,80(%rbp)
359 movdqa .rol16(%rip),%xmm8
360 paddd %xmm7,%xmm3
361 paddd %xmm6,%xmm2
362 paddd %xmm5,%xmm1
363 paddd %xmm4,%xmm0
364 pxor %xmm3,%xmm15
365 pxor %xmm2,%xmm14
366 pxor %xmm1,%xmm13
367 pxor %xmm0,%xmm12
368.byte 102,69,15,56,0,248
369.byte 102,69,15,56,0,240
370.byte 102,69,15,56,0,232
371.byte 102,69,15,56,0,224
372 movdqa 80(%rbp),%xmm8
373 paddd %xmm15,%xmm11
374 paddd %xmm14,%xmm10
375 paddd %xmm13,%xmm9
376 paddd %xmm12,%xmm8
377 pxor %xmm11,%xmm7
378 addq 0(%r8),%r10
379 adcq 8+0(%r8),%r11
380 adcq $1,%r12
381
382 leaq 16(%r8),%r8
383 pxor %xmm10,%xmm6
384 pxor %xmm9,%xmm5
385 pxor %xmm8,%xmm4
386 movdqa %xmm8,80(%rbp)
387 movdqa %xmm7,%xmm8
388 psrld $20,%xmm8
389 pslld $32-20,%xmm7
390 pxor %xmm8,%xmm7
391 movdqa %xmm6,%xmm8
392 psrld $20,%xmm8
393 pslld $32-20,%xmm6
394 pxor %xmm8,%xmm6
395 movdqa %xmm5,%xmm8
396 psrld $20,%xmm8
397 pslld $32-20,%xmm5
398 pxor %xmm8,%xmm5
399 movdqa %xmm4,%xmm8
400 psrld $20,%xmm8
401 pslld $32-20,%xmm4
402 pxor %xmm8,%xmm4
403 movq 0+0(%rbp),%rax
404 movq %rax,%r15
405 mulq %r10
406 movq %rax,%r13
407 movq %rdx,%r14
408 movq 0+0(%rbp),%rax
409 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800410 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500411 addq %rax,%r14
412 adcq %rdx,%r15
413 movdqa .rol8(%rip),%xmm8
414 paddd %xmm7,%xmm3
415 paddd %xmm6,%xmm2
416 paddd %xmm5,%xmm1
417 paddd %xmm4,%xmm0
418 pxor %xmm3,%xmm15
419 pxor %xmm2,%xmm14
420 pxor %xmm1,%xmm13
421 pxor %xmm0,%xmm12
422.byte 102,69,15,56,0,248
423.byte 102,69,15,56,0,240
424.byte 102,69,15,56,0,232
425.byte 102,69,15,56,0,224
426 movdqa 80(%rbp),%xmm8
427 paddd %xmm15,%xmm11
428 paddd %xmm14,%xmm10
429 paddd %xmm13,%xmm9
430 paddd %xmm12,%xmm8
431 pxor %xmm11,%xmm7
432 pxor %xmm10,%xmm6
433 movq 8+0(%rbp),%rax
434 movq %rax,%r9
435 mulq %r10
436 addq %rax,%r14
437 adcq $0,%rdx
438 movq %rdx,%r10
439 movq 8+0(%rbp),%rax
440 mulq %r11
441 addq %rax,%r15
442 adcq $0,%rdx
443 pxor %xmm9,%xmm5
444 pxor %xmm8,%xmm4
445 movdqa %xmm8,80(%rbp)
446 movdqa %xmm7,%xmm8
447 psrld $25,%xmm8
448 pslld $32-25,%xmm7
449 pxor %xmm8,%xmm7
450 movdqa %xmm6,%xmm8
451 psrld $25,%xmm8
452 pslld $32-25,%xmm6
453 pxor %xmm8,%xmm6
454 movdqa %xmm5,%xmm8
455 psrld $25,%xmm8
456 pslld $32-25,%xmm5
457 pxor %xmm8,%xmm5
458 movdqa %xmm4,%xmm8
459 psrld $25,%xmm8
460 pslld $32-25,%xmm4
461 pxor %xmm8,%xmm4
462 movdqa 80(%rbp),%xmm8
Robert Sloan4d1ac502017-02-06 08:36:14 -0800463 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500464 addq %r10,%r15
465 adcq %rdx,%r9
466.byte 102,15,58,15,255,4
467.byte 102,69,15,58,15,219,8
468.byte 102,69,15,58,15,255,12
469.byte 102,15,58,15,246,4
470.byte 102,69,15,58,15,210,8
471.byte 102,69,15,58,15,246,12
472.byte 102,15,58,15,237,4
473.byte 102,69,15,58,15,201,8
474.byte 102,69,15,58,15,237,12
475.byte 102,15,58,15,228,4
476.byte 102,69,15,58,15,192,8
477.byte 102,69,15,58,15,228,12
478 movdqa %xmm8,80(%rbp)
479 movdqa .rol16(%rip),%xmm8
480 paddd %xmm7,%xmm3
481 paddd %xmm6,%xmm2
482 paddd %xmm5,%xmm1
483 paddd %xmm4,%xmm0
484 pxor %xmm3,%xmm15
485 pxor %xmm2,%xmm14
486 movq %r13,%r10
487 movq %r14,%r11
488 movq %r15,%r12
489 andq $3,%r12
490 movq %r15,%r13
491 andq $-4,%r13
492 movq %r9,%r14
493 shrdq $2,%r9,%r15
494 shrq $2,%r9
495 addq %r13,%r10
496 adcq %r14,%r11
497 adcq $0,%r12
498 addq %r15,%r10
499 adcq %r9,%r11
500 adcq $0,%r12
501 pxor %xmm1,%xmm13
502 pxor %xmm0,%xmm12
503.byte 102,69,15,56,0,248
504.byte 102,69,15,56,0,240
505.byte 102,69,15,56,0,232
506.byte 102,69,15,56,0,224
507 movdqa 80(%rbp),%xmm8
508 paddd %xmm15,%xmm11
509 paddd %xmm14,%xmm10
510 paddd %xmm13,%xmm9
511 paddd %xmm12,%xmm8
512 pxor %xmm11,%xmm7
513 pxor %xmm10,%xmm6
514 pxor %xmm9,%xmm5
515 pxor %xmm8,%xmm4
516 movdqa %xmm8,80(%rbp)
517 movdqa %xmm7,%xmm8
518 psrld $20,%xmm8
519 pslld $32-20,%xmm7
520 pxor %xmm8,%xmm7
521 movdqa %xmm6,%xmm8
522 psrld $20,%xmm8
523 pslld $32-20,%xmm6
524 pxor %xmm8,%xmm6
525 movdqa %xmm5,%xmm8
526 psrld $20,%xmm8
527 pslld $32-20,%xmm5
528 pxor %xmm8,%xmm5
529 movdqa %xmm4,%xmm8
530 psrld $20,%xmm8
531 pslld $32-20,%xmm4
532 pxor %xmm8,%xmm4
533 movdqa .rol8(%rip),%xmm8
534 paddd %xmm7,%xmm3
535 paddd %xmm6,%xmm2
536 paddd %xmm5,%xmm1
537 paddd %xmm4,%xmm0
538 pxor %xmm3,%xmm15
539 pxor %xmm2,%xmm14
540 pxor %xmm1,%xmm13
541 pxor %xmm0,%xmm12
542.byte 102,69,15,56,0,248
543.byte 102,69,15,56,0,240
544.byte 102,69,15,56,0,232
545.byte 102,69,15,56,0,224
546 movdqa 80(%rbp),%xmm8
547 paddd %xmm15,%xmm11
548 paddd %xmm14,%xmm10
549 paddd %xmm13,%xmm9
550 paddd %xmm12,%xmm8
551 pxor %xmm11,%xmm7
552 pxor %xmm10,%xmm6
553 pxor %xmm9,%xmm5
554 pxor %xmm8,%xmm4
555 movdqa %xmm8,80(%rbp)
556 movdqa %xmm7,%xmm8
557 psrld $25,%xmm8
558 pslld $32-25,%xmm7
559 pxor %xmm8,%xmm7
560 movdqa %xmm6,%xmm8
561 psrld $25,%xmm8
562 pslld $32-25,%xmm6
563 pxor %xmm8,%xmm6
564 movdqa %xmm5,%xmm8
565 psrld $25,%xmm8
566 pslld $32-25,%xmm5
567 pxor %xmm8,%xmm5
568 movdqa %xmm4,%xmm8
569 psrld $25,%xmm8
570 pslld $32-25,%xmm4
571 pxor %xmm8,%xmm4
572 movdqa 80(%rbp),%xmm8
573.byte 102,15,58,15,255,12
574.byte 102,69,15,58,15,219,8
575.byte 102,69,15,58,15,255,4
576.byte 102,15,58,15,246,12
577.byte 102,69,15,58,15,210,8
578.byte 102,69,15,58,15,246,4
579.byte 102,15,58,15,237,12
580.byte 102,69,15,58,15,201,8
581.byte 102,69,15,58,15,237,4
582.byte 102,15,58,15,228,12
583.byte 102,69,15,58,15,192,8
584.byte 102,69,15,58,15,228,4
585
586 decq %rcx
587 jge 1b
588 addq 0(%r8),%r10
589 adcq 8+0(%r8),%r11
590 adcq $1,%r12
591 movq 0+0(%rbp),%rax
592 movq %rax,%r15
593 mulq %r10
594 movq %rax,%r13
595 movq %rdx,%r14
596 movq 0+0(%rbp),%rax
597 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800598 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500599 addq %rax,%r14
600 adcq %rdx,%r15
601 movq 8+0(%rbp),%rax
602 movq %rax,%r9
603 mulq %r10
604 addq %rax,%r14
605 adcq $0,%rdx
606 movq %rdx,%r10
607 movq 8+0(%rbp),%rax
608 mulq %r11
609 addq %rax,%r15
610 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -0800611 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500612 addq %r10,%r15
613 adcq %rdx,%r9
614 movq %r13,%r10
615 movq %r14,%r11
616 movq %r15,%r12
617 andq $3,%r12
618 movq %r15,%r13
619 andq $-4,%r13
620 movq %r9,%r14
621 shrdq $2,%r9,%r15
622 shrq $2,%r9
623 addq %r13,%r10
624 adcq %r14,%r11
625 adcq $0,%r12
626 addq %r15,%r10
627 adcq %r9,%r11
628 adcq $0,%r12
629
630 leaq 16(%r8),%r8
631 cmpq $-6,%rcx
632 jg 1b
633 paddd .chacha20_consts(%rip),%xmm3
634 paddd 48(%rbp),%xmm7
635 paddd 64(%rbp),%xmm11
636 paddd 144(%rbp),%xmm15
637 paddd .chacha20_consts(%rip),%xmm2
638 paddd 48(%rbp),%xmm6
639 paddd 64(%rbp),%xmm10
640 paddd 128(%rbp),%xmm14
641 paddd .chacha20_consts(%rip),%xmm1
642 paddd 48(%rbp),%xmm5
643 paddd 64(%rbp),%xmm9
644 paddd 112(%rbp),%xmm13
645 paddd .chacha20_consts(%rip),%xmm0
646 paddd 48(%rbp),%xmm4
647 paddd 64(%rbp),%xmm8
648 paddd 96(%rbp),%xmm12
649 movdqa %xmm12,80(%rbp)
650 movdqu 0 + 0(%rsi),%xmm12
651 pxor %xmm3,%xmm12
652 movdqu %xmm12,0 + 0(%rdi)
653 movdqu 16 + 0(%rsi),%xmm12
654 pxor %xmm7,%xmm12
655 movdqu %xmm12,16 + 0(%rdi)
656 movdqu 32 + 0(%rsi),%xmm12
657 pxor %xmm11,%xmm12
658 movdqu %xmm12,32 + 0(%rdi)
659 movdqu 48 + 0(%rsi),%xmm12
660 pxor %xmm15,%xmm12
661 movdqu %xmm12,48 + 0(%rdi)
662 movdqu 0 + 64(%rsi),%xmm3
663 movdqu 16 + 64(%rsi),%xmm7
664 movdqu 32 + 64(%rsi),%xmm11
665 movdqu 48 + 64(%rsi),%xmm15
666 pxor %xmm3,%xmm2
667 pxor %xmm7,%xmm6
668 pxor %xmm11,%xmm10
669 pxor %xmm14,%xmm15
670 movdqu %xmm2,0 + 64(%rdi)
671 movdqu %xmm6,16 + 64(%rdi)
672 movdqu %xmm10,32 + 64(%rdi)
673 movdqu %xmm15,48 + 64(%rdi)
674 movdqu 0 + 128(%rsi),%xmm3
675 movdqu 16 + 128(%rsi),%xmm7
676 movdqu 32 + 128(%rsi),%xmm11
677 movdqu 48 + 128(%rsi),%xmm15
678 pxor %xmm3,%xmm1
679 pxor %xmm7,%xmm5
680 pxor %xmm11,%xmm9
681 pxor %xmm13,%xmm15
682 movdqu %xmm1,0 + 128(%rdi)
683 movdqu %xmm5,16 + 128(%rdi)
684 movdqu %xmm9,32 + 128(%rdi)
685 movdqu %xmm15,48 + 128(%rdi)
686 movdqu 0 + 192(%rsi),%xmm3
687 movdqu 16 + 192(%rsi),%xmm7
688 movdqu 32 + 192(%rsi),%xmm11
689 movdqu 48 + 192(%rsi),%xmm15
690 pxor %xmm3,%xmm0
691 pxor %xmm7,%xmm4
692 pxor %xmm11,%xmm8
693 pxor 80(%rbp),%xmm15
694 movdqu %xmm0,0 + 192(%rdi)
695 movdqu %xmm4,16 + 192(%rdi)
696 movdqu %xmm8,32 + 192(%rdi)
697 movdqu %xmm15,48 + 192(%rdi)
698
699 leaq 256(%rsi),%rsi
700 leaq 256(%rdi),%rdi
701 subq $256,%rbx
702 jmp open_sse_main_loop
7032:
704
705 testq %rbx,%rbx
706 jz open_sse_finalize
707 cmpq $64,%rbx
708 ja 3f
709 movdqa .chacha20_consts(%rip),%xmm0
710 movdqa 48(%rbp),%xmm4
711 movdqa 64(%rbp),%xmm8
712 movdqa 96(%rbp),%xmm12
713 paddd .sse_inc(%rip),%xmm12
714 movdqa %xmm12,96(%rbp)
715
716 xorq %r8,%r8
717 movq %rbx,%rcx
718 cmpq $16,%rcx
719 jb 2f
7201:
721 addq 0(%rsi,%r8), %r10
722 adcq 8+0(%rsi,%r8), %r11
723 adcq $1,%r12
724 movq 0+0(%rbp),%rax
725 movq %rax,%r15
726 mulq %r10
727 movq %rax,%r13
728 movq %rdx,%r14
729 movq 0+0(%rbp),%rax
730 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800731 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500732 addq %rax,%r14
733 adcq %rdx,%r15
734 movq 8+0(%rbp),%rax
735 movq %rax,%r9
736 mulq %r10
737 addq %rax,%r14
738 adcq $0,%rdx
739 movq %rdx,%r10
740 movq 8+0(%rbp),%rax
741 mulq %r11
742 addq %rax,%r15
743 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -0800744 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500745 addq %r10,%r15
746 adcq %rdx,%r9
747 movq %r13,%r10
748 movq %r14,%r11
749 movq %r15,%r12
750 andq $3,%r12
751 movq %r15,%r13
752 andq $-4,%r13
753 movq %r9,%r14
754 shrdq $2,%r9,%r15
755 shrq $2,%r9
756 addq %r13,%r10
757 adcq %r14,%r11
758 adcq $0,%r12
759 addq %r15,%r10
760 adcq %r9,%r11
761 adcq $0,%r12
762
763 subq $16,%rcx
7642:
765 addq $16,%r8
766 paddd %xmm4,%xmm0
767 pxor %xmm0,%xmm12
768 pshufb .rol16(%rip),%xmm12
769 paddd %xmm12,%xmm8
770 pxor %xmm8,%xmm4
771 movdqa %xmm4,%xmm3
772 pslld $12,%xmm3
773 psrld $20,%xmm4
774 pxor %xmm3,%xmm4
775 paddd %xmm4,%xmm0
776 pxor %xmm0,%xmm12
777 pshufb .rol8(%rip),%xmm12
778 paddd %xmm12,%xmm8
779 pxor %xmm8,%xmm4
780 movdqa %xmm4,%xmm3
781 pslld $7,%xmm3
782 psrld $25,%xmm4
783 pxor %xmm3,%xmm4
784.byte 102,15,58,15,228,4
785.byte 102,69,15,58,15,192,8
786.byte 102,69,15,58,15,228,12
787 paddd %xmm4,%xmm0
788 pxor %xmm0,%xmm12
789 pshufb .rol16(%rip),%xmm12
790 paddd %xmm12,%xmm8
791 pxor %xmm8,%xmm4
792 movdqa %xmm4,%xmm3
793 pslld $12,%xmm3
794 psrld $20,%xmm4
795 pxor %xmm3,%xmm4
796 paddd %xmm4,%xmm0
797 pxor %xmm0,%xmm12
798 pshufb .rol8(%rip),%xmm12
799 paddd %xmm12,%xmm8
800 pxor %xmm8,%xmm4
801 movdqa %xmm4,%xmm3
802 pslld $7,%xmm3
803 psrld $25,%xmm4
804 pxor %xmm3,%xmm4
805.byte 102,15,58,15,228,12
806.byte 102,69,15,58,15,192,8
807.byte 102,69,15,58,15,228,4
808
809 cmpq $16,%rcx
810 jae 1b
811 cmpq $160,%r8
812 jne 2b
813 paddd .chacha20_consts(%rip),%xmm0
814 paddd 48(%rbp),%xmm4
815 paddd 64(%rbp),%xmm8
816 paddd 96(%rbp),%xmm12
817
818 jmp open_sse_tail_64_dec_loop
8193:
820 cmpq $128,%rbx
821 ja 3f
822 movdqa .chacha20_consts(%rip),%xmm0
823 movdqa 48(%rbp),%xmm4
824 movdqa 64(%rbp),%xmm8
825 movdqa %xmm0,%xmm1
826 movdqa %xmm4,%xmm5
827 movdqa %xmm8,%xmm9
828 movdqa 96(%rbp),%xmm13
829 paddd .sse_inc(%rip),%xmm13
830 movdqa %xmm13,%xmm12
831 paddd .sse_inc(%rip),%xmm12
832 movdqa %xmm12,96(%rbp)
833 movdqa %xmm13,112(%rbp)
834
835 movq %rbx,%rcx
836 andq $-16,%rcx
837 xorq %r8,%r8
8381:
839 addq 0(%rsi,%r8), %r10
840 adcq 8+0(%rsi,%r8), %r11
841 adcq $1,%r12
842 movq 0+0(%rbp),%rax
843 movq %rax,%r15
844 mulq %r10
845 movq %rax,%r13
846 movq %rdx,%r14
847 movq 0+0(%rbp),%rax
848 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -0800849 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -0500850 addq %rax,%r14
851 adcq %rdx,%r15
852 movq 8+0(%rbp),%rax
853 movq %rax,%r9
854 mulq %r10
855 addq %rax,%r14
856 adcq $0,%rdx
857 movq %rdx,%r10
858 movq 8+0(%rbp),%rax
859 mulq %r11
860 addq %rax,%r15
861 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -0800862 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -0500863 addq %r10,%r15
864 adcq %rdx,%r9
865 movq %r13,%r10
866 movq %r14,%r11
867 movq %r15,%r12
868 andq $3,%r12
869 movq %r15,%r13
870 andq $-4,%r13
871 movq %r9,%r14
872 shrdq $2,%r9,%r15
873 shrq $2,%r9
874 addq %r13,%r10
875 adcq %r14,%r11
876 adcq $0,%r12
877 addq %r15,%r10
878 adcq %r9,%r11
879 adcq $0,%r12
880
8812:
882 addq $16,%r8
883 paddd %xmm4,%xmm0
884 pxor %xmm0,%xmm12
885 pshufb .rol16(%rip),%xmm12
886 paddd %xmm12,%xmm8
887 pxor %xmm8,%xmm4
888 movdqa %xmm4,%xmm3
889 pslld $12,%xmm3
890 psrld $20,%xmm4
891 pxor %xmm3,%xmm4
892 paddd %xmm4,%xmm0
893 pxor %xmm0,%xmm12
894 pshufb .rol8(%rip),%xmm12
895 paddd %xmm12,%xmm8
896 pxor %xmm8,%xmm4
897 movdqa %xmm4,%xmm3
898 pslld $7,%xmm3
899 psrld $25,%xmm4
900 pxor %xmm3,%xmm4
901.byte 102,15,58,15,228,4
902.byte 102,69,15,58,15,192,8
903.byte 102,69,15,58,15,228,12
904 paddd %xmm5,%xmm1
905 pxor %xmm1,%xmm13
906 pshufb .rol16(%rip),%xmm13
907 paddd %xmm13,%xmm9
908 pxor %xmm9,%xmm5
909 movdqa %xmm5,%xmm3
910 pslld $12,%xmm3
911 psrld $20,%xmm5
912 pxor %xmm3,%xmm5
913 paddd %xmm5,%xmm1
914 pxor %xmm1,%xmm13
915 pshufb .rol8(%rip),%xmm13
916 paddd %xmm13,%xmm9
917 pxor %xmm9,%xmm5
918 movdqa %xmm5,%xmm3
919 pslld $7,%xmm3
920 psrld $25,%xmm5
921 pxor %xmm3,%xmm5
922.byte 102,15,58,15,237,4
923.byte 102,69,15,58,15,201,8
924.byte 102,69,15,58,15,237,12
925 paddd %xmm4,%xmm0
926 pxor %xmm0,%xmm12
927 pshufb .rol16(%rip),%xmm12
928 paddd %xmm12,%xmm8
929 pxor %xmm8,%xmm4
930 movdqa %xmm4,%xmm3
931 pslld $12,%xmm3
932 psrld $20,%xmm4
933 pxor %xmm3,%xmm4
934 paddd %xmm4,%xmm0
935 pxor %xmm0,%xmm12
936 pshufb .rol8(%rip),%xmm12
937 paddd %xmm12,%xmm8
938 pxor %xmm8,%xmm4
939 movdqa %xmm4,%xmm3
940 pslld $7,%xmm3
941 psrld $25,%xmm4
942 pxor %xmm3,%xmm4
943.byte 102,15,58,15,228,12
944.byte 102,69,15,58,15,192,8
945.byte 102,69,15,58,15,228,4
946 paddd %xmm5,%xmm1
947 pxor %xmm1,%xmm13
948 pshufb .rol16(%rip),%xmm13
949 paddd %xmm13,%xmm9
950 pxor %xmm9,%xmm5
951 movdqa %xmm5,%xmm3
952 pslld $12,%xmm3
953 psrld $20,%xmm5
954 pxor %xmm3,%xmm5
955 paddd %xmm5,%xmm1
956 pxor %xmm1,%xmm13
957 pshufb .rol8(%rip),%xmm13
958 paddd %xmm13,%xmm9
959 pxor %xmm9,%xmm5
960 movdqa %xmm5,%xmm3
961 pslld $7,%xmm3
962 psrld $25,%xmm5
963 pxor %xmm3,%xmm5
964.byte 102,15,58,15,237,12
965.byte 102,69,15,58,15,201,8
966.byte 102,69,15,58,15,237,4
967
968 cmpq %rcx,%r8
969 jb 1b
970 cmpq $160,%r8
971 jne 2b
972 paddd .chacha20_consts(%rip),%xmm1
973 paddd 48(%rbp),%xmm5
974 paddd 64(%rbp),%xmm9
975 paddd 112(%rbp),%xmm13
976 paddd .chacha20_consts(%rip),%xmm0
977 paddd 48(%rbp),%xmm4
978 paddd 64(%rbp),%xmm8
979 paddd 96(%rbp),%xmm12
980 movdqu 0 + 0(%rsi),%xmm3
981 movdqu 16 + 0(%rsi),%xmm7
982 movdqu 32 + 0(%rsi),%xmm11
983 movdqu 48 + 0(%rsi),%xmm15
984 pxor %xmm3,%xmm1
985 pxor %xmm7,%xmm5
986 pxor %xmm11,%xmm9
987 pxor %xmm13,%xmm15
988 movdqu %xmm1,0 + 0(%rdi)
989 movdqu %xmm5,16 + 0(%rdi)
990 movdqu %xmm9,32 + 0(%rdi)
991 movdqu %xmm15,48 + 0(%rdi)
992
993 subq $64,%rbx
994 leaq 64(%rsi),%rsi
995 leaq 64(%rdi),%rdi
996 jmp open_sse_tail_64_dec_loop
9973:
998 cmpq $192,%rbx
999 ja 3f
1000 movdqa .chacha20_consts(%rip),%xmm0
1001 movdqa 48(%rbp),%xmm4
1002 movdqa 64(%rbp),%xmm8
1003 movdqa %xmm0,%xmm1
1004 movdqa %xmm4,%xmm5
1005 movdqa %xmm8,%xmm9
1006 movdqa %xmm0,%xmm2
1007 movdqa %xmm4,%xmm6
1008 movdqa %xmm8,%xmm10
1009 movdqa 96(%rbp),%xmm14
1010 paddd .sse_inc(%rip),%xmm14
1011 movdqa %xmm14,%xmm13
1012 paddd .sse_inc(%rip),%xmm13
1013 movdqa %xmm13,%xmm12
1014 paddd .sse_inc(%rip),%xmm12
1015 movdqa %xmm12,96(%rbp)
1016 movdqa %xmm13,112(%rbp)
1017 movdqa %xmm14,128(%rbp)
1018
1019 movq %rbx,%rcx
1020 movq $160,%r8
1021 cmpq $160,%rcx
1022 cmovgq %r8,%rcx
1023 andq $-16,%rcx
1024 xorq %r8,%r8
10251:
1026 addq 0(%rsi,%r8), %r10
1027 adcq 8+0(%rsi,%r8), %r11
1028 adcq $1,%r12
1029 movq 0+0(%rbp),%rax
1030 movq %rax,%r15
1031 mulq %r10
1032 movq %rax,%r13
1033 movq %rdx,%r14
1034 movq 0+0(%rbp),%rax
1035 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001036 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001037 addq %rax,%r14
1038 adcq %rdx,%r15
1039 movq 8+0(%rbp),%rax
1040 movq %rax,%r9
1041 mulq %r10
1042 addq %rax,%r14
1043 adcq $0,%rdx
1044 movq %rdx,%r10
1045 movq 8+0(%rbp),%rax
1046 mulq %r11
1047 addq %rax,%r15
1048 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001049 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001050 addq %r10,%r15
1051 adcq %rdx,%r9
1052 movq %r13,%r10
1053 movq %r14,%r11
1054 movq %r15,%r12
1055 andq $3,%r12
1056 movq %r15,%r13
1057 andq $-4,%r13
1058 movq %r9,%r14
1059 shrdq $2,%r9,%r15
1060 shrq $2,%r9
1061 addq %r13,%r10
1062 adcq %r14,%r11
1063 adcq $0,%r12
1064 addq %r15,%r10
1065 adcq %r9,%r11
1066 adcq $0,%r12
1067
10682:
1069 addq $16,%r8
1070 paddd %xmm4,%xmm0
1071 pxor %xmm0,%xmm12
1072 pshufb .rol16(%rip),%xmm12
1073 paddd %xmm12,%xmm8
1074 pxor %xmm8,%xmm4
1075 movdqa %xmm4,%xmm3
1076 pslld $12,%xmm3
1077 psrld $20,%xmm4
1078 pxor %xmm3,%xmm4
1079 paddd %xmm4,%xmm0
1080 pxor %xmm0,%xmm12
1081 pshufb .rol8(%rip),%xmm12
1082 paddd %xmm12,%xmm8
1083 pxor %xmm8,%xmm4
1084 movdqa %xmm4,%xmm3
1085 pslld $7,%xmm3
1086 psrld $25,%xmm4
1087 pxor %xmm3,%xmm4
1088.byte 102,15,58,15,228,4
1089.byte 102,69,15,58,15,192,8
1090.byte 102,69,15,58,15,228,12
1091 paddd %xmm5,%xmm1
1092 pxor %xmm1,%xmm13
1093 pshufb .rol16(%rip),%xmm13
1094 paddd %xmm13,%xmm9
1095 pxor %xmm9,%xmm5
1096 movdqa %xmm5,%xmm3
1097 pslld $12,%xmm3
1098 psrld $20,%xmm5
1099 pxor %xmm3,%xmm5
1100 paddd %xmm5,%xmm1
1101 pxor %xmm1,%xmm13
1102 pshufb .rol8(%rip),%xmm13
1103 paddd %xmm13,%xmm9
1104 pxor %xmm9,%xmm5
1105 movdqa %xmm5,%xmm3
1106 pslld $7,%xmm3
1107 psrld $25,%xmm5
1108 pxor %xmm3,%xmm5
1109.byte 102,15,58,15,237,4
1110.byte 102,69,15,58,15,201,8
1111.byte 102,69,15,58,15,237,12
1112 paddd %xmm6,%xmm2
1113 pxor %xmm2,%xmm14
1114 pshufb .rol16(%rip),%xmm14
1115 paddd %xmm14,%xmm10
1116 pxor %xmm10,%xmm6
1117 movdqa %xmm6,%xmm3
1118 pslld $12,%xmm3
1119 psrld $20,%xmm6
1120 pxor %xmm3,%xmm6
1121 paddd %xmm6,%xmm2
1122 pxor %xmm2,%xmm14
1123 pshufb .rol8(%rip),%xmm14
1124 paddd %xmm14,%xmm10
1125 pxor %xmm10,%xmm6
1126 movdqa %xmm6,%xmm3
1127 pslld $7,%xmm3
1128 psrld $25,%xmm6
1129 pxor %xmm3,%xmm6
1130.byte 102,15,58,15,246,4
1131.byte 102,69,15,58,15,210,8
1132.byte 102,69,15,58,15,246,12
1133 paddd %xmm4,%xmm0
1134 pxor %xmm0,%xmm12
1135 pshufb .rol16(%rip),%xmm12
1136 paddd %xmm12,%xmm8
1137 pxor %xmm8,%xmm4
1138 movdqa %xmm4,%xmm3
1139 pslld $12,%xmm3
1140 psrld $20,%xmm4
1141 pxor %xmm3,%xmm4
1142 paddd %xmm4,%xmm0
1143 pxor %xmm0,%xmm12
1144 pshufb .rol8(%rip),%xmm12
1145 paddd %xmm12,%xmm8
1146 pxor %xmm8,%xmm4
1147 movdqa %xmm4,%xmm3
1148 pslld $7,%xmm3
1149 psrld $25,%xmm4
1150 pxor %xmm3,%xmm4
1151.byte 102,15,58,15,228,12
1152.byte 102,69,15,58,15,192,8
1153.byte 102,69,15,58,15,228,4
1154 paddd %xmm5,%xmm1
1155 pxor %xmm1,%xmm13
1156 pshufb .rol16(%rip),%xmm13
1157 paddd %xmm13,%xmm9
1158 pxor %xmm9,%xmm5
1159 movdqa %xmm5,%xmm3
1160 pslld $12,%xmm3
1161 psrld $20,%xmm5
1162 pxor %xmm3,%xmm5
1163 paddd %xmm5,%xmm1
1164 pxor %xmm1,%xmm13
1165 pshufb .rol8(%rip),%xmm13
1166 paddd %xmm13,%xmm9
1167 pxor %xmm9,%xmm5
1168 movdqa %xmm5,%xmm3
1169 pslld $7,%xmm3
1170 psrld $25,%xmm5
1171 pxor %xmm3,%xmm5
1172.byte 102,15,58,15,237,12
1173.byte 102,69,15,58,15,201,8
1174.byte 102,69,15,58,15,237,4
1175 paddd %xmm6,%xmm2
1176 pxor %xmm2,%xmm14
1177 pshufb .rol16(%rip),%xmm14
1178 paddd %xmm14,%xmm10
1179 pxor %xmm10,%xmm6
1180 movdqa %xmm6,%xmm3
1181 pslld $12,%xmm3
1182 psrld $20,%xmm6
1183 pxor %xmm3,%xmm6
1184 paddd %xmm6,%xmm2
1185 pxor %xmm2,%xmm14
1186 pshufb .rol8(%rip),%xmm14
1187 paddd %xmm14,%xmm10
1188 pxor %xmm10,%xmm6
1189 movdqa %xmm6,%xmm3
1190 pslld $7,%xmm3
1191 psrld $25,%xmm6
1192 pxor %xmm3,%xmm6
1193.byte 102,15,58,15,246,12
1194.byte 102,69,15,58,15,210,8
1195.byte 102,69,15,58,15,246,4
1196
1197 cmpq %rcx,%r8
1198 jb 1b
1199 cmpq $160,%r8
1200 jne 2b
1201 cmpq $176,%rbx
1202 jb 1f
1203 addq 160(%rsi),%r10
1204 adcq 8+160(%rsi),%r11
1205 adcq $1,%r12
1206 movq 0+0(%rbp),%rax
1207 movq %rax,%r15
1208 mulq %r10
1209 movq %rax,%r13
1210 movq %rdx,%r14
1211 movq 0+0(%rbp),%rax
1212 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001213 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001214 addq %rax,%r14
1215 adcq %rdx,%r15
1216 movq 8+0(%rbp),%rax
1217 movq %rax,%r9
1218 mulq %r10
1219 addq %rax,%r14
1220 adcq $0,%rdx
1221 movq %rdx,%r10
1222 movq 8+0(%rbp),%rax
1223 mulq %r11
1224 addq %rax,%r15
1225 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001226 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001227 addq %r10,%r15
1228 adcq %rdx,%r9
1229 movq %r13,%r10
1230 movq %r14,%r11
1231 movq %r15,%r12
1232 andq $3,%r12
1233 movq %r15,%r13
1234 andq $-4,%r13
1235 movq %r9,%r14
1236 shrdq $2,%r9,%r15
1237 shrq $2,%r9
1238 addq %r13,%r10
1239 adcq %r14,%r11
1240 adcq $0,%r12
1241 addq %r15,%r10
1242 adcq %r9,%r11
1243 adcq $0,%r12
1244
1245 cmpq $192,%rbx
1246 jb 1f
1247 addq 176(%rsi),%r10
1248 adcq 8+176(%rsi),%r11
1249 adcq $1,%r12
1250 movq 0+0(%rbp),%rax
1251 movq %rax,%r15
1252 mulq %r10
1253 movq %rax,%r13
1254 movq %rdx,%r14
1255 movq 0+0(%rbp),%rax
1256 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001257 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001258 addq %rax,%r14
1259 adcq %rdx,%r15
1260 movq 8+0(%rbp),%rax
1261 movq %rax,%r9
1262 mulq %r10
1263 addq %rax,%r14
1264 adcq $0,%rdx
1265 movq %rdx,%r10
1266 movq 8+0(%rbp),%rax
1267 mulq %r11
1268 addq %rax,%r15
1269 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001270 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001271 addq %r10,%r15
1272 adcq %rdx,%r9
1273 movq %r13,%r10
1274 movq %r14,%r11
1275 movq %r15,%r12
1276 andq $3,%r12
1277 movq %r15,%r13
1278 andq $-4,%r13
1279 movq %r9,%r14
1280 shrdq $2,%r9,%r15
1281 shrq $2,%r9
1282 addq %r13,%r10
1283 adcq %r14,%r11
1284 adcq $0,%r12
1285 addq %r15,%r10
1286 adcq %r9,%r11
1287 adcq $0,%r12
1288
12891:
1290 paddd .chacha20_consts(%rip),%xmm2
1291 paddd 48(%rbp),%xmm6
1292 paddd 64(%rbp),%xmm10
1293 paddd 128(%rbp),%xmm14
1294 paddd .chacha20_consts(%rip),%xmm1
1295 paddd 48(%rbp),%xmm5
1296 paddd 64(%rbp),%xmm9
1297 paddd 112(%rbp),%xmm13
1298 paddd .chacha20_consts(%rip),%xmm0
1299 paddd 48(%rbp),%xmm4
1300 paddd 64(%rbp),%xmm8
1301 paddd 96(%rbp),%xmm12
1302 movdqu 0 + 0(%rsi),%xmm3
1303 movdqu 16 + 0(%rsi),%xmm7
1304 movdqu 32 + 0(%rsi),%xmm11
1305 movdqu 48 + 0(%rsi),%xmm15
1306 pxor %xmm3,%xmm2
1307 pxor %xmm7,%xmm6
1308 pxor %xmm11,%xmm10
1309 pxor %xmm14,%xmm15
1310 movdqu %xmm2,0 + 0(%rdi)
1311 movdqu %xmm6,16 + 0(%rdi)
1312 movdqu %xmm10,32 + 0(%rdi)
1313 movdqu %xmm15,48 + 0(%rdi)
1314 movdqu 0 + 64(%rsi),%xmm3
1315 movdqu 16 + 64(%rsi),%xmm7
1316 movdqu 32 + 64(%rsi),%xmm11
1317 movdqu 48 + 64(%rsi),%xmm15
1318 pxor %xmm3,%xmm1
1319 pxor %xmm7,%xmm5
1320 pxor %xmm11,%xmm9
1321 pxor %xmm13,%xmm15
1322 movdqu %xmm1,0 + 64(%rdi)
1323 movdqu %xmm5,16 + 64(%rdi)
1324 movdqu %xmm9,32 + 64(%rdi)
1325 movdqu %xmm15,48 + 64(%rdi)
1326
1327 subq $128,%rbx
1328 leaq 128(%rsi),%rsi
1329 leaq 128(%rdi),%rdi
1330 jmp open_sse_tail_64_dec_loop
13313:
1332
1333 movdqa .chacha20_consts(%rip),%xmm0
1334 movdqa 48(%rbp),%xmm4
1335 movdqa 64(%rbp),%xmm8
1336 movdqa %xmm0,%xmm1
1337 movdqa %xmm4,%xmm5
1338 movdqa %xmm8,%xmm9
1339 movdqa %xmm0,%xmm2
1340 movdqa %xmm4,%xmm6
1341 movdqa %xmm8,%xmm10
1342 movdqa %xmm0,%xmm3
1343 movdqa %xmm4,%xmm7
1344 movdqa %xmm8,%xmm11
1345 movdqa 96(%rbp),%xmm15
1346 paddd .sse_inc(%rip),%xmm15
1347 movdqa %xmm15,%xmm14
1348 paddd .sse_inc(%rip),%xmm14
1349 movdqa %xmm14,%xmm13
1350 paddd .sse_inc(%rip),%xmm13
1351 movdqa %xmm13,%xmm12
1352 paddd .sse_inc(%rip),%xmm12
1353 movdqa %xmm12,96(%rbp)
1354 movdqa %xmm13,112(%rbp)
1355 movdqa %xmm14,128(%rbp)
1356 movdqa %xmm15,144(%rbp)
1357
1358 xorq %r8,%r8
13591:
1360 addq 0(%rsi,%r8), %r10
1361 adcq 8+0(%rsi,%r8), %r11
1362 adcq $1,%r12
1363 movdqa %xmm11,80(%rbp)
1364 paddd %xmm4,%xmm0
1365 pxor %xmm0,%xmm12
1366 pshufb .rol16(%rip),%xmm12
1367 paddd %xmm12,%xmm8
1368 pxor %xmm8,%xmm4
1369 movdqa %xmm4,%xmm11
1370 pslld $12,%xmm11
1371 psrld $20,%xmm4
1372 pxor %xmm11,%xmm4
1373 paddd %xmm4,%xmm0
1374 pxor %xmm0,%xmm12
1375 pshufb .rol8(%rip),%xmm12
1376 paddd %xmm12,%xmm8
1377 pxor %xmm8,%xmm4
1378 movdqa %xmm4,%xmm11
1379 pslld $7,%xmm11
1380 psrld $25,%xmm4
1381 pxor %xmm11,%xmm4
1382.byte 102,15,58,15,228,4
1383.byte 102,69,15,58,15,192,8
1384.byte 102,69,15,58,15,228,12
1385 paddd %xmm5,%xmm1
1386 pxor %xmm1,%xmm13
1387 pshufb .rol16(%rip),%xmm13
1388 paddd %xmm13,%xmm9
1389 pxor %xmm9,%xmm5
1390 movdqa %xmm5,%xmm11
1391 pslld $12,%xmm11
1392 psrld $20,%xmm5
1393 pxor %xmm11,%xmm5
1394 paddd %xmm5,%xmm1
1395 pxor %xmm1,%xmm13
1396 pshufb .rol8(%rip),%xmm13
1397 paddd %xmm13,%xmm9
1398 pxor %xmm9,%xmm5
1399 movdqa %xmm5,%xmm11
1400 pslld $7,%xmm11
1401 psrld $25,%xmm5
1402 pxor %xmm11,%xmm5
1403.byte 102,15,58,15,237,4
1404.byte 102,69,15,58,15,201,8
1405.byte 102,69,15,58,15,237,12
1406 paddd %xmm6,%xmm2
1407 pxor %xmm2,%xmm14
1408 pshufb .rol16(%rip),%xmm14
1409 paddd %xmm14,%xmm10
1410 pxor %xmm10,%xmm6
1411 movdqa %xmm6,%xmm11
1412 pslld $12,%xmm11
1413 psrld $20,%xmm6
1414 pxor %xmm11,%xmm6
1415 paddd %xmm6,%xmm2
1416 pxor %xmm2,%xmm14
1417 pshufb .rol8(%rip),%xmm14
1418 paddd %xmm14,%xmm10
1419 pxor %xmm10,%xmm6
1420 movdqa %xmm6,%xmm11
1421 pslld $7,%xmm11
1422 psrld $25,%xmm6
1423 pxor %xmm11,%xmm6
1424.byte 102,15,58,15,246,4
1425.byte 102,69,15,58,15,210,8
1426.byte 102,69,15,58,15,246,12
1427 movdqa 80(%rbp),%xmm11
1428 movq 0+0(%rbp),%rax
1429 movq %rax,%r15
1430 mulq %r10
1431 movq %rax,%r13
1432 movq %rdx,%r14
1433 movq 0+0(%rbp),%rax
1434 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001435 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001436 addq %rax,%r14
1437 adcq %rdx,%r15
1438 movdqa %xmm9,80(%rbp)
1439 paddd %xmm7,%xmm3
1440 pxor %xmm3,%xmm15
1441 pshufb .rol16(%rip),%xmm15
1442 paddd %xmm15,%xmm11
1443 pxor %xmm11,%xmm7
1444 movdqa %xmm7,%xmm9
1445 pslld $12,%xmm9
1446 psrld $20,%xmm7
1447 pxor %xmm9,%xmm7
1448 paddd %xmm7,%xmm3
1449 pxor %xmm3,%xmm15
1450 pshufb .rol8(%rip),%xmm15
1451 paddd %xmm15,%xmm11
1452 pxor %xmm11,%xmm7
1453 movdqa %xmm7,%xmm9
1454 pslld $7,%xmm9
1455 psrld $25,%xmm7
1456 pxor %xmm9,%xmm7
1457.byte 102,15,58,15,255,4
1458.byte 102,69,15,58,15,219,8
1459.byte 102,69,15,58,15,255,12
1460 movdqa 80(%rbp),%xmm9
1461 movq 8+0(%rbp),%rax
1462 movq %rax,%r9
1463 mulq %r10
1464 addq %rax,%r14
1465 adcq $0,%rdx
1466 movq %rdx,%r10
1467 movq 8+0(%rbp),%rax
1468 mulq %r11
1469 addq %rax,%r15
1470 adcq $0,%rdx
1471 movdqa %xmm11,80(%rbp)
1472 paddd %xmm4,%xmm0
1473 pxor %xmm0,%xmm12
1474 pshufb .rol16(%rip),%xmm12
1475 paddd %xmm12,%xmm8
1476 pxor %xmm8,%xmm4
1477 movdqa %xmm4,%xmm11
1478 pslld $12,%xmm11
1479 psrld $20,%xmm4
1480 pxor %xmm11,%xmm4
1481 paddd %xmm4,%xmm0
1482 pxor %xmm0,%xmm12
1483 pshufb .rol8(%rip),%xmm12
1484 paddd %xmm12,%xmm8
1485 pxor %xmm8,%xmm4
1486 movdqa %xmm4,%xmm11
1487 pslld $7,%xmm11
1488 psrld $25,%xmm4
1489 pxor %xmm11,%xmm4
1490.byte 102,15,58,15,228,12
1491.byte 102,69,15,58,15,192,8
1492.byte 102,69,15,58,15,228,4
1493 paddd %xmm5,%xmm1
1494 pxor %xmm1,%xmm13
1495 pshufb .rol16(%rip),%xmm13
1496 paddd %xmm13,%xmm9
1497 pxor %xmm9,%xmm5
1498 movdqa %xmm5,%xmm11
1499 pslld $12,%xmm11
1500 psrld $20,%xmm5
1501 pxor %xmm11,%xmm5
1502 paddd %xmm5,%xmm1
1503 pxor %xmm1,%xmm13
1504 pshufb .rol8(%rip),%xmm13
1505 paddd %xmm13,%xmm9
1506 pxor %xmm9,%xmm5
1507 movdqa %xmm5,%xmm11
1508 pslld $7,%xmm11
1509 psrld $25,%xmm5
1510 pxor %xmm11,%xmm5
1511.byte 102,15,58,15,237,12
1512.byte 102,69,15,58,15,201,8
1513.byte 102,69,15,58,15,237,4
Robert Sloan4d1ac502017-02-06 08:36:14 -08001514 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001515 addq %r10,%r15
1516 adcq %rdx,%r9
1517 paddd %xmm6,%xmm2
1518 pxor %xmm2,%xmm14
1519 pshufb .rol16(%rip),%xmm14
1520 paddd %xmm14,%xmm10
1521 pxor %xmm10,%xmm6
1522 movdqa %xmm6,%xmm11
1523 pslld $12,%xmm11
1524 psrld $20,%xmm6
1525 pxor %xmm11,%xmm6
1526 paddd %xmm6,%xmm2
1527 pxor %xmm2,%xmm14
1528 pshufb .rol8(%rip),%xmm14
1529 paddd %xmm14,%xmm10
1530 pxor %xmm10,%xmm6
1531 movdqa %xmm6,%xmm11
1532 pslld $7,%xmm11
1533 psrld $25,%xmm6
1534 pxor %xmm11,%xmm6
1535.byte 102,15,58,15,246,12
1536.byte 102,69,15,58,15,210,8
1537.byte 102,69,15,58,15,246,4
1538 movdqa 80(%rbp),%xmm11
1539 movq %r13,%r10
1540 movq %r14,%r11
1541 movq %r15,%r12
1542 andq $3,%r12
1543 movq %r15,%r13
1544 andq $-4,%r13
1545 movq %r9,%r14
1546 shrdq $2,%r9,%r15
1547 shrq $2,%r9
1548 addq %r13,%r10
1549 adcq %r14,%r11
1550 adcq $0,%r12
1551 addq %r15,%r10
1552 adcq %r9,%r11
1553 adcq $0,%r12
1554 movdqa %xmm9,80(%rbp)
1555 paddd %xmm7,%xmm3
1556 pxor %xmm3,%xmm15
1557 pshufb .rol16(%rip),%xmm15
1558 paddd %xmm15,%xmm11
1559 pxor %xmm11,%xmm7
1560 movdqa %xmm7,%xmm9
1561 pslld $12,%xmm9
1562 psrld $20,%xmm7
1563 pxor %xmm9,%xmm7
1564 paddd %xmm7,%xmm3
1565 pxor %xmm3,%xmm15
1566 pshufb .rol8(%rip),%xmm15
1567 paddd %xmm15,%xmm11
1568 pxor %xmm11,%xmm7
1569 movdqa %xmm7,%xmm9
1570 pslld $7,%xmm9
1571 psrld $25,%xmm7
1572 pxor %xmm9,%xmm7
1573.byte 102,15,58,15,255,12
1574.byte 102,69,15,58,15,219,8
1575.byte 102,69,15,58,15,255,4
1576 movdqa 80(%rbp),%xmm9
1577
1578 addq $16,%r8
1579 cmpq $160,%r8
1580 jb 1b
1581 movq %rbx,%rcx
1582 andq $-16,%rcx
15831:
1584 addq 0(%rsi,%r8), %r10
1585 adcq 8+0(%rsi,%r8), %r11
1586 adcq $1,%r12
1587 movq 0+0(%rbp),%rax
1588 movq %rax,%r15
1589 mulq %r10
1590 movq %rax,%r13
1591 movq %rdx,%r14
1592 movq 0+0(%rbp),%rax
1593 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001594 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001595 addq %rax,%r14
1596 adcq %rdx,%r15
1597 movq 8+0(%rbp),%rax
1598 movq %rax,%r9
1599 mulq %r10
1600 addq %rax,%r14
1601 adcq $0,%rdx
1602 movq %rdx,%r10
1603 movq 8+0(%rbp),%rax
1604 mulq %r11
1605 addq %rax,%r15
1606 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001607 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001608 addq %r10,%r15
1609 adcq %rdx,%r9
1610 movq %r13,%r10
1611 movq %r14,%r11
1612 movq %r15,%r12
1613 andq $3,%r12
1614 movq %r15,%r13
1615 andq $-4,%r13
1616 movq %r9,%r14
1617 shrdq $2,%r9,%r15
1618 shrq $2,%r9
1619 addq %r13,%r10
1620 adcq %r14,%r11
1621 adcq $0,%r12
1622 addq %r15,%r10
1623 adcq %r9,%r11
1624 adcq $0,%r12
1625
1626 addq $16,%r8
1627 cmpq %rcx,%r8
1628 jb 1b
1629 paddd .chacha20_consts(%rip),%xmm3
1630 paddd 48(%rbp),%xmm7
1631 paddd 64(%rbp),%xmm11
1632 paddd 144(%rbp),%xmm15
1633 paddd .chacha20_consts(%rip),%xmm2
1634 paddd 48(%rbp),%xmm6
1635 paddd 64(%rbp),%xmm10
1636 paddd 128(%rbp),%xmm14
1637 paddd .chacha20_consts(%rip),%xmm1
1638 paddd 48(%rbp),%xmm5
1639 paddd 64(%rbp),%xmm9
1640 paddd 112(%rbp),%xmm13
1641 paddd .chacha20_consts(%rip),%xmm0
1642 paddd 48(%rbp),%xmm4
1643 paddd 64(%rbp),%xmm8
1644 paddd 96(%rbp),%xmm12
1645 movdqa %xmm12,80(%rbp)
1646 movdqu 0 + 0(%rsi),%xmm12
1647 pxor %xmm3,%xmm12
1648 movdqu %xmm12,0 + 0(%rdi)
1649 movdqu 16 + 0(%rsi),%xmm12
1650 pxor %xmm7,%xmm12
1651 movdqu %xmm12,16 + 0(%rdi)
1652 movdqu 32 + 0(%rsi),%xmm12
1653 pxor %xmm11,%xmm12
1654 movdqu %xmm12,32 + 0(%rdi)
1655 movdqu 48 + 0(%rsi),%xmm12
1656 pxor %xmm15,%xmm12
1657 movdqu %xmm12,48 + 0(%rdi)
1658 movdqu 0 + 64(%rsi),%xmm3
1659 movdqu 16 + 64(%rsi),%xmm7
1660 movdqu 32 + 64(%rsi),%xmm11
1661 movdqu 48 + 64(%rsi),%xmm15
1662 pxor %xmm3,%xmm2
1663 pxor %xmm7,%xmm6
1664 pxor %xmm11,%xmm10
1665 pxor %xmm14,%xmm15
1666 movdqu %xmm2,0 + 64(%rdi)
1667 movdqu %xmm6,16 + 64(%rdi)
1668 movdqu %xmm10,32 + 64(%rdi)
1669 movdqu %xmm15,48 + 64(%rdi)
1670 movdqu 0 + 128(%rsi),%xmm3
1671 movdqu 16 + 128(%rsi),%xmm7
1672 movdqu 32 + 128(%rsi),%xmm11
1673 movdqu 48 + 128(%rsi),%xmm15
1674 pxor %xmm3,%xmm1
1675 pxor %xmm7,%xmm5
1676 pxor %xmm11,%xmm9
1677 pxor %xmm13,%xmm15
1678 movdqu %xmm1,0 + 128(%rdi)
1679 movdqu %xmm5,16 + 128(%rdi)
1680 movdqu %xmm9,32 + 128(%rdi)
1681 movdqu %xmm15,48 + 128(%rdi)
1682
1683 movdqa 80(%rbp),%xmm12
1684 subq $192,%rbx
1685 leaq 192(%rsi),%rsi
1686 leaq 192(%rdi),%rdi
1687
1688
1689open_sse_tail_64_dec_loop:
1690 cmpq $16,%rbx
1691 jb 1f
1692 subq $16,%rbx
1693 movdqu (%rsi),%xmm3
1694 pxor %xmm3,%xmm0
1695 movdqu %xmm0,(%rdi)
1696 leaq 16(%rsi),%rsi
1697 leaq 16(%rdi),%rdi
1698 movdqa %xmm4,%xmm0
1699 movdqa %xmm8,%xmm4
1700 movdqa %xmm12,%xmm8
1701 jmp open_sse_tail_64_dec_loop
17021:
1703 movdqa %xmm0,%xmm1
1704
1705
1706open_sse_tail_16:
1707 testq %rbx,%rbx
1708 jz open_sse_finalize
1709
1710
1711
1712 pxor %xmm3,%xmm3
1713 leaq -1(%rsi,%rbx), %rsi
1714 movq %rbx,%r8
17152:
1716 pslldq $1,%xmm3
1717 pinsrb $0,(%rsi),%xmm3
1718 subq $1,%rsi
1719 subq $1,%r8
1720 jnz 2b
1721
17223:
1723.byte 102,73,15,126,221
1724 pextrq $1,%xmm3,%r14
1725
1726 pxor %xmm1,%xmm3
1727
1728
17292:
1730 pextrb $0,%xmm3,(%rdi)
1731 psrldq $1,%xmm3
1732 addq $1,%rdi
1733 subq $1,%rbx
1734 jne 2b
1735
1736 addq %r13,%r10
1737 adcq %r14,%r11
1738 adcq $1,%r12
1739 movq 0+0(%rbp),%rax
1740 movq %rax,%r15
1741 mulq %r10
1742 movq %rax,%r13
1743 movq %rdx,%r14
1744 movq 0+0(%rbp),%rax
1745 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001746 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001747 addq %rax,%r14
1748 adcq %rdx,%r15
1749 movq 8+0(%rbp),%rax
1750 movq %rax,%r9
1751 mulq %r10
1752 addq %rax,%r14
1753 adcq $0,%rdx
1754 movq %rdx,%r10
1755 movq 8+0(%rbp),%rax
1756 mulq %r11
1757 addq %rax,%r15
1758 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001759 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001760 addq %r10,%r15
1761 adcq %rdx,%r9
1762 movq %r13,%r10
1763 movq %r14,%r11
1764 movq %r15,%r12
1765 andq $3,%r12
1766 movq %r15,%r13
1767 andq $-4,%r13
1768 movq %r9,%r14
1769 shrdq $2,%r9,%r15
1770 shrq $2,%r9
1771 addq %r13,%r10
1772 adcq %r14,%r11
1773 adcq $0,%r12
1774 addq %r15,%r10
1775 adcq %r9,%r11
1776 adcq $0,%r12
1777
1778
1779open_sse_finalize:
1780 addq 32(%rbp),%r10
1781 adcq 8+32(%rbp),%r11
1782 adcq $1,%r12
1783 movq 0+0(%rbp),%rax
1784 movq %rax,%r15
1785 mulq %r10
1786 movq %rax,%r13
1787 movq %rdx,%r14
1788 movq 0+0(%rbp),%rax
1789 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08001790 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05001791 addq %rax,%r14
1792 adcq %rdx,%r15
1793 movq 8+0(%rbp),%rax
1794 movq %rax,%r9
1795 mulq %r10
1796 addq %rax,%r14
1797 adcq $0,%rdx
1798 movq %rdx,%r10
1799 movq 8+0(%rbp),%rax
1800 mulq %r11
1801 addq %rax,%r15
1802 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001803 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05001804 addq %r10,%r15
1805 adcq %rdx,%r9
1806 movq %r13,%r10
1807 movq %r14,%r11
1808 movq %r15,%r12
1809 andq $3,%r12
1810 movq %r15,%r13
1811 andq $-4,%r13
1812 movq %r9,%r14
1813 shrdq $2,%r9,%r15
1814 shrq $2,%r9
1815 addq %r13,%r10
1816 adcq %r14,%r11
1817 adcq $0,%r12
1818 addq %r15,%r10
1819 adcq %r9,%r11
1820 adcq $0,%r12
1821
1822
1823 movq %r10,%r13
1824 movq %r11,%r14
1825 movq %r12,%r15
1826 subq $-5,%r10
1827 sbbq $-1,%r11
1828 sbbq $3,%r12
1829 cmovcq %r13,%r10
1830 cmovcq %r14,%r11
1831 cmovcq %r15,%r12
1832
1833 addq 0+16(%rbp),%r10
1834 adcq 8+16(%rbp),%r11
1835
1836 addq $288 + 32,%rsp
Robert Sloana94fe052017-02-21 08:49:28 -08001837
David Benjaminf31229b2017-01-25 14:08:15 -05001838 popq %r9
Robert Sloana94fe052017-02-21 08:49:28 -08001839
David Benjaminf31229b2017-01-25 14:08:15 -05001840 movq %r10,(%r9)
1841 movq %r11,8(%r9)
1842
1843 popq %r15
Robert Sloana94fe052017-02-21 08:49:28 -08001844
David Benjaminf31229b2017-01-25 14:08:15 -05001845 popq %r14
Robert Sloana94fe052017-02-21 08:49:28 -08001846
David Benjaminf31229b2017-01-25 14:08:15 -05001847 popq %r13
Robert Sloana94fe052017-02-21 08:49:28 -08001848
David Benjaminf31229b2017-01-25 14:08:15 -05001849 popq %r12
Robert Sloana94fe052017-02-21 08:49:28 -08001850
David Benjaminf31229b2017-01-25 14:08:15 -05001851 popq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08001852
David Benjaminf31229b2017-01-25 14:08:15 -05001853 popq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08001854
David Benjaminf31229b2017-01-25 14:08:15 -05001855 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -08001856
David Benjaminf31229b2017-01-25 14:08:15 -05001857
1858open_sse_128:
1859 movdqu .chacha20_consts(%rip),%xmm0
1860 movdqa %xmm0,%xmm1
1861 movdqa %xmm0,%xmm2
1862 movdqu 0(%r9),%xmm4
1863 movdqa %xmm4,%xmm5
1864 movdqa %xmm4,%xmm6
1865 movdqu 16(%r9),%xmm8
1866 movdqa %xmm8,%xmm9
1867 movdqa %xmm8,%xmm10
1868 movdqu 32(%r9),%xmm12
1869 movdqa %xmm12,%xmm13
1870 paddd .sse_inc(%rip),%xmm13
1871 movdqa %xmm13,%xmm14
1872 paddd .sse_inc(%rip),%xmm14
1873 movdqa %xmm4,%xmm7
1874 movdqa %xmm8,%xmm11
1875 movdqa %xmm13,%xmm15
1876 movq $10,%r10
18771:
1878 paddd %xmm4,%xmm0
1879 pxor %xmm0,%xmm12
1880 pshufb .rol16(%rip),%xmm12
1881 paddd %xmm12,%xmm8
1882 pxor %xmm8,%xmm4
1883 movdqa %xmm4,%xmm3
1884 pslld $12,%xmm3
1885 psrld $20,%xmm4
1886 pxor %xmm3,%xmm4
1887 paddd %xmm4,%xmm0
1888 pxor %xmm0,%xmm12
1889 pshufb .rol8(%rip),%xmm12
1890 paddd %xmm12,%xmm8
1891 pxor %xmm8,%xmm4
1892 movdqa %xmm4,%xmm3
1893 pslld $7,%xmm3
1894 psrld $25,%xmm4
1895 pxor %xmm3,%xmm4
1896.byte 102,15,58,15,228,4
1897.byte 102,69,15,58,15,192,8
1898.byte 102,69,15,58,15,228,12
1899 paddd %xmm5,%xmm1
1900 pxor %xmm1,%xmm13
1901 pshufb .rol16(%rip),%xmm13
1902 paddd %xmm13,%xmm9
1903 pxor %xmm9,%xmm5
1904 movdqa %xmm5,%xmm3
1905 pslld $12,%xmm3
1906 psrld $20,%xmm5
1907 pxor %xmm3,%xmm5
1908 paddd %xmm5,%xmm1
1909 pxor %xmm1,%xmm13
1910 pshufb .rol8(%rip),%xmm13
1911 paddd %xmm13,%xmm9
1912 pxor %xmm9,%xmm5
1913 movdqa %xmm5,%xmm3
1914 pslld $7,%xmm3
1915 psrld $25,%xmm5
1916 pxor %xmm3,%xmm5
1917.byte 102,15,58,15,237,4
1918.byte 102,69,15,58,15,201,8
1919.byte 102,69,15,58,15,237,12
1920 paddd %xmm6,%xmm2
1921 pxor %xmm2,%xmm14
1922 pshufb .rol16(%rip),%xmm14
1923 paddd %xmm14,%xmm10
1924 pxor %xmm10,%xmm6
1925 movdqa %xmm6,%xmm3
1926 pslld $12,%xmm3
1927 psrld $20,%xmm6
1928 pxor %xmm3,%xmm6
1929 paddd %xmm6,%xmm2
1930 pxor %xmm2,%xmm14
1931 pshufb .rol8(%rip),%xmm14
1932 paddd %xmm14,%xmm10
1933 pxor %xmm10,%xmm6
1934 movdqa %xmm6,%xmm3
1935 pslld $7,%xmm3
1936 psrld $25,%xmm6
1937 pxor %xmm3,%xmm6
1938.byte 102,15,58,15,246,4
1939.byte 102,69,15,58,15,210,8
1940.byte 102,69,15,58,15,246,12
1941 paddd %xmm4,%xmm0
1942 pxor %xmm0,%xmm12
1943 pshufb .rol16(%rip),%xmm12
1944 paddd %xmm12,%xmm8
1945 pxor %xmm8,%xmm4
1946 movdqa %xmm4,%xmm3
1947 pslld $12,%xmm3
1948 psrld $20,%xmm4
1949 pxor %xmm3,%xmm4
1950 paddd %xmm4,%xmm0
1951 pxor %xmm0,%xmm12
1952 pshufb .rol8(%rip),%xmm12
1953 paddd %xmm12,%xmm8
1954 pxor %xmm8,%xmm4
1955 movdqa %xmm4,%xmm3
1956 pslld $7,%xmm3
1957 psrld $25,%xmm4
1958 pxor %xmm3,%xmm4
1959.byte 102,15,58,15,228,12
1960.byte 102,69,15,58,15,192,8
1961.byte 102,69,15,58,15,228,4
1962 paddd %xmm5,%xmm1
1963 pxor %xmm1,%xmm13
1964 pshufb .rol16(%rip),%xmm13
1965 paddd %xmm13,%xmm9
1966 pxor %xmm9,%xmm5
1967 movdqa %xmm5,%xmm3
1968 pslld $12,%xmm3
1969 psrld $20,%xmm5
1970 pxor %xmm3,%xmm5
1971 paddd %xmm5,%xmm1
1972 pxor %xmm1,%xmm13
1973 pshufb .rol8(%rip),%xmm13
1974 paddd %xmm13,%xmm9
1975 pxor %xmm9,%xmm5
1976 movdqa %xmm5,%xmm3
1977 pslld $7,%xmm3
1978 psrld $25,%xmm5
1979 pxor %xmm3,%xmm5
1980.byte 102,15,58,15,237,12
1981.byte 102,69,15,58,15,201,8
1982.byte 102,69,15,58,15,237,4
1983 paddd %xmm6,%xmm2
1984 pxor %xmm2,%xmm14
1985 pshufb .rol16(%rip),%xmm14
1986 paddd %xmm14,%xmm10
1987 pxor %xmm10,%xmm6
1988 movdqa %xmm6,%xmm3
1989 pslld $12,%xmm3
1990 psrld $20,%xmm6
1991 pxor %xmm3,%xmm6
1992 paddd %xmm6,%xmm2
1993 pxor %xmm2,%xmm14
1994 pshufb .rol8(%rip),%xmm14
1995 paddd %xmm14,%xmm10
1996 pxor %xmm10,%xmm6
1997 movdqa %xmm6,%xmm3
1998 pslld $7,%xmm3
1999 psrld $25,%xmm6
2000 pxor %xmm3,%xmm6
2001.byte 102,15,58,15,246,12
2002.byte 102,69,15,58,15,210,8
2003.byte 102,69,15,58,15,246,4
2004
2005 decq %r10
2006 jnz 1b
2007 paddd .chacha20_consts(%rip),%xmm0
2008 paddd .chacha20_consts(%rip),%xmm1
2009 paddd .chacha20_consts(%rip),%xmm2
2010 paddd %xmm7,%xmm4
2011 paddd %xmm7,%xmm5
2012 paddd %xmm7,%xmm6
2013 paddd %xmm11,%xmm9
2014 paddd %xmm11,%xmm10
2015 paddd %xmm15,%xmm13
2016 paddd .sse_inc(%rip),%xmm15
2017 paddd %xmm15,%xmm14
2018
2019 pand .clamp(%rip),%xmm0
2020 movdqa %xmm0,0(%rbp)
2021 movdqa %xmm4,16(%rbp)
2022
2023 movq %r8,%r8
2024 call poly_hash_ad_internal
20251:
2026 cmpq $16,%rbx
2027 jb open_sse_tail_16
2028 subq $16,%rbx
2029 addq 0(%rsi),%r10
2030 adcq 8+0(%rsi),%r11
2031 adcq $1,%r12
2032
2033
2034 movdqu 0(%rsi),%xmm3
2035 pxor %xmm3,%xmm1
2036 movdqu %xmm1,0(%rdi)
2037 leaq 16(%rsi),%rsi
2038 leaq 16(%rdi),%rdi
2039 movq 0+0(%rbp),%rax
2040 movq %rax,%r15
2041 mulq %r10
2042 movq %rax,%r13
2043 movq %rdx,%r14
2044 movq 0+0(%rbp),%rax
2045 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08002046 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05002047 addq %rax,%r14
2048 adcq %rdx,%r15
2049 movq 8+0(%rbp),%rax
2050 movq %rax,%r9
2051 mulq %r10
2052 addq %rax,%r14
2053 adcq $0,%rdx
2054 movq %rdx,%r10
2055 movq 8+0(%rbp),%rax
2056 mulq %r11
2057 addq %rax,%r15
2058 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08002059 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05002060 addq %r10,%r15
2061 adcq %rdx,%r9
2062 movq %r13,%r10
2063 movq %r14,%r11
2064 movq %r15,%r12
2065 andq $3,%r12
2066 movq %r15,%r13
2067 andq $-4,%r13
2068 movq %r9,%r14
2069 shrdq $2,%r9,%r15
2070 shrq $2,%r9
2071 addq %r13,%r10
2072 adcq %r14,%r11
2073 adcq $0,%r12
2074 addq %r15,%r10
2075 adcq %r9,%r11
2076 adcq $0,%r12
2077
2078
2079 movdqa %xmm5,%xmm1
2080 movdqa %xmm9,%xmm5
2081 movdqa %xmm13,%xmm9
2082 movdqa %xmm2,%xmm13
2083 movdqa %xmm6,%xmm2
2084 movdqa %xmm10,%xmm6
2085 movdqa %xmm14,%xmm10
2086 jmp 1b
2087 jmp open_sse_tail_16
2088
Robert Sloana94fe052017-02-21 08:49:28 -08002089
David Benjaminf31229b2017-01-25 14:08:15 -05002090
2091
2092
2093
2094.globl _chacha20_poly1305_seal
2095.private_extern _chacha20_poly1305_seal
2096
2097.p2align 6
2098_chacha20_poly1305_seal:
Robert Sloana94fe052017-02-21 08:49:28 -08002099
David Benjaminf31229b2017-01-25 14:08:15 -05002100 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002101
David Benjaminf31229b2017-01-25 14:08:15 -05002102 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08002103
David Benjaminf31229b2017-01-25 14:08:15 -05002104 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -08002105
David Benjaminf31229b2017-01-25 14:08:15 -05002106 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -08002107
David Benjaminf31229b2017-01-25 14:08:15 -05002108 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -08002109
David Benjaminf31229b2017-01-25 14:08:15 -05002110 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -08002111
David Benjaminf31229b2017-01-25 14:08:15 -05002112
2113
2114 pushq %r9
Robert Sloana94fe052017-02-21 08:49:28 -08002115
David Benjaminf31229b2017-01-25 14:08:15 -05002116 subq $288 + 32,%rsp
Robert Sloana94fe052017-02-21 08:49:28 -08002117
2118
2119
2120
2121
2122
2123
David Benjaminf31229b2017-01-25 14:08:15 -05002124 leaq 32(%rsp),%rbp
2125 andq $-32,%rbp
2126 movq %rdx,8+32(%rbp)
2127 movq %r8,0+32(%rbp)
2128 movq %rdx,%rbx
2129
2130 movl _OPENSSL_ia32cap_P+8(%rip),%eax
2131 andl $288,%eax
2132 xorl $288,%eax
2133 jz chacha20_poly1305_seal_avx2
2134
2135 cmpq $128,%rbx
2136 jbe seal_sse_128
2137
2138 movdqa .chacha20_consts(%rip),%xmm0
2139 movdqu 0(%r9),%xmm4
2140 movdqu 16(%r9),%xmm8
2141 movdqu 32(%r9),%xmm12
2142 movdqa %xmm0,%xmm1
2143 movdqa %xmm0,%xmm2
2144 movdqa %xmm0,%xmm3
2145 movdqa %xmm4,%xmm5
2146 movdqa %xmm4,%xmm6
2147 movdqa %xmm4,%xmm7
2148 movdqa %xmm8,%xmm9
2149 movdqa %xmm8,%xmm10
2150 movdqa %xmm8,%xmm11
2151 movdqa %xmm12,%xmm15
2152 paddd .sse_inc(%rip),%xmm12
2153 movdqa %xmm12,%xmm14
2154 paddd .sse_inc(%rip),%xmm12
2155 movdqa %xmm12,%xmm13
2156 paddd .sse_inc(%rip),%xmm12
2157
2158 movdqa %xmm4,48(%rbp)
2159 movdqa %xmm8,64(%rbp)
2160 movdqa %xmm12,96(%rbp)
2161 movdqa %xmm13,112(%rbp)
2162 movdqa %xmm14,128(%rbp)
2163 movdqa %xmm15,144(%rbp)
2164 movq $10,%r10
21651:
2166 movdqa %xmm8,80(%rbp)
2167 movdqa .rol16(%rip),%xmm8
2168 paddd %xmm7,%xmm3
2169 paddd %xmm6,%xmm2
2170 paddd %xmm5,%xmm1
2171 paddd %xmm4,%xmm0
2172 pxor %xmm3,%xmm15
2173 pxor %xmm2,%xmm14
2174 pxor %xmm1,%xmm13
2175 pxor %xmm0,%xmm12
2176.byte 102,69,15,56,0,248
2177.byte 102,69,15,56,0,240
2178.byte 102,69,15,56,0,232
2179.byte 102,69,15,56,0,224
2180 movdqa 80(%rbp),%xmm8
2181 paddd %xmm15,%xmm11
2182 paddd %xmm14,%xmm10
2183 paddd %xmm13,%xmm9
2184 paddd %xmm12,%xmm8
2185 pxor %xmm11,%xmm7
2186 pxor %xmm10,%xmm6
2187 pxor %xmm9,%xmm5
2188 pxor %xmm8,%xmm4
2189 movdqa %xmm8,80(%rbp)
2190 movdqa %xmm7,%xmm8
2191 psrld $20,%xmm8
2192 pslld $32-20,%xmm7
2193 pxor %xmm8,%xmm7
2194 movdqa %xmm6,%xmm8
2195 psrld $20,%xmm8
2196 pslld $32-20,%xmm6
2197 pxor %xmm8,%xmm6
2198 movdqa %xmm5,%xmm8
2199 psrld $20,%xmm8
2200 pslld $32-20,%xmm5
2201 pxor %xmm8,%xmm5
2202 movdqa %xmm4,%xmm8
2203 psrld $20,%xmm8
2204 pslld $32-20,%xmm4
2205 pxor %xmm8,%xmm4
2206 movdqa .rol8(%rip),%xmm8
2207 paddd %xmm7,%xmm3
2208 paddd %xmm6,%xmm2
2209 paddd %xmm5,%xmm1
2210 paddd %xmm4,%xmm0
2211 pxor %xmm3,%xmm15
2212 pxor %xmm2,%xmm14
2213 pxor %xmm1,%xmm13
2214 pxor %xmm0,%xmm12
2215.byte 102,69,15,56,0,248
2216.byte 102,69,15,56,0,240
2217.byte 102,69,15,56,0,232
2218.byte 102,69,15,56,0,224
2219 movdqa 80(%rbp),%xmm8
2220 paddd %xmm15,%xmm11
2221 paddd %xmm14,%xmm10
2222 paddd %xmm13,%xmm9
2223 paddd %xmm12,%xmm8
2224 pxor %xmm11,%xmm7
2225 pxor %xmm10,%xmm6
2226 pxor %xmm9,%xmm5
2227 pxor %xmm8,%xmm4
2228 movdqa %xmm8,80(%rbp)
2229 movdqa %xmm7,%xmm8
2230 psrld $25,%xmm8
2231 pslld $32-25,%xmm7
2232 pxor %xmm8,%xmm7
2233 movdqa %xmm6,%xmm8
2234 psrld $25,%xmm8
2235 pslld $32-25,%xmm6
2236 pxor %xmm8,%xmm6
2237 movdqa %xmm5,%xmm8
2238 psrld $25,%xmm8
2239 pslld $32-25,%xmm5
2240 pxor %xmm8,%xmm5
2241 movdqa %xmm4,%xmm8
2242 psrld $25,%xmm8
2243 pslld $32-25,%xmm4
2244 pxor %xmm8,%xmm4
2245 movdqa 80(%rbp),%xmm8
2246.byte 102,15,58,15,255,4
2247.byte 102,69,15,58,15,219,8
2248.byte 102,69,15,58,15,255,12
2249.byte 102,15,58,15,246,4
2250.byte 102,69,15,58,15,210,8
2251.byte 102,69,15,58,15,246,12
2252.byte 102,15,58,15,237,4
2253.byte 102,69,15,58,15,201,8
2254.byte 102,69,15,58,15,237,12
2255.byte 102,15,58,15,228,4
2256.byte 102,69,15,58,15,192,8
2257.byte 102,69,15,58,15,228,12
2258 movdqa %xmm8,80(%rbp)
2259 movdqa .rol16(%rip),%xmm8
2260 paddd %xmm7,%xmm3
2261 paddd %xmm6,%xmm2
2262 paddd %xmm5,%xmm1
2263 paddd %xmm4,%xmm0
2264 pxor %xmm3,%xmm15
2265 pxor %xmm2,%xmm14
2266 pxor %xmm1,%xmm13
2267 pxor %xmm0,%xmm12
2268.byte 102,69,15,56,0,248
2269.byte 102,69,15,56,0,240
2270.byte 102,69,15,56,0,232
2271.byte 102,69,15,56,0,224
2272 movdqa 80(%rbp),%xmm8
2273 paddd %xmm15,%xmm11
2274 paddd %xmm14,%xmm10
2275 paddd %xmm13,%xmm9
2276 paddd %xmm12,%xmm8
2277 pxor %xmm11,%xmm7
2278 pxor %xmm10,%xmm6
2279 pxor %xmm9,%xmm5
2280 pxor %xmm8,%xmm4
2281 movdqa %xmm8,80(%rbp)
2282 movdqa %xmm7,%xmm8
2283 psrld $20,%xmm8
2284 pslld $32-20,%xmm7
2285 pxor %xmm8,%xmm7
2286 movdqa %xmm6,%xmm8
2287 psrld $20,%xmm8
2288 pslld $32-20,%xmm6
2289 pxor %xmm8,%xmm6
2290 movdqa %xmm5,%xmm8
2291 psrld $20,%xmm8
2292 pslld $32-20,%xmm5
2293 pxor %xmm8,%xmm5
2294 movdqa %xmm4,%xmm8
2295 psrld $20,%xmm8
2296 pslld $32-20,%xmm4
2297 pxor %xmm8,%xmm4
2298 movdqa .rol8(%rip),%xmm8
2299 paddd %xmm7,%xmm3
2300 paddd %xmm6,%xmm2
2301 paddd %xmm5,%xmm1
2302 paddd %xmm4,%xmm0
2303 pxor %xmm3,%xmm15
2304 pxor %xmm2,%xmm14
2305 pxor %xmm1,%xmm13
2306 pxor %xmm0,%xmm12
2307.byte 102,69,15,56,0,248
2308.byte 102,69,15,56,0,240
2309.byte 102,69,15,56,0,232
2310.byte 102,69,15,56,0,224
2311 movdqa 80(%rbp),%xmm8
2312 paddd %xmm15,%xmm11
2313 paddd %xmm14,%xmm10
2314 paddd %xmm13,%xmm9
2315 paddd %xmm12,%xmm8
2316 pxor %xmm11,%xmm7
2317 pxor %xmm10,%xmm6
2318 pxor %xmm9,%xmm5
2319 pxor %xmm8,%xmm4
2320 movdqa %xmm8,80(%rbp)
2321 movdqa %xmm7,%xmm8
2322 psrld $25,%xmm8
2323 pslld $32-25,%xmm7
2324 pxor %xmm8,%xmm7
2325 movdqa %xmm6,%xmm8
2326 psrld $25,%xmm8
2327 pslld $32-25,%xmm6
2328 pxor %xmm8,%xmm6
2329 movdqa %xmm5,%xmm8
2330 psrld $25,%xmm8
2331 pslld $32-25,%xmm5
2332 pxor %xmm8,%xmm5
2333 movdqa %xmm4,%xmm8
2334 psrld $25,%xmm8
2335 pslld $32-25,%xmm4
2336 pxor %xmm8,%xmm4
2337 movdqa 80(%rbp),%xmm8
2338.byte 102,15,58,15,255,12
2339.byte 102,69,15,58,15,219,8
2340.byte 102,69,15,58,15,255,4
2341.byte 102,15,58,15,246,12
2342.byte 102,69,15,58,15,210,8
2343.byte 102,69,15,58,15,246,4
2344.byte 102,15,58,15,237,12
2345.byte 102,69,15,58,15,201,8
2346.byte 102,69,15,58,15,237,4
2347.byte 102,15,58,15,228,12
2348.byte 102,69,15,58,15,192,8
2349.byte 102,69,15,58,15,228,4
2350
2351 decq %r10
2352 jnz 1b
2353 paddd .chacha20_consts(%rip),%xmm3
2354 paddd 48(%rbp),%xmm7
2355 paddd 64(%rbp),%xmm11
2356 paddd 144(%rbp),%xmm15
2357 paddd .chacha20_consts(%rip),%xmm2
2358 paddd 48(%rbp),%xmm6
2359 paddd 64(%rbp),%xmm10
2360 paddd 128(%rbp),%xmm14
2361 paddd .chacha20_consts(%rip),%xmm1
2362 paddd 48(%rbp),%xmm5
2363 paddd 64(%rbp),%xmm9
2364 paddd 112(%rbp),%xmm13
2365 paddd .chacha20_consts(%rip),%xmm0
2366 paddd 48(%rbp),%xmm4
2367 paddd 64(%rbp),%xmm8
2368 paddd 96(%rbp),%xmm12
2369
2370
2371 pand .clamp(%rip),%xmm3
2372 movdqa %xmm3,0(%rbp)
2373 movdqa %xmm7,16(%rbp)
2374
2375 movq %r8,%r8
2376 call poly_hash_ad_internal
2377 movdqu 0 + 0(%rsi),%xmm3
2378 movdqu 16 + 0(%rsi),%xmm7
2379 movdqu 32 + 0(%rsi),%xmm11
2380 movdqu 48 + 0(%rsi),%xmm15
2381 pxor %xmm3,%xmm2
2382 pxor %xmm7,%xmm6
2383 pxor %xmm11,%xmm10
2384 pxor %xmm14,%xmm15
2385 movdqu %xmm2,0 + 0(%rdi)
2386 movdqu %xmm6,16 + 0(%rdi)
2387 movdqu %xmm10,32 + 0(%rdi)
2388 movdqu %xmm15,48 + 0(%rdi)
2389 movdqu 0 + 64(%rsi),%xmm3
2390 movdqu 16 + 64(%rsi),%xmm7
2391 movdqu 32 + 64(%rsi),%xmm11
2392 movdqu 48 + 64(%rsi),%xmm15
2393 pxor %xmm3,%xmm1
2394 pxor %xmm7,%xmm5
2395 pxor %xmm11,%xmm9
2396 pxor %xmm13,%xmm15
2397 movdqu %xmm1,0 + 64(%rdi)
2398 movdqu %xmm5,16 + 64(%rdi)
2399 movdqu %xmm9,32 + 64(%rdi)
2400 movdqu %xmm15,48 + 64(%rdi)
2401
2402 cmpq $192,%rbx
2403 ja 1f
2404 movq $128,%rcx
2405 subq $128,%rbx
2406 leaq 128(%rsi),%rsi
2407 jmp seal_sse_128_seal_hash
24081:
2409 movdqu 0 + 128(%rsi),%xmm3
2410 movdqu 16 + 128(%rsi),%xmm7
2411 movdqu 32 + 128(%rsi),%xmm11
2412 movdqu 48 + 128(%rsi),%xmm15
2413 pxor %xmm3,%xmm0
2414 pxor %xmm7,%xmm4
2415 pxor %xmm11,%xmm8
2416 pxor %xmm12,%xmm15
2417 movdqu %xmm0,0 + 128(%rdi)
2418 movdqu %xmm4,16 + 128(%rdi)
2419 movdqu %xmm8,32 + 128(%rdi)
2420 movdqu %xmm15,48 + 128(%rdi)
2421
2422 movq $192,%rcx
2423 subq $192,%rbx
2424 leaq 192(%rsi),%rsi
2425 movq $2,%rcx
2426 movq $8,%r8
2427 cmpq $64,%rbx
2428 jbe seal_sse_tail_64
2429 cmpq $128,%rbx
2430 jbe seal_sse_tail_128
2431 cmpq $192,%rbx
2432 jbe seal_sse_tail_192
2433
24341:
2435 movdqa .chacha20_consts(%rip),%xmm0
2436 movdqa 48(%rbp),%xmm4
2437 movdqa 64(%rbp),%xmm8
2438 movdqa %xmm0,%xmm1
2439 movdqa %xmm4,%xmm5
2440 movdqa %xmm8,%xmm9
2441 movdqa %xmm0,%xmm2
2442 movdqa %xmm4,%xmm6
2443 movdqa %xmm8,%xmm10
2444 movdqa %xmm0,%xmm3
2445 movdqa %xmm4,%xmm7
2446 movdqa %xmm8,%xmm11
2447 movdqa 96(%rbp),%xmm15
2448 paddd .sse_inc(%rip),%xmm15
2449 movdqa %xmm15,%xmm14
2450 paddd .sse_inc(%rip),%xmm14
2451 movdqa %xmm14,%xmm13
2452 paddd .sse_inc(%rip),%xmm13
2453 movdqa %xmm13,%xmm12
2454 paddd .sse_inc(%rip),%xmm12
2455 movdqa %xmm12,96(%rbp)
2456 movdqa %xmm13,112(%rbp)
2457 movdqa %xmm14,128(%rbp)
2458 movdqa %xmm15,144(%rbp)
2459
24602:
2461 movdqa %xmm8,80(%rbp)
2462 movdqa .rol16(%rip),%xmm8
2463 paddd %xmm7,%xmm3
2464 paddd %xmm6,%xmm2
2465 paddd %xmm5,%xmm1
2466 paddd %xmm4,%xmm0
2467 pxor %xmm3,%xmm15
2468 pxor %xmm2,%xmm14
2469 pxor %xmm1,%xmm13
2470 pxor %xmm0,%xmm12
2471.byte 102,69,15,56,0,248
2472.byte 102,69,15,56,0,240
2473.byte 102,69,15,56,0,232
2474.byte 102,69,15,56,0,224
2475 movdqa 80(%rbp),%xmm8
2476 paddd %xmm15,%xmm11
2477 paddd %xmm14,%xmm10
2478 paddd %xmm13,%xmm9
2479 paddd %xmm12,%xmm8
2480 pxor %xmm11,%xmm7
2481 addq 0(%rdi),%r10
2482 adcq 8+0(%rdi),%r11
2483 adcq $1,%r12
2484 pxor %xmm10,%xmm6
2485 pxor %xmm9,%xmm5
2486 pxor %xmm8,%xmm4
2487 movdqa %xmm8,80(%rbp)
2488 movdqa %xmm7,%xmm8
2489 psrld $20,%xmm8
2490 pslld $32-20,%xmm7
2491 pxor %xmm8,%xmm7
2492 movdqa %xmm6,%xmm8
2493 psrld $20,%xmm8
2494 pslld $32-20,%xmm6
2495 pxor %xmm8,%xmm6
2496 movdqa %xmm5,%xmm8
2497 psrld $20,%xmm8
2498 pslld $32-20,%xmm5
2499 pxor %xmm8,%xmm5
2500 movdqa %xmm4,%xmm8
2501 psrld $20,%xmm8
2502 pslld $32-20,%xmm4
2503 pxor %xmm8,%xmm4
2504 movq 0+0(%rbp),%rax
2505 movq %rax,%r15
2506 mulq %r10
2507 movq %rax,%r13
2508 movq %rdx,%r14
2509 movq 0+0(%rbp),%rax
2510 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08002511 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05002512 addq %rax,%r14
2513 adcq %rdx,%r15
2514 movdqa .rol8(%rip),%xmm8
2515 paddd %xmm7,%xmm3
2516 paddd %xmm6,%xmm2
2517 paddd %xmm5,%xmm1
2518 paddd %xmm4,%xmm0
2519 pxor %xmm3,%xmm15
2520 pxor %xmm2,%xmm14
2521 pxor %xmm1,%xmm13
2522 pxor %xmm0,%xmm12
2523.byte 102,69,15,56,0,248
2524.byte 102,69,15,56,0,240
2525.byte 102,69,15,56,0,232
2526.byte 102,69,15,56,0,224
2527 movdqa 80(%rbp),%xmm8
2528 paddd %xmm15,%xmm11
2529 paddd %xmm14,%xmm10
2530 paddd %xmm13,%xmm9
2531 paddd %xmm12,%xmm8
2532 pxor %xmm11,%xmm7
2533 pxor %xmm10,%xmm6
2534 movq 8+0(%rbp),%rax
2535 movq %rax,%r9
2536 mulq %r10
2537 addq %rax,%r14
2538 adcq $0,%rdx
2539 movq %rdx,%r10
2540 movq 8+0(%rbp),%rax
2541 mulq %r11
2542 addq %rax,%r15
2543 adcq $0,%rdx
2544 pxor %xmm9,%xmm5
2545 pxor %xmm8,%xmm4
2546 movdqa %xmm8,80(%rbp)
2547 movdqa %xmm7,%xmm8
2548 psrld $25,%xmm8
2549 pslld $32-25,%xmm7
2550 pxor %xmm8,%xmm7
2551 movdqa %xmm6,%xmm8
2552 psrld $25,%xmm8
2553 pslld $32-25,%xmm6
2554 pxor %xmm8,%xmm6
2555 movdqa %xmm5,%xmm8
2556 psrld $25,%xmm8
2557 pslld $32-25,%xmm5
2558 pxor %xmm8,%xmm5
2559 movdqa %xmm4,%xmm8
2560 psrld $25,%xmm8
2561 pslld $32-25,%xmm4
2562 pxor %xmm8,%xmm4
2563 movdqa 80(%rbp),%xmm8
Robert Sloan4d1ac502017-02-06 08:36:14 -08002564 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05002565 addq %r10,%r15
2566 adcq %rdx,%r9
2567.byte 102,15,58,15,255,4
2568.byte 102,69,15,58,15,219,8
2569.byte 102,69,15,58,15,255,12
2570.byte 102,15,58,15,246,4
2571.byte 102,69,15,58,15,210,8
2572.byte 102,69,15,58,15,246,12
2573.byte 102,15,58,15,237,4
2574.byte 102,69,15,58,15,201,8
2575.byte 102,69,15,58,15,237,12
2576.byte 102,15,58,15,228,4
2577.byte 102,69,15,58,15,192,8
2578.byte 102,69,15,58,15,228,12
2579 movdqa %xmm8,80(%rbp)
2580 movdqa .rol16(%rip),%xmm8
2581 paddd %xmm7,%xmm3
2582 paddd %xmm6,%xmm2
2583 paddd %xmm5,%xmm1
2584 paddd %xmm4,%xmm0
2585 pxor %xmm3,%xmm15
2586 pxor %xmm2,%xmm14
2587 movq %r13,%r10
2588 movq %r14,%r11
2589 movq %r15,%r12
2590 andq $3,%r12
2591 movq %r15,%r13
2592 andq $-4,%r13
2593 movq %r9,%r14
2594 shrdq $2,%r9,%r15
2595 shrq $2,%r9
2596 addq %r13,%r10
2597 adcq %r14,%r11
2598 adcq $0,%r12
2599 addq %r15,%r10
2600 adcq %r9,%r11
2601 adcq $0,%r12
2602 pxor %xmm1,%xmm13
2603 pxor %xmm0,%xmm12
2604.byte 102,69,15,56,0,248
2605.byte 102,69,15,56,0,240
2606.byte 102,69,15,56,0,232
2607.byte 102,69,15,56,0,224
2608 movdqa 80(%rbp),%xmm8
2609 paddd %xmm15,%xmm11
2610 paddd %xmm14,%xmm10
2611 paddd %xmm13,%xmm9
2612 paddd %xmm12,%xmm8
2613 pxor %xmm11,%xmm7
2614 pxor %xmm10,%xmm6
2615 pxor %xmm9,%xmm5
2616 pxor %xmm8,%xmm4
2617 movdqa %xmm8,80(%rbp)
2618 movdqa %xmm7,%xmm8
2619 psrld $20,%xmm8
2620 pslld $32-20,%xmm7
2621 pxor %xmm8,%xmm7
2622 movdqa %xmm6,%xmm8
2623 psrld $20,%xmm8
2624 pslld $32-20,%xmm6
2625 pxor %xmm8,%xmm6
2626 movdqa %xmm5,%xmm8
2627 psrld $20,%xmm8
2628 pslld $32-20,%xmm5
2629 pxor %xmm8,%xmm5
2630 movdqa %xmm4,%xmm8
2631 psrld $20,%xmm8
2632 pslld $32-20,%xmm4
2633 pxor %xmm8,%xmm4
2634 movdqa .rol8(%rip),%xmm8
2635 paddd %xmm7,%xmm3
2636 paddd %xmm6,%xmm2
2637 paddd %xmm5,%xmm1
2638 paddd %xmm4,%xmm0
2639 pxor %xmm3,%xmm15
2640 pxor %xmm2,%xmm14
2641 pxor %xmm1,%xmm13
2642 pxor %xmm0,%xmm12
2643.byte 102,69,15,56,0,248
2644.byte 102,69,15,56,0,240
2645.byte 102,69,15,56,0,232
2646.byte 102,69,15,56,0,224
2647 movdqa 80(%rbp),%xmm8
2648 paddd %xmm15,%xmm11
2649 paddd %xmm14,%xmm10
2650 paddd %xmm13,%xmm9
2651 paddd %xmm12,%xmm8
2652 pxor %xmm11,%xmm7
2653 pxor %xmm10,%xmm6
2654 pxor %xmm9,%xmm5
2655 pxor %xmm8,%xmm4
2656 movdqa %xmm8,80(%rbp)
2657 movdqa %xmm7,%xmm8
2658 psrld $25,%xmm8
2659 pslld $32-25,%xmm7
2660 pxor %xmm8,%xmm7
2661 movdqa %xmm6,%xmm8
2662 psrld $25,%xmm8
2663 pslld $32-25,%xmm6
2664 pxor %xmm8,%xmm6
2665 movdqa %xmm5,%xmm8
2666 psrld $25,%xmm8
2667 pslld $32-25,%xmm5
2668 pxor %xmm8,%xmm5
2669 movdqa %xmm4,%xmm8
2670 psrld $25,%xmm8
2671 pslld $32-25,%xmm4
2672 pxor %xmm8,%xmm4
2673 movdqa 80(%rbp),%xmm8
2674.byte 102,15,58,15,255,12
2675.byte 102,69,15,58,15,219,8
2676.byte 102,69,15,58,15,255,4
2677.byte 102,15,58,15,246,12
2678.byte 102,69,15,58,15,210,8
2679.byte 102,69,15,58,15,246,4
2680.byte 102,15,58,15,237,12
2681.byte 102,69,15,58,15,201,8
2682.byte 102,69,15,58,15,237,4
2683.byte 102,15,58,15,228,12
2684.byte 102,69,15,58,15,192,8
2685.byte 102,69,15,58,15,228,4
2686
2687 leaq 16(%rdi),%rdi
2688 decq %r8
2689 jge 2b
2690 addq 0(%rdi),%r10
2691 adcq 8+0(%rdi),%r11
2692 adcq $1,%r12
2693 movq 0+0(%rbp),%rax
2694 movq %rax,%r15
2695 mulq %r10
2696 movq %rax,%r13
2697 movq %rdx,%r14
2698 movq 0+0(%rbp),%rax
2699 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08002700 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05002701 addq %rax,%r14
2702 adcq %rdx,%r15
2703 movq 8+0(%rbp),%rax
2704 movq %rax,%r9
2705 mulq %r10
2706 addq %rax,%r14
2707 adcq $0,%rdx
2708 movq %rdx,%r10
2709 movq 8+0(%rbp),%rax
2710 mulq %r11
2711 addq %rax,%r15
2712 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08002713 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05002714 addq %r10,%r15
2715 adcq %rdx,%r9
2716 movq %r13,%r10
2717 movq %r14,%r11
2718 movq %r15,%r12
2719 andq $3,%r12
2720 movq %r15,%r13
2721 andq $-4,%r13
2722 movq %r9,%r14
2723 shrdq $2,%r9,%r15
2724 shrq $2,%r9
2725 addq %r13,%r10
2726 adcq %r14,%r11
2727 adcq $0,%r12
2728 addq %r15,%r10
2729 adcq %r9,%r11
2730 adcq $0,%r12
2731
2732 leaq 16(%rdi),%rdi
2733 decq %rcx
2734 jg 2b
2735 paddd .chacha20_consts(%rip),%xmm3
2736 paddd 48(%rbp),%xmm7
2737 paddd 64(%rbp),%xmm11
2738 paddd 144(%rbp),%xmm15
2739 paddd .chacha20_consts(%rip),%xmm2
2740 paddd 48(%rbp),%xmm6
2741 paddd 64(%rbp),%xmm10
2742 paddd 128(%rbp),%xmm14
2743 paddd .chacha20_consts(%rip),%xmm1
2744 paddd 48(%rbp),%xmm5
2745 paddd 64(%rbp),%xmm9
2746 paddd 112(%rbp),%xmm13
2747 paddd .chacha20_consts(%rip),%xmm0
2748 paddd 48(%rbp),%xmm4
2749 paddd 64(%rbp),%xmm8
2750 paddd 96(%rbp),%xmm12
2751
2752 movdqa %xmm14,80(%rbp)
2753 movdqa %xmm14,80(%rbp)
2754 movdqu 0 + 0(%rsi),%xmm14
2755 pxor %xmm3,%xmm14
2756 movdqu %xmm14,0 + 0(%rdi)
2757 movdqu 16 + 0(%rsi),%xmm14
2758 pxor %xmm7,%xmm14
2759 movdqu %xmm14,16 + 0(%rdi)
2760 movdqu 32 + 0(%rsi),%xmm14
2761 pxor %xmm11,%xmm14
2762 movdqu %xmm14,32 + 0(%rdi)
2763 movdqu 48 + 0(%rsi),%xmm14
2764 pxor %xmm15,%xmm14
2765 movdqu %xmm14,48 + 0(%rdi)
2766
2767 movdqa 80(%rbp),%xmm14
2768 movdqu 0 + 64(%rsi),%xmm3
2769 movdqu 16 + 64(%rsi),%xmm7
2770 movdqu 32 + 64(%rsi),%xmm11
2771 movdqu 48 + 64(%rsi),%xmm15
2772 pxor %xmm3,%xmm2
2773 pxor %xmm7,%xmm6
2774 pxor %xmm11,%xmm10
2775 pxor %xmm14,%xmm15
2776 movdqu %xmm2,0 + 64(%rdi)
2777 movdqu %xmm6,16 + 64(%rdi)
2778 movdqu %xmm10,32 + 64(%rdi)
2779 movdqu %xmm15,48 + 64(%rdi)
2780 movdqu 0 + 128(%rsi),%xmm3
2781 movdqu 16 + 128(%rsi),%xmm7
2782 movdqu 32 + 128(%rsi),%xmm11
2783 movdqu 48 + 128(%rsi),%xmm15
2784 pxor %xmm3,%xmm1
2785 pxor %xmm7,%xmm5
2786 pxor %xmm11,%xmm9
2787 pxor %xmm13,%xmm15
2788 movdqu %xmm1,0 + 128(%rdi)
2789 movdqu %xmm5,16 + 128(%rdi)
2790 movdqu %xmm9,32 + 128(%rdi)
2791 movdqu %xmm15,48 + 128(%rdi)
2792
2793 cmpq $256,%rbx
2794 ja 3f
2795
2796 movq $192,%rcx
2797 subq $192,%rbx
2798 leaq 192(%rsi),%rsi
2799 jmp seal_sse_128_seal_hash
28003:
2801 movdqu 0 + 192(%rsi),%xmm3
2802 movdqu 16 + 192(%rsi),%xmm7
2803 movdqu 32 + 192(%rsi),%xmm11
2804 movdqu 48 + 192(%rsi),%xmm15
2805 pxor %xmm3,%xmm0
2806 pxor %xmm7,%xmm4
2807 pxor %xmm11,%xmm8
2808 pxor %xmm12,%xmm15
2809 movdqu %xmm0,0 + 192(%rdi)
2810 movdqu %xmm4,16 + 192(%rdi)
2811 movdqu %xmm8,32 + 192(%rdi)
2812 movdqu %xmm15,48 + 192(%rdi)
2813
2814 leaq 256(%rsi),%rsi
2815 subq $256,%rbx
2816 movq $6,%rcx
2817 movq $4,%r8
2818 cmpq $192,%rbx
2819 jg 1b
2820 movq %rbx,%rcx
2821 testq %rbx,%rbx
2822 je seal_sse_128_seal_hash
2823 movq $6,%rcx
2824 cmpq $64,%rbx
2825 jg 3f
2826
2827seal_sse_tail_64:
2828 movdqa .chacha20_consts(%rip),%xmm0
2829 movdqa 48(%rbp),%xmm4
2830 movdqa 64(%rbp),%xmm8
2831 movdqa 96(%rbp),%xmm12
2832 paddd .sse_inc(%rip),%xmm12
2833 movdqa %xmm12,96(%rbp)
2834
28351:
2836 addq 0(%rdi),%r10
2837 adcq 8+0(%rdi),%r11
2838 adcq $1,%r12
2839 movq 0+0(%rbp),%rax
2840 movq %rax,%r15
2841 mulq %r10
2842 movq %rax,%r13
2843 movq %rdx,%r14
2844 movq 0+0(%rbp),%rax
2845 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08002846 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05002847 addq %rax,%r14
2848 adcq %rdx,%r15
2849 movq 8+0(%rbp),%rax
2850 movq %rax,%r9
2851 mulq %r10
2852 addq %rax,%r14
2853 adcq $0,%rdx
2854 movq %rdx,%r10
2855 movq 8+0(%rbp),%rax
2856 mulq %r11
2857 addq %rax,%r15
2858 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08002859 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05002860 addq %r10,%r15
2861 adcq %rdx,%r9
2862 movq %r13,%r10
2863 movq %r14,%r11
2864 movq %r15,%r12
2865 andq $3,%r12
2866 movq %r15,%r13
2867 andq $-4,%r13
2868 movq %r9,%r14
2869 shrdq $2,%r9,%r15
2870 shrq $2,%r9
2871 addq %r13,%r10
2872 adcq %r14,%r11
2873 adcq $0,%r12
2874 addq %r15,%r10
2875 adcq %r9,%r11
2876 adcq $0,%r12
2877
2878 leaq 16(%rdi),%rdi
28792:
2880 paddd %xmm4,%xmm0
2881 pxor %xmm0,%xmm12
2882 pshufb .rol16(%rip),%xmm12
2883 paddd %xmm12,%xmm8
2884 pxor %xmm8,%xmm4
2885 movdqa %xmm4,%xmm3
2886 pslld $12,%xmm3
2887 psrld $20,%xmm4
2888 pxor %xmm3,%xmm4
2889 paddd %xmm4,%xmm0
2890 pxor %xmm0,%xmm12
2891 pshufb .rol8(%rip),%xmm12
2892 paddd %xmm12,%xmm8
2893 pxor %xmm8,%xmm4
2894 movdqa %xmm4,%xmm3
2895 pslld $7,%xmm3
2896 psrld $25,%xmm4
2897 pxor %xmm3,%xmm4
2898.byte 102,15,58,15,228,4
2899.byte 102,69,15,58,15,192,8
2900.byte 102,69,15,58,15,228,12
2901 paddd %xmm4,%xmm0
2902 pxor %xmm0,%xmm12
2903 pshufb .rol16(%rip),%xmm12
2904 paddd %xmm12,%xmm8
2905 pxor %xmm8,%xmm4
2906 movdqa %xmm4,%xmm3
2907 pslld $12,%xmm3
2908 psrld $20,%xmm4
2909 pxor %xmm3,%xmm4
2910 paddd %xmm4,%xmm0
2911 pxor %xmm0,%xmm12
2912 pshufb .rol8(%rip),%xmm12
2913 paddd %xmm12,%xmm8
2914 pxor %xmm8,%xmm4
2915 movdqa %xmm4,%xmm3
2916 pslld $7,%xmm3
2917 psrld $25,%xmm4
2918 pxor %xmm3,%xmm4
2919.byte 102,15,58,15,228,12
2920.byte 102,69,15,58,15,192,8
2921.byte 102,69,15,58,15,228,4
2922 addq 0(%rdi),%r10
2923 adcq 8+0(%rdi),%r11
2924 adcq $1,%r12
2925 movq 0+0(%rbp),%rax
2926 movq %rax,%r15
2927 mulq %r10
2928 movq %rax,%r13
2929 movq %rdx,%r14
2930 movq 0+0(%rbp),%rax
2931 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08002932 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05002933 addq %rax,%r14
2934 adcq %rdx,%r15
2935 movq 8+0(%rbp),%rax
2936 movq %rax,%r9
2937 mulq %r10
2938 addq %rax,%r14
2939 adcq $0,%rdx
2940 movq %rdx,%r10
2941 movq 8+0(%rbp),%rax
2942 mulq %r11
2943 addq %rax,%r15
2944 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08002945 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05002946 addq %r10,%r15
2947 adcq %rdx,%r9
2948 movq %r13,%r10
2949 movq %r14,%r11
2950 movq %r15,%r12
2951 andq $3,%r12
2952 movq %r15,%r13
2953 andq $-4,%r13
2954 movq %r9,%r14
2955 shrdq $2,%r9,%r15
2956 shrq $2,%r9
2957 addq %r13,%r10
2958 adcq %r14,%r11
2959 adcq $0,%r12
2960 addq %r15,%r10
2961 adcq %r9,%r11
2962 adcq $0,%r12
2963
2964 leaq 16(%rdi),%rdi
2965 decq %rcx
2966 jg 1b
2967 decq %r8
2968 jge 2b
2969 paddd .chacha20_consts(%rip),%xmm0
2970 paddd 48(%rbp),%xmm4
2971 paddd 64(%rbp),%xmm8
2972 paddd 96(%rbp),%xmm12
2973
2974 jmp seal_sse_128_seal
29753:
2976 cmpq $128,%rbx
2977 jg 3f
2978
2979seal_sse_tail_128:
2980 movdqa .chacha20_consts(%rip),%xmm0
2981 movdqa 48(%rbp),%xmm4
2982 movdqa 64(%rbp),%xmm8
2983 movdqa %xmm0,%xmm1
2984 movdqa %xmm4,%xmm5
2985 movdqa %xmm8,%xmm9
2986 movdqa 96(%rbp),%xmm13
2987 paddd .sse_inc(%rip),%xmm13
2988 movdqa %xmm13,%xmm12
2989 paddd .sse_inc(%rip),%xmm12
2990 movdqa %xmm12,96(%rbp)
2991 movdqa %xmm13,112(%rbp)
2992
29931:
2994 addq 0(%rdi),%r10
2995 adcq 8+0(%rdi),%r11
2996 adcq $1,%r12
2997 movq 0+0(%rbp),%rax
2998 movq %rax,%r15
2999 mulq %r10
3000 movq %rax,%r13
3001 movq %rdx,%r14
3002 movq 0+0(%rbp),%rax
3003 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003004 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003005 addq %rax,%r14
3006 adcq %rdx,%r15
3007 movq 8+0(%rbp),%rax
3008 movq %rax,%r9
3009 mulq %r10
3010 addq %rax,%r14
3011 adcq $0,%rdx
3012 movq %rdx,%r10
3013 movq 8+0(%rbp),%rax
3014 mulq %r11
3015 addq %rax,%r15
3016 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003017 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003018 addq %r10,%r15
3019 adcq %rdx,%r9
3020 movq %r13,%r10
3021 movq %r14,%r11
3022 movq %r15,%r12
3023 andq $3,%r12
3024 movq %r15,%r13
3025 andq $-4,%r13
3026 movq %r9,%r14
3027 shrdq $2,%r9,%r15
3028 shrq $2,%r9
3029 addq %r13,%r10
3030 adcq %r14,%r11
3031 adcq $0,%r12
3032 addq %r15,%r10
3033 adcq %r9,%r11
3034 adcq $0,%r12
3035
3036 leaq 16(%rdi),%rdi
30372:
3038 paddd %xmm4,%xmm0
3039 pxor %xmm0,%xmm12
3040 pshufb .rol16(%rip),%xmm12
3041 paddd %xmm12,%xmm8
3042 pxor %xmm8,%xmm4
3043 movdqa %xmm4,%xmm3
3044 pslld $12,%xmm3
3045 psrld $20,%xmm4
3046 pxor %xmm3,%xmm4
3047 paddd %xmm4,%xmm0
3048 pxor %xmm0,%xmm12
3049 pshufb .rol8(%rip),%xmm12
3050 paddd %xmm12,%xmm8
3051 pxor %xmm8,%xmm4
3052 movdqa %xmm4,%xmm3
3053 pslld $7,%xmm3
3054 psrld $25,%xmm4
3055 pxor %xmm3,%xmm4
3056.byte 102,15,58,15,228,4
3057.byte 102,69,15,58,15,192,8
3058.byte 102,69,15,58,15,228,12
3059 paddd %xmm5,%xmm1
3060 pxor %xmm1,%xmm13
3061 pshufb .rol16(%rip),%xmm13
3062 paddd %xmm13,%xmm9
3063 pxor %xmm9,%xmm5
3064 movdqa %xmm5,%xmm3
3065 pslld $12,%xmm3
3066 psrld $20,%xmm5
3067 pxor %xmm3,%xmm5
3068 paddd %xmm5,%xmm1
3069 pxor %xmm1,%xmm13
3070 pshufb .rol8(%rip),%xmm13
3071 paddd %xmm13,%xmm9
3072 pxor %xmm9,%xmm5
3073 movdqa %xmm5,%xmm3
3074 pslld $7,%xmm3
3075 psrld $25,%xmm5
3076 pxor %xmm3,%xmm5
3077.byte 102,15,58,15,237,4
3078.byte 102,69,15,58,15,201,8
3079.byte 102,69,15,58,15,237,12
3080 addq 0(%rdi),%r10
3081 adcq 8+0(%rdi),%r11
3082 adcq $1,%r12
3083 movq 0+0(%rbp),%rax
3084 movq %rax,%r15
3085 mulq %r10
3086 movq %rax,%r13
3087 movq %rdx,%r14
3088 movq 0+0(%rbp),%rax
3089 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003090 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003091 addq %rax,%r14
3092 adcq %rdx,%r15
3093 movq 8+0(%rbp),%rax
3094 movq %rax,%r9
3095 mulq %r10
3096 addq %rax,%r14
3097 adcq $0,%rdx
3098 movq %rdx,%r10
3099 movq 8+0(%rbp),%rax
3100 mulq %r11
3101 addq %rax,%r15
3102 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003103 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003104 addq %r10,%r15
3105 adcq %rdx,%r9
3106 movq %r13,%r10
3107 movq %r14,%r11
3108 movq %r15,%r12
3109 andq $3,%r12
3110 movq %r15,%r13
3111 andq $-4,%r13
3112 movq %r9,%r14
3113 shrdq $2,%r9,%r15
3114 shrq $2,%r9
3115 addq %r13,%r10
3116 adcq %r14,%r11
3117 adcq $0,%r12
3118 addq %r15,%r10
3119 adcq %r9,%r11
3120 adcq $0,%r12
3121 paddd %xmm4,%xmm0
3122 pxor %xmm0,%xmm12
3123 pshufb .rol16(%rip),%xmm12
3124 paddd %xmm12,%xmm8
3125 pxor %xmm8,%xmm4
3126 movdqa %xmm4,%xmm3
3127 pslld $12,%xmm3
3128 psrld $20,%xmm4
3129 pxor %xmm3,%xmm4
3130 paddd %xmm4,%xmm0
3131 pxor %xmm0,%xmm12
3132 pshufb .rol8(%rip),%xmm12
3133 paddd %xmm12,%xmm8
3134 pxor %xmm8,%xmm4
3135 movdqa %xmm4,%xmm3
3136 pslld $7,%xmm3
3137 psrld $25,%xmm4
3138 pxor %xmm3,%xmm4
3139.byte 102,15,58,15,228,12
3140.byte 102,69,15,58,15,192,8
3141.byte 102,69,15,58,15,228,4
3142 paddd %xmm5,%xmm1
3143 pxor %xmm1,%xmm13
3144 pshufb .rol16(%rip),%xmm13
3145 paddd %xmm13,%xmm9
3146 pxor %xmm9,%xmm5
3147 movdqa %xmm5,%xmm3
3148 pslld $12,%xmm3
3149 psrld $20,%xmm5
3150 pxor %xmm3,%xmm5
3151 paddd %xmm5,%xmm1
3152 pxor %xmm1,%xmm13
3153 pshufb .rol8(%rip),%xmm13
3154 paddd %xmm13,%xmm9
3155 pxor %xmm9,%xmm5
3156 movdqa %xmm5,%xmm3
3157 pslld $7,%xmm3
3158 psrld $25,%xmm5
3159 pxor %xmm3,%xmm5
3160.byte 102,15,58,15,237,12
3161.byte 102,69,15,58,15,201,8
3162.byte 102,69,15,58,15,237,4
3163
3164 leaq 16(%rdi),%rdi
3165 decq %rcx
3166 jg 1b
3167 decq %r8
3168 jge 2b
3169 paddd .chacha20_consts(%rip),%xmm1
3170 paddd 48(%rbp),%xmm5
3171 paddd 64(%rbp),%xmm9
3172 paddd 112(%rbp),%xmm13
3173 paddd .chacha20_consts(%rip),%xmm0
3174 paddd 48(%rbp),%xmm4
3175 paddd 64(%rbp),%xmm8
3176 paddd 96(%rbp),%xmm12
3177 movdqu 0 + 0(%rsi),%xmm3
3178 movdqu 16 + 0(%rsi),%xmm7
3179 movdqu 32 + 0(%rsi),%xmm11
3180 movdqu 48 + 0(%rsi),%xmm15
3181 pxor %xmm3,%xmm1
3182 pxor %xmm7,%xmm5
3183 pxor %xmm11,%xmm9
3184 pxor %xmm13,%xmm15
3185 movdqu %xmm1,0 + 0(%rdi)
3186 movdqu %xmm5,16 + 0(%rdi)
3187 movdqu %xmm9,32 + 0(%rdi)
3188 movdqu %xmm15,48 + 0(%rdi)
3189
3190 movq $64,%rcx
3191 subq $64,%rbx
3192 leaq 64(%rsi),%rsi
3193 jmp seal_sse_128_seal_hash
31943:
3195
3196seal_sse_tail_192:
3197 movdqa .chacha20_consts(%rip),%xmm0
3198 movdqa 48(%rbp),%xmm4
3199 movdqa 64(%rbp),%xmm8
3200 movdqa %xmm0,%xmm1
3201 movdqa %xmm4,%xmm5
3202 movdqa %xmm8,%xmm9
3203 movdqa %xmm0,%xmm2
3204 movdqa %xmm4,%xmm6
3205 movdqa %xmm8,%xmm10
3206 movdqa 96(%rbp),%xmm14
3207 paddd .sse_inc(%rip),%xmm14
3208 movdqa %xmm14,%xmm13
3209 paddd .sse_inc(%rip),%xmm13
3210 movdqa %xmm13,%xmm12
3211 paddd .sse_inc(%rip),%xmm12
3212 movdqa %xmm12,96(%rbp)
3213 movdqa %xmm13,112(%rbp)
3214 movdqa %xmm14,128(%rbp)
3215
32161:
3217 addq 0(%rdi),%r10
3218 adcq 8+0(%rdi),%r11
3219 adcq $1,%r12
3220 movq 0+0(%rbp),%rax
3221 movq %rax,%r15
3222 mulq %r10
3223 movq %rax,%r13
3224 movq %rdx,%r14
3225 movq 0+0(%rbp),%rax
3226 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003227 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003228 addq %rax,%r14
3229 adcq %rdx,%r15
3230 movq 8+0(%rbp),%rax
3231 movq %rax,%r9
3232 mulq %r10
3233 addq %rax,%r14
3234 adcq $0,%rdx
3235 movq %rdx,%r10
3236 movq 8+0(%rbp),%rax
3237 mulq %r11
3238 addq %rax,%r15
3239 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003240 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003241 addq %r10,%r15
3242 adcq %rdx,%r9
3243 movq %r13,%r10
3244 movq %r14,%r11
3245 movq %r15,%r12
3246 andq $3,%r12
3247 movq %r15,%r13
3248 andq $-4,%r13
3249 movq %r9,%r14
3250 shrdq $2,%r9,%r15
3251 shrq $2,%r9
3252 addq %r13,%r10
3253 adcq %r14,%r11
3254 adcq $0,%r12
3255 addq %r15,%r10
3256 adcq %r9,%r11
3257 adcq $0,%r12
3258
3259 leaq 16(%rdi),%rdi
32602:
3261 paddd %xmm4,%xmm0
3262 pxor %xmm0,%xmm12
3263 pshufb .rol16(%rip),%xmm12
3264 paddd %xmm12,%xmm8
3265 pxor %xmm8,%xmm4
3266 movdqa %xmm4,%xmm3
3267 pslld $12,%xmm3
3268 psrld $20,%xmm4
3269 pxor %xmm3,%xmm4
3270 paddd %xmm4,%xmm0
3271 pxor %xmm0,%xmm12
3272 pshufb .rol8(%rip),%xmm12
3273 paddd %xmm12,%xmm8
3274 pxor %xmm8,%xmm4
3275 movdqa %xmm4,%xmm3
3276 pslld $7,%xmm3
3277 psrld $25,%xmm4
3278 pxor %xmm3,%xmm4
3279.byte 102,15,58,15,228,4
3280.byte 102,69,15,58,15,192,8
3281.byte 102,69,15,58,15,228,12
3282 paddd %xmm5,%xmm1
3283 pxor %xmm1,%xmm13
3284 pshufb .rol16(%rip),%xmm13
3285 paddd %xmm13,%xmm9
3286 pxor %xmm9,%xmm5
3287 movdqa %xmm5,%xmm3
3288 pslld $12,%xmm3
3289 psrld $20,%xmm5
3290 pxor %xmm3,%xmm5
3291 paddd %xmm5,%xmm1
3292 pxor %xmm1,%xmm13
3293 pshufb .rol8(%rip),%xmm13
3294 paddd %xmm13,%xmm9
3295 pxor %xmm9,%xmm5
3296 movdqa %xmm5,%xmm3
3297 pslld $7,%xmm3
3298 psrld $25,%xmm5
3299 pxor %xmm3,%xmm5
3300.byte 102,15,58,15,237,4
3301.byte 102,69,15,58,15,201,8
3302.byte 102,69,15,58,15,237,12
3303 paddd %xmm6,%xmm2
3304 pxor %xmm2,%xmm14
3305 pshufb .rol16(%rip),%xmm14
3306 paddd %xmm14,%xmm10
3307 pxor %xmm10,%xmm6
3308 movdqa %xmm6,%xmm3
3309 pslld $12,%xmm3
3310 psrld $20,%xmm6
3311 pxor %xmm3,%xmm6
3312 paddd %xmm6,%xmm2
3313 pxor %xmm2,%xmm14
3314 pshufb .rol8(%rip),%xmm14
3315 paddd %xmm14,%xmm10
3316 pxor %xmm10,%xmm6
3317 movdqa %xmm6,%xmm3
3318 pslld $7,%xmm3
3319 psrld $25,%xmm6
3320 pxor %xmm3,%xmm6
3321.byte 102,15,58,15,246,4
3322.byte 102,69,15,58,15,210,8
3323.byte 102,69,15,58,15,246,12
3324 addq 0(%rdi),%r10
3325 adcq 8+0(%rdi),%r11
3326 adcq $1,%r12
3327 movq 0+0(%rbp),%rax
3328 movq %rax,%r15
3329 mulq %r10
3330 movq %rax,%r13
3331 movq %rdx,%r14
3332 movq 0+0(%rbp),%rax
3333 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003334 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003335 addq %rax,%r14
3336 adcq %rdx,%r15
3337 movq 8+0(%rbp),%rax
3338 movq %rax,%r9
3339 mulq %r10
3340 addq %rax,%r14
3341 adcq $0,%rdx
3342 movq %rdx,%r10
3343 movq 8+0(%rbp),%rax
3344 mulq %r11
3345 addq %rax,%r15
3346 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003347 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003348 addq %r10,%r15
3349 adcq %rdx,%r9
3350 movq %r13,%r10
3351 movq %r14,%r11
3352 movq %r15,%r12
3353 andq $3,%r12
3354 movq %r15,%r13
3355 andq $-4,%r13
3356 movq %r9,%r14
3357 shrdq $2,%r9,%r15
3358 shrq $2,%r9
3359 addq %r13,%r10
3360 adcq %r14,%r11
3361 adcq $0,%r12
3362 addq %r15,%r10
3363 adcq %r9,%r11
3364 adcq $0,%r12
3365 paddd %xmm4,%xmm0
3366 pxor %xmm0,%xmm12
3367 pshufb .rol16(%rip),%xmm12
3368 paddd %xmm12,%xmm8
3369 pxor %xmm8,%xmm4
3370 movdqa %xmm4,%xmm3
3371 pslld $12,%xmm3
3372 psrld $20,%xmm4
3373 pxor %xmm3,%xmm4
3374 paddd %xmm4,%xmm0
3375 pxor %xmm0,%xmm12
3376 pshufb .rol8(%rip),%xmm12
3377 paddd %xmm12,%xmm8
3378 pxor %xmm8,%xmm4
3379 movdqa %xmm4,%xmm3
3380 pslld $7,%xmm3
3381 psrld $25,%xmm4
3382 pxor %xmm3,%xmm4
3383.byte 102,15,58,15,228,12
3384.byte 102,69,15,58,15,192,8
3385.byte 102,69,15,58,15,228,4
3386 paddd %xmm5,%xmm1
3387 pxor %xmm1,%xmm13
3388 pshufb .rol16(%rip),%xmm13
3389 paddd %xmm13,%xmm9
3390 pxor %xmm9,%xmm5
3391 movdqa %xmm5,%xmm3
3392 pslld $12,%xmm3
3393 psrld $20,%xmm5
3394 pxor %xmm3,%xmm5
3395 paddd %xmm5,%xmm1
3396 pxor %xmm1,%xmm13
3397 pshufb .rol8(%rip),%xmm13
3398 paddd %xmm13,%xmm9
3399 pxor %xmm9,%xmm5
3400 movdqa %xmm5,%xmm3
3401 pslld $7,%xmm3
3402 psrld $25,%xmm5
3403 pxor %xmm3,%xmm5
3404.byte 102,15,58,15,237,12
3405.byte 102,69,15,58,15,201,8
3406.byte 102,69,15,58,15,237,4
3407 paddd %xmm6,%xmm2
3408 pxor %xmm2,%xmm14
3409 pshufb .rol16(%rip),%xmm14
3410 paddd %xmm14,%xmm10
3411 pxor %xmm10,%xmm6
3412 movdqa %xmm6,%xmm3
3413 pslld $12,%xmm3
3414 psrld $20,%xmm6
3415 pxor %xmm3,%xmm6
3416 paddd %xmm6,%xmm2
3417 pxor %xmm2,%xmm14
3418 pshufb .rol8(%rip),%xmm14
3419 paddd %xmm14,%xmm10
3420 pxor %xmm10,%xmm6
3421 movdqa %xmm6,%xmm3
3422 pslld $7,%xmm3
3423 psrld $25,%xmm6
3424 pxor %xmm3,%xmm6
3425.byte 102,15,58,15,246,12
3426.byte 102,69,15,58,15,210,8
3427.byte 102,69,15,58,15,246,4
3428
3429 leaq 16(%rdi),%rdi
3430 decq %rcx
3431 jg 1b
3432 decq %r8
3433 jge 2b
3434 paddd .chacha20_consts(%rip),%xmm2
3435 paddd 48(%rbp),%xmm6
3436 paddd 64(%rbp),%xmm10
3437 paddd 128(%rbp),%xmm14
3438 paddd .chacha20_consts(%rip),%xmm1
3439 paddd 48(%rbp),%xmm5
3440 paddd 64(%rbp),%xmm9
3441 paddd 112(%rbp),%xmm13
3442 paddd .chacha20_consts(%rip),%xmm0
3443 paddd 48(%rbp),%xmm4
3444 paddd 64(%rbp),%xmm8
3445 paddd 96(%rbp),%xmm12
3446 movdqu 0 + 0(%rsi),%xmm3
3447 movdqu 16 + 0(%rsi),%xmm7
3448 movdqu 32 + 0(%rsi),%xmm11
3449 movdqu 48 + 0(%rsi),%xmm15
3450 pxor %xmm3,%xmm2
3451 pxor %xmm7,%xmm6
3452 pxor %xmm11,%xmm10
3453 pxor %xmm14,%xmm15
3454 movdqu %xmm2,0 + 0(%rdi)
3455 movdqu %xmm6,16 + 0(%rdi)
3456 movdqu %xmm10,32 + 0(%rdi)
3457 movdqu %xmm15,48 + 0(%rdi)
3458 movdqu 0 + 64(%rsi),%xmm3
3459 movdqu 16 + 64(%rsi),%xmm7
3460 movdqu 32 + 64(%rsi),%xmm11
3461 movdqu 48 + 64(%rsi),%xmm15
3462 pxor %xmm3,%xmm1
3463 pxor %xmm7,%xmm5
3464 pxor %xmm11,%xmm9
3465 pxor %xmm13,%xmm15
3466 movdqu %xmm1,0 + 64(%rdi)
3467 movdqu %xmm5,16 + 64(%rdi)
3468 movdqu %xmm9,32 + 64(%rdi)
3469 movdqu %xmm15,48 + 64(%rdi)
3470
3471 movq $128,%rcx
3472 subq $128,%rbx
3473 leaq 128(%rsi),%rsi
3474
3475seal_sse_128_seal_hash:
3476 cmpq $16,%rcx
3477 jb seal_sse_128_seal
3478 addq 0(%rdi),%r10
3479 adcq 8+0(%rdi),%r11
3480 adcq $1,%r12
3481 movq 0+0(%rbp),%rax
3482 movq %rax,%r15
3483 mulq %r10
3484 movq %rax,%r13
3485 movq %rdx,%r14
3486 movq 0+0(%rbp),%rax
3487 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003488 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003489 addq %rax,%r14
3490 adcq %rdx,%r15
3491 movq 8+0(%rbp),%rax
3492 movq %rax,%r9
3493 mulq %r10
3494 addq %rax,%r14
3495 adcq $0,%rdx
3496 movq %rdx,%r10
3497 movq 8+0(%rbp),%rax
3498 mulq %r11
3499 addq %rax,%r15
3500 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003501 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003502 addq %r10,%r15
3503 adcq %rdx,%r9
3504 movq %r13,%r10
3505 movq %r14,%r11
3506 movq %r15,%r12
3507 andq $3,%r12
3508 movq %r15,%r13
3509 andq $-4,%r13
3510 movq %r9,%r14
3511 shrdq $2,%r9,%r15
3512 shrq $2,%r9
3513 addq %r13,%r10
3514 adcq %r14,%r11
3515 adcq $0,%r12
3516 addq %r15,%r10
3517 adcq %r9,%r11
3518 adcq $0,%r12
3519
3520 subq $16,%rcx
3521 leaq 16(%rdi),%rdi
3522 jmp seal_sse_128_seal_hash
3523
3524seal_sse_128_seal:
3525 cmpq $16,%rbx
3526 jb seal_sse_tail_16
3527 subq $16,%rbx
3528
3529 movdqu 0(%rsi),%xmm3
3530 pxor %xmm3,%xmm0
3531 movdqu %xmm0,0(%rdi)
3532
3533 addq 0(%rdi),%r10
3534 adcq 8(%rdi),%r11
3535 adcq $1,%r12
3536 leaq 16(%rsi),%rsi
3537 leaq 16(%rdi),%rdi
3538 movq 0+0(%rbp),%rax
3539 movq %rax,%r15
3540 mulq %r10
3541 movq %rax,%r13
3542 movq %rdx,%r14
3543 movq 0+0(%rbp),%rax
3544 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003545 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003546 addq %rax,%r14
3547 adcq %rdx,%r15
3548 movq 8+0(%rbp),%rax
3549 movq %rax,%r9
3550 mulq %r10
3551 addq %rax,%r14
3552 adcq $0,%rdx
3553 movq %rdx,%r10
3554 movq 8+0(%rbp),%rax
3555 mulq %r11
3556 addq %rax,%r15
3557 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003558 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003559 addq %r10,%r15
3560 adcq %rdx,%r9
3561 movq %r13,%r10
3562 movq %r14,%r11
3563 movq %r15,%r12
3564 andq $3,%r12
3565 movq %r15,%r13
3566 andq $-4,%r13
3567 movq %r9,%r14
3568 shrdq $2,%r9,%r15
3569 shrq $2,%r9
3570 addq %r13,%r10
3571 adcq %r14,%r11
3572 adcq $0,%r12
3573 addq %r15,%r10
3574 adcq %r9,%r11
3575 adcq $0,%r12
3576
3577
3578 movdqa %xmm4,%xmm0
3579 movdqa %xmm8,%xmm4
3580 movdqa %xmm12,%xmm8
3581 movdqa %xmm1,%xmm12
3582 movdqa %xmm5,%xmm1
3583 movdqa %xmm9,%xmm5
3584 movdqa %xmm13,%xmm9
3585 jmp seal_sse_128_seal
3586
3587seal_sse_tail_16:
3588 testq %rbx,%rbx
3589 jz seal_sse_finalize
3590
3591 movq %rbx,%r8
3592 shlq $4,%r8
3593 leaq .and_masks(%rip),%r13
3594 movq %rbx,%rcx
3595 leaq -1(%rsi,%rbx), %rsi
3596 pxor %xmm15,%xmm15
35971:
3598 pslldq $1,%xmm15
3599 pinsrb $0,(%rsi),%xmm15
3600 leaq -1(%rsi),%rsi
3601 decq %rcx
3602 jne 1b
3603
3604
3605 pxor %xmm0,%xmm15
3606
3607
3608 movq %rbx,%rcx
3609 movdqu %xmm15,%xmm0
36102:
3611 pextrb $0,%xmm0,(%rdi)
3612 psrldq $1,%xmm0
3613 addq $1,%rdi
3614 subq $1,%rcx
3615 jnz 2b
3616
3617 pand -16(%r13,%r8), %xmm15
3618.byte 102,77,15,126,253
3619 pextrq $1,%xmm15,%r14
3620 addq %r13,%r10
3621 adcq %r14,%r11
3622 adcq $1,%r12
3623 movq 0+0(%rbp),%rax
3624 movq %rax,%r15
3625 mulq %r10
3626 movq %rax,%r13
3627 movq %rdx,%r14
3628 movq 0+0(%rbp),%rax
3629 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003630 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003631 addq %rax,%r14
3632 adcq %rdx,%r15
3633 movq 8+0(%rbp),%rax
3634 movq %rax,%r9
3635 mulq %r10
3636 addq %rax,%r14
3637 adcq $0,%rdx
3638 movq %rdx,%r10
3639 movq 8+0(%rbp),%rax
3640 mulq %r11
3641 addq %rax,%r15
3642 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003643 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003644 addq %r10,%r15
3645 adcq %rdx,%r9
3646 movq %r13,%r10
3647 movq %r14,%r11
3648 movq %r15,%r12
3649 andq $3,%r12
3650 movq %r15,%r13
3651 andq $-4,%r13
3652 movq %r9,%r14
3653 shrdq $2,%r9,%r15
3654 shrq $2,%r9
3655 addq %r13,%r10
3656 adcq %r14,%r11
3657 adcq $0,%r12
3658 addq %r15,%r10
3659 adcq %r9,%r11
3660 adcq $0,%r12
3661
3662seal_sse_finalize:
3663 addq 32(%rbp),%r10
3664 adcq 8+32(%rbp),%r11
3665 adcq $1,%r12
3666 movq 0+0(%rbp),%rax
3667 movq %rax,%r15
3668 mulq %r10
3669 movq %rax,%r13
3670 movq %rdx,%r14
3671 movq 0+0(%rbp),%rax
3672 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08003673 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05003674 addq %rax,%r14
3675 adcq %rdx,%r15
3676 movq 8+0(%rbp),%rax
3677 movq %rax,%r9
3678 mulq %r10
3679 addq %rax,%r14
3680 adcq $0,%rdx
3681 movq %rdx,%r10
3682 movq 8+0(%rbp),%rax
3683 mulq %r11
3684 addq %rax,%r15
3685 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08003686 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05003687 addq %r10,%r15
3688 adcq %rdx,%r9
3689 movq %r13,%r10
3690 movq %r14,%r11
3691 movq %r15,%r12
3692 andq $3,%r12
3693 movq %r15,%r13
3694 andq $-4,%r13
3695 movq %r9,%r14
3696 shrdq $2,%r9,%r15
3697 shrq $2,%r9
3698 addq %r13,%r10
3699 adcq %r14,%r11
3700 adcq $0,%r12
3701 addq %r15,%r10
3702 adcq %r9,%r11
3703 adcq $0,%r12
3704
3705
3706 movq %r10,%r13
3707 movq %r11,%r14
3708 movq %r12,%r15
3709 subq $-5,%r10
3710 sbbq $-1,%r11
3711 sbbq $3,%r12
3712 cmovcq %r13,%r10
3713 cmovcq %r14,%r11
3714 cmovcq %r15,%r12
3715
3716 addq 0+16(%rbp),%r10
3717 adcq 8+16(%rbp),%r11
3718
3719 addq $288 + 32,%rsp
Robert Sloana94fe052017-02-21 08:49:28 -08003720
David Benjaminf31229b2017-01-25 14:08:15 -05003721 popq %r9
Robert Sloana94fe052017-02-21 08:49:28 -08003722
David Benjaminf31229b2017-01-25 14:08:15 -05003723 movq %r10,0(%r9)
3724 movq %r11,8(%r9)
3725
3726 popq %r15
Robert Sloana94fe052017-02-21 08:49:28 -08003727
David Benjaminf31229b2017-01-25 14:08:15 -05003728 popq %r14
Robert Sloana94fe052017-02-21 08:49:28 -08003729
David Benjaminf31229b2017-01-25 14:08:15 -05003730 popq %r13
Robert Sloana94fe052017-02-21 08:49:28 -08003731
David Benjaminf31229b2017-01-25 14:08:15 -05003732 popq %r12
Robert Sloana94fe052017-02-21 08:49:28 -08003733
David Benjaminf31229b2017-01-25 14:08:15 -05003734 popq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08003735
David Benjaminf31229b2017-01-25 14:08:15 -05003736 popq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08003737
David Benjaminf31229b2017-01-25 14:08:15 -05003738 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -08003739
David Benjaminf31229b2017-01-25 14:08:15 -05003740
3741seal_sse_128:
3742 movdqu .chacha20_consts(%rip),%xmm0
3743 movdqa %xmm0,%xmm1
3744 movdqa %xmm0,%xmm2
3745 movdqu 0(%r9),%xmm4
3746 movdqa %xmm4,%xmm5
3747 movdqa %xmm4,%xmm6
3748 movdqu 16(%r9),%xmm8
3749 movdqa %xmm8,%xmm9
3750 movdqa %xmm8,%xmm10
3751 movdqu 32(%r9),%xmm14
3752 movdqa %xmm14,%xmm12
3753 paddd .sse_inc(%rip),%xmm12
3754 movdqa %xmm12,%xmm13
3755 paddd .sse_inc(%rip),%xmm13
3756 movdqa %xmm4,%xmm7
3757 movdqa %xmm8,%xmm11
3758 movdqa %xmm12,%xmm15
3759 movq $10,%r10
37601:
3761 paddd %xmm4,%xmm0
3762 pxor %xmm0,%xmm12
3763 pshufb .rol16(%rip),%xmm12
3764 paddd %xmm12,%xmm8
3765 pxor %xmm8,%xmm4
3766 movdqa %xmm4,%xmm3
3767 pslld $12,%xmm3
3768 psrld $20,%xmm4
3769 pxor %xmm3,%xmm4
3770 paddd %xmm4,%xmm0
3771 pxor %xmm0,%xmm12
3772 pshufb .rol8(%rip),%xmm12
3773 paddd %xmm12,%xmm8
3774 pxor %xmm8,%xmm4
3775 movdqa %xmm4,%xmm3
3776 pslld $7,%xmm3
3777 psrld $25,%xmm4
3778 pxor %xmm3,%xmm4
3779.byte 102,15,58,15,228,4
3780.byte 102,69,15,58,15,192,8
3781.byte 102,69,15,58,15,228,12
3782 paddd %xmm5,%xmm1
3783 pxor %xmm1,%xmm13
3784 pshufb .rol16(%rip),%xmm13
3785 paddd %xmm13,%xmm9
3786 pxor %xmm9,%xmm5
3787 movdqa %xmm5,%xmm3
3788 pslld $12,%xmm3
3789 psrld $20,%xmm5
3790 pxor %xmm3,%xmm5
3791 paddd %xmm5,%xmm1
3792 pxor %xmm1,%xmm13
3793 pshufb .rol8(%rip),%xmm13
3794 paddd %xmm13,%xmm9
3795 pxor %xmm9,%xmm5
3796 movdqa %xmm5,%xmm3
3797 pslld $7,%xmm3
3798 psrld $25,%xmm5
3799 pxor %xmm3,%xmm5
3800.byte 102,15,58,15,237,4
3801.byte 102,69,15,58,15,201,8
3802.byte 102,69,15,58,15,237,12
3803 paddd %xmm6,%xmm2
3804 pxor %xmm2,%xmm14
3805 pshufb .rol16(%rip),%xmm14
3806 paddd %xmm14,%xmm10
3807 pxor %xmm10,%xmm6
3808 movdqa %xmm6,%xmm3
3809 pslld $12,%xmm3
3810 psrld $20,%xmm6
3811 pxor %xmm3,%xmm6
3812 paddd %xmm6,%xmm2
3813 pxor %xmm2,%xmm14
3814 pshufb .rol8(%rip),%xmm14
3815 paddd %xmm14,%xmm10
3816 pxor %xmm10,%xmm6
3817 movdqa %xmm6,%xmm3
3818 pslld $7,%xmm3
3819 psrld $25,%xmm6
3820 pxor %xmm3,%xmm6
3821.byte 102,15,58,15,246,4
3822.byte 102,69,15,58,15,210,8
3823.byte 102,69,15,58,15,246,12
3824 paddd %xmm4,%xmm0
3825 pxor %xmm0,%xmm12
3826 pshufb .rol16(%rip),%xmm12
3827 paddd %xmm12,%xmm8
3828 pxor %xmm8,%xmm4
3829 movdqa %xmm4,%xmm3
3830 pslld $12,%xmm3
3831 psrld $20,%xmm4
3832 pxor %xmm3,%xmm4
3833 paddd %xmm4,%xmm0
3834 pxor %xmm0,%xmm12
3835 pshufb .rol8(%rip),%xmm12
3836 paddd %xmm12,%xmm8
3837 pxor %xmm8,%xmm4
3838 movdqa %xmm4,%xmm3
3839 pslld $7,%xmm3
3840 psrld $25,%xmm4
3841 pxor %xmm3,%xmm4
3842.byte 102,15,58,15,228,12
3843.byte 102,69,15,58,15,192,8
3844.byte 102,69,15,58,15,228,4
3845 paddd %xmm5,%xmm1
3846 pxor %xmm1,%xmm13
3847 pshufb .rol16(%rip),%xmm13
3848 paddd %xmm13,%xmm9
3849 pxor %xmm9,%xmm5
3850 movdqa %xmm5,%xmm3
3851 pslld $12,%xmm3
3852 psrld $20,%xmm5
3853 pxor %xmm3,%xmm5
3854 paddd %xmm5,%xmm1
3855 pxor %xmm1,%xmm13
3856 pshufb .rol8(%rip),%xmm13
3857 paddd %xmm13,%xmm9
3858 pxor %xmm9,%xmm5
3859 movdqa %xmm5,%xmm3
3860 pslld $7,%xmm3
3861 psrld $25,%xmm5
3862 pxor %xmm3,%xmm5
3863.byte 102,15,58,15,237,12
3864.byte 102,69,15,58,15,201,8
3865.byte 102,69,15,58,15,237,4
3866 paddd %xmm6,%xmm2
3867 pxor %xmm2,%xmm14
3868 pshufb .rol16(%rip),%xmm14
3869 paddd %xmm14,%xmm10
3870 pxor %xmm10,%xmm6
3871 movdqa %xmm6,%xmm3
3872 pslld $12,%xmm3
3873 psrld $20,%xmm6
3874 pxor %xmm3,%xmm6
3875 paddd %xmm6,%xmm2
3876 pxor %xmm2,%xmm14
3877 pshufb .rol8(%rip),%xmm14
3878 paddd %xmm14,%xmm10
3879 pxor %xmm10,%xmm6
3880 movdqa %xmm6,%xmm3
3881 pslld $7,%xmm3
3882 psrld $25,%xmm6
3883 pxor %xmm3,%xmm6
3884.byte 102,15,58,15,246,12
3885.byte 102,69,15,58,15,210,8
3886.byte 102,69,15,58,15,246,4
3887
3888 decq %r10
3889 jnz 1b
3890 paddd .chacha20_consts(%rip),%xmm0
3891 paddd .chacha20_consts(%rip),%xmm1
3892 paddd .chacha20_consts(%rip),%xmm2
3893 paddd %xmm7,%xmm4
3894 paddd %xmm7,%xmm5
3895 paddd %xmm7,%xmm6
3896 paddd %xmm11,%xmm8
3897 paddd %xmm11,%xmm9
3898 paddd %xmm15,%xmm12
3899 paddd .sse_inc(%rip),%xmm15
3900 paddd %xmm15,%xmm13
3901
3902 pand .clamp(%rip),%xmm2
3903 movdqa %xmm2,0(%rbp)
3904 movdqa %xmm6,16(%rbp)
3905
3906 movq %r8,%r8
3907 call poly_hash_ad_internal
3908 jmp seal_sse_128_seal
3909
3910
3911
3912
3913.p2align 6
3914chacha20_poly1305_open_avx2:
3915 vzeroupper
3916 vmovdqa .chacha20_consts(%rip),%ymm0
3917 vbroadcasti128 0(%r9),%ymm4
3918 vbroadcasti128 16(%r9),%ymm8
3919 vbroadcasti128 32(%r9),%ymm12
3920 vpaddd .avx2_init(%rip),%ymm12,%ymm12
3921 cmpq $192,%rbx
3922 jbe open_avx2_192
3923 cmpq $320,%rbx
3924 jbe open_avx2_320
3925
3926 vmovdqa %ymm4,64(%rbp)
3927 vmovdqa %ymm8,96(%rbp)
3928 vmovdqa %ymm12,160(%rbp)
3929 movq $10,%r10
39301:
3931 vpaddd %ymm4,%ymm0,%ymm0
3932 vpxor %ymm0,%ymm12,%ymm12
3933 vpshufb .rol16(%rip),%ymm12,%ymm12
3934 vpaddd %ymm12,%ymm8,%ymm8
3935 vpxor %ymm8,%ymm4,%ymm4
3936 vpsrld $20,%ymm4,%ymm3
3937 vpslld $12,%ymm4,%ymm4
3938 vpxor %ymm3,%ymm4,%ymm4
3939 vpaddd %ymm4,%ymm0,%ymm0
3940 vpxor %ymm0,%ymm12,%ymm12
3941 vpshufb .rol8(%rip),%ymm12,%ymm12
3942 vpaddd %ymm12,%ymm8,%ymm8
3943 vpxor %ymm8,%ymm4,%ymm4
3944 vpslld $7,%ymm4,%ymm3
3945 vpsrld $25,%ymm4,%ymm4
3946 vpxor %ymm3,%ymm4,%ymm4
3947 vpalignr $12,%ymm12,%ymm12,%ymm12
3948 vpalignr $8,%ymm8,%ymm8,%ymm8
3949 vpalignr $4,%ymm4,%ymm4,%ymm4
3950 vpaddd %ymm4,%ymm0,%ymm0
3951 vpxor %ymm0,%ymm12,%ymm12
3952 vpshufb .rol16(%rip),%ymm12,%ymm12
3953 vpaddd %ymm12,%ymm8,%ymm8
3954 vpxor %ymm8,%ymm4,%ymm4
3955 vpsrld $20,%ymm4,%ymm3
3956 vpslld $12,%ymm4,%ymm4
3957 vpxor %ymm3,%ymm4,%ymm4
3958 vpaddd %ymm4,%ymm0,%ymm0
3959 vpxor %ymm0,%ymm12,%ymm12
3960 vpshufb .rol8(%rip),%ymm12,%ymm12
3961 vpaddd %ymm12,%ymm8,%ymm8
3962 vpxor %ymm8,%ymm4,%ymm4
3963 vpslld $7,%ymm4,%ymm3
3964 vpsrld $25,%ymm4,%ymm4
3965 vpxor %ymm3,%ymm4,%ymm4
3966 vpalignr $4,%ymm12,%ymm12,%ymm12
3967 vpalignr $8,%ymm8,%ymm8,%ymm8
3968 vpalignr $12,%ymm4,%ymm4,%ymm4
3969
3970 decq %r10
3971 jne 1b
3972 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
3973 vpaddd 64(%rbp),%ymm4,%ymm4
3974 vpaddd 96(%rbp),%ymm8,%ymm8
3975 vpaddd 160(%rbp),%ymm12,%ymm12
3976
3977 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
3978
3979 vpand .clamp(%rip),%ymm3,%ymm3
3980 vmovdqa %ymm3,0(%rbp)
3981
3982 vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
3983 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
3984
3985 movq %r8,%r8
3986 call poly_hash_ad_internal
3987 xorq %rcx,%rcx
3988
39891:
3990 addq 0(%rsi,%rcx), %r10
3991 adcq 8+0(%rsi,%rcx), %r11
3992 adcq $1,%r12
3993 movq 0+0(%rbp),%rax
3994 movq %rax,%r15
3995 mulq %r10
3996 movq %rax,%r13
3997 movq %rdx,%r14
3998 movq 0+0(%rbp),%rax
3999 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08004000 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004001 addq %rax,%r14
4002 adcq %rdx,%r15
4003 movq 8+0(%rbp),%rax
4004 movq %rax,%r9
4005 mulq %r10
4006 addq %rax,%r14
4007 adcq $0,%rdx
4008 movq %rdx,%r10
4009 movq 8+0(%rbp),%rax
4010 mulq %r11
4011 addq %rax,%r15
4012 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004013 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05004014 addq %r10,%r15
4015 adcq %rdx,%r9
4016 movq %r13,%r10
4017 movq %r14,%r11
4018 movq %r15,%r12
4019 andq $3,%r12
4020 movq %r15,%r13
4021 andq $-4,%r13
4022 movq %r9,%r14
4023 shrdq $2,%r9,%r15
4024 shrq $2,%r9
4025 addq %r13,%r10
4026 adcq %r14,%r11
4027 adcq $0,%r12
4028 addq %r15,%r10
4029 adcq %r9,%r11
4030 adcq $0,%r12
4031
4032 addq $16,%rcx
4033 cmpq $64,%rcx
4034 jne 1b
4035
4036 vpxor 0(%rsi),%ymm0,%ymm0
4037 vpxor 32(%rsi),%ymm4,%ymm4
4038 vmovdqu %ymm0,0(%rdi)
4039 vmovdqu %ymm4,32(%rdi)
4040 leaq 64(%rsi),%rsi
4041 leaq 64(%rdi),%rdi
4042 subq $64,%rbx
40431:
4044
4045 cmpq $512,%rbx
4046 jb 3f
4047 vmovdqa .chacha20_consts(%rip),%ymm0
4048 vmovdqa 64(%rbp),%ymm4
4049 vmovdqa 96(%rbp),%ymm8
4050 vmovdqa %ymm0,%ymm1
4051 vmovdqa %ymm4,%ymm5
4052 vmovdqa %ymm8,%ymm9
4053 vmovdqa %ymm0,%ymm2
4054 vmovdqa %ymm4,%ymm6
4055 vmovdqa %ymm8,%ymm10
4056 vmovdqa %ymm0,%ymm3
4057 vmovdqa %ymm4,%ymm7
4058 vmovdqa %ymm8,%ymm11
4059 vmovdqa .avx2_inc(%rip),%ymm12
4060 vpaddd 160(%rbp),%ymm12,%ymm15
4061 vpaddd %ymm15,%ymm12,%ymm14
4062 vpaddd %ymm14,%ymm12,%ymm13
4063 vpaddd %ymm13,%ymm12,%ymm12
4064 vmovdqa %ymm15,256(%rbp)
4065 vmovdqa %ymm14,224(%rbp)
4066 vmovdqa %ymm13,192(%rbp)
4067 vmovdqa %ymm12,160(%rbp)
4068
4069 xorq %rcx,%rcx
40702:
4071 addq 0*8(%rsi,%rcx), %r10
4072 adcq 8+0*8(%rsi,%rcx), %r11
4073 adcq $1,%r12
4074 vmovdqa %ymm8,128(%rbp)
4075 vmovdqa .rol16(%rip),%ymm8
4076 vpaddd %ymm7,%ymm3,%ymm3
4077 vpaddd %ymm6,%ymm2,%ymm2
4078 vpaddd %ymm5,%ymm1,%ymm1
4079 vpaddd %ymm4,%ymm0,%ymm0
4080 vpxor %ymm3,%ymm15,%ymm15
4081 vpxor %ymm2,%ymm14,%ymm14
4082 vpxor %ymm1,%ymm13,%ymm13
4083 vpxor %ymm0,%ymm12,%ymm12
4084 movq 0+0(%rbp),%rdx
4085 movq %rdx,%r15
4086 mulxq %r10,%r13,%r14
4087 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004088 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004089 addq %rax,%r14
4090 adcq %rdx,%r15
4091 vpshufb %ymm8,%ymm15,%ymm15
4092 vpshufb %ymm8,%ymm14,%ymm14
4093 vpshufb %ymm8,%ymm13,%ymm13
4094 vpshufb %ymm8,%ymm12,%ymm12
4095 vmovdqa 128(%rbp),%ymm8
4096 vpaddd %ymm15,%ymm11,%ymm11
4097 vpaddd %ymm14,%ymm10,%ymm10
4098 vpaddd %ymm13,%ymm9,%ymm9
4099 vpaddd %ymm12,%ymm8,%ymm8
4100 movq 8+0(%rbp),%rdx
4101 mulxq %r10,%r10,%rax
4102 addq %r10,%r14
4103 mulxq %r11,%r11,%r9
4104 adcq %r11,%r15
4105 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004106 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004107 vpxor %ymm11,%ymm7,%ymm7
4108 vpxor %ymm10,%ymm6,%ymm6
4109 vpxor %ymm9,%ymm5,%ymm5
4110 vpxor %ymm8,%ymm4,%ymm4
4111 vmovdqa %ymm8,128(%rbp)
4112 vpsrld $20,%ymm7,%ymm8
4113 vpslld $32-20,%ymm7,%ymm7
4114 vpxor %ymm8,%ymm7,%ymm7
4115 vpsrld $20,%ymm6,%ymm8
4116 vpslld $32-20,%ymm6,%ymm6
4117 vpxor %ymm8,%ymm6,%ymm6
4118 vpsrld $20,%ymm5,%ymm8
4119 addq %rax,%r15
4120 adcq %rdx,%r9
4121 vpslld $32-20,%ymm5,%ymm5
4122 vpxor %ymm8,%ymm5,%ymm5
4123 vpsrld $20,%ymm4,%ymm8
4124 vpslld $32-20,%ymm4,%ymm4
4125 vpxor %ymm8,%ymm4,%ymm4
4126 vmovdqa .rol8(%rip),%ymm8
4127 vpaddd %ymm7,%ymm3,%ymm3
4128 vpaddd %ymm6,%ymm2,%ymm2
4129 vpaddd %ymm5,%ymm1,%ymm1
4130 vpaddd %ymm4,%ymm0,%ymm0
4131 movq %r13,%r10
4132 movq %r14,%r11
4133 movq %r15,%r12
4134 andq $3,%r12
4135 movq %r15,%r13
4136 andq $-4,%r13
4137 movq %r9,%r14
4138 shrdq $2,%r9,%r15
4139 shrq $2,%r9
4140 addq %r13,%r10
4141 adcq %r14,%r11
4142 adcq $0,%r12
4143 addq %r15,%r10
4144 adcq %r9,%r11
4145 adcq $0,%r12
4146 vpxor %ymm3,%ymm15,%ymm15
4147 vpxor %ymm2,%ymm14,%ymm14
4148 vpxor %ymm1,%ymm13,%ymm13
4149 vpxor %ymm0,%ymm12,%ymm12
4150 vpshufb %ymm8,%ymm15,%ymm15
4151 vpshufb %ymm8,%ymm14,%ymm14
4152 vpshufb %ymm8,%ymm13,%ymm13
4153 vpshufb %ymm8,%ymm12,%ymm12
4154 vmovdqa 128(%rbp),%ymm8
4155 addq 2*8(%rsi,%rcx), %r10
4156 adcq 8+2*8(%rsi,%rcx), %r11
4157 adcq $1,%r12
4158 vpaddd %ymm15,%ymm11,%ymm11
4159 vpaddd %ymm14,%ymm10,%ymm10
4160 vpaddd %ymm13,%ymm9,%ymm9
4161 vpaddd %ymm12,%ymm8,%ymm8
4162 vpxor %ymm11,%ymm7,%ymm7
4163 vpxor %ymm10,%ymm6,%ymm6
4164 vpxor %ymm9,%ymm5,%ymm5
4165 vpxor %ymm8,%ymm4,%ymm4
4166 movq 0+0(%rbp),%rdx
4167 movq %rdx,%r15
4168 mulxq %r10,%r13,%r14
4169 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004170 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004171 addq %rax,%r14
4172 adcq %rdx,%r15
4173 vmovdqa %ymm8,128(%rbp)
4174 vpsrld $25,%ymm7,%ymm8
4175 vpslld $32-25,%ymm7,%ymm7
4176 vpxor %ymm8,%ymm7,%ymm7
4177 vpsrld $25,%ymm6,%ymm8
4178 vpslld $32-25,%ymm6,%ymm6
4179 vpxor %ymm8,%ymm6,%ymm6
4180 vpsrld $25,%ymm5,%ymm8
4181 vpslld $32-25,%ymm5,%ymm5
4182 vpxor %ymm8,%ymm5,%ymm5
4183 vpsrld $25,%ymm4,%ymm8
4184 vpslld $32-25,%ymm4,%ymm4
4185 vpxor %ymm8,%ymm4,%ymm4
4186 vmovdqa 128(%rbp),%ymm8
4187 vpalignr $4,%ymm7,%ymm7,%ymm7
4188 vpalignr $8,%ymm11,%ymm11,%ymm11
4189 vpalignr $12,%ymm15,%ymm15,%ymm15
4190 vpalignr $4,%ymm6,%ymm6,%ymm6
4191 movq 8+0(%rbp),%rdx
4192 mulxq %r10,%r10,%rax
4193 addq %r10,%r14
4194 mulxq %r11,%r11,%r9
4195 adcq %r11,%r15
4196 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004197 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004198 vpalignr $8,%ymm10,%ymm10,%ymm10
4199 vpalignr $12,%ymm14,%ymm14,%ymm14
4200 vpalignr $4,%ymm5,%ymm5,%ymm5
4201 vpalignr $8,%ymm9,%ymm9,%ymm9
4202 vpalignr $12,%ymm13,%ymm13,%ymm13
4203 vpalignr $4,%ymm4,%ymm4,%ymm4
4204 vpalignr $8,%ymm8,%ymm8,%ymm8
4205 vpalignr $12,%ymm12,%ymm12,%ymm12
4206 vmovdqa %ymm8,128(%rbp)
4207 vmovdqa .rol16(%rip),%ymm8
4208 vpaddd %ymm7,%ymm3,%ymm3
4209 vpaddd %ymm6,%ymm2,%ymm2
4210 vpaddd %ymm5,%ymm1,%ymm1
4211 vpaddd %ymm4,%ymm0,%ymm0
4212 vpxor %ymm3,%ymm15,%ymm15
4213 vpxor %ymm2,%ymm14,%ymm14
4214 vpxor %ymm1,%ymm13,%ymm13
4215 vpxor %ymm0,%ymm12,%ymm12
4216 addq %rax,%r15
4217 adcq %rdx,%r9
4218 vpshufb %ymm8,%ymm15,%ymm15
4219 vpshufb %ymm8,%ymm14,%ymm14
4220 vpshufb %ymm8,%ymm13,%ymm13
4221 vpshufb %ymm8,%ymm12,%ymm12
4222 vmovdqa 128(%rbp),%ymm8
4223 vpaddd %ymm15,%ymm11,%ymm11
4224 vpaddd %ymm14,%ymm10,%ymm10
4225 vpaddd %ymm13,%ymm9,%ymm9
4226 vpaddd %ymm12,%ymm8,%ymm8
4227 movq %r13,%r10
4228 movq %r14,%r11
4229 movq %r15,%r12
4230 andq $3,%r12
4231 movq %r15,%r13
4232 andq $-4,%r13
4233 movq %r9,%r14
4234 shrdq $2,%r9,%r15
4235 shrq $2,%r9
4236 addq %r13,%r10
4237 adcq %r14,%r11
4238 adcq $0,%r12
4239 addq %r15,%r10
4240 adcq %r9,%r11
4241 adcq $0,%r12
4242 vpxor %ymm11,%ymm7,%ymm7
4243 vpxor %ymm10,%ymm6,%ymm6
4244 vpxor %ymm9,%ymm5,%ymm5
4245 vpxor %ymm8,%ymm4,%ymm4
4246 vmovdqa %ymm8,128(%rbp)
4247 vpsrld $20,%ymm7,%ymm8
4248 vpslld $32-20,%ymm7,%ymm7
4249 vpxor %ymm8,%ymm7,%ymm7
4250 addq 4*8(%rsi,%rcx), %r10
4251 adcq 8+4*8(%rsi,%rcx), %r11
4252 adcq $1,%r12
4253
4254 leaq 48(%rcx),%rcx
4255 vpsrld $20,%ymm6,%ymm8
4256 vpslld $32-20,%ymm6,%ymm6
4257 vpxor %ymm8,%ymm6,%ymm6
4258 vpsrld $20,%ymm5,%ymm8
4259 vpslld $32-20,%ymm5,%ymm5
4260 vpxor %ymm8,%ymm5,%ymm5
4261 vpsrld $20,%ymm4,%ymm8
4262 vpslld $32-20,%ymm4,%ymm4
4263 vpxor %ymm8,%ymm4,%ymm4
4264 vmovdqa .rol8(%rip),%ymm8
4265 vpaddd %ymm7,%ymm3,%ymm3
4266 vpaddd %ymm6,%ymm2,%ymm2
4267 vpaddd %ymm5,%ymm1,%ymm1
4268 vpaddd %ymm4,%ymm0,%ymm0
4269 vpxor %ymm3,%ymm15,%ymm15
4270 vpxor %ymm2,%ymm14,%ymm14
4271 vpxor %ymm1,%ymm13,%ymm13
4272 vpxor %ymm0,%ymm12,%ymm12
4273 movq 0+0(%rbp),%rdx
4274 movq %rdx,%r15
4275 mulxq %r10,%r13,%r14
4276 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004277 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004278 addq %rax,%r14
4279 adcq %rdx,%r15
4280 vpshufb %ymm8,%ymm15,%ymm15
4281 vpshufb %ymm8,%ymm14,%ymm14
4282 vpshufb %ymm8,%ymm13,%ymm13
4283 vpshufb %ymm8,%ymm12,%ymm12
4284 vmovdqa 128(%rbp),%ymm8
4285 vpaddd %ymm15,%ymm11,%ymm11
4286 vpaddd %ymm14,%ymm10,%ymm10
4287 vpaddd %ymm13,%ymm9,%ymm9
4288 movq 8+0(%rbp),%rdx
4289 mulxq %r10,%r10,%rax
4290 addq %r10,%r14
4291 mulxq %r11,%r11,%r9
4292 adcq %r11,%r15
4293 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004294 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004295 vpaddd %ymm12,%ymm8,%ymm8
4296 vpxor %ymm11,%ymm7,%ymm7
4297 vpxor %ymm10,%ymm6,%ymm6
4298 vpxor %ymm9,%ymm5,%ymm5
4299 vpxor %ymm8,%ymm4,%ymm4
4300 vmovdqa %ymm8,128(%rbp)
4301 vpsrld $25,%ymm7,%ymm8
4302 vpslld $32-25,%ymm7,%ymm7
4303 addq %rax,%r15
4304 adcq %rdx,%r9
4305 vpxor %ymm8,%ymm7,%ymm7
4306 vpsrld $25,%ymm6,%ymm8
4307 vpslld $32-25,%ymm6,%ymm6
4308 vpxor %ymm8,%ymm6,%ymm6
4309 vpsrld $25,%ymm5,%ymm8
4310 vpslld $32-25,%ymm5,%ymm5
4311 vpxor %ymm8,%ymm5,%ymm5
4312 vpsrld $25,%ymm4,%ymm8
4313 vpslld $32-25,%ymm4,%ymm4
4314 vpxor %ymm8,%ymm4,%ymm4
4315 vmovdqa 128(%rbp),%ymm8
4316 vpalignr $12,%ymm7,%ymm7,%ymm7
4317 vpalignr $8,%ymm11,%ymm11,%ymm11
4318 vpalignr $4,%ymm15,%ymm15,%ymm15
4319 vpalignr $12,%ymm6,%ymm6,%ymm6
4320 vpalignr $8,%ymm10,%ymm10,%ymm10
4321 vpalignr $4,%ymm14,%ymm14,%ymm14
4322 vpalignr $12,%ymm5,%ymm5,%ymm5
4323 movq %r13,%r10
4324 movq %r14,%r11
4325 movq %r15,%r12
4326 andq $3,%r12
4327 movq %r15,%r13
4328 andq $-4,%r13
4329 movq %r9,%r14
4330 shrdq $2,%r9,%r15
4331 shrq $2,%r9
4332 addq %r13,%r10
4333 adcq %r14,%r11
4334 adcq $0,%r12
4335 addq %r15,%r10
4336 adcq %r9,%r11
4337 adcq $0,%r12
4338 vpalignr $8,%ymm9,%ymm9,%ymm9
4339 vpalignr $4,%ymm13,%ymm13,%ymm13
4340 vpalignr $12,%ymm4,%ymm4,%ymm4
4341 vpalignr $8,%ymm8,%ymm8,%ymm8
4342 vpalignr $4,%ymm12,%ymm12,%ymm12
4343
4344 cmpq $60*8,%rcx
4345 jne 2b
4346 vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
4347 vpaddd 64(%rbp),%ymm7,%ymm7
4348 vpaddd 96(%rbp),%ymm11,%ymm11
4349 vpaddd 256(%rbp),%ymm15,%ymm15
4350 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
4351 vpaddd 64(%rbp),%ymm6,%ymm6
4352 vpaddd 96(%rbp),%ymm10,%ymm10
4353 vpaddd 224(%rbp),%ymm14,%ymm14
4354 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
4355 vpaddd 64(%rbp),%ymm5,%ymm5
4356 vpaddd 96(%rbp),%ymm9,%ymm9
4357 vpaddd 192(%rbp),%ymm13,%ymm13
4358 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
4359 vpaddd 64(%rbp),%ymm4,%ymm4
4360 vpaddd 96(%rbp),%ymm8,%ymm8
4361 vpaddd 160(%rbp),%ymm12,%ymm12
4362
4363 vmovdqa %ymm0,128(%rbp)
4364 addq 60*8(%rsi),%r10
4365 adcq 8+60*8(%rsi),%r11
4366 adcq $1,%r12
4367 vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
4368 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
4369 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
4370 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
4371 vpxor 0+0(%rsi),%ymm0,%ymm0
4372 vpxor 32+0(%rsi),%ymm3,%ymm3
4373 vpxor 64+0(%rsi),%ymm7,%ymm7
4374 vpxor 96+0(%rsi),%ymm11,%ymm11
4375 vmovdqu %ymm0,0+0(%rdi)
4376 vmovdqu %ymm3,32+0(%rdi)
4377 vmovdqu %ymm7,64+0(%rdi)
4378 vmovdqu %ymm11,96+0(%rdi)
4379
4380 vmovdqa 128(%rbp),%ymm0
4381 movq 0+0(%rbp),%rax
4382 movq %rax,%r15
4383 mulq %r10
4384 movq %rax,%r13
4385 movq %rdx,%r14
4386 movq 0+0(%rbp),%rax
4387 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08004388 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004389 addq %rax,%r14
4390 adcq %rdx,%r15
4391 movq 8+0(%rbp),%rax
4392 movq %rax,%r9
4393 mulq %r10
4394 addq %rax,%r14
4395 adcq $0,%rdx
4396 movq %rdx,%r10
4397 movq 8+0(%rbp),%rax
4398 mulq %r11
4399 addq %rax,%r15
4400 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004401 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05004402 addq %r10,%r15
4403 adcq %rdx,%r9
4404 movq %r13,%r10
4405 movq %r14,%r11
4406 movq %r15,%r12
4407 andq $3,%r12
4408 movq %r15,%r13
4409 andq $-4,%r13
4410 movq %r9,%r14
4411 shrdq $2,%r9,%r15
4412 shrq $2,%r9
4413 addq %r13,%r10
4414 adcq %r14,%r11
4415 adcq $0,%r12
4416 addq %r15,%r10
4417 adcq %r9,%r11
4418 adcq $0,%r12
4419 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
4420 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
4421 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
4422 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
4423 vpxor 0+128(%rsi),%ymm3,%ymm3
4424 vpxor 32+128(%rsi),%ymm2,%ymm2
4425 vpxor 64+128(%rsi),%ymm6,%ymm6
4426 vpxor 96+128(%rsi),%ymm10,%ymm10
4427 vmovdqu %ymm3,0+128(%rdi)
4428 vmovdqu %ymm2,32+128(%rdi)
4429 vmovdqu %ymm6,64+128(%rdi)
4430 vmovdqu %ymm10,96+128(%rdi)
4431 addq 60*8+16(%rsi),%r10
4432 adcq 8+60*8+16(%rsi),%r11
4433 adcq $1,%r12
4434 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
4435 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
4436 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
4437 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
4438 vpxor 0+256(%rsi),%ymm3,%ymm3
4439 vpxor 32+256(%rsi),%ymm1,%ymm1
4440 vpxor 64+256(%rsi),%ymm5,%ymm5
4441 vpxor 96+256(%rsi),%ymm9,%ymm9
4442 vmovdqu %ymm3,0+256(%rdi)
4443 vmovdqu %ymm1,32+256(%rdi)
4444 vmovdqu %ymm5,64+256(%rdi)
4445 vmovdqu %ymm9,96+256(%rdi)
4446 movq 0+0(%rbp),%rax
4447 movq %rax,%r15
4448 mulq %r10
4449 movq %rax,%r13
4450 movq %rdx,%r14
4451 movq 0+0(%rbp),%rax
4452 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08004453 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004454 addq %rax,%r14
4455 adcq %rdx,%r15
4456 movq 8+0(%rbp),%rax
4457 movq %rax,%r9
4458 mulq %r10
4459 addq %rax,%r14
4460 adcq $0,%rdx
4461 movq %rdx,%r10
4462 movq 8+0(%rbp),%rax
4463 mulq %r11
4464 addq %rax,%r15
4465 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004466 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05004467 addq %r10,%r15
4468 adcq %rdx,%r9
4469 movq %r13,%r10
4470 movq %r14,%r11
4471 movq %r15,%r12
4472 andq $3,%r12
4473 movq %r15,%r13
4474 andq $-4,%r13
4475 movq %r9,%r14
4476 shrdq $2,%r9,%r15
4477 shrq $2,%r9
4478 addq %r13,%r10
4479 adcq %r14,%r11
4480 adcq $0,%r12
4481 addq %r15,%r10
4482 adcq %r9,%r11
4483 adcq $0,%r12
4484 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
4485 vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
4486 vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
4487 vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
4488 vpxor 0+384(%rsi),%ymm3,%ymm3
4489 vpxor 32+384(%rsi),%ymm0,%ymm0
4490 vpxor 64+384(%rsi),%ymm4,%ymm4
4491 vpxor 96+384(%rsi),%ymm8,%ymm8
4492 vmovdqu %ymm3,0+384(%rdi)
4493 vmovdqu %ymm0,32+384(%rdi)
4494 vmovdqu %ymm4,64+384(%rdi)
4495 vmovdqu %ymm8,96+384(%rdi)
4496
4497 leaq 512(%rsi),%rsi
4498 leaq 512(%rdi),%rdi
4499 subq $512,%rbx
4500 jmp 1b
45013:
4502 testq %rbx,%rbx
4503 vzeroupper
4504 je open_sse_finalize
45053:
4506 cmpq $128,%rbx
4507 ja 3f
4508 vmovdqa .chacha20_consts(%rip),%ymm0
4509 vmovdqa 64(%rbp),%ymm4
4510 vmovdqa 96(%rbp),%ymm8
4511 vmovdqa .avx2_inc(%rip),%ymm12
4512 vpaddd 160(%rbp),%ymm12,%ymm12
4513 vmovdqa %ymm12,160(%rbp)
4514
4515 xorq %r8,%r8
4516 movq %rbx,%rcx
4517 andq $-16,%rcx
4518 testq %rcx,%rcx
4519 je 2f
45201:
4521 addq 0*8(%rsi,%r8), %r10
4522 adcq 8+0*8(%rsi,%r8), %r11
4523 adcq $1,%r12
4524 movq 0+0(%rbp),%rax
4525 movq %rax,%r15
4526 mulq %r10
4527 movq %rax,%r13
4528 movq %rdx,%r14
4529 movq 0+0(%rbp),%rax
4530 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08004531 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004532 addq %rax,%r14
4533 adcq %rdx,%r15
4534 movq 8+0(%rbp),%rax
4535 movq %rax,%r9
4536 mulq %r10
4537 addq %rax,%r14
4538 adcq $0,%rdx
4539 movq %rdx,%r10
4540 movq 8+0(%rbp),%rax
4541 mulq %r11
4542 addq %rax,%r15
4543 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004544 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05004545 addq %r10,%r15
4546 adcq %rdx,%r9
4547 movq %r13,%r10
4548 movq %r14,%r11
4549 movq %r15,%r12
4550 andq $3,%r12
4551 movq %r15,%r13
4552 andq $-4,%r13
4553 movq %r9,%r14
4554 shrdq $2,%r9,%r15
4555 shrq $2,%r9
4556 addq %r13,%r10
4557 adcq %r14,%r11
4558 adcq $0,%r12
4559 addq %r15,%r10
4560 adcq %r9,%r11
4561 adcq $0,%r12
4562
45632:
4564 addq $16,%r8
4565 vpaddd %ymm4,%ymm0,%ymm0
4566 vpxor %ymm0,%ymm12,%ymm12
4567 vpshufb .rol16(%rip),%ymm12,%ymm12
4568 vpaddd %ymm12,%ymm8,%ymm8
4569 vpxor %ymm8,%ymm4,%ymm4
4570 vpsrld $20,%ymm4,%ymm3
4571 vpslld $12,%ymm4,%ymm4
4572 vpxor %ymm3,%ymm4,%ymm4
4573 vpaddd %ymm4,%ymm0,%ymm0
4574 vpxor %ymm0,%ymm12,%ymm12
4575 vpshufb .rol8(%rip),%ymm12,%ymm12
4576 vpaddd %ymm12,%ymm8,%ymm8
4577 vpxor %ymm8,%ymm4,%ymm4
4578 vpslld $7,%ymm4,%ymm3
4579 vpsrld $25,%ymm4,%ymm4
4580 vpxor %ymm3,%ymm4,%ymm4
4581 vpalignr $12,%ymm12,%ymm12,%ymm12
4582 vpalignr $8,%ymm8,%ymm8,%ymm8
4583 vpalignr $4,%ymm4,%ymm4,%ymm4
4584 vpaddd %ymm4,%ymm0,%ymm0
4585 vpxor %ymm0,%ymm12,%ymm12
4586 vpshufb .rol16(%rip),%ymm12,%ymm12
4587 vpaddd %ymm12,%ymm8,%ymm8
4588 vpxor %ymm8,%ymm4,%ymm4
4589 vpsrld $20,%ymm4,%ymm3
4590 vpslld $12,%ymm4,%ymm4
4591 vpxor %ymm3,%ymm4,%ymm4
4592 vpaddd %ymm4,%ymm0,%ymm0
4593 vpxor %ymm0,%ymm12,%ymm12
4594 vpshufb .rol8(%rip),%ymm12,%ymm12
4595 vpaddd %ymm12,%ymm8,%ymm8
4596 vpxor %ymm8,%ymm4,%ymm4
4597 vpslld $7,%ymm4,%ymm3
4598 vpsrld $25,%ymm4,%ymm4
4599 vpxor %ymm3,%ymm4,%ymm4
4600 vpalignr $4,%ymm12,%ymm12,%ymm12
4601 vpalignr $8,%ymm8,%ymm8,%ymm8
4602 vpalignr $12,%ymm4,%ymm4,%ymm4
4603
4604 cmpq %rcx,%r8
4605 jb 1b
4606 cmpq $160,%r8
4607 jne 2b
4608 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
4609 vpaddd 64(%rbp),%ymm4,%ymm4
4610 vpaddd 96(%rbp),%ymm8,%ymm8
4611 vpaddd 160(%rbp),%ymm12,%ymm12
4612 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
4613 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
4614 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
4615 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
4616 vmovdqa %ymm3,%ymm8
4617
4618 jmp open_avx2_tail_loop
46193:
4620 cmpq $256,%rbx
4621 ja 3f
4622 vmovdqa .chacha20_consts(%rip),%ymm0
4623 vmovdqa 64(%rbp),%ymm4
4624 vmovdqa 96(%rbp),%ymm8
4625 vmovdqa %ymm0,%ymm1
4626 vmovdqa %ymm4,%ymm5
4627 vmovdqa %ymm8,%ymm9
4628 vmovdqa .avx2_inc(%rip),%ymm12
4629 vpaddd 160(%rbp),%ymm12,%ymm13
4630 vpaddd %ymm13,%ymm12,%ymm12
4631 vmovdqa %ymm12,160(%rbp)
4632 vmovdqa %ymm13,192(%rbp)
4633
4634 movq %rbx,128(%rbp)
4635 movq %rbx,%rcx
4636 subq $128,%rcx
4637 shrq $4,%rcx
4638 movq $10,%r8
4639 cmpq $10,%rcx
4640 cmovgq %r8,%rcx
4641 movq %rsi,%rbx
4642 xorq %r8,%r8
46431:
4644 addq 0(%rbx),%r10
4645 adcq 8+0(%rbx),%r11
4646 adcq $1,%r12
4647 movq 0+0(%rbp),%rdx
4648 movq %rdx,%r15
4649 mulxq %r10,%r13,%r14
4650 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004651 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004652 addq %rax,%r14
4653 adcq %rdx,%r15
4654 movq 8+0(%rbp),%rdx
4655 mulxq %r10,%r10,%rax
4656 addq %r10,%r14
4657 mulxq %r11,%r11,%r9
4658 adcq %r11,%r15
4659 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004660 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004661 addq %rax,%r15
4662 adcq %rdx,%r9
4663 movq %r13,%r10
4664 movq %r14,%r11
4665 movq %r15,%r12
4666 andq $3,%r12
4667 movq %r15,%r13
4668 andq $-4,%r13
4669 movq %r9,%r14
4670 shrdq $2,%r9,%r15
4671 shrq $2,%r9
4672 addq %r13,%r10
4673 adcq %r14,%r11
4674 adcq $0,%r12
4675 addq %r15,%r10
4676 adcq %r9,%r11
4677 adcq $0,%r12
4678
4679 leaq 16(%rbx),%rbx
46802:
4681 vpaddd %ymm4,%ymm0,%ymm0
4682 vpxor %ymm0,%ymm12,%ymm12
4683 vpshufb .rol16(%rip),%ymm12,%ymm12
4684 vpaddd %ymm12,%ymm8,%ymm8
4685 vpxor %ymm8,%ymm4,%ymm4
4686 vpsrld $20,%ymm4,%ymm3
4687 vpslld $12,%ymm4,%ymm4
4688 vpxor %ymm3,%ymm4,%ymm4
4689 vpaddd %ymm4,%ymm0,%ymm0
4690 vpxor %ymm0,%ymm12,%ymm12
4691 vpshufb .rol8(%rip),%ymm12,%ymm12
4692 vpaddd %ymm12,%ymm8,%ymm8
4693 vpxor %ymm8,%ymm4,%ymm4
4694 vpslld $7,%ymm4,%ymm3
4695 vpsrld $25,%ymm4,%ymm4
4696 vpxor %ymm3,%ymm4,%ymm4
4697 vpalignr $12,%ymm12,%ymm12,%ymm12
4698 vpalignr $8,%ymm8,%ymm8,%ymm8
4699 vpalignr $4,%ymm4,%ymm4,%ymm4
4700 vpaddd %ymm5,%ymm1,%ymm1
4701 vpxor %ymm1,%ymm13,%ymm13
4702 vpshufb .rol16(%rip),%ymm13,%ymm13
4703 vpaddd %ymm13,%ymm9,%ymm9
4704 vpxor %ymm9,%ymm5,%ymm5
4705 vpsrld $20,%ymm5,%ymm3
4706 vpslld $12,%ymm5,%ymm5
4707 vpxor %ymm3,%ymm5,%ymm5
4708 vpaddd %ymm5,%ymm1,%ymm1
4709 vpxor %ymm1,%ymm13,%ymm13
4710 vpshufb .rol8(%rip),%ymm13,%ymm13
4711 vpaddd %ymm13,%ymm9,%ymm9
4712 vpxor %ymm9,%ymm5,%ymm5
4713 vpslld $7,%ymm5,%ymm3
4714 vpsrld $25,%ymm5,%ymm5
4715 vpxor %ymm3,%ymm5,%ymm5
4716 vpalignr $12,%ymm13,%ymm13,%ymm13
4717 vpalignr $8,%ymm9,%ymm9,%ymm9
4718 vpalignr $4,%ymm5,%ymm5,%ymm5
4719
4720 incq %r8
4721 vpaddd %ymm4,%ymm0,%ymm0
4722 vpxor %ymm0,%ymm12,%ymm12
4723 vpshufb .rol16(%rip),%ymm12,%ymm12
4724 vpaddd %ymm12,%ymm8,%ymm8
4725 vpxor %ymm8,%ymm4,%ymm4
4726 vpsrld $20,%ymm4,%ymm3
4727 vpslld $12,%ymm4,%ymm4
4728 vpxor %ymm3,%ymm4,%ymm4
4729 vpaddd %ymm4,%ymm0,%ymm0
4730 vpxor %ymm0,%ymm12,%ymm12
4731 vpshufb .rol8(%rip),%ymm12,%ymm12
4732 vpaddd %ymm12,%ymm8,%ymm8
4733 vpxor %ymm8,%ymm4,%ymm4
4734 vpslld $7,%ymm4,%ymm3
4735 vpsrld $25,%ymm4,%ymm4
4736 vpxor %ymm3,%ymm4,%ymm4
4737 vpalignr $4,%ymm12,%ymm12,%ymm12
4738 vpalignr $8,%ymm8,%ymm8,%ymm8
4739 vpalignr $12,%ymm4,%ymm4,%ymm4
4740 vpaddd %ymm5,%ymm1,%ymm1
4741 vpxor %ymm1,%ymm13,%ymm13
4742 vpshufb .rol16(%rip),%ymm13,%ymm13
4743 vpaddd %ymm13,%ymm9,%ymm9
4744 vpxor %ymm9,%ymm5,%ymm5
4745 vpsrld $20,%ymm5,%ymm3
4746 vpslld $12,%ymm5,%ymm5
4747 vpxor %ymm3,%ymm5,%ymm5
4748 vpaddd %ymm5,%ymm1,%ymm1
4749 vpxor %ymm1,%ymm13,%ymm13
4750 vpshufb .rol8(%rip),%ymm13,%ymm13
4751 vpaddd %ymm13,%ymm9,%ymm9
4752 vpxor %ymm9,%ymm5,%ymm5
4753 vpslld $7,%ymm5,%ymm3
4754 vpsrld $25,%ymm5,%ymm5
4755 vpxor %ymm3,%ymm5,%ymm5
4756 vpalignr $4,%ymm13,%ymm13,%ymm13
4757 vpalignr $8,%ymm9,%ymm9,%ymm9
4758 vpalignr $12,%ymm5,%ymm5,%ymm5
4759 vpaddd %ymm6,%ymm2,%ymm2
4760 vpxor %ymm2,%ymm14,%ymm14
4761 vpshufb .rol16(%rip),%ymm14,%ymm14
4762 vpaddd %ymm14,%ymm10,%ymm10
4763 vpxor %ymm10,%ymm6,%ymm6
4764 vpsrld $20,%ymm6,%ymm3
4765 vpslld $12,%ymm6,%ymm6
4766 vpxor %ymm3,%ymm6,%ymm6
4767 vpaddd %ymm6,%ymm2,%ymm2
4768 vpxor %ymm2,%ymm14,%ymm14
4769 vpshufb .rol8(%rip),%ymm14,%ymm14
4770 vpaddd %ymm14,%ymm10,%ymm10
4771 vpxor %ymm10,%ymm6,%ymm6
4772 vpslld $7,%ymm6,%ymm3
4773 vpsrld $25,%ymm6,%ymm6
4774 vpxor %ymm3,%ymm6,%ymm6
4775 vpalignr $4,%ymm14,%ymm14,%ymm14
4776 vpalignr $8,%ymm10,%ymm10,%ymm10
4777 vpalignr $12,%ymm6,%ymm6,%ymm6
4778
4779 cmpq %rcx,%r8
4780 jb 1b
4781 cmpq $10,%r8
4782 jne 2b
4783 movq %rbx,%r8
4784 subq %rsi,%rbx
4785 movq %rbx,%rcx
4786 movq 128(%rbp),%rbx
47871:
4788 addq $16,%rcx
4789 cmpq %rbx,%rcx
4790 jg 1f
4791 addq 0(%r8),%r10
4792 adcq 8+0(%r8),%r11
4793 adcq $1,%r12
4794 movq 0+0(%rbp),%rdx
4795 movq %rdx,%r15
4796 mulxq %r10,%r13,%r14
4797 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004798 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004799 addq %rax,%r14
4800 adcq %rdx,%r15
4801 movq 8+0(%rbp),%rdx
4802 mulxq %r10,%r10,%rax
4803 addq %r10,%r14
4804 mulxq %r11,%r11,%r9
4805 adcq %r11,%r15
4806 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004807 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004808 addq %rax,%r15
4809 adcq %rdx,%r9
4810 movq %r13,%r10
4811 movq %r14,%r11
4812 movq %r15,%r12
4813 andq $3,%r12
4814 movq %r15,%r13
4815 andq $-4,%r13
4816 movq %r9,%r14
4817 shrdq $2,%r9,%r15
4818 shrq $2,%r9
4819 addq %r13,%r10
4820 adcq %r14,%r11
4821 adcq $0,%r12
4822 addq %r15,%r10
4823 adcq %r9,%r11
4824 adcq $0,%r12
4825
4826 leaq 16(%r8),%r8
4827 jmp 1b
48281:
4829 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
4830 vpaddd 64(%rbp),%ymm5,%ymm5
4831 vpaddd 96(%rbp),%ymm9,%ymm9
4832 vpaddd 192(%rbp),%ymm13,%ymm13
4833 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
4834 vpaddd 64(%rbp),%ymm4,%ymm4
4835 vpaddd 96(%rbp),%ymm8,%ymm8
4836 vpaddd 160(%rbp),%ymm12,%ymm12
4837 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
4838 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
4839 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
4840 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
4841 vpxor 0+0(%rsi),%ymm3,%ymm3
4842 vpxor 32+0(%rsi),%ymm1,%ymm1
4843 vpxor 64+0(%rsi),%ymm5,%ymm5
4844 vpxor 96+0(%rsi),%ymm9,%ymm9
4845 vmovdqu %ymm3,0+0(%rdi)
4846 vmovdqu %ymm1,32+0(%rdi)
4847 vmovdqu %ymm5,64+0(%rdi)
4848 vmovdqu %ymm9,96+0(%rdi)
4849 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
4850 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
4851 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
4852 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
4853 vmovdqa %ymm3,%ymm8
4854
4855 leaq 128(%rsi),%rsi
4856 leaq 128(%rdi),%rdi
4857 subq $128,%rbx
4858 jmp open_avx2_tail_loop
48593:
4860 cmpq $384,%rbx
4861 ja 3f
4862 vmovdqa .chacha20_consts(%rip),%ymm0
4863 vmovdqa 64(%rbp),%ymm4
4864 vmovdqa 96(%rbp),%ymm8
4865 vmovdqa %ymm0,%ymm1
4866 vmovdqa %ymm4,%ymm5
4867 vmovdqa %ymm8,%ymm9
4868 vmovdqa %ymm0,%ymm2
4869 vmovdqa %ymm4,%ymm6
4870 vmovdqa %ymm8,%ymm10
4871 vmovdqa .avx2_inc(%rip),%ymm12
4872 vpaddd 160(%rbp),%ymm12,%ymm14
4873 vpaddd %ymm14,%ymm12,%ymm13
4874 vpaddd %ymm13,%ymm12,%ymm12
4875 vmovdqa %ymm12,160(%rbp)
4876 vmovdqa %ymm13,192(%rbp)
4877 vmovdqa %ymm14,224(%rbp)
4878
4879 movq %rbx,128(%rbp)
4880 movq %rbx,%rcx
4881 subq $256,%rcx
4882 shrq $4,%rcx
4883 addq $6,%rcx
4884 movq $10,%r8
4885 cmpq $10,%rcx
4886 cmovgq %r8,%rcx
4887 movq %rsi,%rbx
4888 xorq %r8,%r8
48891:
4890 addq 0(%rbx),%r10
4891 adcq 8+0(%rbx),%r11
4892 adcq $1,%r12
4893 movq 0+0(%rbp),%rdx
4894 movq %rdx,%r15
4895 mulxq %r10,%r13,%r14
4896 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08004897 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004898 addq %rax,%r14
4899 adcq %rdx,%r15
4900 movq 8+0(%rbp),%rdx
4901 mulxq %r10,%r10,%rax
4902 addq %r10,%r14
4903 mulxq %r11,%r11,%r9
4904 adcq %r11,%r15
4905 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08004906 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05004907 addq %rax,%r15
4908 adcq %rdx,%r9
4909 movq %r13,%r10
4910 movq %r14,%r11
4911 movq %r15,%r12
4912 andq $3,%r12
4913 movq %r15,%r13
4914 andq $-4,%r13
4915 movq %r9,%r14
4916 shrdq $2,%r9,%r15
4917 shrq $2,%r9
4918 addq %r13,%r10
4919 adcq %r14,%r11
4920 adcq $0,%r12
4921 addq %r15,%r10
4922 adcq %r9,%r11
4923 adcq $0,%r12
4924
4925 leaq 16(%rbx),%rbx
49262:
4927 vpaddd %ymm6,%ymm2,%ymm2
4928 vpxor %ymm2,%ymm14,%ymm14
4929 vpshufb .rol16(%rip),%ymm14,%ymm14
4930 vpaddd %ymm14,%ymm10,%ymm10
4931 vpxor %ymm10,%ymm6,%ymm6
4932 vpsrld $20,%ymm6,%ymm3
4933 vpslld $12,%ymm6,%ymm6
4934 vpxor %ymm3,%ymm6,%ymm6
4935 vpaddd %ymm6,%ymm2,%ymm2
4936 vpxor %ymm2,%ymm14,%ymm14
4937 vpshufb .rol8(%rip),%ymm14,%ymm14
4938 vpaddd %ymm14,%ymm10,%ymm10
4939 vpxor %ymm10,%ymm6,%ymm6
4940 vpslld $7,%ymm6,%ymm3
4941 vpsrld $25,%ymm6,%ymm6
4942 vpxor %ymm3,%ymm6,%ymm6
4943 vpalignr $12,%ymm14,%ymm14,%ymm14
4944 vpalignr $8,%ymm10,%ymm10,%ymm10
4945 vpalignr $4,%ymm6,%ymm6,%ymm6
4946 vpaddd %ymm5,%ymm1,%ymm1
4947 vpxor %ymm1,%ymm13,%ymm13
4948 vpshufb .rol16(%rip),%ymm13,%ymm13
4949 vpaddd %ymm13,%ymm9,%ymm9
4950 vpxor %ymm9,%ymm5,%ymm5
4951 vpsrld $20,%ymm5,%ymm3
4952 vpslld $12,%ymm5,%ymm5
4953 vpxor %ymm3,%ymm5,%ymm5
4954 vpaddd %ymm5,%ymm1,%ymm1
4955 vpxor %ymm1,%ymm13,%ymm13
4956 vpshufb .rol8(%rip),%ymm13,%ymm13
4957 vpaddd %ymm13,%ymm9,%ymm9
4958 vpxor %ymm9,%ymm5,%ymm5
4959 vpslld $7,%ymm5,%ymm3
4960 vpsrld $25,%ymm5,%ymm5
4961 vpxor %ymm3,%ymm5,%ymm5
4962 vpalignr $12,%ymm13,%ymm13,%ymm13
4963 vpalignr $8,%ymm9,%ymm9,%ymm9
4964 vpalignr $4,%ymm5,%ymm5,%ymm5
4965 vpaddd %ymm4,%ymm0,%ymm0
4966 vpxor %ymm0,%ymm12,%ymm12
4967 vpshufb .rol16(%rip),%ymm12,%ymm12
4968 vpaddd %ymm12,%ymm8,%ymm8
4969 vpxor %ymm8,%ymm4,%ymm4
4970 vpsrld $20,%ymm4,%ymm3
4971 vpslld $12,%ymm4,%ymm4
4972 vpxor %ymm3,%ymm4,%ymm4
4973 vpaddd %ymm4,%ymm0,%ymm0
4974 vpxor %ymm0,%ymm12,%ymm12
4975 vpshufb .rol8(%rip),%ymm12,%ymm12
4976 vpaddd %ymm12,%ymm8,%ymm8
4977 vpxor %ymm8,%ymm4,%ymm4
4978 vpslld $7,%ymm4,%ymm3
4979 vpsrld $25,%ymm4,%ymm4
4980 vpxor %ymm3,%ymm4,%ymm4
4981 vpalignr $12,%ymm12,%ymm12,%ymm12
4982 vpalignr $8,%ymm8,%ymm8,%ymm8
4983 vpalignr $4,%ymm4,%ymm4,%ymm4
4984 addq 0(%rbx),%r10
4985 adcq 8+0(%rbx),%r11
4986 adcq $1,%r12
4987 movq 0+0(%rbp),%rax
4988 movq %rax,%r15
4989 mulq %r10
4990 movq %rax,%r13
4991 movq %rdx,%r14
4992 movq 0+0(%rbp),%rax
4993 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08004994 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05004995 addq %rax,%r14
4996 adcq %rdx,%r15
4997 movq 8+0(%rbp),%rax
4998 movq %rax,%r9
4999 mulq %r10
5000 addq %rax,%r14
5001 adcq $0,%rdx
5002 movq %rdx,%r10
5003 movq 8+0(%rbp),%rax
5004 mulq %r11
5005 addq %rax,%r15
5006 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005007 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05005008 addq %r10,%r15
5009 adcq %rdx,%r9
5010 movq %r13,%r10
5011 movq %r14,%r11
5012 movq %r15,%r12
5013 andq $3,%r12
5014 movq %r15,%r13
5015 andq $-4,%r13
5016 movq %r9,%r14
5017 shrdq $2,%r9,%r15
5018 shrq $2,%r9
5019 addq %r13,%r10
5020 adcq %r14,%r11
5021 adcq $0,%r12
5022 addq %r15,%r10
5023 adcq %r9,%r11
5024 adcq $0,%r12
5025
5026 leaq 16(%rbx),%rbx
5027 incq %r8
5028 vpaddd %ymm6,%ymm2,%ymm2
5029 vpxor %ymm2,%ymm14,%ymm14
5030 vpshufb .rol16(%rip),%ymm14,%ymm14
5031 vpaddd %ymm14,%ymm10,%ymm10
5032 vpxor %ymm10,%ymm6,%ymm6
5033 vpsrld $20,%ymm6,%ymm3
5034 vpslld $12,%ymm6,%ymm6
5035 vpxor %ymm3,%ymm6,%ymm6
5036 vpaddd %ymm6,%ymm2,%ymm2
5037 vpxor %ymm2,%ymm14,%ymm14
5038 vpshufb .rol8(%rip),%ymm14,%ymm14
5039 vpaddd %ymm14,%ymm10,%ymm10
5040 vpxor %ymm10,%ymm6,%ymm6
5041 vpslld $7,%ymm6,%ymm3
5042 vpsrld $25,%ymm6,%ymm6
5043 vpxor %ymm3,%ymm6,%ymm6
5044 vpalignr $4,%ymm14,%ymm14,%ymm14
5045 vpalignr $8,%ymm10,%ymm10,%ymm10
5046 vpalignr $12,%ymm6,%ymm6,%ymm6
5047 vpaddd %ymm5,%ymm1,%ymm1
5048 vpxor %ymm1,%ymm13,%ymm13
5049 vpshufb .rol16(%rip),%ymm13,%ymm13
5050 vpaddd %ymm13,%ymm9,%ymm9
5051 vpxor %ymm9,%ymm5,%ymm5
5052 vpsrld $20,%ymm5,%ymm3
5053 vpslld $12,%ymm5,%ymm5
5054 vpxor %ymm3,%ymm5,%ymm5
5055 vpaddd %ymm5,%ymm1,%ymm1
5056 vpxor %ymm1,%ymm13,%ymm13
5057 vpshufb .rol8(%rip),%ymm13,%ymm13
5058 vpaddd %ymm13,%ymm9,%ymm9
5059 vpxor %ymm9,%ymm5,%ymm5
5060 vpslld $7,%ymm5,%ymm3
5061 vpsrld $25,%ymm5,%ymm5
5062 vpxor %ymm3,%ymm5,%ymm5
5063 vpalignr $4,%ymm13,%ymm13,%ymm13
5064 vpalignr $8,%ymm9,%ymm9,%ymm9
5065 vpalignr $12,%ymm5,%ymm5,%ymm5
5066 vpaddd %ymm4,%ymm0,%ymm0
5067 vpxor %ymm0,%ymm12,%ymm12
5068 vpshufb .rol16(%rip),%ymm12,%ymm12
5069 vpaddd %ymm12,%ymm8,%ymm8
5070 vpxor %ymm8,%ymm4,%ymm4
5071 vpsrld $20,%ymm4,%ymm3
5072 vpslld $12,%ymm4,%ymm4
5073 vpxor %ymm3,%ymm4,%ymm4
5074 vpaddd %ymm4,%ymm0,%ymm0
5075 vpxor %ymm0,%ymm12,%ymm12
5076 vpshufb .rol8(%rip),%ymm12,%ymm12
5077 vpaddd %ymm12,%ymm8,%ymm8
5078 vpxor %ymm8,%ymm4,%ymm4
5079 vpslld $7,%ymm4,%ymm3
5080 vpsrld $25,%ymm4,%ymm4
5081 vpxor %ymm3,%ymm4,%ymm4
5082 vpalignr $4,%ymm12,%ymm12,%ymm12
5083 vpalignr $8,%ymm8,%ymm8,%ymm8
5084 vpalignr $12,%ymm4,%ymm4,%ymm4
5085
5086 cmpq %rcx,%r8
5087 jb 1b
5088 cmpq $10,%r8
5089 jne 2b
5090 movq %rbx,%r8
5091 subq %rsi,%rbx
5092 movq %rbx,%rcx
5093 movq 128(%rbp),%rbx
50941:
5095 addq $16,%rcx
5096 cmpq %rbx,%rcx
5097 jg 1f
5098 addq 0(%r8),%r10
5099 adcq 8+0(%r8),%r11
5100 adcq $1,%r12
5101 movq 0+0(%rbp),%rdx
5102 movq %rdx,%r15
5103 mulxq %r10,%r13,%r14
5104 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005105 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005106 addq %rax,%r14
5107 adcq %rdx,%r15
5108 movq 8+0(%rbp),%rdx
5109 mulxq %r10,%r10,%rax
5110 addq %r10,%r14
5111 mulxq %r11,%r11,%r9
5112 adcq %r11,%r15
5113 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08005114 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05005115 addq %rax,%r15
5116 adcq %rdx,%r9
5117 movq %r13,%r10
5118 movq %r14,%r11
5119 movq %r15,%r12
5120 andq $3,%r12
5121 movq %r15,%r13
5122 andq $-4,%r13
5123 movq %r9,%r14
5124 shrdq $2,%r9,%r15
5125 shrq $2,%r9
5126 addq %r13,%r10
5127 adcq %r14,%r11
5128 adcq $0,%r12
5129 addq %r15,%r10
5130 adcq %r9,%r11
5131 adcq $0,%r12
5132
5133 leaq 16(%r8),%r8
5134 jmp 1b
51351:
5136 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
5137 vpaddd 64(%rbp),%ymm6,%ymm6
5138 vpaddd 96(%rbp),%ymm10,%ymm10
5139 vpaddd 224(%rbp),%ymm14,%ymm14
5140 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
5141 vpaddd 64(%rbp),%ymm5,%ymm5
5142 vpaddd 96(%rbp),%ymm9,%ymm9
5143 vpaddd 192(%rbp),%ymm13,%ymm13
5144 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
5145 vpaddd 64(%rbp),%ymm4,%ymm4
5146 vpaddd 96(%rbp),%ymm8,%ymm8
5147 vpaddd 160(%rbp),%ymm12,%ymm12
5148 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
5149 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
5150 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
5151 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
5152 vpxor 0+0(%rsi),%ymm3,%ymm3
5153 vpxor 32+0(%rsi),%ymm2,%ymm2
5154 vpxor 64+0(%rsi),%ymm6,%ymm6
5155 vpxor 96+0(%rsi),%ymm10,%ymm10
5156 vmovdqu %ymm3,0+0(%rdi)
5157 vmovdqu %ymm2,32+0(%rdi)
5158 vmovdqu %ymm6,64+0(%rdi)
5159 vmovdqu %ymm10,96+0(%rdi)
5160 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
5161 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
5162 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
5163 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
5164 vpxor 0+128(%rsi),%ymm3,%ymm3
5165 vpxor 32+128(%rsi),%ymm1,%ymm1
5166 vpxor 64+128(%rsi),%ymm5,%ymm5
5167 vpxor 96+128(%rsi),%ymm9,%ymm9
5168 vmovdqu %ymm3,0+128(%rdi)
5169 vmovdqu %ymm1,32+128(%rdi)
5170 vmovdqu %ymm5,64+128(%rdi)
5171 vmovdqu %ymm9,96+128(%rdi)
5172 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
5173 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
5174 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
5175 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
5176 vmovdqa %ymm3,%ymm8
5177
5178 leaq 256(%rsi),%rsi
5179 leaq 256(%rdi),%rdi
5180 subq $256,%rbx
5181 jmp open_avx2_tail_loop
51823:
5183 vmovdqa .chacha20_consts(%rip),%ymm0
5184 vmovdqa 64(%rbp),%ymm4
5185 vmovdqa 96(%rbp),%ymm8
5186 vmovdqa %ymm0,%ymm1
5187 vmovdqa %ymm4,%ymm5
5188 vmovdqa %ymm8,%ymm9
5189 vmovdqa %ymm0,%ymm2
5190 vmovdqa %ymm4,%ymm6
5191 vmovdqa %ymm8,%ymm10
5192 vmovdqa %ymm0,%ymm3
5193 vmovdqa %ymm4,%ymm7
5194 vmovdqa %ymm8,%ymm11
5195 vmovdqa .avx2_inc(%rip),%ymm12
5196 vpaddd 160(%rbp),%ymm12,%ymm15
5197 vpaddd %ymm15,%ymm12,%ymm14
5198 vpaddd %ymm14,%ymm12,%ymm13
5199 vpaddd %ymm13,%ymm12,%ymm12
5200 vmovdqa %ymm15,256(%rbp)
5201 vmovdqa %ymm14,224(%rbp)
5202 vmovdqa %ymm13,192(%rbp)
5203 vmovdqa %ymm12,160(%rbp)
5204
5205 xorq %rcx,%rcx
5206 movq %rsi,%r8
52071:
5208 addq 0(%r8),%r10
5209 adcq 8+0(%r8),%r11
5210 adcq $1,%r12
5211 movq 0+0(%rbp),%rax
5212 movq %rax,%r15
5213 mulq %r10
5214 movq %rax,%r13
5215 movq %rdx,%r14
5216 movq 0+0(%rbp),%rax
5217 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08005218 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005219 addq %rax,%r14
5220 adcq %rdx,%r15
5221 movq 8+0(%rbp),%rax
5222 movq %rax,%r9
5223 mulq %r10
5224 addq %rax,%r14
5225 adcq $0,%rdx
5226 movq %rdx,%r10
5227 movq 8+0(%rbp),%rax
5228 mulq %r11
5229 addq %rax,%r15
5230 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005231 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05005232 addq %r10,%r15
5233 adcq %rdx,%r9
5234 movq %r13,%r10
5235 movq %r14,%r11
5236 movq %r15,%r12
5237 andq $3,%r12
5238 movq %r15,%r13
5239 andq $-4,%r13
5240 movq %r9,%r14
5241 shrdq $2,%r9,%r15
5242 shrq $2,%r9
5243 addq %r13,%r10
5244 adcq %r14,%r11
5245 adcq $0,%r12
5246 addq %r15,%r10
5247 adcq %r9,%r11
5248 adcq $0,%r12
5249
5250 leaq 16(%r8),%r8
52512:
5252 vmovdqa %ymm8,128(%rbp)
5253 vmovdqa .rol16(%rip),%ymm8
5254 vpaddd %ymm7,%ymm3,%ymm3
5255 vpaddd %ymm6,%ymm2,%ymm2
5256 vpaddd %ymm5,%ymm1,%ymm1
5257 vpaddd %ymm4,%ymm0,%ymm0
5258 vpxor %ymm3,%ymm15,%ymm15
5259 vpxor %ymm2,%ymm14,%ymm14
5260 vpxor %ymm1,%ymm13,%ymm13
5261 vpxor %ymm0,%ymm12,%ymm12
5262 vpshufb %ymm8,%ymm15,%ymm15
5263 vpshufb %ymm8,%ymm14,%ymm14
5264 vpshufb %ymm8,%ymm13,%ymm13
5265 vpshufb %ymm8,%ymm12,%ymm12
5266 vmovdqa 128(%rbp),%ymm8
5267 vpaddd %ymm15,%ymm11,%ymm11
5268 vpaddd %ymm14,%ymm10,%ymm10
5269 vpaddd %ymm13,%ymm9,%ymm9
5270 vpaddd %ymm12,%ymm8,%ymm8
5271 vpxor %ymm11,%ymm7,%ymm7
5272 vpxor %ymm10,%ymm6,%ymm6
5273 vpxor %ymm9,%ymm5,%ymm5
5274 vpxor %ymm8,%ymm4,%ymm4
5275 vmovdqa %ymm8,128(%rbp)
5276 vpsrld $20,%ymm7,%ymm8
5277 vpslld $32-20,%ymm7,%ymm7
5278 vpxor %ymm8,%ymm7,%ymm7
5279 vpsrld $20,%ymm6,%ymm8
5280 vpslld $32-20,%ymm6,%ymm6
5281 vpxor %ymm8,%ymm6,%ymm6
5282 vpsrld $20,%ymm5,%ymm8
5283 vpslld $32-20,%ymm5,%ymm5
5284 vpxor %ymm8,%ymm5,%ymm5
5285 vpsrld $20,%ymm4,%ymm8
5286 vpslld $32-20,%ymm4,%ymm4
5287 vpxor %ymm8,%ymm4,%ymm4
5288 vmovdqa .rol8(%rip),%ymm8
5289 addq 0(%r8),%r10
5290 adcq 8+0(%r8),%r11
5291 adcq $1,%r12
5292 movq 0+0(%rbp),%rdx
5293 movq %rdx,%r15
5294 mulxq %r10,%r13,%r14
5295 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005296 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005297 addq %rax,%r14
5298 adcq %rdx,%r15
5299 movq 8+0(%rbp),%rdx
5300 mulxq %r10,%r10,%rax
5301 addq %r10,%r14
5302 mulxq %r11,%r11,%r9
5303 adcq %r11,%r15
5304 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08005305 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05005306 addq %rax,%r15
5307 adcq %rdx,%r9
5308 movq %r13,%r10
5309 movq %r14,%r11
5310 movq %r15,%r12
5311 andq $3,%r12
5312 movq %r15,%r13
5313 andq $-4,%r13
5314 movq %r9,%r14
5315 shrdq $2,%r9,%r15
5316 shrq $2,%r9
5317 addq %r13,%r10
5318 adcq %r14,%r11
5319 adcq $0,%r12
5320 addq %r15,%r10
5321 adcq %r9,%r11
5322 adcq $0,%r12
5323 vpaddd %ymm7,%ymm3,%ymm3
5324 vpaddd %ymm6,%ymm2,%ymm2
5325 vpaddd %ymm5,%ymm1,%ymm1
5326 vpaddd %ymm4,%ymm0,%ymm0
5327 vpxor %ymm3,%ymm15,%ymm15
5328 vpxor %ymm2,%ymm14,%ymm14
5329 vpxor %ymm1,%ymm13,%ymm13
5330 vpxor %ymm0,%ymm12,%ymm12
5331 vpshufb %ymm8,%ymm15,%ymm15
5332 vpshufb %ymm8,%ymm14,%ymm14
5333 vpshufb %ymm8,%ymm13,%ymm13
5334 vpshufb %ymm8,%ymm12,%ymm12
5335 vmovdqa 128(%rbp),%ymm8
5336 vpaddd %ymm15,%ymm11,%ymm11
5337 vpaddd %ymm14,%ymm10,%ymm10
5338 vpaddd %ymm13,%ymm9,%ymm9
5339 vpaddd %ymm12,%ymm8,%ymm8
5340 vpxor %ymm11,%ymm7,%ymm7
5341 vpxor %ymm10,%ymm6,%ymm6
5342 vpxor %ymm9,%ymm5,%ymm5
5343 vpxor %ymm8,%ymm4,%ymm4
5344 vmovdqa %ymm8,128(%rbp)
5345 vpsrld $25,%ymm7,%ymm8
5346 vpslld $32-25,%ymm7,%ymm7
5347 vpxor %ymm8,%ymm7,%ymm7
5348 vpsrld $25,%ymm6,%ymm8
5349 vpslld $32-25,%ymm6,%ymm6
5350 vpxor %ymm8,%ymm6,%ymm6
5351 vpsrld $25,%ymm5,%ymm8
5352 vpslld $32-25,%ymm5,%ymm5
5353 vpxor %ymm8,%ymm5,%ymm5
5354 vpsrld $25,%ymm4,%ymm8
5355 vpslld $32-25,%ymm4,%ymm4
5356 vpxor %ymm8,%ymm4,%ymm4
5357 vmovdqa 128(%rbp),%ymm8
5358 vpalignr $4,%ymm7,%ymm7,%ymm7
5359 vpalignr $8,%ymm11,%ymm11,%ymm11
5360 vpalignr $12,%ymm15,%ymm15,%ymm15
5361 vpalignr $4,%ymm6,%ymm6,%ymm6
5362 vpalignr $8,%ymm10,%ymm10,%ymm10
5363 vpalignr $12,%ymm14,%ymm14,%ymm14
5364 vpalignr $4,%ymm5,%ymm5,%ymm5
5365 vpalignr $8,%ymm9,%ymm9,%ymm9
5366 vpalignr $12,%ymm13,%ymm13,%ymm13
5367 vpalignr $4,%ymm4,%ymm4,%ymm4
5368 vpalignr $8,%ymm8,%ymm8,%ymm8
5369 vpalignr $12,%ymm12,%ymm12,%ymm12
5370 vmovdqa %ymm8,128(%rbp)
5371 addq 16(%r8),%r10
5372 adcq 8+16(%r8),%r11
5373 adcq $1,%r12
5374 movq 0+0(%rbp),%rdx
5375 movq %rdx,%r15
5376 mulxq %r10,%r13,%r14
5377 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005378 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005379 addq %rax,%r14
5380 adcq %rdx,%r15
5381 movq 8+0(%rbp),%rdx
5382 mulxq %r10,%r10,%rax
5383 addq %r10,%r14
5384 mulxq %r11,%r11,%r9
5385 adcq %r11,%r15
5386 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08005387 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05005388 addq %rax,%r15
5389 adcq %rdx,%r9
5390 movq %r13,%r10
5391 movq %r14,%r11
5392 movq %r15,%r12
5393 andq $3,%r12
5394 movq %r15,%r13
5395 andq $-4,%r13
5396 movq %r9,%r14
5397 shrdq $2,%r9,%r15
5398 shrq $2,%r9
5399 addq %r13,%r10
5400 adcq %r14,%r11
5401 adcq $0,%r12
5402 addq %r15,%r10
5403 adcq %r9,%r11
5404 adcq $0,%r12
5405
5406 leaq 32(%r8),%r8
5407 vmovdqa .rol16(%rip),%ymm8
5408 vpaddd %ymm7,%ymm3,%ymm3
5409 vpaddd %ymm6,%ymm2,%ymm2
5410 vpaddd %ymm5,%ymm1,%ymm1
5411 vpaddd %ymm4,%ymm0,%ymm0
5412 vpxor %ymm3,%ymm15,%ymm15
5413 vpxor %ymm2,%ymm14,%ymm14
5414 vpxor %ymm1,%ymm13,%ymm13
5415 vpxor %ymm0,%ymm12,%ymm12
5416 vpshufb %ymm8,%ymm15,%ymm15
5417 vpshufb %ymm8,%ymm14,%ymm14
5418 vpshufb %ymm8,%ymm13,%ymm13
5419 vpshufb %ymm8,%ymm12,%ymm12
5420 vmovdqa 128(%rbp),%ymm8
5421 vpaddd %ymm15,%ymm11,%ymm11
5422 vpaddd %ymm14,%ymm10,%ymm10
5423 vpaddd %ymm13,%ymm9,%ymm9
5424 vpaddd %ymm12,%ymm8,%ymm8
5425 vpxor %ymm11,%ymm7,%ymm7
5426 vpxor %ymm10,%ymm6,%ymm6
5427 vpxor %ymm9,%ymm5,%ymm5
5428 vpxor %ymm8,%ymm4,%ymm4
5429 vmovdqa %ymm8,128(%rbp)
5430 vpsrld $20,%ymm7,%ymm8
5431 vpslld $32-20,%ymm7,%ymm7
5432 vpxor %ymm8,%ymm7,%ymm7
5433 vpsrld $20,%ymm6,%ymm8
5434 vpslld $32-20,%ymm6,%ymm6
5435 vpxor %ymm8,%ymm6,%ymm6
5436 vpsrld $20,%ymm5,%ymm8
5437 vpslld $32-20,%ymm5,%ymm5
5438 vpxor %ymm8,%ymm5,%ymm5
5439 vpsrld $20,%ymm4,%ymm8
5440 vpslld $32-20,%ymm4,%ymm4
5441 vpxor %ymm8,%ymm4,%ymm4
5442 vmovdqa .rol8(%rip),%ymm8
5443 vpaddd %ymm7,%ymm3,%ymm3
5444 vpaddd %ymm6,%ymm2,%ymm2
5445 vpaddd %ymm5,%ymm1,%ymm1
5446 vpaddd %ymm4,%ymm0,%ymm0
5447 vpxor %ymm3,%ymm15,%ymm15
5448 vpxor %ymm2,%ymm14,%ymm14
5449 vpxor %ymm1,%ymm13,%ymm13
5450 vpxor %ymm0,%ymm12,%ymm12
5451 vpshufb %ymm8,%ymm15,%ymm15
5452 vpshufb %ymm8,%ymm14,%ymm14
5453 vpshufb %ymm8,%ymm13,%ymm13
5454 vpshufb %ymm8,%ymm12,%ymm12
5455 vmovdqa 128(%rbp),%ymm8
5456 vpaddd %ymm15,%ymm11,%ymm11
5457 vpaddd %ymm14,%ymm10,%ymm10
5458 vpaddd %ymm13,%ymm9,%ymm9
5459 vpaddd %ymm12,%ymm8,%ymm8
5460 vpxor %ymm11,%ymm7,%ymm7
5461 vpxor %ymm10,%ymm6,%ymm6
5462 vpxor %ymm9,%ymm5,%ymm5
5463 vpxor %ymm8,%ymm4,%ymm4
5464 vmovdqa %ymm8,128(%rbp)
5465 vpsrld $25,%ymm7,%ymm8
5466 vpslld $32-25,%ymm7,%ymm7
5467 vpxor %ymm8,%ymm7,%ymm7
5468 vpsrld $25,%ymm6,%ymm8
5469 vpslld $32-25,%ymm6,%ymm6
5470 vpxor %ymm8,%ymm6,%ymm6
5471 vpsrld $25,%ymm5,%ymm8
5472 vpslld $32-25,%ymm5,%ymm5
5473 vpxor %ymm8,%ymm5,%ymm5
5474 vpsrld $25,%ymm4,%ymm8
5475 vpslld $32-25,%ymm4,%ymm4
5476 vpxor %ymm8,%ymm4,%ymm4
5477 vmovdqa 128(%rbp),%ymm8
5478 vpalignr $12,%ymm7,%ymm7,%ymm7
5479 vpalignr $8,%ymm11,%ymm11,%ymm11
5480 vpalignr $4,%ymm15,%ymm15,%ymm15
5481 vpalignr $12,%ymm6,%ymm6,%ymm6
5482 vpalignr $8,%ymm10,%ymm10,%ymm10
5483 vpalignr $4,%ymm14,%ymm14,%ymm14
5484 vpalignr $12,%ymm5,%ymm5,%ymm5
5485 vpalignr $8,%ymm9,%ymm9,%ymm9
5486 vpalignr $4,%ymm13,%ymm13,%ymm13
5487 vpalignr $12,%ymm4,%ymm4,%ymm4
5488 vpalignr $8,%ymm8,%ymm8,%ymm8
5489 vpalignr $4,%ymm12,%ymm12,%ymm12
5490
5491 incq %rcx
5492 cmpq $4,%rcx
5493 jl 1b
5494 cmpq $10,%rcx
5495 jne 2b
5496 movq %rbx,%rcx
5497 subq $384,%rcx
5498 andq $-16,%rcx
54991:
5500 testq %rcx,%rcx
5501 je 1f
5502 addq 0(%r8),%r10
5503 adcq 8+0(%r8),%r11
5504 adcq $1,%r12
5505 movq 0+0(%rbp),%rdx
5506 movq %rdx,%r15
5507 mulxq %r10,%r13,%r14
5508 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005509 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005510 addq %rax,%r14
5511 adcq %rdx,%r15
5512 movq 8+0(%rbp),%rdx
5513 mulxq %r10,%r10,%rax
5514 addq %r10,%r14
5515 mulxq %r11,%r11,%r9
5516 adcq %r11,%r15
5517 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08005518 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05005519 addq %rax,%r15
5520 adcq %rdx,%r9
5521 movq %r13,%r10
5522 movq %r14,%r11
5523 movq %r15,%r12
5524 andq $3,%r12
5525 movq %r15,%r13
5526 andq $-4,%r13
5527 movq %r9,%r14
5528 shrdq $2,%r9,%r15
5529 shrq $2,%r9
5530 addq %r13,%r10
5531 adcq %r14,%r11
5532 adcq $0,%r12
5533 addq %r15,%r10
5534 adcq %r9,%r11
5535 adcq $0,%r12
5536
5537 leaq 16(%r8),%r8
5538 subq $16,%rcx
5539 jmp 1b
55401:
5541 vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
5542 vpaddd 64(%rbp),%ymm7,%ymm7
5543 vpaddd 96(%rbp),%ymm11,%ymm11
5544 vpaddd 256(%rbp),%ymm15,%ymm15
5545 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
5546 vpaddd 64(%rbp),%ymm6,%ymm6
5547 vpaddd 96(%rbp),%ymm10,%ymm10
5548 vpaddd 224(%rbp),%ymm14,%ymm14
5549 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
5550 vpaddd 64(%rbp),%ymm5,%ymm5
5551 vpaddd 96(%rbp),%ymm9,%ymm9
5552 vpaddd 192(%rbp),%ymm13,%ymm13
5553 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
5554 vpaddd 64(%rbp),%ymm4,%ymm4
5555 vpaddd 96(%rbp),%ymm8,%ymm8
5556 vpaddd 160(%rbp),%ymm12,%ymm12
5557
5558 vmovdqa %ymm0,128(%rbp)
5559 vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
5560 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
5561 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
5562 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
5563 vpxor 0+0(%rsi),%ymm0,%ymm0
5564 vpxor 32+0(%rsi),%ymm3,%ymm3
5565 vpxor 64+0(%rsi),%ymm7,%ymm7
5566 vpxor 96+0(%rsi),%ymm11,%ymm11
5567 vmovdqu %ymm0,0+0(%rdi)
5568 vmovdqu %ymm3,32+0(%rdi)
5569 vmovdqu %ymm7,64+0(%rdi)
5570 vmovdqu %ymm11,96+0(%rdi)
5571
5572 vmovdqa 128(%rbp),%ymm0
5573 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
5574 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
5575 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
5576 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
5577 vpxor 0+128(%rsi),%ymm3,%ymm3
5578 vpxor 32+128(%rsi),%ymm2,%ymm2
5579 vpxor 64+128(%rsi),%ymm6,%ymm6
5580 vpxor 96+128(%rsi),%ymm10,%ymm10
5581 vmovdqu %ymm3,0+128(%rdi)
5582 vmovdqu %ymm2,32+128(%rdi)
5583 vmovdqu %ymm6,64+128(%rdi)
5584 vmovdqu %ymm10,96+128(%rdi)
5585 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
5586 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
5587 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
5588 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
5589 vpxor 0+256(%rsi),%ymm3,%ymm3
5590 vpxor 32+256(%rsi),%ymm1,%ymm1
5591 vpxor 64+256(%rsi),%ymm5,%ymm5
5592 vpxor 96+256(%rsi),%ymm9,%ymm9
5593 vmovdqu %ymm3,0+256(%rdi)
5594 vmovdqu %ymm1,32+256(%rdi)
5595 vmovdqu %ymm5,64+256(%rdi)
5596 vmovdqu %ymm9,96+256(%rdi)
5597 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
5598 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
5599 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
5600 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
5601 vmovdqa %ymm3,%ymm8
5602
5603 leaq 384(%rsi),%rsi
5604 leaq 384(%rdi),%rdi
5605 subq $384,%rbx
5606open_avx2_tail_loop:
5607 cmpq $32,%rbx
5608 jb open_avx2_tail
5609 subq $32,%rbx
5610 vpxor (%rsi),%ymm0,%ymm0
5611 vmovdqu %ymm0,(%rdi)
5612 leaq 32(%rsi),%rsi
5613 leaq 32(%rdi),%rdi
5614 vmovdqa %ymm4,%ymm0
5615 vmovdqa %ymm8,%ymm4
5616 vmovdqa %ymm12,%ymm8
5617 jmp open_avx2_tail_loop
5618open_avx2_tail:
5619 cmpq $16,%rbx
5620 vmovdqa %xmm0,%xmm1
5621 jb 1f
5622 subq $16,%rbx
5623
5624 vpxor (%rsi),%xmm0,%xmm1
5625 vmovdqu %xmm1,(%rdi)
5626 leaq 16(%rsi),%rsi
5627 leaq 16(%rdi),%rdi
5628 vperm2i128 $0x11,%ymm0,%ymm0,%ymm0
5629 vmovdqa %xmm0,%xmm1
56301:
5631 vzeroupper
5632 jmp open_sse_tail_16
5633
5634open_avx2_192:
5635 vmovdqa %ymm0,%ymm1
5636 vmovdqa %ymm0,%ymm2
5637 vmovdqa %ymm4,%ymm5
5638 vmovdqa %ymm4,%ymm6
5639 vmovdqa %ymm8,%ymm9
5640 vmovdqa %ymm8,%ymm10
5641 vpaddd .avx2_inc(%rip),%ymm12,%ymm13
5642 vmovdqa %ymm12,%ymm11
5643 vmovdqa %ymm13,%ymm15
5644 movq $10,%r10
56451:
5646 vpaddd %ymm4,%ymm0,%ymm0
5647 vpxor %ymm0,%ymm12,%ymm12
5648 vpshufb .rol16(%rip),%ymm12,%ymm12
5649 vpaddd %ymm12,%ymm8,%ymm8
5650 vpxor %ymm8,%ymm4,%ymm4
5651 vpsrld $20,%ymm4,%ymm3
5652 vpslld $12,%ymm4,%ymm4
5653 vpxor %ymm3,%ymm4,%ymm4
5654 vpaddd %ymm4,%ymm0,%ymm0
5655 vpxor %ymm0,%ymm12,%ymm12
5656 vpshufb .rol8(%rip),%ymm12,%ymm12
5657 vpaddd %ymm12,%ymm8,%ymm8
5658 vpxor %ymm8,%ymm4,%ymm4
5659 vpslld $7,%ymm4,%ymm3
5660 vpsrld $25,%ymm4,%ymm4
5661 vpxor %ymm3,%ymm4,%ymm4
5662 vpalignr $12,%ymm12,%ymm12,%ymm12
5663 vpalignr $8,%ymm8,%ymm8,%ymm8
5664 vpalignr $4,%ymm4,%ymm4,%ymm4
5665 vpaddd %ymm5,%ymm1,%ymm1
5666 vpxor %ymm1,%ymm13,%ymm13
5667 vpshufb .rol16(%rip),%ymm13,%ymm13
5668 vpaddd %ymm13,%ymm9,%ymm9
5669 vpxor %ymm9,%ymm5,%ymm5
5670 vpsrld $20,%ymm5,%ymm3
5671 vpslld $12,%ymm5,%ymm5
5672 vpxor %ymm3,%ymm5,%ymm5
5673 vpaddd %ymm5,%ymm1,%ymm1
5674 vpxor %ymm1,%ymm13,%ymm13
5675 vpshufb .rol8(%rip),%ymm13,%ymm13
5676 vpaddd %ymm13,%ymm9,%ymm9
5677 vpxor %ymm9,%ymm5,%ymm5
5678 vpslld $7,%ymm5,%ymm3
5679 vpsrld $25,%ymm5,%ymm5
5680 vpxor %ymm3,%ymm5,%ymm5
5681 vpalignr $12,%ymm13,%ymm13,%ymm13
5682 vpalignr $8,%ymm9,%ymm9,%ymm9
5683 vpalignr $4,%ymm5,%ymm5,%ymm5
5684 vpaddd %ymm4,%ymm0,%ymm0
5685 vpxor %ymm0,%ymm12,%ymm12
5686 vpshufb .rol16(%rip),%ymm12,%ymm12
5687 vpaddd %ymm12,%ymm8,%ymm8
5688 vpxor %ymm8,%ymm4,%ymm4
5689 vpsrld $20,%ymm4,%ymm3
5690 vpslld $12,%ymm4,%ymm4
5691 vpxor %ymm3,%ymm4,%ymm4
5692 vpaddd %ymm4,%ymm0,%ymm0
5693 vpxor %ymm0,%ymm12,%ymm12
5694 vpshufb .rol8(%rip),%ymm12,%ymm12
5695 vpaddd %ymm12,%ymm8,%ymm8
5696 vpxor %ymm8,%ymm4,%ymm4
5697 vpslld $7,%ymm4,%ymm3
5698 vpsrld $25,%ymm4,%ymm4
5699 vpxor %ymm3,%ymm4,%ymm4
5700 vpalignr $4,%ymm12,%ymm12,%ymm12
5701 vpalignr $8,%ymm8,%ymm8,%ymm8
5702 vpalignr $12,%ymm4,%ymm4,%ymm4
5703 vpaddd %ymm5,%ymm1,%ymm1
5704 vpxor %ymm1,%ymm13,%ymm13
5705 vpshufb .rol16(%rip),%ymm13,%ymm13
5706 vpaddd %ymm13,%ymm9,%ymm9
5707 vpxor %ymm9,%ymm5,%ymm5
5708 vpsrld $20,%ymm5,%ymm3
5709 vpslld $12,%ymm5,%ymm5
5710 vpxor %ymm3,%ymm5,%ymm5
5711 vpaddd %ymm5,%ymm1,%ymm1
5712 vpxor %ymm1,%ymm13,%ymm13
5713 vpshufb .rol8(%rip),%ymm13,%ymm13
5714 vpaddd %ymm13,%ymm9,%ymm9
5715 vpxor %ymm9,%ymm5,%ymm5
5716 vpslld $7,%ymm5,%ymm3
5717 vpsrld $25,%ymm5,%ymm5
5718 vpxor %ymm3,%ymm5,%ymm5
5719 vpalignr $4,%ymm13,%ymm13,%ymm13
5720 vpalignr $8,%ymm9,%ymm9,%ymm9
5721 vpalignr $12,%ymm5,%ymm5,%ymm5
5722
5723 decq %r10
5724 jne 1b
5725 vpaddd %ymm2,%ymm0,%ymm0
5726 vpaddd %ymm2,%ymm1,%ymm1
5727 vpaddd %ymm6,%ymm4,%ymm4
5728 vpaddd %ymm6,%ymm5,%ymm5
5729 vpaddd %ymm10,%ymm8,%ymm8
5730 vpaddd %ymm10,%ymm9,%ymm9
5731 vpaddd %ymm11,%ymm12,%ymm12
5732 vpaddd %ymm15,%ymm13,%ymm13
5733 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
5734
5735 vpand .clamp(%rip),%ymm3,%ymm3
5736 vmovdqa %ymm3,0(%rbp)
5737
5738 vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
5739 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
5740 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
5741 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
5742 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
5743 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
5744open_avx2_short:
5745 movq %r8,%r8
5746 call poly_hash_ad_internal
5747open_avx2_hash_and_xor_loop:
5748 cmpq $32,%rbx
5749 jb open_avx2_short_tail_32
5750 subq $32,%rbx
5751 addq 0(%rsi),%r10
5752 adcq 8+0(%rsi),%r11
5753 adcq $1,%r12
5754 movq 0+0(%rbp),%rax
5755 movq %rax,%r15
5756 mulq %r10
5757 movq %rax,%r13
5758 movq %rdx,%r14
5759 movq 0+0(%rbp),%rax
5760 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08005761 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005762 addq %rax,%r14
5763 adcq %rdx,%r15
5764 movq 8+0(%rbp),%rax
5765 movq %rax,%r9
5766 mulq %r10
5767 addq %rax,%r14
5768 adcq $0,%rdx
5769 movq %rdx,%r10
5770 movq 8+0(%rbp),%rax
5771 mulq %r11
5772 addq %rax,%r15
5773 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005774 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05005775 addq %r10,%r15
5776 adcq %rdx,%r9
5777 movq %r13,%r10
5778 movq %r14,%r11
5779 movq %r15,%r12
5780 andq $3,%r12
5781 movq %r15,%r13
5782 andq $-4,%r13
5783 movq %r9,%r14
5784 shrdq $2,%r9,%r15
5785 shrq $2,%r9
5786 addq %r13,%r10
5787 adcq %r14,%r11
5788 adcq $0,%r12
5789 addq %r15,%r10
5790 adcq %r9,%r11
5791 adcq $0,%r12
5792 addq 16(%rsi),%r10
5793 adcq 8+16(%rsi),%r11
5794 adcq $1,%r12
5795 movq 0+0(%rbp),%rax
5796 movq %rax,%r15
5797 mulq %r10
5798 movq %rax,%r13
5799 movq %rdx,%r14
5800 movq 0+0(%rbp),%rax
5801 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08005802 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005803 addq %rax,%r14
5804 adcq %rdx,%r15
5805 movq 8+0(%rbp),%rax
5806 movq %rax,%r9
5807 mulq %r10
5808 addq %rax,%r14
5809 adcq $0,%rdx
5810 movq %rdx,%r10
5811 movq 8+0(%rbp),%rax
5812 mulq %r11
5813 addq %rax,%r15
5814 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005815 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05005816 addq %r10,%r15
5817 adcq %rdx,%r9
5818 movq %r13,%r10
5819 movq %r14,%r11
5820 movq %r15,%r12
5821 andq $3,%r12
5822 movq %r15,%r13
5823 andq $-4,%r13
5824 movq %r9,%r14
5825 shrdq $2,%r9,%r15
5826 shrq $2,%r9
5827 addq %r13,%r10
5828 adcq %r14,%r11
5829 adcq $0,%r12
5830 addq %r15,%r10
5831 adcq %r9,%r11
5832 adcq $0,%r12
5833
5834
5835 vpxor (%rsi),%ymm0,%ymm0
5836 vmovdqu %ymm0,(%rdi)
5837 leaq 32(%rsi),%rsi
5838 leaq 32(%rdi),%rdi
5839
5840 vmovdqa %ymm4,%ymm0
5841 vmovdqa %ymm8,%ymm4
5842 vmovdqa %ymm12,%ymm8
5843 vmovdqa %ymm1,%ymm12
5844 vmovdqa %ymm5,%ymm1
5845 vmovdqa %ymm9,%ymm5
5846 vmovdqa %ymm13,%ymm9
5847 vmovdqa %ymm2,%ymm13
5848 vmovdqa %ymm6,%ymm2
5849 jmp open_avx2_hash_and_xor_loop
5850open_avx2_short_tail_32:
5851 cmpq $16,%rbx
5852 vmovdqa %xmm0,%xmm1
5853 jb 1f
5854 subq $16,%rbx
5855 addq 0(%rsi),%r10
5856 adcq 8+0(%rsi),%r11
5857 adcq $1,%r12
5858 movq 0+0(%rbp),%rax
5859 movq %rax,%r15
5860 mulq %r10
5861 movq %rax,%r13
5862 movq %rdx,%r14
5863 movq 0+0(%rbp),%rax
5864 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08005865 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05005866 addq %rax,%r14
5867 adcq %rdx,%r15
5868 movq 8+0(%rbp),%rax
5869 movq %rax,%r9
5870 mulq %r10
5871 addq %rax,%r14
5872 adcq $0,%rdx
5873 movq %rdx,%r10
5874 movq 8+0(%rbp),%rax
5875 mulq %r11
5876 addq %rax,%r15
5877 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08005878 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05005879 addq %r10,%r15
5880 adcq %rdx,%r9
5881 movq %r13,%r10
5882 movq %r14,%r11
5883 movq %r15,%r12
5884 andq $3,%r12
5885 movq %r15,%r13
5886 andq $-4,%r13
5887 movq %r9,%r14
5888 shrdq $2,%r9,%r15
5889 shrq $2,%r9
5890 addq %r13,%r10
5891 adcq %r14,%r11
5892 adcq $0,%r12
5893 addq %r15,%r10
5894 adcq %r9,%r11
5895 adcq $0,%r12
5896
5897 vpxor (%rsi),%xmm0,%xmm3
5898 vmovdqu %xmm3,(%rdi)
5899 leaq 16(%rsi),%rsi
5900 leaq 16(%rdi),%rdi
5901 vextracti128 $1,%ymm0,%xmm1
59021:
5903 vzeroupper
5904 jmp open_sse_tail_16
5905
5906open_avx2_320:
5907 vmovdqa %ymm0,%ymm1
5908 vmovdqa %ymm0,%ymm2
5909 vmovdqa %ymm4,%ymm5
5910 vmovdqa %ymm4,%ymm6
5911 vmovdqa %ymm8,%ymm9
5912 vmovdqa %ymm8,%ymm10
5913 vpaddd .avx2_inc(%rip),%ymm12,%ymm13
5914 vpaddd .avx2_inc(%rip),%ymm13,%ymm14
5915 vmovdqa %ymm4,%ymm7
5916 vmovdqa %ymm8,%ymm11
5917 vmovdqa %ymm12,160(%rbp)
5918 vmovdqa %ymm13,192(%rbp)
5919 vmovdqa %ymm14,224(%rbp)
5920 movq $10,%r10
59211:
5922 vpaddd %ymm4,%ymm0,%ymm0
5923 vpxor %ymm0,%ymm12,%ymm12
5924 vpshufb .rol16(%rip),%ymm12,%ymm12
5925 vpaddd %ymm12,%ymm8,%ymm8
5926 vpxor %ymm8,%ymm4,%ymm4
5927 vpsrld $20,%ymm4,%ymm3
5928 vpslld $12,%ymm4,%ymm4
5929 vpxor %ymm3,%ymm4,%ymm4
5930 vpaddd %ymm4,%ymm0,%ymm0
5931 vpxor %ymm0,%ymm12,%ymm12
5932 vpshufb .rol8(%rip),%ymm12,%ymm12
5933 vpaddd %ymm12,%ymm8,%ymm8
5934 vpxor %ymm8,%ymm4,%ymm4
5935 vpslld $7,%ymm4,%ymm3
5936 vpsrld $25,%ymm4,%ymm4
5937 vpxor %ymm3,%ymm4,%ymm4
5938 vpalignr $12,%ymm12,%ymm12,%ymm12
5939 vpalignr $8,%ymm8,%ymm8,%ymm8
5940 vpalignr $4,%ymm4,%ymm4,%ymm4
5941 vpaddd %ymm5,%ymm1,%ymm1
5942 vpxor %ymm1,%ymm13,%ymm13
5943 vpshufb .rol16(%rip),%ymm13,%ymm13
5944 vpaddd %ymm13,%ymm9,%ymm9
5945 vpxor %ymm9,%ymm5,%ymm5
5946 vpsrld $20,%ymm5,%ymm3
5947 vpslld $12,%ymm5,%ymm5
5948 vpxor %ymm3,%ymm5,%ymm5
5949 vpaddd %ymm5,%ymm1,%ymm1
5950 vpxor %ymm1,%ymm13,%ymm13
5951 vpshufb .rol8(%rip),%ymm13,%ymm13
5952 vpaddd %ymm13,%ymm9,%ymm9
5953 vpxor %ymm9,%ymm5,%ymm5
5954 vpslld $7,%ymm5,%ymm3
5955 vpsrld $25,%ymm5,%ymm5
5956 vpxor %ymm3,%ymm5,%ymm5
5957 vpalignr $12,%ymm13,%ymm13,%ymm13
5958 vpalignr $8,%ymm9,%ymm9,%ymm9
5959 vpalignr $4,%ymm5,%ymm5,%ymm5
5960 vpaddd %ymm6,%ymm2,%ymm2
5961 vpxor %ymm2,%ymm14,%ymm14
5962 vpshufb .rol16(%rip),%ymm14,%ymm14
5963 vpaddd %ymm14,%ymm10,%ymm10
5964 vpxor %ymm10,%ymm6,%ymm6
5965 vpsrld $20,%ymm6,%ymm3
5966 vpslld $12,%ymm6,%ymm6
5967 vpxor %ymm3,%ymm6,%ymm6
5968 vpaddd %ymm6,%ymm2,%ymm2
5969 vpxor %ymm2,%ymm14,%ymm14
5970 vpshufb .rol8(%rip),%ymm14,%ymm14
5971 vpaddd %ymm14,%ymm10,%ymm10
5972 vpxor %ymm10,%ymm6,%ymm6
5973 vpslld $7,%ymm6,%ymm3
5974 vpsrld $25,%ymm6,%ymm6
5975 vpxor %ymm3,%ymm6,%ymm6
5976 vpalignr $12,%ymm14,%ymm14,%ymm14
5977 vpalignr $8,%ymm10,%ymm10,%ymm10
5978 vpalignr $4,%ymm6,%ymm6,%ymm6
5979 vpaddd %ymm4,%ymm0,%ymm0
5980 vpxor %ymm0,%ymm12,%ymm12
5981 vpshufb .rol16(%rip),%ymm12,%ymm12
5982 vpaddd %ymm12,%ymm8,%ymm8
5983 vpxor %ymm8,%ymm4,%ymm4
5984 vpsrld $20,%ymm4,%ymm3
5985 vpslld $12,%ymm4,%ymm4
5986 vpxor %ymm3,%ymm4,%ymm4
5987 vpaddd %ymm4,%ymm0,%ymm0
5988 vpxor %ymm0,%ymm12,%ymm12
5989 vpshufb .rol8(%rip),%ymm12,%ymm12
5990 vpaddd %ymm12,%ymm8,%ymm8
5991 vpxor %ymm8,%ymm4,%ymm4
5992 vpslld $7,%ymm4,%ymm3
5993 vpsrld $25,%ymm4,%ymm4
5994 vpxor %ymm3,%ymm4,%ymm4
5995 vpalignr $4,%ymm12,%ymm12,%ymm12
5996 vpalignr $8,%ymm8,%ymm8,%ymm8
5997 vpalignr $12,%ymm4,%ymm4,%ymm4
5998 vpaddd %ymm5,%ymm1,%ymm1
5999 vpxor %ymm1,%ymm13,%ymm13
6000 vpshufb .rol16(%rip),%ymm13,%ymm13
6001 vpaddd %ymm13,%ymm9,%ymm9
6002 vpxor %ymm9,%ymm5,%ymm5
6003 vpsrld $20,%ymm5,%ymm3
6004 vpslld $12,%ymm5,%ymm5
6005 vpxor %ymm3,%ymm5,%ymm5
6006 vpaddd %ymm5,%ymm1,%ymm1
6007 vpxor %ymm1,%ymm13,%ymm13
6008 vpshufb .rol8(%rip),%ymm13,%ymm13
6009 vpaddd %ymm13,%ymm9,%ymm9
6010 vpxor %ymm9,%ymm5,%ymm5
6011 vpslld $7,%ymm5,%ymm3
6012 vpsrld $25,%ymm5,%ymm5
6013 vpxor %ymm3,%ymm5,%ymm5
6014 vpalignr $4,%ymm13,%ymm13,%ymm13
6015 vpalignr $8,%ymm9,%ymm9,%ymm9
6016 vpalignr $12,%ymm5,%ymm5,%ymm5
6017 vpaddd %ymm6,%ymm2,%ymm2
6018 vpxor %ymm2,%ymm14,%ymm14
6019 vpshufb .rol16(%rip),%ymm14,%ymm14
6020 vpaddd %ymm14,%ymm10,%ymm10
6021 vpxor %ymm10,%ymm6,%ymm6
6022 vpsrld $20,%ymm6,%ymm3
6023 vpslld $12,%ymm6,%ymm6
6024 vpxor %ymm3,%ymm6,%ymm6
6025 vpaddd %ymm6,%ymm2,%ymm2
6026 vpxor %ymm2,%ymm14,%ymm14
6027 vpshufb .rol8(%rip),%ymm14,%ymm14
6028 vpaddd %ymm14,%ymm10,%ymm10
6029 vpxor %ymm10,%ymm6,%ymm6
6030 vpslld $7,%ymm6,%ymm3
6031 vpsrld $25,%ymm6,%ymm6
6032 vpxor %ymm3,%ymm6,%ymm6
6033 vpalignr $4,%ymm14,%ymm14,%ymm14
6034 vpalignr $8,%ymm10,%ymm10,%ymm10
6035 vpalignr $12,%ymm6,%ymm6,%ymm6
6036
6037 decq %r10
6038 jne 1b
6039 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
6040 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
6041 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
6042 vpaddd %ymm7,%ymm4,%ymm4
6043 vpaddd %ymm7,%ymm5,%ymm5
6044 vpaddd %ymm7,%ymm6,%ymm6
6045 vpaddd %ymm11,%ymm8,%ymm8
6046 vpaddd %ymm11,%ymm9,%ymm9
6047 vpaddd %ymm11,%ymm10,%ymm10
6048 vpaddd 160(%rbp),%ymm12,%ymm12
6049 vpaddd 192(%rbp),%ymm13,%ymm13
6050 vpaddd 224(%rbp),%ymm14,%ymm14
6051 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
6052
6053 vpand .clamp(%rip),%ymm3,%ymm3
6054 vmovdqa %ymm3,0(%rbp)
6055
6056 vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
6057 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
6058 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
6059 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
6060 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
6061 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
6062 vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
6063 vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
6064 vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
6065 vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
6066 jmp open_avx2_short
6067
6068
6069
6070
6071.p2align 6
6072chacha20_poly1305_seal_avx2:
6073 vzeroupper
6074 vmovdqa .chacha20_consts(%rip),%ymm0
6075 vbroadcasti128 0(%r9),%ymm4
6076 vbroadcasti128 16(%r9),%ymm8
6077 vbroadcasti128 32(%r9),%ymm12
6078 vpaddd .avx2_init(%rip),%ymm12,%ymm12
6079 cmpq $192,%rbx
6080 jbe seal_avx2_192
6081 cmpq $320,%rbx
6082 jbe seal_avx2_320
6083 vmovdqa %ymm0,%ymm1
6084 vmovdqa %ymm0,%ymm2
6085 vmovdqa %ymm0,%ymm3
6086 vmovdqa %ymm4,%ymm5
6087 vmovdqa %ymm4,%ymm6
6088 vmovdqa %ymm4,%ymm7
6089 vmovdqa %ymm4,64(%rbp)
6090 vmovdqa %ymm8,%ymm9
6091 vmovdqa %ymm8,%ymm10
6092 vmovdqa %ymm8,%ymm11
6093 vmovdqa %ymm8,96(%rbp)
6094 vmovdqa %ymm12,%ymm15
6095 vpaddd .avx2_inc(%rip),%ymm15,%ymm14
6096 vpaddd .avx2_inc(%rip),%ymm14,%ymm13
6097 vpaddd .avx2_inc(%rip),%ymm13,%ymm12
6098 vmovdqa %ymm12,160(%rbp)
6099 vmovdqa %ymm13,192(%rbp)
6100 vmovdqa %ymm14,224(%rbp)
6101 vmovdqa %ymm15,256(%rbp)
6102 movq $10,%r10
61031:
6104 vmovdqa %ymm8,128(%rbp)
6105 vmovdqa .rol16(%rip),%ymm8
6106 vpaddd %ymm7,%ymm3,%ymm3
6107 vpaddd %ymm6,%ymm2,%ymm2
6108 vpaddd %ymm5,%ymm1,%ymm1
6109 vpaddd %ymm4,%ymm0,%ymm0
6110 vpxor %ymm3,%ymm15,%ymm15
6111 vpxor %ymm2,%ymm14,%ymm14
6112 vpxor %ymm1,%ymm13,%ymm13
6113 vpxor %ymm0,%ymm12,%ymm12
6114 vpshufb %ymm8,%ymm15,%ymm15
6115 vpshufb %ymm8,%ymm14,%ymm14
6116 vpshufb %ymm8,%ymm13,%ymm13
6117 vpshufb %ymm8,%ymm12,%ymm12
6118 vmovdqa 128(%rbp),%ymm8
6119 vpaddd %ymm15,%ymm11,%ymm11
6120 vpaddd %ymm14,%ymm10,%ymm10
6121 vpaddd %ymm13,%ymm9,%ymm9
6122 vpaddd %ymm12,%ymm8,%ymm8
6123 vpxor %ymm11,%ymm7,%ymm7
6124 vpxor %ymm10,%ymm6,%ymm6
6125 vpxor %ymm9,%ymm5,%ymm5
6126 vpxor %ymm8,%ymm4,%ymm4
6127 vmovdqa %ymm8,128(%rbp)
6128 vpsrld $20,%ymm7,%ymm8
6129 vpslld $32-20,%ymm7,%ymm7
6130 vpxor %ymm8,%ymm7,%ymm7
6131 vpsrld $20,%ymm6,%ymm8
6132 vpslld $32-20,%ymm6,%ymm6
6133 vpxor %ymm8,%ymm6,%ymm6
6134 vpsrld $20,%ymm5,%ymm8
6135 vpslld $32-20,%ymm5,%ymm5
6136 vpxor %ymm8,%ymm5,%ymm5
6137 vpsrld $20,%ymm4,%ymm8
6138 vpslld $32-20,%ymm4,%ymm4
6139 vpxor %ymm8,%ymm4,%ymm4
6140 vmovdqa .rol8(%rip),%ymm8
6141 vpaddd %ymm7,%ymm3,%ymm3
6142 vpaddd %ymm6,%ymm2,%ymm2
6143 vpaddd %ymm5,%ymm1,%ymm1
6144 vpaddd %ymm4,%ymm0,%ymm0
6145 vpxor %ymm3,%ymm15,%ymm15
6146 vpxor %ymm2,%ymm14,%ymm14
6147 vpxor %ymm1,%ymm13,%ymm13
6148 vpxor %ymm0,%ymm12,%ymm12
6149 vpshufb %ymm8,%ymm15,%ymm15
6150 vpshufb %ymm8,%ymm14,%ymm14
6151 vpshufb %ymm8,%ymm13,%ymm13
6152 vpshufb %ymm8,%ymm12,%ymm12
6153 vmovdqa 128(%rbp),%ymm8
6154 vpaddd %ymm15,%ymm11,%ymm11
6155 vpaddd %ymm14,%ymm10,%ymm10
6156 vpaddd %ymm13,%ymm9,%ymm9
6157 vpaddd %ymm12,%ymm8,%ymm8
6158 vpxor %ymm11,%ymm7,%ymm7
6159 vpxor %ymm10,%ymm6,%ymm6
6160 vpxor %ymm9,%ymm5,%ymm5
6161 vpxor %ymm8,%ymm4,%ymm4
6162 vmovdqa %ymm8,128(%rbp)
6163 vpsrld $25,%ymm7,%ymm8
6164 vpslld $32-25,%ymm7,%ymm7
6165 vpxor %ymm8,%ymm7,%ymm7
6166 vpsrld $25,%ymm6,%ymm8
6167 vpslld $32-25,%ymm6,%ymm6
6168 vpxor %ymm8,%ymm6,%ymm6
6169 vpsrld $25,%ymm5,%ymm8
6170 vpslld $32-25,%ymm5,%ymm5
6171 vpxor %ymm8,%ymm5,%ymm5
6172 vpsrld $25,%ymm4,%ymm8
6173 vpslld $32-25,%ymm4,%ymm4
6174 vpxor %ymm8,%ymm4,%ymm4
6175 vmovdqa 128(%rbp),%ymm8
6176 vpalignr $4,%ymm7,%ymm7,%ymm7
6177 vpalignr $8,%ymm11,%ymm11,%ymm11
6178 vpalignr $12,%ymm15,%ymm15,%ymm15
6179 vpalignr $4,%ymm6,%ymm6,%ymm6
6180 vpalignr $8,%ymm10,%ymm10,%ymm10
6181 vpalignr $12,%ymm14,%ymm14,%ymm14
6182 vpalignr $4,%ymm5,%ymm5,%ymm5
6183 vpalignr $8,%ymm9,%ymm9,%ymm9
6184 vpalignr $12,%ymm13,%ymm13,%ymm13
6185 vpalignr $4,%ymm4,%ymm4,%ymm4
6186 vpalignr $8,%ymm8,%ymm8,%ymm8
6187 vpalignr $12,%ymm12,%ymm12,%ymm12
6188 vmovdqa %ymm8,128(%rbp)
6189 vmovdqa .rol16(%rip),%ymm8
6190 vpaddd %ymm7,%ymm3,%ymm3
6191 vpaddd %ymm6,%ymm2,%ymm2
6192 vpaddd %ymm5,%ymm1,%ymm1
6193 vpaddd %ymm4,%ymm0,%ymm0
6194 vpxor %ymm3,%ymm15,%ymm15
6195 vpxor %ymm2,%ymm14,%ymm14
6196 vpxor %ymm1,%ymm13,%ymm13
6197 vpxor %ymm0,%ymm12,%ymm12
6198 vpshufb %ymm8,%ymm15,%ymm15
6199 vpshufb %ymm8,%ymm14,%ymm14
6200 vpshufb %ymm8,%ymm13,%ymm13
6201 vpshufb %ymm8,%ymm12,%ymm12
6202 vmovdqa 128(%rbp),%ymm8
6203 vpaddd %ymm15,%ymm11,%ymm11
6204 vpaddd %ymm14,%ymm10,%ymm10
6205 vpaddd %ymm13,%ymm9,%ymm9
6206 vpaddd %ymm12,%ymm8,%ymm8
6207 vpxor %ymm11,%ymm7,%ymm7
6208 vpxor %ymm10,%ymm6,%ymm6
6209 vpxor %ymm9,%ymm5,%ymm5
6210 vpxor %ymm8,%ymm4,%ymm4
6211 vmovdqa %ymm8,128(%rbp)
6212 vpsrld $20,%ymm7,%ymm8
6213 vpslld $32-20,%ymm7,%ymm7
6214 vpxor %ymm8,%ymm7,%ymm7
6215 vpsrld $20,%ymm6,%ymm8
6216 vpslld $32-20,%ymm6,%ymm6
6217 vpxor %ymm8,%ymm6,%ymm6
6218 vpsrld $20,%ymm5,%ymm8
6219 vpslld $32-20,%ymm5,%ymm5
6220 vpxor %ymm8,%ymm5,%ymm5
6221 vpsrld $20,%ymm4,%ymm8
6222 vpslld $32-20,%ymm4,%ymm4
6223 vpxor %ymm8,%ymm4,%ymm4
6224 vmovdqa .rol8(%rip),%ymm8
6225 vpaddd %ymm7,%ymm3,%ymm3
6226 vpaddd %ymm6,%ymm2,%ymm2
6227 vpaddd %ymm5,%ymm1,%ymm1
6228 vpaddd %ymm4,%ymm0,%ymm0
6229 vpxor %ymm3,%ymm15,%ymm15
6230 vpxor %ymm2,%ymm14,%ymm14
6231 vpxor %ymm1,%ymm13,%ymm13
6232 vpxor %ymm0,%ymm12,%ymm12
6233 vpshufb %ymm8,%ymm15,%ymm15
6234 vpshufb %ymm8,%ymm14,%ymm14
6235 vpshufb %ymm8,%ymm13,%ymm13
6236 vpshufb %ymm8,%ymm12,%ymm12
6237 vmovdqa 128(%rbp),%ymm8
6238 vpaddd %ymm15,%ymm11,%ymm11
6239 vpaddd %ymm14,%ymm10,%ymm10
6240 vpaddd %ymm13,%ymm9,%ymm9
6241 vpaddd %ymm12,%ymm8,%ymm8
6242 vpxor %ymm11,%ymm7,%ymm7
6243 vpxor %ymm10,%ymm6,%ymm6
6244 vpxor %ymm9,%ymm5,%ymm5
6245 vpxor %ymm8,%ymm4,%ymm4
6246 vmovdqa %ymm8,128(%rbp)
6247 vpsrld $25,%ymm7,%ymm8
6248 vpslld $32-25,%ymm7,%ymm7
6249 vpxor %ymm8,%ymm7,%ymm7
6250 vpsrld $25,%ymm6,%ymm8
6251 vpslld $32-25,%ymm6,%ymm6
6252 vpxor %ymm8,%ymm6,%ymm6
6253 vpsrld $25,%ymm5,%ymm8
6254 vpslld $32-25,%ymm5,%ymm5
6255 vpxor %ymm8,%ymm5,%ymm5
6256 vpsrld $25,%ymm4,%ymm8
6257 vpslld $32-25,%ymm4,%ymm4
6258 vpxor %ymm8,%ymm4,%ymm4
6259 vmovdqa 128(%rbp),%ymm8
6260 vpalignr $12,%ymm7,%ymm7,%ymm7
6261 vpalignr $8,%ymm11,%ymm11,%ymm11
6262 vpalignr $4,%ymm15,%ymm15,%ymm15
6263 vpalignr $12,%ymm6,%ymm6,%ymm6
6264 vpalignr $8,%ymm10,%ymm10,%ymm10
6265 vpalignr $4,%ymm14,%ymm14,%ymm14
6266 vpalignr $12,%ymm5,%ymm5,%ymm5
6267 vpalignr $8,%ymm9,%ymm9,%ymm9
6268 vpalignr $4,%ymm13,%ymm13,%ymm13
6269 vpalignr $12,%ymm4,%ymm4,%ymm4
6270 vpalignr $8,%ymm8,%ymm8,%ymm8
6271 vpalignr $4,%ymm12,%ymm12,%ymm12
6272
6273 decq %r10
6274 jnz 1b
6275 vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
6276 vpaddd 64(%rbp),%ymm7,%ymm7
6277 vpaddd 96(%rbp),%ymm11,%ymm11
6278 vpaddd 256(%rbp),%ymm15,%ymm15
6279 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
6280 vpaddd 64(%rbp),%ymm6,%ymm6
6281 vpaddd 96(%rbp),%ymm10,%ymm10
6282 vpaddd 224(%rbp),%ymm14,%ymm14
6283 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
6284 vpaddd 64(%rbp),%ymm5,%ymm5
6285 vpaddd 96(%rbp),%ymm9,%ymm9
6286 vpaddd 192(%rbp),%ymm13,%ymm13
6287 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
6288 vpaddd 64(%rbp),%ymm4,%ymm4
6289 vpaddd 96(%rbp),%ymm8,%ymm8
6290 vpaddd 160(%rbp),%ymm12,%ymm12
6291
6292 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
6293 vperm2i128 $0x02,%ymm3,%ymm7,%ymm15
6294 vperm2i128 $0x13,%ymm3,%ymm7,%ymm3
6295 vpand .clamp(%rip),%ymm15,%ymm15
6296 vmovdqa %ymm15,0(%rbp)
6297 movq %r8,%r8
6298 call poly_hash_ad_internal
6299
6300 vpxor 0(%rsi),%ymm3,%ymm3
6301 vpxor 32(%rsi),%ymm11,%ymm11
6302 vmovdqu %ymm3,0(%rdi)
6303 vmovdqu %ymm11,32(%rdi)
6304 vperm2i128 $0x02,%ymm2,%ymm6,%ymm15
6305 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
6306 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
6307 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
6308 vpxor 0+64(%rsi),%ymm15,%ymm15
6309 vpxor 32+64(%rsi),%ymm2,%ymm2
6310 vpxor 64+64(%rsi),%ymm6,%ymm6
6311 vpxor 96+64(%rsi),%ymm10,%ymm10
6312 vmovdqu %ymm15,0+64(%rdi)
6313 vmovdqu %ymm2,32+64(%rdi)
6314 vmovdqu %ymm6,64+64(%rdi)
6315 vmovdqu %ymm10,96+64(%rdi)
6316 vperm2i128 $0x02,%ymm1,%ymm5,%ymm15
6317 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
6318 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
6319 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
6320 vpxor 0+192(%rsi),%ymm15,%ymm15
6321 vpxor 32+192(%rsi),%ymm1,%ymm1
6322 vpxor 64+192(%rsi),%ymm5,%ymm5
6323 vpxor 96+192(%rsi),%ymm9,%ymm9
6324 vmovdqu %ymm15,0+192(%rdi)
6325 vmovdqu %ymm1,32+192(%rdi)
6326 vmovdqu %ymm5,64+192(%rdi)
6327 vmovdqu %ymm9,96+192(%rdi)
6328 vperm2i128 $0x13,%ymm0,%ymm4,%ymm15
6329 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
6330 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
6331 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
6332 vmovdqa %ymm15,%ymm8
6333
6334 leaq 320(%rsi),%rsi
6335 subq $320,%rbx
6336 movq $320,%rcx
6337 cmpq $128,%rbx
6338 jbe seal_avx2_hash
6339 vpxor 0(%rsi),%ymm0,%ymm0
6340 vpxor 32(%rsi),%ymm4,%ymm4
6341 vpxor 64(%rsi),%ymm8,%ymm8
6342 vpxor 96(%rsi),%ymm12,%ymm12
6343 vmovdqu %ymm0,320(%rdi)
6344 vmovdqu %ymm4,352(%rdi)
6345 vmovdqu %ymm8,384(%rdi)
6346 vmovdqu %ymm12,416(%rdi)
6347 leaq 128(%rsi),%rsi
6348 subq $128,%rbx
6349 movq $8,%rcx
6350 movq $2,%r8
6351 cmpq $128,%rbx
6352 jbe seal_avx2_tail_128
6353 cmpq $256,%rbx
6354 jbe seal_avx2_tail_256
6355 cmpq $384,%rbx
6356 jbe seal_avx2_tail_384
6357 cmpq $512,%rbx
6358 jbe seal_avx2_tail_512
6359 vmovdqa .chacha20_consts(%rip),%ymm0
6360 vmovdqa 64(%rbp),%ymm4
6361 vmovdqa 96(%rbp),%ymm8
6362 vmovdqa %ymm0,%ymm1
6363 vmovdqa %ymm4,%ymm5
6364 vmovdqa %ymm8,%ymm9
6365 vmovdqa %ymm0,%ymm2
6366 vmovdqa %ymm4,%ymm6
6367 vmovdqa %ymm8,%ymm10
6368 vmovdqa %ymm0,%ymm3
6369 vmovdqa %ymm4,%ymm7
6370 vmovdqa %ymm8,%ymm11
6371 vmovdqa .avx2_inc(%rip),%ymm12
6372 vpaddd 160(%rbp),%ymm12,%ymm15
6373 vpaddd %ymm15,%ymm12,%ymm14
6374 vpaddd %ymm14,%ymm12,%ymm13
6375 vpaddd %ymm13,%ymm12,%ymm12
6376 vmovdqa %ymm15,256(%rbp)
6377 vmovdqa %ymm14,224(%rbp)
6378 vmovdqa %ymm13,192(%rbp)
6379 vmovdqa %ymm12,160(%rbp)
6380 vmovdqa %ymm8,128(%rbp)
6381 vmovdqa .rol16(%rip),%ymm8
6382 vpaddd %ymm7,%ymm3,%ymm3
6383 vpaddd %ymm6,%ymm2,%ymm2
6384 vpaddd %ymm5,%ymm1,%ymm1
6385 vpaddd %ymm4,%ymm0,%ymm0
6386 vpxor %ymm3,%ymm15,%ymm15
6387 vpxor %ymm2,%ymm14,%ymm14
6388 vpxor %ymm1,%ymm13,%ymm13
6389 vpxor %ymm0,%ymm12,%ymm12
6390 vpshufb %ymm8,%ymm15,%ymm15
6391 vpshufb %ymm8,%ymm14,%ymm14
6392 vpshufb %ymm8,%ymm13,%ymm13
6393 vpshufb %ymm8,%ymm12,%ymm12
6394 vmovdqa 128(%rbp),%ymm8
6395 vpaddd %ymm15,%ymm11,%ymm11
6396 vpaddd %ymm14,%ymm10,%ymm10
6397 vpaddd %ymm13,%ymm9,%ymm9
6398 vpaddd %ymm12,%ymm8,%ymm8
6399 vpxor %ymm11,%ymm7,%ymm7
6400 vpxor %ymm10,%ymm6,%ymm6
6401 vpxor %ymm9,%ymm5,%ymm5
6402 vpxor %ymm8,%ymm4,%ymm4
6403 vmovdqa %ymm8,128(%rbp)
6404 vpsrld $20,%ymm7,%ymm8
6405 vpslld $32-20,%ymm7,%ymm7
6406 vpxor %ymm8,%ymm7,%ymm7
6407 vpsrld $20,%ymm6,%ymm8
6408 vpslld $32-20,%ymm6,%ymm6
6409 vpxor %ymm8,%ymm6,%ymm6
6410 vpsrld $20,%ymm5,%ymm8
6411 vpslld $32-20,%ymm5,%ymm5
6412 vpxor %ymm8,%ymm5,%ymm5
6413 vpsrld $20,%ymm4,%ymm8
6414 vpslld $32-20,%ymm4,%ymm4
6415 vpxor %ymm8,%ymm4,%ymm4
6416 vmovdqa .rol8(%rip),%ymm8
6417 vpaddd %ymm7,%ymm3,%ymm3
6418 vpaddd %ymm6,%ymm2,%ymm2
6419 vpaddd %ymm5,%ymm1,%ymm1
6420 vpaddd %ymm4,%ymm0,%ymm0
6421 vpxor %ymm3,%ymm15,%ymm15
6422 vpxor %ymm2,%ymm14,%ymm14
6423 vpxor %ymm1,%ymm13,%ymm13
6424 vpxor %ymm0,%ymm12,%ymm12
6425 vpshufb %ymm8,%ymm15,%ymm15
6426 vpshufb %ymm8,%ymm14,%ymm14
6427 vpshufb %ymm8,%ymm13,%ymm13
6428 vpshufb %ymm8,%ymm12,%ymm12
6429 vmovdqa 128(%rbp),%ymm8
6430 vpaddd %ymm15,%ymm11,%ymm11
6431 vpaddd %ymm14,%ymm10,%ymm10
6432 vpaddd %ymm13,%ymm9,%ymm9
6433 vpaddd %ymm12,%ymm8,%ymm8
6434 vpxor %ymm11,%ymm7,%ymm7
6435 vpxor %ymm10,%ymm6,%ymm6
6436 vpxor %ymm9,%ymm5,%ymm5
6437 vpxor %ymm8,%ymm4,%ymm4
6438 vmovdqa %ymm8,128(%rbp)
6439 vpsrld $25,%ymm7,%ymm8
6440 vpslld $32-25,%ymm7,%ymm7
6441 vpxor %ymm8,%ymm7,%ymm7
6442 vpsrld $25,%ymm6,%ymm8
6443 vpslld $32-25,%ymm6,%ymm6
6444 vpxor %ymm8,%ymm6,%ymm6
6445 vpsrld $25,%ymm5,%ymm8
6446 vpslld $32-25,%ymm5,%ymm5
6447 vpxor %ymm8,%ymm5,%ymm5
6448 vpsrld $25,%ymm4,%ymm8
6449 vpslld $32-25,%ymm4,%ymm4
6450 vpxor %ymm8,%ymm4,%ymm4
6451 vmovdqa 128(%rbp),%ymm8
6452 vpalignr $4,%ymm7,%ymm7,%ymm7
6453 vpalignr $8,%ymm11,%ymm11,%ymm11
6454 vpalignr $12,%ymm15,%ymm15,%ymm15
6455 vpalignr $4,%ymm6,%ymm6,%ymm6
6456 vpalignr $8,%ymm10,%ymm10,%ymm10
6457 vpalignr $12,%ymm14,%ymm14,%ymm14
6458 vpalignr $4,%ymm5,%ymm5,%ymm5
6459 vpalignr $8,%ymm9,%ymm9,%ymm9
6460 vpalignr $12,%ymm13,%ymm13,%ymm13
6461 vpalignr $4,%ymm4,%ymm4,%ymm4
6462 vpalignr $8,%ymm8,%ymm8,%ymm8
6463 vpalignr $12,%ymm12,%ymm12,%ymm12
6464 vmovdqa %ymm8,128(%rbp)
6465 vmovdqa .rol16(%rip),%ymm8
6466 vpaddd %ymm7,%ymm3,%ymm3
6467 vpaddd %ymm6,%ymm2,%ymm2
6468 vpaddd %ymm5,%ymm1,%ymm1
6469 vpaddd %ymm4,%ymm0,%ymm0
6470 vpxor %ymm3,%ymm15,%ymm15
6471 vpxor %ymm2,%ymm14,%ymm14
6472 vpxor %ymm1,%ymm13,%ymm13
6473 vpxor %ymm0,%ymm12,%ymm12
6474 vpshufb %ymm8,%ymm15,%ymm15
6475 vpshufb %ymm8,%ymm14,%ymm14
6476 vpshufb %ymm8,%ymm13,%ymm13
6477 vpshufb %ymm8,%ymm12,%ymm12
6478 vmovdqa 128(%rbp),%ymm8
6479 vpaddd %ymm15,%ymm11,%ymm11
6480 vpaddd %ymm14,%ymm10,%ymm10
6481 vpaddd %ymm13,%ymm9,%ymm9
6482 vpaddd %ymm12,%ymm8,%ymm8
6483 vpxor %ymm11,%ymm7,%ymm7
6484 vpxor %ymm10,%ymm6,%ymm6
6485 vpxor %ymm9,%ymm5,%ymm5
6486 vpxor %ymm8,%ymm4,%ymm4
6487 vmovdqa %ymm8,128(%rbp)
6488 vpsrld $20,%ymm7,%ymm8
6489 vpslld $32-20,%ymm7,%ymm7
6490 vpxor %ymm8,%ymm7,%ymm7
6491 vpsrld $20,%ymm6,%ymm8
6492 vpslld $32-20,%ymm6,%ymm6
6493 vpxor %ymm8,%ymm6,%ymm6
6494 vpsrld $20,%ymm5,%ymm8
6495 vpslld $32-20,%ymm5,%ymm5
6496 vpxor %ymm8,%ymm5,%ymm5
6497 vpsrld $20,%ymm4,%ymm8
6498 vpslld $32-20,%ymm4,%ymm4
6499 vpxor %ymm8,%ymm4,%ymm4
6500 vmovdqa .rol8(%rip),%ymm8
6501 vpaddd %ymm7,%ymm3,%ymm3
6502 vpaddd %ymm6,%ymm2,%ymm2
6503 vpaddd %ymm5,%ymm1,%ymm1
6504 vpaddd %ymm4,%ymm0,%ymm0
6505 vpxor %ymm3,%ymm15,%ymm15
6506 vpxor %ymm2,%ymm14,%ymm14
6507 vpxor %ymm1,%ymm13,%ymm13
6508 vpxor %ymm0,%ymm12,%ymm12
6509 vpshufb %ymm8,%ymm15,%ymm15
6510 vpshufb %ymm8,%ymm14,%ymm14
6511 vpshufb %ymm8,%ymm13,%ymm13
6512 vpshufb %ymm8,%ymm12,%ymm12
6513 vmovdqa 128(%rbp),%ymm8
6514 vpaddd %ymm15,%ymm11,%ymm11
6515 vpaddd %ymm14,%ymm10,%ymm10
6516 vpaddd %ymm13,%ymm9,%ymm9
6517 vpaddd %ymm12,%ymm8,%ymm8
6518 vpxor %ymm11,%ymm7,%ymm7
6519 vpxor %ymm10,%ymm6,%ymm6
6520 vpxor %ymm9,%ymm5,%ymm5
6521 vpxor %ymm8,%ymm4,%ymm4
6522 vmovdqa %ymm8,128(%rbp)
6523 vpsrld $25,%ymm7,%ymm8
6524 vpslld $32-25,%ymm7,%ymm7
6525 vpxor %ymm8,%ymm7,%ymm7
6526 vpsrld $25,%ymm6,%ymm8
6527 vpslld $32-25,%ymm6,%ymm6
6528 vpxor %ymm8,%ymm6,%ymm6
6529 vpsrld $25,%ymm5,%ymm8
6530 vpslld $32-25,%ymm5,%ymm5
6531 vpxor %ymm8,%ymm5,%ymm5
6532 vpsrld $25,%ymm4,%ymm8
6533 vpslld $32-25,%ymm4,%ymm4
6534 vpxor %ymm8,%ymm4,%ymm4
6535 vmovdqa 128(%rbp),%ymm8
6536 vpalignr $12,%ymm7,%ymm7,%ymm7
6537 vpalignr $8,%ymm11,%ymm11,%ymm11
6538 vpalignr $4,%ymm15,%ymm15,%ymm15
6539 vpalignr $12,%ymm6,%ymm6,%ymm6
6540 vpalignr $8,%ymm10,%ymm10,%ymm10
6541 vpalignr $4,%ymm14,%ymm14,%ymm14
6542 vpalignr $12,%ymm5,%ymm5,%ymm5
6543 vpalignr $8,%ymm9,%ymm9,%ymm9
6544 vpalignr $4,%ymm13,%ymm13,%ymm13
6545 vpalignr $12,%ymm4,%ymm4,%ymm4
6546 vpalignr $8,%ymm8,%ymm8,%ymm8
6547 vpalignr $4,%ymm12,%ymm12,%ymm12
6548 vmovdqa %ymm8,128(%rbp)
6549 vmovdqa .rol16(%rip),%ymm8
6550 vpaddd %ymm7,%ymm3,%ymm3
6551 vpaddd %ymm6,%ymm2,%ymm2
6552 vpaddd %ymm5,%ymm1,%ymm1
6553 vpaddd %ymm4,%ymm0,%ymm0
6554 vpxor %ymm3,%ymm15,%ymm15
6555 vpxor %ymm2,%ymm14,%ymm14
6556 vpxor %ymm1,%ymm13,%ymm13
6557 vpxor %ymm0,%ymm12,%ymm12
6558 vpshufb %ymm8,%ymm15,%ymm15
6559 vpshufb %ymm8,%ymm14,%ymm14
6560 vpshufb %ymm8,%ymm13,%ymm13
6561 vpshufb %ymm8,%ymm12,%ymm12
6562 vmovdqa 128(%rbp),%ymm8
6563 vpaddd %ymm15,%ymm11,%ymm11
6564 vpaddd %ymm14,%ymm10,%ymm10
6565 vpaddd %ymm13,%ymm9,%ymm9
6566 vpaddd %ymm12,%ymm8,%ymm8
6567 vpxor %ymm11,%ymm7,%ymm7
6568 vpxor %ymm10,%ymm6,%ymm6
6569 vpxor %ymm9,%ymm5,%ymm5
6570 vpxor %ymm8,%ymm4,%ymm4
6571 vmovdqa %ymm8,128(%rbp)
6572 vpsrld $20,%ymm7,%ymm8
6573 vpslld $32-20,%ymm7,%ymm7
6574 vpxor %ymm8,%ymm7,%ymm7
6575 vpsrld $20,%ymm6,%ymm8
6576 vpslld $32-20,%ymm6,%ymm6
6577 vpxor %ymm8,%ymm6,%ymm6
6578 vpsrld $20,%ymm5,%ymm8
6579 vpslld $32-20,%ymm5,%ymm5
6580 vpxor %ymm8,%ymm5,%ymm5
6581 vpsrld $20,%ymm4,%ymm8
6582 vpslld $32-20,%ymm4,%ymm4
6583 vpxor %ymm8,%ymm4,%ymm4
6584 vmovdqa .rol8(%rip),%ymm8
6585 vpaddd %ymm7,%ymm3,%ymm3
6586 vpaddd %ymm6,%ymm2,%ymm2
6587 vpaddd %ymm5,%ymm1,%ymm1
6588 vpaddd %ymm4,%ymm0,%ymm0
6589
6590 subq $16,%rdi
6591 movq $9,%rcx
6592 jmp 4f
65931:
6594 vmovdqa .chacha20_consts(%rip),%ymm0
6595 vmovdqa 64(%rbp),%ymm4
6596 vmovdqa 96(%rbp),%ymm8
6597 vmovdqa %ymm0,%ymm1
6598 vmovdqa %ymm4,%ymm5
6599 vmovdqa %ymm8,%ymm9
6600 vmovdqa %ymm0,%ymm2
6601 vmovdqa %ymm4,%ymm6
6602 vmovdqa %ymm8,%ymm10
6603 vmovdqa %ymm0,%ymm3
6604 vmovdqa %ymm4,%ymm7
6605 vmovdqa %ymm8,%ymm11
6606 vmovdqa .avx2_inc(%rip),%ymm12
6607 vpaddd 160(%rbp),%ymm12,%ymm15
6608 vpaddd %ymm15,%ymm12,%ymm14
6609 vpaddd %ymm14,%ymm12,%ymm13
6610 vpaddd %ymm13,%ymm12,%ymm12
6611 vmovdqa %ymm15,256(%rbp)
6612 vmovdqa %ymm14,224(%rbp)
6613 vmovdqa %ymm13,192(%rbp)
6614 vmovdqa %ymm12,160(%rbp)
6615
6616 movq $10,%rcx
66172:
6618 addq 0(%rdi),%r10
6619 adcq 8+0(%rdi),%r11
6620 adcq $1,%r12
6621 vmovdqa %ymm8,128(%rbp)
6622 vmovdqa .rol16(%rip),%ymm8
6623 vpaddd %ymm7,%ymm3,%ymm3
6624 vpaddd %ymm6,%ymm2,%ymm2
6625 vpaddd %ymm5,%ymm1,%ymm1
6626 vpaddd %ymm4,%ymm0,%ymm0
6627 vpxor %ymm3,%ymm15,%ymm15
6628 vpxor %ymm2,%ymm14,%ymm14
6629 vpxor %ymm1,%ymm13,%ymm13
6630 vpxor %ymm0,%ymm12,%ymm12
6631 movq 0+0(%rbp),%rdx
6632 movq %rdx,%r15
6633 mulxq %r10,%r13,%r14
6634 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08006635 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05006636 addq %rax,%r14
6637 adcq %rdx,%r15
6638 vpshufb %ymm8,%ymm15,%ymm15
6639 vpshufb %ymm8,%ymm14,%ymm14
6640 vpshufb %ymm8,%ymm13,%ymm13
6641 vpshufb %ymm8,%ymm12,%ymm12
6642 vmovdqa 128(%rbp),%ymm8
6643 vpaddd %ymm15,%ymm11,%ymm11
6644 vpaddd %ymm14,%ymm10,%ymm10
6645 vpaddd %ymm13,%ymm9,%ymm9
6646 vpaddd %ymm12,%ymm8,%ymm8
6647 movq 8+0(%rbp),%rdx
6648 mulxq %r10,%r10,%rax
6649 addq %r10,%r14
6650 mulxq %r11,%r11,%r9
6651 adcq %r11,%r15
6652 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08006653 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05006654 vpxor %ymm11,%ymm7,%ymm7
6655 vpxor %ymm10,%ymm6,%ymm6
6656 vpxor %ymm9,%ymm5,%ymm5
6657 vpxor %ymm8,%ymm4,%ymm4
6658 vmovdqa %ymm8,128(%rbp)
6659 vpsrld $20,%ymm7,%ymm8
6660 vpslld $32-20,%ymm7,%ymm7
6661 vpxor %ymm8,%ymm7,%ymm7
6662 vpsrld $20,%ymm6,%ymm8
6663 vpslld $32-20,%ymm6,%ymm6
6664 vpxor %ymm8,%ymm6,%ymm6
6665 vpsrld $20,%ymm5,%ymm8
6666 addq %rax,%r15
6667 adcq %rdx,%r9
6668 vpslld $32-20,%ymm5,%ymm5
6669 vpxor %ymm8,%ymm5,%ymm5
6670 vpsrld $20,%ymm4,%ymm8
6671 vpslld $32-20,%ymm4,%ymm4
6672 vpxor %ymm8,%ymm4,%ymm4
6673 vmovdqa .rol8(%rip),%ymm8
6674 vpaddd %ymm7,%ymm3,%ymm3
6675 vpaddd %ymm6,%ymm2,%ymm2
6676 vpaddd %ymm5,%ymm1,%ymm1
6677 vpaddd %ymm4,%ymm0,%ymm0
6678 movq %r13,%r10
6679 movq %r14,%r11
6680 movq %r15,%r12
6681 andq $3,%r12
6682 movq %r15,%r13
6683 andq $-4,%r13
6684 movq %r9,%r14
6685 shrdq $2,%r9,%r15
6686 shrq $2,%r9
6687 addq %r13,%r10
6688 adcq %r14,%r11
6689 adcq $0,%r12
6690 addq %r15,%r10
6691 adcq %r9,%r11
6692 adcq $0,%r12
6693
66944:
6695 vpxor %ymm3,%ymm15,%ymm15
6696 vpxor %ymm2,%ymm14,%ymm14
6697 vpxor %ymm1,%ymm13,%ymm13
6698 vpxor %ymm0,%ymm12,%ymm12
6699 vpshufb %ymm8,%ymm15,%ymm15
6700 vpshufb %ymm8,%ymm14,%ymm14
6701 vpshufb %ymm8,%ymm13,%ymm13
6702 vpshufb %ymm8,%ymm12,%ymm12
6703 vmovdqa 128(%rbp),%ymm8
6704 addq 16(%rdi),%r10
6705 adcq 8+16(%rdi),%r11
6706 adcq $1,%r12
6707 vpaddd %ymm15,%ymm11,%ymm11
6708 vpaddd %ymm14,%ymm10,%ymm10
6709 vpaddd %ymm13,%ymm9,%ymm9
6710 vpaddd %ymm12,%ymm8,%ymm8
6711 vpxor %ymm11,%ymm7,%ymm7
6712 vpxor %ymm10,%ymm6,%ymm6
6713 vpxor %ymm9,%ymm5,%ymm5
6714 vpxor %ymm8,%ymm4,%ymm4
6715 movq 0+0(%rbp),%rdx
6716 movq %rdx,%r15
6717 mulxq %r10,%r13,%r14
6718 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08006719 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05006720 addq %rax,%r14
6721 adcq %rdx,%r15
6722 vmovdqa %ymm8,128(%rbp)
6723 vpsrld $25,%ymm7,%ymm8
6724 vpslld $32-25,%ymm7,%ymm7
6725 vpxor %ymm8,%ymm7,%ymm7
6726 vpsrld $25,%ymm6,%ymm8
6727 vpslld $32-25,%ymm6,%ymm6
6728 vpxor %ymm8,%ymm6,%ymm6
6729 vpsrld $25,%ymm5,%ymm8
6730 vpslld $32-25,%ymm5,%ymm5
6731 vpxor %ymm8,%ymm5,%ymm5
6732 vpsrld $25,%ymm4,%ymm8
6733 vpslld $32-25,%ymm4,%ymm4
6734 vpxor %ymm8,%ymm4,%ymm4
6735 vmovdqa 128(%rbp),%ymm8
6736 vpalignr $4,%ymm7,%ymm7,%ymm7
6737 vpalignr $8,%ymm11,%ymm11,%ymm11
6738 vpalignr $12,%ymm15,%ymm15,%ymm15
6739 vpalignr $4,%ymm6,%ymm6,%ymm6
6740 movq 8+0(%rbp),%rdx
6741 mulxq %r10,%r10,%rax
6742 addq %r10,%r14
6743 mulxq %r11,%r11,%r9
6744 adcq %r11,%r15
6745 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08006746 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05006747 vpalignr $8,%ymm10,%ymm10,%ymm10
6748 vpalignr $12,%ymm14,%ymm14,%ymm14
6749 vpalignr $4,%ymm5,%ymm5,%ymm5
6750 vpalignr $8,%ymm9,%ymm9,%ymm9
6751 vpalignr $12,%ymm13,%ymm13,%ymm13
6752 vpalignr $4,%ymm4,%ymm4,%ymm4
6753 vpalignr $8,%ymm8,%ymm8,%ymm8
6754 vpalignr $12,%ymm12,%ymm12,%ymm12
6755 vmovdqa %ymm8,128(%rbp)
6756 vmovdqa .rol16(%rip),%ymm8
6757 vpaddd %ymm7,%ymm3,%ymm3
6758 vpaddd %ymm6,%ymm2,%ymm2
6759 vpaddd %ymm5,%ymm1,%ymm1
6760 vpaddd %ymm4,%ymm0,%ymm0
6761 vpxor %ymm3,%ymm15,%ymm15
6762 vpxor %ymm2,%ymm14,%ymm14
6763 vpxor %ymm1,%ymm13,%ymm13
6764 vpxor %ymm0,%ymm12,%ymm12
6765 addq %rax,%r15
6766 adcq %rdx,%r9
6767 vpshufb %ymm8,%ymm15,%ymm15
6768 vpshufb %ymm8,%ymm14,%ymm14
6769 vpshufb %ymm8,%ymm13,%ymm13
6770 vpshufb %ymm8,%ymm12,%ymm12
6771 vmovdqa 128(%rbp),%ymm8
6772 vpaddd %ymm15,%ymm11,%ymm11
6773 vpaddd %ymm14,%ymm10,%ymm10
6774 vpaddd %ymm13,%ymm9,%ymm9
6775 vpaddd %ymm12,%ymm8,%ymm8
6776 movq %r13,%r10
6777 movq %r14,%r11
6778 movq %r15,%r12
6779 andq $3,%r12
6780 movq %r15,%r13
6781 andq $-4,%r13
6782 movq %r9,%r14
6783 shrdq $2,%r9,%r15
6784 shrq $2,%r9
6785 addq %r13,%r10
6786 adcq %r14,%r11
6787 adcq $0,%r12
6788 addq %r15,%r10
6789 adcq %r9,%r11
6790 adcq $0,%r12
6791 vpxor %ymm11,%ymm7,%ymm7
6792 vpxor %ymm10,%ymm6,%ymm6
6793 vpxor %ymm9,%ymm5,%ymm5
6794 vpxor %ymm8,%ymm4,%ymm4
6795 vmovdqa %ymm8,128(%rbp)
6796 vpsrld $20,%ymm7,%ymm8
6797 vpslld $32-20,%ymm7,%ymm7
6798 vpxor %ymm8,%ymm7,%ymm7
6799 addq 32(%rdi),%r10
6800 adcq 8+32(%rdi),%r11
6801 adcq $1,%r12
6802
6803 leaq 48(%rdi),%rdi
6804 vpsrld $20,%ymm6,%ymm8
6805 vpslld $32-20,%ymm6,%ymm6
6806 vpxor %ymm8,%ymm6,%ymm6
6807 vpsrld $20,%ymm5,%ymm8
6808 vpslld $32-20,%ymm5,%ymm5
6809 vpxor %ymm8,%ymm5,%ymm5
6810 vpsrld $20,%ymm4,%ymm8
6811 vpslld $32-20,%ymm4,%ymm4
6812 vpxor %ymm8,%ymm4,%ymm4
6813 vmovdqa .rol8(%rip),%ymm8
6814 vpaddd %ymm7,%ymm3,%ymm3
6815 vpaddd %ymm6,%ymm2,%ymm2
6816 vpaddd %ymm5,%ymm1,%ymm1
6817 vpaddd %ymm4,%ymm0,%ymm0
6818 vpxor %ymm3,%ymm15,%ymm15
6819 vpxor %ymm2,%ymm14,%ymm14
6820 vpxor %ymm1,%ymm13,%ymm13
6821 vpxor %ymm0,%ymm12,%ymm12
6822 movq 0+0(%rbp),%rdx
6823 movq %rdx,%r15
6824 mulxq %r10,%r13,%r14
6825 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08006826 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05006827 addq %rax,%r14
6828 adcq %rdx,%r15
6829 vpshufb %ymm8,%ymm15,%ymm15
6830 vpshufb %ymm8,%ymm14,%ymm14
6831 vpshufb %ymm8,%ymm13,%ymm13
6832 vpshufb %ymm8,%ymm12,%ymm12
6833 vmovdqa 128(%rbp),%ymm8
6834 vpaddd %ymm15,%ymm11,%ymm11
6835 vpaddd %ymm14,%ymm10,%ymm10
6836 vpaddd %ymm13,%ymm9,%ymm9
6837 movq 8+0(%rbp),%rdx
6838 mulxq %r10,%r10,%rax
6839 addq %r10,%r14
6840 mulxq %r11,%r11,%r9
6841 adcq %r11,%r15
6842 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08006843 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05006844 vpaddd %ymm12,%ymm8,%ymm8
6845 vpxor %ymm11,%ymm7,%ymm7
6846 vpxor %ymm10,%ymm6,%ymm6
6847 vpxor %ymm9,%ymm5,%ymm5
6848 vpxor %ymm8,%ymm4,%ymm4
6849 vmovdqa %ymm8,128(%rbp)
6850 vpsrld $25,%ymm7,%ymm8
6851 vpslld $32-25,%ymm7,%ymm7
6852 addq %rax,%r15
6853 adcq %rdx,%r9
6854 vpxor %ymm8,%ymm7,%ymm7
6855 vpsrld $25,%ymm6,%ymm8
6856 vpslld $32-25,%ymm6,%ymm6
6857 vpxor %ymm8,%ymm6,%ymm6
6858 vpsrld $25,%ymm5,%ymm8
6859 vpslld $32-25,%ymm5,%ymm5
6860 vpxor %ymm8,%ymm5,%ymm5
6861 vpsrld $25,%ymm4,%ymm8
6862 vpslld $32-25,%ymm4,%ymm4
6863 vpxor %ymm8,%ymm4,%ymm4
6864 vmovdqa 128(%rbp),%ymm8
6865 vpalignr $12,%ymm7,%ymm7,%ymm7
6866 vpalignr $8,%ymm11,%ymm11,%ymm11
6867 vpalignr $4,%ymm15,%ymm15,%ymm15
6868 vpalignr $12,%ymm6,%ymm6,%ymm6
6869 vpalignr $8,%ymm10,%ymm10,%ymm10
6870 vpalignr $4,%ymm14,%ymm14,%ymm14
6871 vpalignr $12,%ymm5,%ymm5,%ymm5
6872 movq %r13,%r10
6873 movq %r14,%r11
6874 movq %r15,%r12
6875 andq $3,%r12
6876 movq %r15,%r13
6877 andq $-4,%r13
6878 movq %r9,%r14
6879 shrdq $2,%r9,%r15
6880 shrq $2,%r9
6881 addq %r13,%r10
6882 adcq %r14,%r11
6883 adcq $0,%r12
6884 addq %r15,%r10
6885 adcq %r9,%r11
6886 adcq $0,%r12
6887 vpalignr $8,%ymm9,%ymm9,%ymm9
6888 vpalignr $4,%ymm13,%ymm13,%ymm13
6889 vpalignr $12,%ymm4,%ymm4,%ymm4
6890 vpalignr $8,%ymm8,%ymm8,%ymm8
6891 vpalignr $4,%ymm12,%ymm12,%ymm12
6892
6893 decq %rcx
6894 jne 2b
6895 vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
6896 vpaddd 64(%rbp),%ymm7,%ymm7
6897 vpaddd 96(%rbp),%ymm11,%ymm11
6898 vpaddd 256(%rbp),%ymm15,%ymm15
6899 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
6900 vpaddd 64(%rbp),%ymm6,%ymm6
6901 vpaddd 96(%rbp),%ymm10,%ymm10
6902 vpaddd 224(%rbp),%ymm14,%ymm14
6903 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
6904 vpaddd 64(%rbp),%ymm5,%ymm5
6905 vpaddd 96(%rbp),%ymm9,%ymm9
6906 vpaddd 192(%rbp),%ymm13,%ymm13
6907 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
6908 vpaddd 64(%rbp),%ymm4,%ymm4
6909 vpaddd 96(%rbp),%ymm8,%ymm8
6910 vpaddd 160(%rbp),%ymm12,%ymm12
6911
6912 leaq 32(%rdi),%rdi
6913 vmovdqa %ymm0,128(%rbp)
6914 addq -32(%rdi),%r10
6915 adcq 8+-32(%rdi),%r11
6916 adcq $1,%r12
6917 vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
6918 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
6919 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
6920 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
6921 vpxor 0+0(%rsi),%ymm0,%ymm0
6922 vpxor 32+0(%rsi),%ymm3,%ymm3
6923 vpxor 64+0(%rsi),%ymm7,%ymm7
6924 vpxor 96+0(%rsi),%ymm11,%ymm11
6925 vmovdqu %ymm0,0+0(%rdi)
6926 vmovdqu %ymm3,32+0(%rdi)
6927 vmovdqu %ymm7,64+0(%rdi)
6928 vmovdqu %ymm11,96+0(%rdi)
6929
6930 vmovdqa 128(%rbp),%ymm0
6931 movq 0+0(%rbp),%rax
6932 movq %rax,%r15
6933 mulq %r10
6934 movq %rax,%r13
6935 movq %rdx,%r14
6936 movq 0+0(%rbp),%rax
6937 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08006938 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05006939 addq %rax,%r14
6940 adcq %rdx,%r15
6941 movq 8+0(%rbp),%rax
6942 movq %rax,%r9
6943 mulq %r10
6944 addq %rax,%r14
6945 adcq $0,%rdx
6946 movq %rdx,%r10
6947 movq 8+0(%rbp),%rax
6948 mulq %r11
6949 addq %rax,%r15
6950 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08006951 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05006952 addq %r10,%r15
6953 adcq %rdx,%r9
6954 movq %r13,%r10
6955 movq %r14,%r11
6956 movq %r15,%r12
6957 andq $3,%r12
6958 movq %r15,%r13
6959 andq $-4,%r13
6960 movq %r9,%r14
6961 shrdq $2,%r9,%r15
6962 shrq $2,%r9
6963 addq %r13,%r10
6964 adcq %r14,%r11
6965 adcq $0,%r12
6966 addq %r15,%r10
6967 adcq %r9,%r11
6968 adcq $0,%r12
6969 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
6970 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
6971 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
6972 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
6973 vpxor 0+128(%rsi),%ymm3,%ymm3
6974 vpxor 32+128(%rsi),%ymm2,%ymm2
6975 vpxor 64+128(%rsi),%ymm6,%ymm6
6976 vpxor 96+128(%rsi),%ymm10,%ymm10
6977 vmovdqu %ymm3,0+128(%rdi)
6978 vmovdqu %ymm2,32+128(%rdi)
6979 vmovdqu %ymm6,64+128(%rdi)
6980 vmovdqu %ymm10,96+128(%rdi)
6981 addq -16(%rdi),%r10
6982 adcq 8+-16(%rdi),%r11
6983 adcq $1,%r12
6984 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
6985 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
6986 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
6987 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
6988 vpxor 0+256(%rsi),%ymm3,%ymm3
6989 vpxor 32+256(%rsi),%ymm1,%ymm1
6990 vpxor 64+256(%rsi),%ymm5,%ymm5
6991 vpxor 96+256(%rsi),%ymm9,%ymm9
6992 vmovdqu %ymm3,0+256(%rdi)
6993 vmovdqu %ymm1,32+256(%rdi)
6994 vmovdqu %ymm5,64+256(%rdi)
6995 vmovdqu %ymm9,96+256(%rdi)
6996 movq 0+0(%rbp),%rax
6997 movq %rax,%r15
6998 mulq %r10
6999 movq %rax,%r13
7000 movq %rdx,%r14
7001 movq 0+0(%rbp),%rax
7002 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007003 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007004 addq %rax,%r14
7005 adcq %rdx,%r15
7006 movq 8+0(%rbp),%rax
7007 movq %rax,%r9
7008 mulq %r10
7009 addq %rax,%r14
7010 adcq $0,%rdx
7011 movq %rdx,%r10
7012 movq 8+0(%rbp),%rax
7013 mulq %r11
7014 addq %rax,%r15
7015 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007016 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007017 addq %r10,%r15
7018 adcq %rdx,%r9
7019 movq %r13,%r10
7020 movq %r14,%r11
7021 movq %r15,%r12
7022 andq $3,%r12
7023 movq %r15,%r13
7024 andq $-4,%r13
7025 movq %r9,%r14
7026 shrdq $2,%r9,%r15
7027 shrq $2,%r9
7028 addq %r13,%r10
7029 adcq %r14,%r11
7030 adcq $0,%r12
7031 addq %r15,%r10
7032 adcq %r9,%r11
7033 adcq $0,%r12
7034 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
7035 vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
7036 vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
7037 vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
7038 vpxor 0+384(%rsi),%ymm3,%ymm3
7039 vpxor 32+384(%rsi),%ymm0,%ymm0
7040 vpxor 64+384(%rsi),%ymm4,%ymm4
7041 vpxor 96+384(%rsi),%ymm8,%ymm8
7042 vmovdqu %ymm3,0+384(%rdi)
7043 vmovdqu %ymm0,32+384(%rdi)
7044 vmovdqu %ymm4,64+384(%rdi)
7045 vmovdqu %ymm8,96+384(%rdi)
7046
7047 leaq 512(%rsi),%rsi
7048 subq $512,%rbx
7049 cmpq $512,%rbx
7050 jg 1b
7051 addq 0(%rdi),%r10
7052 adcq 8+0(%rdi),%r11
7053 adcq $1,%r12
7054 movq 0+0(%rbp),%rax
7055 movq %rax,%r15
7056 mulq %r10
7057 movq %rax,%r13
7058 movq %rdx,%r14
7059 movq 0+0(%rbp),%rax
7060 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007061 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007062 addq %rax,%r14
7063 adcq %rdx,%r15
7064 movq 8+0(%rbp),%rax
7065 movq %rax,%r9
7066 mulq %r10
7067 addq %rax,%r14
7068 adcq $0,%rdx
7069 movq %rdx,%r10
7070 movq 8+0(%rbp),%rax
7071 mulq %r11
7072 addq %rax,%r15
7073 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007074 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007075 addq %r10,%r15
7076 adcq %rdx,%r9
7077 movq %r13,%r10
7078 movq %r14,%r11
7079 movq %r15,%r12
7080 andq $3,%r12
7081 movq %r15,%r13
7082 andq $-4,%r13
7083 movq %r9,%r14
7084 shrdq $2,%r9,%r15
7085 shrq $2,%r9
7086 addq %r13,%r10
7087 adcq %r14,%r11
7088 adcq $0,%r12
7089 addq %r15,%r10
7090 adcq %r9,%r11
7091 adcq $0,%r12
7092 addq 16(%rdi),%r10
7093 adcq 8+16(%rdi),%r11
7094 adcq $1,%r12
7095 movq 0+0(%rbp),%rax
7096 movq %rax,%r15
7097 mulq %r10
7098 movq %rax,%r13
7099 movq %rdx,%r14
7100 movq 0+0(%rbp),%rax
7101 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007102 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007103 addq %rax,%r14
7104 adcq %rdx,%r15
7105 movq 8+0(%rbp),%rax
7106 movq %rax,%r9
7107 mulq %r10
7108 addq %rax,%r14
7109 adcq $0,%rdx
7110 movq %rdx,%r10
7111 movq 8+0(%rbp),%rax
7112 mulq %r11
7113 addq %rax,%r15
7114 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007115 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007116 addq %r10,%r15
7117 adcq %rdx,%r9
7118 movq %r13,%r10
7119 movq %r14,%r11
7120 movq %r15,%r12
7121 andq $3,%r12
7122 movq %r15,%r13
7123 andq $-4,%r13
7124 movq %r9,%r14
7125 shrdq $2,%r9,%r15
7126 shrq $2,%r9
7127 addq %r13,%r10
7128 adcq %r14,%r11
7129 adcq $0,%r12
7130 addq %r15,%r10
7131 adcq %r9,%r11
7132 adcq $0,%r12
7133
7134 leaq 32(%rdi),%rdi
7135 movq $10,%rcx
7136 xorq %r8,%r8
7137 cmpq $128,%rbx
7138 ja 3f
7139
7140seal_avx2_tail_128:
7141 vmovdqa .chacha20_consts(%rip),%ymm0
7142 vmovdqa 64(%rbp),%ymm4
7143 vmovdqa 96(%rbp),%ymm8
7144 vmovdqa .avx2_inc(%rip),%ymm12
7145 vpaddd 160(%rbp),%ymm12,%ymm12
7146 vmovdqa %ymm12,160(%rbp)
7147
71481:
7149 addq 0(%rdi),%r10
7150 adcq 8+0(%rdi),%r11
7151 adcq $1,%r12
7152 movq 0+0(%rbp),%rax
7153 movq %rax,%r15
7154 mulq %r10
7155 movq %rax,%r13
7156 movq %rdx,%r14
7157 movq 0+0(%rbp),%rax
7158 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007159 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007160 addq %rax,%r14
7161 adcq %rdx,%r15
7162 movq 8+0(%rbp),%rax
7163 movq %rax,%r9
7164 mulq %r10
7165 addq %rax,%r14
7166 adcq $0,%rdx
7167 movq %rdx,%r10
7168 movq 8+0(%rbp),%rax
7169 mulq %r11
7170 addq %rax,%r15
7171 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007172 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007173 addq %r10,%r15
7174 adcq %rdx,%r9
7175 movq %r13,%r10
7176 movq %r14,%r11
7177 movq %r15,%r12
7178 andq $3,%r12
7179 movq %r15,%r13
7180 andq $-4,%r13
7181 movq %r9,%r14
7182 shrdq $2,%r9,%r15
7183 shrq $2,%r9
7184 addq %r13,%r10
7185 adcq %r14,%r11
7186 adcq $0,%r12
7187 addq %r15,%r10
7188 adcq %r9,%r11
7189 adcq $0,%r12
7190
7191 leaq 16(%rdi),%rdi
71922:
7193 vpaddd %ymm4,%ymm0,%ymm0
7194 vpxor %ymm0,%ymm12,%ymm12
7195 vpshufb .rol16(%rip),%ymm12,%ymm12
7196 vpaddd %ymm12,%ymm8,%ymm8
7197 vpxor %ymm8,%ymm4,%ymm4
7198 vpsrld $20,%ymm4,%ymm3
7199 vpslld $12,%ymm4,%ymm4
7200 vpxor %ymm3,%ymm4,%ymm4
7201 vpaddd %ymm4,%ymm0,%ymm0
7202 vpxor %ymm0,%ymm12,%ymm12
7203 vpshufb .rol8(%rip),%ymm12,%ymm12
7204 vpaddd %ymm12,%ymm8,%ymm8
7205 vpxor %ymm8,%ymm4,%ymm4
7206 vpslld $7,%ymm4,%ymm3
7207 vpsrld $25,%ymm4,%ymm4
7208 vpxor %ymm3,%ymm4,%ymm4
7209 vpalignr $12,%ymm12,%ymm12,%ymm12
7210 vpalignr $8,%ymm8,%ymm8,%ymm8
7211 vpalignr $4,%ymm4,%ymm4,%ymm4
7212 addq 0(%rdi),%r10
7213 adcq 8+0(%rdi),%r11
7214 adcq $1,%r12
7215 movq 0+0(%rbp),%rax
7216 movq %rax,%r15
7217 mulq %r10
7218 movq %rax,%r13
7219 movq %rdx,%r14
7220 movq 0+0(%rbp),%rax
7221 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007222 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007223 addq %rax,%r14
7224 adcq %rdx,%r15
7225 movq 8+0(%rbp),%rax
7226 movq %rax,%r9
7227 mulq %r10
7228 addq %rax,%r14
7229 adcq $0,%rdx
7230 movq %rdx,%r10
7231 movq 8+0(%rbp),%rax
7232 mulq %r11
7233 addq %rax,%r15
7234 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007235 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007236 addq %r10,%r15
7237 adcq %rdx,%r9
7238 movq %r13,%r10
7239 movq %r14,%r11
7240 movq %r15,%r12
7241 andq $3,%r12
7242 movq %r15,%r13
7243 andq $-4,%r13
7244 movq %r9,%r14
7245 shrdq $2,%r9,%r15
7246 shrq $2,%r9
7247 addq %r13,%r10
7248 adcq %r14,%r11
7249 adcq $0,%r12
7250 addq %r15,%r10
7251 adcq %r9,%r11
7252 adcq $0,%r12
7253 vpaddd %ymm4,%ymm0,%ymm0
7254 vpxor %ymm0,%ymm12,%ymm12
7255 vpshufb .rol16(%rip),%ymm12,%ymm12
7256 vpaddd %ymm12,%ymm8,%ymm8
7257 vpxor %ymm8,%ymm4,%ymm4
7258 vpsrld $20,%ymm4,%ymm3
7259 vpslld $12,%ymm4,%ymm4
7260 vpxor %ymm3,%ymm4,%ymm4
7261 vpaddd %ymm4,%ymm0,%ymm0
7262 vpxor %ymm0,%ymm12,%ymm12
7263 vpshufb .rol8(%rip),%ymm12,%ymm12
7264 vpaddd %ymm12,%ymm8,%ymm8
7265 vpxor %ymm8,%ymm4,%ymm4
7266 vpslld $7,%ymm4,%ymm3
7267 vpsrld $25,%ymm4,%ymm4
7268 vpxor %ymm3,%ymm4,%ymm4
7269 vpalignr $4,%ymm12,%ymm12,%ymm12
7270 vpalignr $8,%ymm8,%ymm8,%ymm8
7271 vpalignr $12,%ymm4,%ymm4,%ymm4
7272 addq 16(%rdi),%r10
7273 adcq 8+16(%rdi),%r11
7274 adcq $1,%r12
7275 movq 0+0(%rbp),%rax
7276 movq %rax,%r15
7277 mulq %r10
7278 movq %rax,%r13
7279 movq %rdx,%r14
7280 movq 0+0(%rbp),%rax
7281 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007282 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007283 addq %rax,%r14
7284 adcq %rdx,%r15
7285 movq 8+0(%rbp),%rax
7286 movq %rax,%r9
7287 mulq %r10
7288 addq %rax,%r14
7289 adcq $0,%rdx
7290 movq %rdx,%r10
7291 movq 8+0(%rbp),%rax
7292 mulq %r11
7293 addq %rax,%r15
7294 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007295 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007296 addq %r10,%r15
7297 adcq %rdx,%r9
7298 movq %r13,%r10
7299 movq %r14,%r11
7300 movq %r15,%r12
7301 andq $3,%r12
7302 movq %r15,%r13
7303 andq $-4,%r13
7304 movq %r9,%r14
7305 shrdq $2,%r9,%r15
7306 shrq $2,%r9
7307 addq %r13,%r10
7308 adcq %r14,%r11
7309 adcq $0,%r12
7310 addq %r15,%r10
7311 adcq %r9,%r11
7312 adcq $0,%r12
7313
7314 leaq 32(%rdi),%rdi
7315 decq %rcx
7316 jg 1b
7317 decq %r8
7318 jge 2b
7319 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
7320 vpaddd 64(%rbp),%ymm4,%ymm4
7321 vpaddd 96(%rbp),%ymm8,%ymm8
7322 vpaddd 160(%rbp),%ymm12,%ymm12
7323 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
7324 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
7325 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
7326 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
7327 vmovdqa %ymm3,%ymm8
7328
7329 jmp seal_avx2_short_loop
73303:
7331 cmpq $256,%rbx
7332 ja 3f
7333
7334seal_avx2_tail_256:
7335 vmovdqa .chacha20_consts(%rip),%ymm0
7336 vmovdqa 64(%rbp),%ymm4
7337 vmovdqa 96(%rbp),%ymm8
7338 vmovdqa %ymm0,%ymm1
7339 vmovdqa %ymm4,%ymm5
7340 vmovdqa %ymm8,%ymm9
7341 vmovdqa .avx2_inc(%rip),%ymm12
7342 vpaddd 160(%rbp),%ymm12,%ymm13
7343 vpaddd %ymm13,%ymm12,%ymm12
7344 vmovdqa %ymm12,160(%rbp)
7345 vmovdqa %ymm13,192(%rbp)
7346
73471:
7348 addq 0(%rdi),%r10
7349 adcq 8+0(%rdi),%r11
7350 adcq $1,%r12
7351 movq 0+0(%rbp),%rax
7352 movq %rax,%r15
7353 mulq %r10
7354 movq %rax,%r13
7355 movq %rdx,%r14
7356 movq 0+0(%rbp),%rax
7357 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007358 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007359 addq %rax,%r14
7360 adcq %rdx,%r15
7361 movq 8+0(%rbp),%rax
7362 movq %rax,%r9
7363 mulq %r10
7364 addq %rax,%r14
7365 adcq $0,%rdx
7366 movq %rdx,%r10
7367 movq 8+0(%rbp),%rax
7368 mulq %r11
7369 addq %rax,%r15
7370 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007371 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007372 addq %r10,%r15
7373 adcq %rdx,%r9
7374 movq %r13,%r10
7375 movq %r14,%r11
7376 movq %r15,%r12
7377 andq $3,%r12
7378 movq %r15,%r13
7379 andq $-4,%r13
7380 movq %r9,%r14
7381 shrdq $2,%r9,%r15
7382 shrq $2,%r9
7383 addq %r13,%r10
7384 adcq %r14,%r11
7385 adcq $0,%r12
7386 addq %r15,%r10
7387 adcq %r9,%r11
7388 adcq $0,%r12
7389
7390 leaq 16(%rdi),%rdi
73912:
7392 vpaddd %ymm4,%ymm0,%ymm0
7393 vpxor %ymm0,%ymm12,%ymm12
7394 vpshufb .rol16(%rip),%ymm12,%ymm12
7395 vpaddd %ymm12,%ymm8,%ymm8
7396 vpxor %ymm8,%ymm4,%ymm4
7397 vpsrld $20,%ymm4,%ymm3
7398 vpslld $12,%ymm4,%ymm4
7399 vpxor %ymm3,%ymm4,%ymm4
7400 vpaddd %ymm4,%ymm0,%ymm0
7401 vpxor %ymm0,%ymm12,%ymm12
7402 vpshufb .rol8(%rip),%ymm12,%ymm12
7403 vpaddd %ymm12,%ymm8,%ymm8
7404 vpxor %ymm8,%ymm4,%ymm4
7405 vpslld $7,%ymm4,%ymm3
7406 vpsrld $25,%ymm4,%ymm4
7407 vpxor %ymm3,%ymm4,%ymm4
7408 vpalignr $12,%ymm12,%ymm12,%ymm12
7409 vpalignr $8,%ymm8,%ymm8,%ymm8
7410 vpalignr $4,%ymm4,%ymm4,%ymm4
7411 vpaddd %ymm5,%ymm1,%ymm1
7412 vpxor %ymm1,%ymm13,%ymm13
7413 vpshufb .rol16(%rip),%ymm13,%ymm13
7414 vpaddd %ymm13,%ymm9,%ymm9
7415 vpxor %ymm9,%ymm5,%ymm5
7416 vpsrld $20,%ymm5,%ymm3
7417 vpslld $12,%ymm5,%ymm5
7418 vpxor %ymm3,%ymm5,%ymm5
7419 vpaddd %ymm5,%ymm1,%ymm1
7420 vpxor %ymm1,%ymm13,%ymm13
7421 vpshufb .rol8(%rip),%ymm13,%ymm13
7422 vpaddd %ymm13,%ymm9,%ymm9
7423 vpxor %ymm9,%ymm5,%ymm5
7424 vpslld $7,%ymm5,%ymm3
7425 vpsrld $25,%ymm5,%ymm5
7426 vpxor %ymm3,%ymm5,%ymm5
7427 vpalignr $12,%ymm13,%ymm13,%ymm13
7428 vpalignr $8,%ymm9,%ymm9,%ymm9
7429 vpalignr $4,%ymm5,%ymm5,%ymm5
7430 addq 0(%rdi),%r10
7431 adcq 8+0(%rdi),%r11
7432 adcq $1,%r12
7433 movq 0+0(%rbp),%rax
7434 movq %rax,%r15
7435 mulq %r10
7436 movq %rax,%r13
7437 movq %rdx,%r14
7438 movq 0+0(%rbp),%rax
7439 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007440 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007441 addq %rax,%r14
7442 adcq %rdx,%r15
7443 movq 8+0(%rbp),%rax
7444 movq %rax,%r9
7445 mulq %r10
7446 addq %rax,%r14
7447 adcq $0,%rdx
7448 movq %rdx,%r10
7449 movq 8+0(%rbp),%rax
7450 mulq %r11
7451 addq %rax,%r15
7452 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007453 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007454 addq %r10,%r15
7455 adcq %rdx,%r9
7456 movq %r13,%r10
7457 movq %r14,%r11
7458 movq %r15,%r12
7459 andq $3,%r12
7460 movq %r15,%r13
7461 andq $-4,%r13
7462 movq %r9,%r14
7463 shrdq $2,%r9,%r15
7464 shrq $2,%r9
7465 addq %r13,%r10
7466 adcq %r14,%r11
7467 adcq $0,%r12
7468 addq %r15,%r10
7469 adcq %r9,%r11
7470 adcq $0,%r12
7471 vpaddd %ymm4,%ymm0,%ymm0
7472 vpxor %ymm0,%ymm12,%ymm12
7473 vpshufb .rol16(%rip),%ymm12,%ymm12
7474 vpaddd %ymm12,%ymm8,%ymm8
7475 vpxor %ymm8,%ymm4,%ymm4
7476 vpsrld $20,%ymm4,%ymm3
7477 vpslld $12,%ymm4,%ymm4
7478 vpxor %ymm3,%ymm4,%ymm4
7479 vpaddd %ymm4,%ymm0,%ymm0
7480 vpxor %ymm0,%ymm12,%ymm12
7481 vpshufb .rol8(%rip),%ymm12,%ymm12
7482 vpaddd %ymm12,%ymm8,%ymm8
7483 vpxor %ymm8,%ymm4,%ymm4
7484 vpslld $7,%ymm4,%ymm3
7485 vpsrld $25,%ymm4,%ymm4
7486 vpxor %ymm3,%ymm4,%ymm4
7487 vpalignr $4,%ymm12,%ymm12,%ymm12
7488 vpalignr $8,%ymm8,%ymm8,%ymm8
7489 vpalignr $12,%ymm4,%ymm4,%ymm4
7490 vpaddd %ymm5,%ymm1,%ymm1
7491 vpxor %ymm1,%ymm13,%ymm13
7492 vpshufb .rol16(%rip),%ymm13,%ymm13
7493 vpaddd %ymm13,%ymm9,%ymm9
7494 vpxor %ymm9,%ymm5,%ymm5
7495 vpsrld $20,%ymm5,%ymm3
7496 vpslld $12,%ymm5,%ymm5
7497 vpxor %ymm3,%ymm5,%ymm5
7498 vpaddd %ymm5,%ymm1,%ymm1
7499 vpxor %ymm1,%ymm13,%ymm13
7500 vpshufb .rol8(%rip),%ymm13,%ymm13
7501 vpaddd %ymm13,%ymm9,%ymm9
7502 vpxor %ymm9,%ymm5,%ymm5
7503 vpslld $7,%ymm5,%ymm3
7504 vpsrld $25,%ymm5,%ymm5
7505 vpxor %ymm3,%ymm5,%ymm5
7506 vpalignr $4,%ymm13,%ymm13,%ymm13
7507 vpalignr $8,%ymm9,%ymm9,%ymm9
7508 vpalignr $12,%ymm5,%ymm5,%ymm5
7509 addq 16(%rdi),%r10
7510 adcq 8+16(%rdi),%r11
7511 adcq $1,%r12
7512 movq 0+0(%rbp),%rax
7513 movq %rax,%r15
7514 mulq %r10
7515 movq %rax,%r13
7516 movq %rdx,%r14
7517 movq 0+0(%rbp),%rax
7518 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007519 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007520 addq %rax,%r14
7521 adcq %rdx,%r15
7522 movq 8+0(%rbp),%rax
7523 movq %rax,%r9
7524 mulq %r10
7525 addq %rax,%r14
7526 adcq $0,%rdx
7527 movq %rdx,%r10
7528 movq 8+0(%rbp),%rax
7529 mulq %r11
7530 addq %rax,%r15
7531 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007532 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007533 addq %r10,%r15
7534 adcq %rdx,%r9
7535 movq %r13,%r10
7536 movq %r14,%r11
7537 movq %r15,%r12
7538 andq $3,%r12
7539 movq %r15,%r13
7540 andq $-4,%r13
7541 movq %r9,%r14
7542 shrdq $2,%r9,%r15
7543 shrq $2,%r9
7544 addq %r13,%r10
7545 adcq %r14,%r11
7546 adcq $0,%r12
7547 addq %r15,%r10
7548 adcq %r9,%r11
7549 adcq $0,%r12
7550
7551 leaq 32(%rdi),%rdi
7552 decq %rcx
7553 jg 1b
7554 decq %r8
7555 jge 2b
7556 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
7557 vpaddd 64(%rbp),%ymm5,%ymm5
7558 vpaddd 96(%rbp),%ymm9,%ymm9
7559 vpaddd 192(%rbp),%ymm13,%ymm13
7560 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
7561 vpaddd 64(%rbp),%ymm4,%ymm4
7562 vpaddd 96(%rbp),%ymm8,%ymm8
7563 vpaddd 160(%rbp),%ymm12,%ymm12
7564 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
7565 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
7566 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
7567 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
7568 vpxor 0+0(%rsi),%ymm3,%ymm3
7569 vpxor 32+0(%rsi),%ymm1,%ymm1
7570 vpxor 64+0(%rsi),%ymm5,%ymm5
7571 vpxor 96+0(%rsi),%ymm9,%ymm9
7572 vmovdqu %ymm3,0+0(%rdi)
7573 vmovdqu %ymm1,32+0(%rdi)
7574 vmovdqu %ymm5,64+0(%rdi)
7575 vmovdqu %ymm9,96+0(%rdi)
7576 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
7577 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
7578 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
7579 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
7580 vmovdqa %ymm3,%ymm8
7581
7582 movq $128,%rcx
7583 leaq 128(%rsi),%rsi
7584 subq $128,%rbx
7585 jmp seal_avx2_hash
75863:
7587 cmpq $384,%rbx
7588 ja seal_avx2_tail_512
7589
7590seal_avx2_tail_384:
7591 vmovdqa .chacha20_consts(%rip),%ymm0
7592 vmovdqa 64(%rbp),%ymm4
7593 vmovdqa 96(%rbp),%ymm8
7594 vmovdqa %ymm0,%ymm1
7595 vmovdqa %ymm4,%ymm5
7596 vmovdqa %ymm8,%ymm9
7597 vmovdqa %ymm0,%ymm2
7598 vmovdqa %ymm4,%ymm6
7599 vmovdqa %ymm8,%ymm10
7600 vmovdqa .avx2_inc(%rip),%ymm12
7601 vpaddd 160(%rbp),%ymm12,%ymm14
7602 vpaddd %ymm14,%ymm12,%ymm13
7603 vpaddd %ymm13,%ymm12,%ymm12
7604 vmovdqa %ymm12,160(%rbp)
7605 vmovdqa %ymm13,192(%rbp)
7606 vmovdqa %ymm14,224(%rbp)
7607
76081:
7609 addq 0(%rdi),%r10
7610 adcq 8+0(%rdi),%r11
7611 adcq $1,%r12
7612 movq 0+0(%rbp),%rax
7613 movq %rax,%r15
7614 mulq %r10
7615 movq %rax,%r13
7616 movq %rdx,%r14
7617 movq 0+0(%rbp),%rax
7618 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007619 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007620 addq %rax,%r14
7621 adcq %rdx,%r15
7622 movq 8+0(%rbp),%rax
7623 movq %rax,%r9
7624 mulq %r10
7625 addq %rax,%r14
7626 adcq $0,%rdx
7627 movq %rdx,%r10
7628 movq 8+0(%rbp),%rax
7629 mulq %r11
7630 addq %rax,%r15
7631 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007632 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007633 addq %r10,%r15
7634 adcq %rdx,%r9
7635 movq %r13,%r10
7636 movq %r14,%r11
7637 movq %r15,%r12
7638 andq $3,%r12
7639 movq %r15,%r13
7640 andq $-4,%r13
7641 movq %r9,%r14
7642 shrdq $2,%r9,%r15
7643 shrq $2,%r9
7644 addq %r13,%r10
7645 adcq %r14,%r11
7646 adcq $0,%r12
7647 addq %r15,%r10
7648 adcq %r9,%r11
7649 adcq $0,%r12
7650
7651 leaq 16(%rdi),%rdi
76522:
7653 vpaddd %ymm4,%ymm0,%ymm0
7654 vpxor %ymm0,%ymm12,%ymm12
7655 vpshufb .rol16(%rip),%ymm12,%ymm12
7656 vpaddd %ymm12,%ymm8,%ymm8
7657 vpxor %ymm8,%ymm4,%ymm4
7658 vpsrld $20,%ymm4,%ymm3
7659 vpslld $12,%ymm4,%ymm4
7660 vpxor %ymm3,%ymm4,%ymm4
7661 vpaddd %ymm4,%ymm0,%ymm0
7662 vpxor %ymm0,%ymm12,%ymm12
7663 vpshufb .rol8(%rip),%ymm12,%ymm12
7664 vpaddd %ymm12,%ymm8,%ymm8
7665 vpxor %ymm8,%ymm4,%ymm4
7666 vpslld $7,%ymm4,%ymm3
7667 vpsrld $25,%ymm4,%ymm4
7668 vpxor %ymm3,%ymm4,%ymm4
7669 vpalignr $12,%ymm12,%ymm12,%ymm12
7670 vpalignr $8,%ymm8,%ymm8,%ymm8
7671 vpalignr $4,%ymm4,%ymm4,%ymm4
7672 vpaddd %ymm5,%ymm1,%ymm1
7673 vpxor %ymm1,%ymm13,%ymm13
7674 vpshufb .rol16(%rip),%ymm13,%ymm13
7675 vpaddd %ymm13,%ymm9,%ymm9
7676 vpxor %ymm9,%ymm5,%ymm5
7677 vpsrld $20,%ymm5,%ymm3
7678 vpslld $12,%ymm5,%ymm5
7679 vpxor %ymm3,%ymm5,%ymm5
7680 vpaddd %ymm5,%ymm1,%ymm1
7681 vpxor %ymm1,%ymm13,%ymm13
7682 vpshufb .rol8(%rip),%ymm13,%ymm13
7683 vpaddd %ymm13,%ymm9,%ymm9
7684 vpxor %ymm9,%ymm5,%ymm5
7685 vpslld $7,%ymm5,%ymm3
7686 vpsrld $25,%ymm5,%ymm5
7687 vpxor %ymm3,%ymm5,%ymm5
7688 vpalignr $12,%ymm13,%ymm13,%ymm13
7689 vpalignr $8,%ymm9,%ymm9,%ymm9
7690 vpalignr $4,%ymm5,%ymm5,%ymm5
7691 addq 0(%rdi),%r10
7692 adcq 8+0(%rdi),%r11
7693 adcq $1,%r12
7694 movq 0+0(%rbp),%rax
7695 movq %rax,%r15
7696 mulq %r10
7697 movq %rax,%r13
7698 movq %rdx,%r14
7699 movq 0+0(%rbp),%rax
7700 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007701 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007702 addq %rax,%r14
7703 adcq %rdx,%r15
7704 movq 8+0(%rbp),%rax
7705 movq %rax,%r9
7706 mulq %r10
7707 addq %rax,%r14
7708 adcq $0,%rdx
7709 movq %rdx,%r10
7710 movq 8+0(%rbp),%rax
7711 mulq %r11
7712 addq %rax,%r15
7713 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007714 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007715 addq %r10,%r15
7716 adcq %rdx,%r9
7717 movq %r13,%r10
7718 movq %r14,%r11
7719 movq %r15,%r12
7720 andq $3,%r12
7721 movq %r15,%r13
7722 andq $-4,%r13
7723 movq %r9,%r14
7724 shrdq $2,%r9,%r15
7725 shrq $2,%r9
7726 addq %r13,%r10
7727 adcq %r14,%r11
7728 adcq $0,%r12
7729 addq %r15,%r10
7730 adcq %r9,%r11
7731 adcq $0,%r12
7732 vpaddd %ymm6,%ymm2,%ymm2
7733 vpxor %ymm2,%ymm14,%ymm14
7734 vpshufb .rol16(%rip),%ymm14,%ymm14
7735 vpaddd %ymm14,%ymm10,%ymm10
7736 vpxor %ymm10,%ymm6,%ymm6
7737 vpsrld $20,%ymm6,%ymm3
7738 vpslld $12,%ymm6,%ymm6
7739 vpxor %ymm3,%ymm6,%ymm6
7740 vpaddd %ymm6,%ymm2,%ymm2
7741 vpxor %ymm2,%ymm14,%ymm14
7742 vpshufb .rol8(%rip),%ymm14,%ymm14
7743 vpaddd %ymm14,%ymm10,%ymm10
7744 vpxor %ymm10,%ymm6,%ymm6
7745 vpslld $7,%ymm6,%ymm3
7746 vpsrld $25,%ymm6,%ymm6
7747 vpxor %ymm3,%ymm6,%ymm6
7748 vpalignr $12,%ymm14,%ymm14,%ymm14
7749 vpalignr $8,%ymm10,%ymm10,%ymm10
7750 vpalignr $4,%ymm6,%ymm6,%ymm6
7751 vpaddd %ymm4,%ymm0,%ymm0
7752 vpxor %ymm0,%ymm12,%ymm12
7753 vpshufb .rol16(%rip),%ymm12,%ymm12
7754 vpaddd %ymm12,%ymm8,%ymm8
7755 vpxor %ymm8,%ymm4,%ymm4
7756 vpsrld $20,%ymm4,%ymm3
7757 vpslld $12,%ymm4,%ymm4
7758 vpxor %ymm3,%ymm4,%ymm4
7759 vpaddd %ymm4,%ymm0,%ymm0
7760 vpxor %ymm0,%ymm12,%ymm12
7761 vpshufb .rol8(%rip),%ymm12,%ymm12
7762 vpaddd %ymm12,%ymm8,%ymm8
7763 vpxor %ymm8,%ymm4,%ymm4
7764 vpslld $7,%ymm4,%ymm3
7765 vpsrld $25,%ymm4,%ymm4
7766 vpxor %ymm3,%ymm4,%ymm4
7767 vpalignr $4,%ymm12,%ymm12,%ymm12
7768 vpalignr $8,%ymm8,%ymm8,%ymm8
7769 vpalignr $12,%ymm4,%ymm4,%ymm4
7770 addq 16(%rdi),%r10
7771 adcq 8+16(%rdi),%r11
7772 adcq $1,%r12
7773 movq 0+0(%rbp),%rax
7774 movq %rax,%r15
7775 mulq %r10
7776 movq %rax,%r13
7777 movq %rdx,%r14
7778 movq 0+0(%rbp),%rax
7779 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08007780 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007781 addq %rax,%r14
7782 adcq %rdx,%r15
7783 movq 8+0(%rbp),%rax
7784 movq %rax,%r9
7785 mulq %r10
7786 addq %rax,%r14
7787 adcq $0,%rdx
7788 movq %rdx,%r10
7789 movq 8+0(%rbp),%rax
7790 mulq %r11
7791 addq %rax,%r15
7792 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007793 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05007794 addq %r10,%r15
7795 adcq %rdx,%r9
7796 movq %r13,%r10
7797 movq %r14,%r11
7798 movq %r15,%r12
7799 andq $3,%r12
7800 movq %r15,%r13
7801 andq $-4,%r13
7802 movq %r9,%r14
7803 shrdq $2,%r9,%r15
7804 shrq $2,%r9
7805 addq %r13,%r10
7806 adcq %r14,%r11
7807 adcq $0,%r12
7808 addq %r15,%r10
7809 adcq %r9,%r11
7810 adcq $0,%r12
7811 vpaddd %ymm5,%ymm1,%ymm1
7812 vpxor %ymm1,%ymm13,%ymm13
7813 vpshufb .rol16(%rip),%ymm13,%ymm13
7814 vpaddd %ymm13,%ymm9,%ymm9
7815 vpxor %ymm9,%ymm5,%ymm5
7816 vpsrld $20,%ymm5,%ymm3
7817 vpslld $12,%ymm5,%ymm5
7818 vpxor %ymm3,%ymm5,%ymm5
7819 vpaddd %ymm5,%ymm1,%ymm1
7820 vpxor %ymm1,%ymm13,%ymm13
7821 vpshufb .rol8(%rip),%ymm13,%ymm13
7822 vpaddd %ymm13,%ymm9,%ymm9
7823 vpxor %ymm9,%ymm5,%ymm5
7824 vpslld $7,%ymm5,%ymm3
7825 vpsrld $25,%ymm5,%ymm5
7826 vpxor %ymm3,%ymm5,%ymm5
7827 vpalignr $4,%ymm13,%ymm13,%ymm13
7828 vpalignr $8,%ymm9,%ymm9,%ymm9
7829 vpalignr $12,%ymm5,%ymm5,%ymm5
7830 vpaddd %ymm6,%ymm2,%ymm2
7831 vpxor %ymm2,%ymm14,%ymm14
7832 vpshufb .rol16(%rip),%ymm14,%ymm14
7833 vpaddd %ymm14,%ymm10,%ymm10
7834 vpxor %ymm10,%ymm6,%ymm6
7835 vpsrld $20,%ymm6,%ymm3
7836 vpslld $12,%ymm6,%ymm6
7837 vpxor %ymm3,%ymm6,%ymm6
7838 vpaddd %ymm6,%ymm2,%ymm2
7839 vpxor %ymm2,%ymm14,%ymm14
7840 vpshufb .rol8(%rip),%ymm14,%ymm14
7841 vpaddd %ymm14,%ymm10,%ymm10
7842 vpxor %ymm10,%ymm6,%ymm6
7843 vpslld $7,%ymm6,%ymm3
7844 vpsrld $25,%ymm6,%ymm6
7845 vpxor %ymm3,%ymm6,%ymm6
7846 vpalignr $4,%ymm14,%ymm14,%ymm14
7847 vpalignr $8,%ymm10,%ymm10,%ymm10
7848 vpalignr $12,%ymm6,%ymm6,%ymm6
7849
7850 leaq 32(%rdi),%rdi
7851 decq %rcx
7852 jg 1b
7853 decq %r8
7854 jge 2b
7855 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
7856 vpaddd 64(%rbp),%ymm6,%ymm6
7857 vpaddd 96(%rbp),%ymm10,%ymm10
7858 vpaddd 224(%rbp),%ymm14,%ymm14
7859 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
7860 vpaddd 64(%rbp),%ymm5,%ymm5
7861 vpaddd 96(%rbp),%ymm9,%ymm9
7862 vpaddd 192(%rbp),%ymm13,%ymm13
7863 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
7864 vpaddd 64(%rbp),%ymm4,%ymm4
7865 vpaddd 96(%rbp),%ymm8,%ymm8
7866 vpaddd 160(%rbp),%ymm12,%ymm12
7867 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
7868 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
7869 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
7870 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
7871 vpxor 0+0(%rsi),%ymm3,%ymm3
7872 vpxor 32+0(%rsi),%ymm2,%ymm2
7873 vpxor 64+0(%rsi),%ymm6,%ymm6
7874 vpxor 96+0(%rsi),%ymm10,%ymm10
7875 vmovdqu %ymm3,0+0(%rdi)
7876 vmovdqu %ymm2,32+0(%rdi)
7877 vmovdqu %ymm6,64+0(%rdi)
7878 vmovdqu %ymm10,96+0(%rdi)
7879 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
7880 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
7881 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
7882 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
7883 vpxor 0+128(%rsi),%ymm3,%ymm3
7884 vpxor 32+128(%rsi),%ymm1,%ymm1
7885 vpxor 64+128(%rsi),%ymm5,%ymm5
7886 vpxor 96+128(%rsi),%ymm9,%ymm9
7887 vmovdqu %ymm3,0+128(%rdi)
7888 vmovdqu %ymm1,32+128(%rdi)
7889 vmovdqu %ymm5,64+128(%rdi)
7890 vmovdqu %ymm9,96+128(%rdi)
7891 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
7892 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
7893 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
7894 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
7895 vmovdqa %ymm3,%ymm8
7896
7897 movq $256,%rcx
7898 leaq 256(%rsi),%rsi
7899 subq $256,%rbx
7900 jmp seal_avx2_hash
7901
7902seal_avx2_tail_512:
7903 vmovdqa .chacha20_consts(%rip),%ymm0
7904 vmovdqa 64(%rbp),%ymm4
7905 vmovdqa 96(%rbp),%ymm8
7906 vmovdqa %ymm0,%ymm1
7907 vmovdqa %ymm4,%ymm5
7908 vmovdqa %ymm8,%ymm9
7909 vmovdqa %ymm0,%ymm2
7910 vmovdqa %ymm4,%ymm6
7911 vmovdqa %ymm8,%ymm10
7912 vmovdqa %ymm0,%ymm3
7913 vmovdqa %ymm4,%ymm7
7914 vmovdqa %ymm8,%ymm11
7915 vmovdqa .avx2_inc(%rip),%ymm12
7916 vpaddd 160(%rbp),%ymm12,%ymm15
7917 vpaddd %ymm15,%ymm12,%ymm14
7918 vpaddd %ymm14,%ymm12,%ymm13
7919 vpaddd %ymm13,%ymm12,%ymm12
7920 vmovdqa %ymm15,256(%rbp)
7921 vmovdqa %ymm14,224(%rbp)
7922 vmovdqa %ymm13,192(%rbp)
7923 vmovdqa %ymm12,160(%rbp)
7924
79251:
7926 addq 0(%rdi),%r10
7927 adcq 8+0(%rdi),%r11
7928 adcq $1,%r12
7929 movq 0+0(%rbp),%rdx
7930 movq %rdx,%r15
7931 mulxq %r10,%r13,%r14
7932 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08007933 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05007934 addq %rax,%r14
7935 adcq %rdx,%r15
7936 movq 8+0(%rbp),%rdx
7937 mulxq %r10,%r10,%rax
7938 addq %r10,%r14
7939 mulxq %r11,%r11,%r9
7940 adcq %r11,%r15
7941 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08007942 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05007943 addq %rax,%r15
7944 adcq %rdx,%r9
7945 movq %r13,%r10
7946 movq %r14,%r11
7947 movq %r15,%r12
7948 andq $3,%r12
7949 movq %r15,%r13
7950 andq $-4,%r13
7951 movq %r9,%r14
7952 shrdq $2,%r9,%r15
7953 shrq $2,%r9
7954 addq %r13,%r10
7955 adcq %r14,%r11
7956 adcq $0,%r12
7957 addq %r15,%r10
7958 adcq %r9,%r11
7959 adcq $0,%r12
7960
7961 leaq 16(%rdi),%rdi
79622:
7963 vmovdqa %ymm8,128(%rbp)
7964 vmovdqa .rol16(%rip),%ymm8
7965 vpaddd %ymm7,%ymm3,%ymm3
7966 vpaddd %ymm6,%ymm2,%ymm2
7967 vpaddd %ymm5,%ymm1,%ymm1
7968 vpaddd %ymm4,%ymm0,%ymm0
7969 vpxor %ymm3,%ymm15,%ymm15
7970 vpxor %ymm2,%ymm14,%ymm14
7971 vpxor %ymm1,%ymm13,%ymm13
7972 vpxor %ymm0,%ymm12,%ymm12
7973 vpshufb %ymm8,%ymm15,%ymm15
7974 vpshufb %ymm8,%ymm14,%ymm14
7975 vpshufb %ymm8,%ymm13,%ymm13
7976 vpshufb %ymm8,%ymm12,%ymm12
7977 vmovdqa 128(%rbp),%ymm8
7978 vpaddd %ymm15,%ymm11,%ymm11
7979 vpaddd %ymm14,%ymm10,%ymm10
7980 vpaddd %ymm13,%ymm9,%ymm9
7981 vpaddd %ymm12,%ymm8,%ymm8
7982 vpxor %ymm11,%ymm7,%ymm7
7983 addq 0(%rdi),%r10
7984 adcq 8+0(%rdi),%r11
7985 adcq $1,%r12
7986 vpxor %ymm10,%ymm6,%ymm6
7987 vpxor %ymm9,%ymm5,%ymm5
7988 vpxor %ymm8,%ymm4,%ymm4
7989 vmovdqa %ymm8,128(%rbp)
7990 vpsrld $20,%ymm7,%ymm8
7991 vpslld $32-20,%ymm7,%ymm7
7992 vpxor %ymm8,%ymm7,%ymm7
7993 vpsrld $20,%ymm6,%ymm8
7994 vpslld $32-20,%ymm6,%ymm6
7995 vpxor %ymm8,%ymm6,%ymm6
7996 vpsrld $20,%ymm5,%ymm8
7997 vpslld $32-20,%ymm5,%ymm5
7998 vpxor %ymm8,%ymm5,%ymm5
7999 vpsrld $20,%ymm4,%ymm8
8000 vpslld $32-20,%ymm4,%ymm4
8001 vpxor %ymm8,%ymm4,%ymm4
8002 vmovdqa .rol8(%rip),%ymm8
8003 vpaddd %ymm7,%ymm3,%ymm3
8004 vpaddd %ymm6,%ymm2,%ymm2
8005 vpaddd %ymm5,%ymm1,%ymm1
8006 movq 0+0(%rbp),%rdx
8007 movq %rdx,%r15
8008 mulxq %r10,%r13,%r14
8009 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008010 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008011 addq %rax,%r14
8012 adcq %rdx,%r15
8013 vpaddd %ymm4,%ymm0,%ymm0
8014 vpxor %ymm3,%ymm15,%ymm15
8015 vpxor %ymm2,%ymm14,%ymm14
8016 vpxor %ymm1,%ymm13,%ymm13
8017 vpxor %ymm0,%ymm12,%ymm12
8018 vpshufb %ymm8,%ymm15,%ymm15
8019 vpshufb %ymm8,%ymm14,%ymm14
8020 vpshufb %ymm8,%ymm13,%ymm13
8021 vpshufb %ymm8,%ymm12,%ymm12
8022 vmovdqa 128(%rbp),%ymm8
8023 vpaddd %ymm15,%ymm11,%ymm11
8024 vpaddd %ymm14,%ymm10,%ymm10
8025 vpaddd %ymm13,%ymm9,%ymm9
8026 vpaddd %ymm12,%ymm8,%ymm8
8027 vpxor %ymm11,%ymm7,%ymm7
8028 vpxor %ymm10,%ymm6,%ymm6
8029 vpxor %ymm9,%ymm5,%ymm5
8030 vpxor %ymm8,%ymm4,%ymm4
8031 vmovdqa %ymm8,128(%rbp)
8032 vpsrld $25,%ymm7,%ymm8
8033 movq 8+0(%rbp),%rdx
8034 mulxq %r10,%r10,%rax
8035 addq %r10,%r14
8036 mulxq %r11,%r11,%r9
8037 adcq %r11,%r15
8038 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08008039 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05008040 vpslld $32-25,%ymm7,%ymm7
8041 vpxor %ymm8,%ymm7,%ymm7
8042 vpsrld $25,%ymm6,%ymm8
8043 vpslld $32-25,%ymm6,%ymm6
8044 vpxor %ymm8,%ymm6,%ymm6
8045 vpsrld $25,%ymm5,%ymm8
8046 vpslld $32-25,%ymm5,%ymm5
8047 vpxor %ymm8,%ymm5,%ymm5
8048 vpsrld $25,%ymm4,%ymm8
8049 vpslld $32-25,%ymm4,%ymm4
8050 vpxor %ymm8,%ymm4,%ymm4
8051 vmovdqa 128(%rbp),%ymm8
8052 vpalignr $4,%ymm7,%ymm7,%ymm7
8053 vpalignr $8,%ymm11,%ymm11,%ymm11
8054 vpalignr $12,%ymm15,%ymm15,%ymm15
8055 vpalignr $4,%ymm6,%ymm6,%ymm6
8056 vpalignr $8,%ymm10,%ymm10,%ymm10
8057 vpalignr $12,%ymm14,%ymm14,%ymm14
8058 vpalignr $4,%ymm5,%ymm5,%ymm5
8059 vpalignr $8,%ymm9,%ymm9,%ymm9
8060 addq %rax,%r15
8061 adcq %rdx,%r9
8062 vpalignr $12,%ymm13,%ymm13,%ymm13
8063 vpalignr $4,%ymm4,%ymm4,%ymm4
8064 vpalignr $8,%ymm8,%ymm8,%ymm8
8065 vpalignr $12,%ymm12,%ymm12,%ymm12
8066 vmovdqa %ymm8,128(%rbp)
8067 vmovdqa .rol16(%rip),%ymm8
8068 vpaddd %ymm7,%ymm3,%ymm3
8069 vpaddd %ymm6,%ymm2,%ymm2
8070 vpaddd %ymm5,%ymm1,%ymm1
8071 vpaddd %ymm4,%ymm0,%ymm0
8072 vpxor %ymm3,%ymm15,%ymm15
8073 vpxor %ymm2,%ymm14,%ymm14
8074 vpxor %ymm1,%ymm13,%ymm13
8075 vpxor %ymm0,%ymm12,%ymm12
8076 vpshufb %ymm8,%ymm15,%ymm15
8077 vpshufb %ymm8,%ymm14,%ymm14
8078 vpshufb %ymm8,%ymm13,%ymm13
8079 vpshufb %ymm8,%ymm12,%ymm12
8080 vmovdqa 128(%rbp),%ymm8
8081 vpaddd %ymm15,%ymm11,%ymm11
8082 movq %r13,%r10
8083 movq %r14,%r11
8084 movq %r15,%r12
8085 andq $3,%r12
8086 movq %r15,%r13
8087 andq $-4,%r13
8088 movq %r9,%r14
8089 shrdq $2,%r9,%r15
8090 shrq $2,%r9
8091 addq %r13,%r10
8092 adcq %r14,%r11
8093 adcq $0,%r12
8094 addq %r15,%r10
8095 adcq %r9,%r11
8096 adcq $0,%r12
8097 vpaddd %ymm14,%ymm10,%ymm10
8098 vpaddd %ymm13,%ymm9,%ymm9
8099 vpaddd %ymm12,%ymm8,%ymm8
8100 vpxor %ymm11,%ymm7,%ymm7
8101 vpxor %ymm10,%ymm6,%ymm6
8102 vpxor %ymm9,%ymm5,%ymm5
8103 vpxor %ymm8,%ymm4,%ymm4
8104 vmovdqa %ymm8,128(%rbp)
8105 vpsrld $20,%ymm7,%ymm8
8106 vpslld $32-20,%ymm7,%ymm7
8107 vpxor %ymm8,%ymm7,%ymm7
8108 vpsrld $20,%ymm6,%ymm8
8109 vpslld $32-20,%ymm6,%ymm6
8110 vpxor %ymm8,%ymm6,%ymm6
8111 vpsrld $20,%ymm5,%ymm8
8112 vpslld $32-20,%ymm5,%ymm5
8113 vpxor %ymm8,%ymm5,%ymm5
8114 vpsrld $20,%ymm4,%ymm8
8115 vpslld $32-20,%ymm4,%ymm4
8116 vpxor %ymm8,%ymm4,%ymm4
8117 addq 16(%rdi),%r10
8118 adcq 8+16(%rdi),%r11
8119 adcq $1,%r12
8120 vmovdqa .rol8(%rip),%ymm8
8121 vpaddd %ymm7,%ymm3,%ymm3
8122 vpaddd %ymm6,%ymm2,%ymm2
8123 vpaddd %ymm5,%ymm1,%ymm1
8124 vpaddd %ymm4,%ymm0,%ymm0
8125 vpxor %ymm3,%ymm15,%ymm15
8126 vpxor %ymm2,%ymm14,%ymm14
8127 vpxor %ymm1,%ymm13,%ymm13
8128 vpxor %ymm0,%ymm12,%ymm12
8129 vpshufb %ymm8,%ymm15,%ymm15
8130 vpshufb %ymm8,%ymm14,%ymm14
8131 vpshufb %ymm8,%ymm13,%ymm13
8132 vpshufb %ymm8,%ymm12,%ymm12
8133 vmovdqa 128(%rbp),%ymm8
8134 vpaddd %ymm15,%ymm11,%ymm11
8135 vpaddd %ymm14,%ymm10,%ymm10
8136 vpaddd %ymm13,%ymm9,%ymm9
8137 vpaddd %ymm12,%ymm8,%ymm8
8138 vpxor %ymm11,%ymm7,%ymm7
8139 vpxor %ymm10,%ymm6,%ymm6
8140 movq 0+0(%rbp),%rdx
8141 movq %rdx,%r15
8142 mulxq %r10,%r13,%r14
8143 mulxq %r11,%rax,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008144 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008145 addq %rax,%r14
8146 adcq %rdx,%r15
8147 vpxor %ymm9,%ymm5,%ymm5
8148 vpxor %ymm8,%ymm4,%ymm4
8149 vmovdqa %ymm8,128(%rbp)
8150 vpsrld $25,%ymm7,%ymm8
8151 vpslld $32-25,%ymm7,%ymm7
8152 vpxor %ymm8,%ymm7,%ymm7
8153 vpsrld $25,%ymm6,%ymm8
8154 vpslld $32-25,%ymm6,%ymm6
8155 vpxor %ymm8,%ymm6,%ymm6
8156 vpsrld $25,%ymm5,%ymm8
8157 vpslld $32-25,%ymm5,%ymm5
8158 vpxor %ymm8,%ymm5,%ymm5
8159 vpsrld $25,%ymm4,%ymm8
8160 vpslld $32-25,%ymm4,%ymm4
8161 vpxor %ymm8,%ymm4,%ymm4
8162 vmovdqa 128(%rbp),%ymm8
8163 vpalignr $12,%ymm7,%ymm7,%ymm7
8164 vpalignr $8,%ymm11,%ymm11,%ymm11
8165 vpalignr $4,%ymm15,%ymm15,%ymm15
8166 vpalignr $12,%ymm6,%ymm6,%ymm6
8167 movq 8+0(%rbp),%rdx
8168 mulxq %r10,%r10,%rax
8169 addq %r10,%r14
8170 mulxq %r11,%r11,%r9
8171 adcq %r11,%r15
8172 adcq $0,%r9
Robert Sloan4d1ac502017-02-06 08:36:14 -08008173 imulq %r12,%rdx
David Benjaminf31229b2017-01-25 14:08:15 -05008174 vpalignr $8,%ymm10,%ymm10,%ymm10
8175 vpalignr $4,%ymm14,%ymm14,%ymm14
8176 vpalignr $12,%ymm5,%ymm5,%ymm5
8177 vpalignr $8,%ymm9,%ymm9,%ymm9
8178 vpalignr $4,%ymm13,%ymm13,%ymm13
8179 vpalignr $12,%ymm4,%ymm4,%ymm4
8180 vpalignr $8,%ymm8,%ymm8,%ymm8
8181 vpalignr $4,%ymm12,%ymm12,%ymm12
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194 addq %rax,%r15
8195 adcq %rdx,%r9
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216 movq %r13,%r10
8217 movq %r14,%r11
8218 movq %r15,%r12
8219 andq $3,%r12
8220 movq %r15,%r13
8221 andq $-4,%r13
8222 movq %r9,%r14
8223 shrdq $2,%r9,%r15
8224 shrq $2,%r9
8225 addq %r13,%r10
8226 adcq %r14,%r11
8227 adcq $0,%r12
8228 addq %r15,%r10
8229 adcq %r9,%r11
8230 adcq $0,%r12
8231
8232 leaq 32(%rdi),%rdi
8233 decq %rcx
8234 jg 1b
8235 decq %r8
8236 jge 2b
8237 vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
8238 vpaddd 64(%rbp),%ymm7,%ymm7
8239 vpaddd 96(%rbp),%ymm11,%ymm11
8240 vpaddd 256(%rbp),%ymm15,%ymm15
8241 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
8242 vpaddd 64(%rbp),%ymm6,%ymm6
8243 vpaddd 96(%rbp),%ymm10,%ymm10
8244 vpaddd 224(%rbp),%ymm14,%ymm14
8245 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
8246 vpaddd 64(%rbp),%ymm5,%ymm5
8247 vpaddd 96(%rbp),%ymm9,%ymm9
8248 vpaddd 192(%rbp),%ymm13,%ymm13
8249 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
8250 vpaddd 64(%rbp),%ymm4,%ymm4
8251 vpaddd 96(%rbp),%ymm8,%ymm8
8252 vpaddd 160(%rbp),%ymm12,%ymm12
8253
8254 vmovdqa %ymm0,128(%rbp)
8255 vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
8256 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
8257 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
8258 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
8259 vpxor 0+0(%rsi),%ymm0,%ymm0
8260 vpxor 32+0(%rsi),%ymm3,%ymm3
8261 vpxor 64+0(%rsi),%ymm7,%ymm7
8262 vpxor 96+0(%rsi),%ymm11,%ymm11
8263 vmovdqu %ymm0,0+0(%rdi)
8264 vmovdqu %ymm3,32+0(%rdi)
8265 vmovdqu %ymm7,64+0(%rdi)
8266 vmovdqu %ymm11,96+0(%rdi)
8267
8268 vmovdqa 128(%rbp),%ymm0
8269 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
8270 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
8271 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
8272 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
8273 vpxor 0+128(%rsi),%ymm3,%ymm3
8274 vpxor 32+128(%rsi),%ymm2,%ymm2
8275 vpxor 64+128(%rsi),%ymm6,%ymm6
8276 vpxor 96+128(%rsi),%ymm10,%ymm10
8277 vmovdqu %ymm3,0+128(%rdi)
8278 vmovdqu %ymm2,32+128(%rdi)
8279 vmovdqu %ymm6,64+128(%rdi)
8280 vmovdqu %ymm10,96+128(%rdi)
8281 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
8282 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
8283 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
8284 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
8285 vpxor 0+256(%rsi),%ymm3,%ymm3
8286 vpxor 32+256(%rsi),%ymm1,%ymm1
8287 vpxor 64+256(%rsi),%ymm5,%ymm5
8288 vpxor 96+256(%rsi),%ymm9,%ymm9
8289 vmovdqu %ymm3,0+256(%rdi)
8290 vmovdqu %ymm1,32+256(%rdi)
8291 vmovdqu %ymm5,64+256(%rdi)
8292 vmovdqu %ymm9,96+256(%rdi)
8293 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
8294 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
8295 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
8296 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
8297 vmovdqa %ymm3,%ymm8
8298
8299 movq $384,%rcx
8300 leaq 384(%rsi),%rsi
8301 subq $384,%rbx
8302 jmp seal_avx2_hash
8303
8304seal_avx2_320:
8305 vmovdqa %ymm0,%ymm1
8306 vmovdqa %ymm0,%ymm2
8307 vmovdqa %ymm4,%ymm5
8308 vmovdqa %ymm4,%ymm6
8309 vmovdqa %ymm8,%ymm9
8310 vmovdqa %ymm8,%ymm10
8311 vpaddd .avx2_inc(%rip),%ymm12,%ymm13
8312 vpaddd .avx2_inc(%rip),%ymm13,%ymm14
8313 vmovdqa %ymm4,%ymm7
8314 vmovdqa %ymm8,%ymm11
8315 vmovdqa %ymm12,160(%rbp)
8316 vmovdqa %ymm13,192(%rbp)
8317 vmovdqa %ymm14,224(%rbp)
8318 movq $10,%r10
83191:
8320 vpaddd %ymm4,%ymm0,%ymm0
8321 vpxor %ymm0,%ymm12,%ymm12
8322 vpshufb .rol16(%rip),%ymm12,%ymm12
8323 vpaddd %ymm12,%ymm8,%ymm8
8324 vpxor %ymm8,%ymm4,%ymm4
8325 vpsrld $20,%ymm4,%ymm3
8326 vpslld $12,%ymm4,%ymm4
8327 vpxor %ymm3,%ymm4,%ymm4
8328 vpaddd %ymm4,%ymm0,%ymm0
8329 vpxor %ymm0,%ymm12,%ymm12
8330 vpshufb .rol8(%rip),%ymm12,%ymm12
8331 vpaddd %ymm12,%ymm8,%ymm8
8332 vpxor %ymm8,%ymm4,%ymm4
8333 vpslld $7,%ymm4,%ymm3
8334 vpsrld $25,%ymm4,%ymm4
8335 vpxor %ymm3,%ymm4,%ymm4
8336 vpalignr $12,%ymm12,%ymm12,%ymm12
8337 vpalignr $8,%ymm8,%ymm8,%ymm8
8338 vpalignr $4,%ymm4,%ymm4,%ymm4
8339 vpaddd %ymm5,%ymm1,%ymm1
8340 vpxor %ymm1,%ymm13,%ymm13
8341 vpshufb .rol16(%rip),%ymm13,%ymm13
8342 vpaddd %ymm13,%ymm9,%ymm9
8343 vpxor %ymm9,%ymm5,%ymm5
8344 vpsrld $20,%ymm5,%ymm3
8345 vpslld $12,%ymm5,%ymm5
8346 vpxor %ymm3,%ymm5,%ymm5
8347 vpaddd %ymm5,%ymm1,%ymm1
8348 vpxor %ymm1,%ymm13,%ymm13
8349 vpshufb .rol8(%rip),%ymm13,%ymm13
8350 vpaddd %ymm13,%ymm9,%ymm9
8351 vpxor %ymm9,%ymm5,%ymm5
8352 vpslld $7,%ymm5,%ymm3
8353 vpsrld $25,%ymm5,%ymm5
8354 vpxor %ymm3,%ymm5,%ymm5
8355 vpalignr $12,%ymm13,%ymm13,%ymm13
8356 vpalignr $8,%ymm9,%ymm9,%ymm9
8357 vpalignr $4,%ymm5,%ymm5,%ymm5
8358 vpaddd %ymm6,%ymm2,%ymm2
8359 vpxor %ymm2,%ymm14,%ymm14
8360 vpshufb .rol16(%rip),%ymm14,%ymm14
8361 vpaddd %ymm14,%ymm10,%ymm10
8362 vpxor %ymm10,%ymm6,%ymm6
8363 vpsrld $20,%ymm6,%ymm3
8364 vpslld $12,%ymm6,%ymm6
8365 vpxor %ymm3,%ymm6,%ymm6
8366 vpaddd %ymm6,%ymm2,%ymm2
8367 vpxor %ymm2,%ymm14,%ymm14
8368 vpshufb .rol8(%rip),%ymm14,%ymm14
8369 vpaddd %ymm14,%ymm10,%ymm10
8370 vpxor %ymm10,%ymm6,%ymm6
8371 vpslld $7,%ymm6,%ymm3
8372 vpsrld $25,%ymm6,%ymm6
8373 vpxor %ymm3,%ymm6,%ymm6
8374 vpalignr $12,%ymm14,%ymm14,%ymm14
8375 vpalignr $8,%ymm10,%ymm10,%ymm10
8376 vpalignr $4,%ymm6,%ymm6,%ymm6
8377 vpaddd %ymm4,%ymm0,%ymm0
8378 vpxor %ymm0,%ymm12,%ymm12
8379 vpshufb .rol16(%rip),%ymm12,%ymm12
8380 vpaddd %ymm12,%ymm8,%ymm8
8381 vpxor %ymm8,%ymm4,%ymm4
8382 vpsrld $20,%ymm4,%ymm3
8383 vpslld $12,%ymm4,%ymm4
8384 vpxor %ymm3,%ymm4,%ymm4
8385 vpaddd %ymm4,%ymm0,%ymm0
8386 vpxor %ymm0,%ymm12,%ymm12
8387 vpshufb .rol8(%rip),%ymm12,%ymm12
8388 vpaddd %ymm12,%ymm8,%ymm8
8389 vpxor %ymm8,%ymm4,%ymm4
8390 vpslld $7,%ymm4,%ymm3
8391 vpsrld $25,%ymm4,%ymm4
8392 vpxor %ymm3,%ymm4,%ymm4
8393 vpalignr $4,%ymm12,%ymm12,%ymm12
8394 vpalignr $8,%ymm8,%ymm8,%ymm8
8395 vpalignr $12,%ymm4,%ymm4,%ymm4
8396 vpaddd %ymm5,%ymm1,%ymm1
8397 vpxor %ymm1,%ymm13,%ymm13
8398 vpshufb .rol16(%rip),%ymm13,%ymm13
8399 vpaddd %ymm13,%ymm9,%ymm9
8400 vpxor %ymm9,%ymm5,%ymm5
8401 vpsrld $20,%ymm5,%ymm3
8402 vpslld $12,%ymm5,%ymm5
8403 vpxor %ymm3,%ymm5,%ymm5
8404 vpaddd %ymm5,%ymm1,%ymm1
8405 vpxor %ymm1,%ymm13,%ymm13
8406 vpshufb .rol8(%rip),%ymm13,%ymm13
8407 vpaddd %ymm13,%ymm9,%ymm9
8408 vpxor %ymm9,%ymm5,%ymm5
8409 vpslld $7,%ymm5,%ymm3
8410 vpsrld $25,%ymm5,%ymm5
8411 vpxor %ymm3,%ymm5,%ymm5
8412 vpalignr $4,%ymm13,%ymm13,%ymm13
8413 vpalignr $8,%ymm9,%ymm9,%ymm9
8414 vpalignr $12,%ymm5,%ymm5,%ymm5
8415 vpaddd %ymm6,%ymm2,%ymm2
8416 vpxor %ymm2,%ymm14,%ymm14
8417 vpshufb .rol16(%rip),%ymm14,%ymm14
8418 vpaddd %ymm14,%ymm10,%ymm10
8419 vpxor %ymm10,%ymm6,%ymm6
8420 vpsrld $20,%ymm6,%ymm3
8421 vpslld $12,%ymm6,%ymm6
8422 vpxor %ymm3,%ymm6,%ymm6
8423 vpaddd %ymm6,%ymm2,%ymm2
8424 vpxor %ymm2,%ymm14,%ymm14
8425 vpshufb .rol8(%rip),%ymm14,%ymm14
8426 vpaddd %ymm14,%ymm10,%ymm10
8427 vpxor %ymm10,%ymm6,%ymm6
8428 vpslld $7,%ymm6,%ymm3
8429 vpsrld $25,%ymm6,%ymm6
8430 vpxor %ymm3,%ymm6,%ymm6
8431 vpalignr $4,%ymm14,%ymm14,%ymm14
8432 vpalignr $8,%ymm10,%ymm10,%ymm10
8433 vpalignr $12,%ymm6,%ymm6,%ymm6
8434
8435 decq %r10
8436 jne 1b
8437 vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
8438 vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
8439 vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
8440 vpaddd %ymm7,%ymm4,%ymm4
8441 vpaddd %ymm7,%ymm5,%ymm5
8442 vpaddd %ymm7,%ymm6,%ymm6
8443 vpaddd %ymm11,%ymm8,%ymm8
8444 vpaddd %ymm11,%ymm9,%ymm9
8445 vpaddd %ymm11,%ymm10,%ymm10
8446 vpaddd 160(%rbp),%ymm12,%ymm12
8447 vpaddd 192(%rbp),%ymm13,%ymm13
8448 vpaddd 224(%rbp),%ymm14,%ymm14
8449 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
8450
8451 vpand .clamp(%rip),%ymm3,%ymm3
8452 vmovdqa %ymm3,0(%rbp)
8453
8454 vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
8455 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
8456 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
8457 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
8458 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
8459 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
8460 vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
8461 vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
8462 vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
8463 vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
8464 jmp seal_avx2_short
8465
8466seal_avx2_192:
8467 vmovdqa %ymm0,%ymm1
8468 vmovdqa %ymm0,%ymm2
8469 vmovdqa %ymm4,%ymm5
8470 vmovdqa %ymm4,%ymm6
8471 vmovdqa %ymm8,%ymm9
8472 vmovdqa %ymm8,%ymm10
8473 vpaddd .avx2_inc(%rip),%ymm12,%ymm13
8474 vmovdqa %ymm12,%ymm11
8475 vmovdqa %ymm13,%ymm15
8476 movq $10,%r10
84771:
8478 vpaddd %ymm4,%ymm0,%ymm0
8479 vpxor %ymm0,%ymm12,%ymm12
8480 vpshufb .rol16(%rip),%ymm12,%ymm12
8481 vpaddd %ymm12,%ymm8,%ymm8
8482 vpxor %ymm8,%ymm4,%ymm4
8483 vpsrld $20,%ymm4,%ymm3
8484 vpslld $12,%ymm4,%ymm4
8485 vpxor %ymm3,%ymm4,%ymm4
8486 vpaddd %ymm4,%ymm0,%ymm0
8487 vpxor %ymm0,%ymm12,%ymm12
8488 vpshufb .rol8(%rip),%ymm12,%ymm12
8489 vpaddd %ymm12,%ymm8,%ymm8
8490 vpxor %ymm8,%ymm4,%ymm4
8491 vpslld $7,%ymm4,%ymm3
8492 vpsrld $25,%ymm4,%ymm4
8493 vpxor %ymm3,%ymm4,%ymm4
8494 vpalignr $12,%ymm12,%ymm12,%ymm12
8495 vpalignr $8,%ymm8,%ymm8,%ymm8
8496 vpalignr $4,%ymm4,%ymm4,%ymm4
8497 vpaddd %ymm5,%ymm1,%ymm1
8498 vpxor %ymm1,%ymm13,%ymm13
8499 vpshufb .rol16(%rip),%ymm13,%ymm13
8500 vpaddd %ymm13,%ymm9,%ymm9
8501 vpxor %ymm9,%ymm5,%ymm5
8502 vpsrld $20,%ymm5,%ymm3
8503 vpslld $12,%ymm5,%ymm5
8504 vpxor %ymm3,%ymm5,%ymm5
8505 vpaddd %ymm5,%ymm1,%ymm1
8506 vpxor %ymm1,%ymm13,%ymm13
8507 vpshufb .rol8(%rip),%ymm13,%ymm13
8508 vpaddd %ymm13,%ymm9,%ymm9
8509 vpxor %ymm9,%ymm5,%ymm5
8510 vpslld $7,%ymm5,%ymm3
8511 vpsrld $25,%ymm5,%ymm5
8512 vpxor %ymm3,%ymm5,%ymm5
8513 vpalignr $12,%ymm13,%ymm13,%ymm13
8514 vpalignr $8,%ymm9,%ymm9,%ymm9
8515 vpalignr $4,%ymm5,%ymm5,%ymm5
8516 vpaddd %ymm4,%ymm0,%ymm0
8517 vpxor %ymm0,%ymm12,%ymm12
8518 vpshufb .rol16(%rip),%ymm12,%ymm12
8519 vpaddd %ymm12,%ymm8,%ymm8
8520 vpxor %ymm8,%ymm4,%ymm4
8521 vpsrld $20,%ymm4,%ymm3
8522 vpslld $12,%ymm4,%ymm4
8523 vpxor %ymm3,%ymm4,%ymm4
8524 vpaddd %ymm4,%ymm0,%ymm0
8525 vpxor %ymm0,%ymm12,%ymm12
8526 vpshufb .rol8(%rip),%ymm12,%ymm12
8527 vpaddd %ymm12,%ymm8,%ymm8
8528 vpxor %ymm8,%ymm4,%ymm4
8529 vpslld $7,%ymm4,%ymm3
8530 vpsrld $25,%ymm4,%ymm4
8531 vpxor %ymm3,%ymm4,%ymm4
8532 vpalignr $4,%ymm12,%ymm12,%ymm12
8533 vpalignr $8,%ymm8,%ymm8,%ymm8
8534 vpalignr $12,%ymm4,%ymm4,%ymm4
8535 vpaddd %ymm5,%ymm1,%ymm1
8536 vpxor %ymm1,%ymm13,%ymm13
8537 vpshufb .rol16(%rip),%ymm13,%ymm13
8538 vpaddd %ymm13,%ymm9,%ymm9
8539 vpxor %ymm9,%ymm5,%ymm5
8540 vpsrld $20,%ymm5,%ymm3
8541 vpslld $12,%ymm5,%ymm5
8542 vpxor %ymm3,%ymm5,%ymm5
8543 vpaddd %ymm5,%ymm1,%ymm1
8544 vpxor %ymm1,%ymm13,%ymm13
8545 vpshufb .rol8(%rip),%ymm13,%ymm13
8546 vpaddd %ymm13,%ymm9,%ymm9
8547 vpxor %ymm9,%ymm5,%ymm5
8548 vpslld $7,%ymm5,%ymm3
8549 vpsrld $25,%ymm5,%ymm5
8550 vpxor %ymm3,%ymm5,%ymm5
8551 vpalignr $4,%ymm13,%ymm13,%ymm13
8552 vpalignr $8,%ymm9,%ymm9,%ymm9
8553 vpalignr $12,%ymm5,%ymm5,%ymm5
8554
8555 decq %r10
8556 jne 1b
8557 vpaddd %ymm2,%ymm0,%ymm0
8558 vpaddd %ymm2,%ymm1,%ymm1
8559 vpaddd %ymm6,%ymm4,%ymm4
8560 vpaddd %ymm6,%ymm5,%ymm5
8561 vpaddd %ymm10,%ymm8,%ymm8
8562 vpaddd %ymm10,%ymm9,%ymm9
8563 vpaddd %ymm11,%ymm12,%ymm12
8564 vpaddd %ymm15,%ymm13,%ymm13
8565 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
8566
8567 vpand .clamp(%rip),%ymm3,%ymm3
8568 vmovdqa %ymm3,0(%rbp)
8569
8570 vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
8571 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
8572 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
8573 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
8574 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
8575 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
8576seal_avx2_short:
8577 movq %r8,%r8
8578 call poly_hash_ad_internal
8579 xorq %rcx,%rcx
8580seal_avx2_hash:
8581 cmpq $16,%rcx
8582 jb seal_avx2_short_loop
8583 addq 0(%rdi),%r10
8584 adcq 8+0(%rdi),%r11
8585 adcq $1,%r12
8586 movq 0+0(%rbp),%rax
8587 movq %rax,%r15
8588 mulq %r10
8589 movq %rax,%r13
8590 movq %rdx,%r14
8591 movq 0+0(%rbp),%rax
8592 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08008593 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008594 addq %rax,%r14
8595 adcq %rdx,%r15
8596 movq 8+0(%rbp),%rax
8597 movq %rax,%r9
8598 mulq %r10
8599 addq %rax,%r14
8600 adcq $0,%rdx
8601 movq %rdx,%r10
8602 movq 8+0(%rbp),%rax
8603 mulq %r11
8604 addq %rax,%r15
8605 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008606 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05008607 addq %r10,%r15
8608 adcq %rdx,%r9
8609 movq %r13,%r10
8610 movq %r14,%r11
8611 movq %r15,%r12
8612 andq $3,%r12
8613 movq %r15,%r13
8614 andq $-4,%r13
8615 movq %r9,%r14
8616 shrdq $2,%r9,%r15
8617 shrq $2,%r9
8618 addq %r13,%r10
8619 adcq %r14,%r11
8620 adcq $0,%r12
8621 addq %r15,%r10
8622 adcq %r9,%r11
8623 adcq $0,%r12
8624
8625 subq $16,%rcx
8626 addq $16,%rdi
8627 jmp seal_avx2_hash
8628seal_avx2_short_loop:
8629 cmpq $32,%rbx
8630 jb seal_avx2_short_tail
8631 subq $32,%rbx
8632
8633 vpxor (%rsi),%ymm0,%ymm0
8634 vmovdqu %ymm0,(%rdi)
8635 leaq 32(%rsi),%rsi
8636
8637 addq 0(%rdi),%r10
8638 adcq 8+0(%rdi),%r11
8639 adcq $1,%r12
8640 movq 0+0(%rbp),%rax
8641 movq %rax,%r15
8642 mulq %r10
8643 movq %rax,%r13
8644 movq %rdx,%r14
8645 movq 0+0(%rbp),%rax
8646 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08008647 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008648 addq %rax,%r14
8649 adcq %rdx,%r15
8650 movq 8+0(%rbp),%rax
8651 movq %rax,%r9
8652 mulq %r10
8653 addq %rax,%r14
8654 adcq $0,%rdx
8655 movq %rdx,%r10
8656 movq 8+0(%rbp),%rax
8657 mulq %r11
8658 addq %rax,%r15
8659 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008660 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05008661 addq %r10,%r15
8662 adcq %rdx,%r9
8663 movq %r13,%r10
8664 movq %r14,%r11
8665 movq %r15,%r12
8666 andq $3,%r12
8667 movq %r15,%r13
8668 andq $-4,%r13
8669 movq %r9,%r14
8670 shrdq $2,%r9,%r15
8671 shrq $2,%r9
8672 addq %r13,%r10
8673 adcq %r14,%r11
8674 adcq $0,%r12
8675 addq %r15,%r10
8676 adcq %r9,%r11
8677 adcq $0,%r12
8678 addq 16(%rdi),%r10
8679 adcq 8+16(%rdi),%r11
8680 adcq $1,%r12
8681 movq 0+0(%rbp),%rax
8682 movq %rax,%r15
8683 mulq %r10
8684 movq %rax,%r13
8685 movq %rdx,%r14
8686 movq 0+0(%rbp),%rax
8687 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08008688 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008689 addq %rax,%r14
8690 adcq %rdx,%r15
8691 movq 8+0(%rbp),%rax
8692 movq %rax,%r9
8693 mulq %r10
8694 addq %rax,%r14
8695 adcq $0,%rdx
8696 movq %rdx,%r10
8697 movq 8+0(%rbp),%rax
8698 mulq %r11
8699 addq %rax,%r15
8700 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008701 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05008702 addq %r10,%r15
8703 adcq %rdx,%r9
8704 movq %r13,%r10
8705 movq %r14,%r11
8706 movq %r15,%r12
8707 andq $3,%r12
8708 movq %r15,%r13
8709 andq $-4,%r13
8710 movq %r9,%r14
8711 shrdq $2,%r9,%r15
8712 shrq $2,%r9
8713 addq %r13,%r10
8714 adcq %r14,%r11
8715 adcq $0,%r12
8716 addq %r15,%r10
8717 adcq %r9,%r11
8718 adcq $0,%r12
8719
8720 leaq 32(%rdi),%rdi
8721
8722 vmovdqa %ymm4,%ymm0
8723 vmovdqa %ymm8,%ymm4
8724 vmovdqa %ymm12,%ymm8
8725 vmovdqa %ymm1,%ymm12
8726 vmovdqa %ymm5,%ymm1
8727 vmovdqa %ymm9,%ymm5
8728 vmovdqa %ymm13,%ymm9
8729 vmovdqa %ymm2,%ymm13
8730 vmovdqa %ymm6,%ymm2
8731 jmp seal_avx2_short_loop
8732seal_avx2_short_tail:
8733 cmpq $16,%rbx
8734 jb 1f
8735 subq $16,%rbx
8736 vpxor (%rsi),%xmm0,%xmm3
8737 vmovdqu %xmm3,(%rdi)
8738 leaq 16(%rsi),%rsi
8739 addq 0(%rdi),%r10
8740 adcq 8+0(%rdi),%r11
8741 adcq $1,%r12
8742 movq 0+0(%rbp),%rax
8743 movq %rax,%r15
8744 mulq %r10
8745 movq %rax,%r13
8746 movq %rdx,%r14
8747 movq 0+0(%rbp),%rax
8748 mulq %r11
Robert Sloan4d1ac502017-02-06 08:36:14 -08008749 imulq %r12,%r15
David Benjaminf31229b2017-01-25 14:08:15 -05008750 addq %rax,%r14
8751 adcq %rdx,%r15
8752 movq 8+0(%rbp),%rax
8753 movq %rax,%r9
8754 mulq %r10
8755 addq %rax,%r14
8756 adcq $0,%rdx
8757 movq %rdx,%r10
8758 movq 8+0(%rbp),%rax
8759 mulq %r11
8760 addq %rax,%r15
8761 adcq $0,%rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08008762 imulq %r12,%r9
David Benjaminf31229b2017-01-25 14:08:15 -05008763 addq %r10,%r15
8764 adcq %rdx,%r9
8765 movq %r13,%r10
8766 movq %r14,%r11
8767 movq %r15,%r12
8768 andq $3,%r12
8769 movq %r15,%r13
8770 andq $-4,%r13
8771 movq %r9,%r14
8772 shrdq $2,%r9,%r15
8773 shrq $2,%r9
8774 addq %r13,%r10
8775 adcq %r14,%r11
8776 adcq $0,%r12
8777 addq %r15,%r10
8778 adcq %r9,%r11
8779 adcq $0,%r12
8780
8781 leaq 16(%rdi),%rdi
8782 vextracti128 $1,%ymm0,%xmm0
87831:
8784 vzeroupper
8785 jmp seal_sse_tail_16
Robert Sloana94fe052017-02-21 08:49:28 -08008786
David Benjaminf31229b2017-01-25 14:08:15 -05008787#endif