blob: 857f1d5d72294690e6f0cdd4c7e0919c333ee4aa [file] [log] [blame]
David Benjaminf31229b2017-01-25 14:08:15 -05001#!/usr/bin/env perl
2
3# Copyright (c) 2015, CloudFlare Ltd.
4#
5# Permission to use, copy, modify, and/or distribute this software for any
6# purpose with or without fee is hereby granted, provided that the above
7# copyright notice and this permission notice appear in all copies.
8#
9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
12# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
14# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
15# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
16
17##############################################################################
18# #
19# Author: Vlad Krasnov #
20# #
21##############################################################################
22
23$flavour = shift;
24$output = shift;
25if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
26
27$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
28
29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
31( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
32die "can't locate x86_64-xlate.pl";
33
Robert Sloan4d1ac502017-02-06 08:36:14 -080034open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
David Benjaminf31229b2017-01-25 14:08:15 -050035*STDOUT=*OUT;
36
37$avx = 2;
38
39$code.=<<___;
40.text
41.extern OPENSSL_ia32cap_P
Robert Sloan5d625782017-02-13 09:55:39 -080042
43chacha20_poly1305_constants:
44
David Benjaminf31229b2017-01-25 14:08:15 -050045.align 64
46.chacha20_consts:
47.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
48.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
49.rol8:
50.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
51.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
52.rol16:
53.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
54.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
55.avx2_init:
56.long 0,0,0,0
57.sse_inc:
58.long 1,0,0,0
59.avx2_inc:
60.long 2,0,0,0,2,0,0,0
61.clamp:
62.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
63.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
64.align 16
65.and_masks:
66.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
67.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
68.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
69.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
70.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
71.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
72.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
73.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
74.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
75.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
76.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
77.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
78.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
79.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
80.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
81___
82
83my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8");
84my ($acc0,$acc1,$acc2)=map("%r$_",(10..12));
85my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9");
86my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15));
87my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
88my $r_store="0*16(%rbp)";
89my $s_store="1*16(%rbp)";
90my $len_store="2*16(%rbp)";
91my $state1_store="3*16(%rbp)";
92my $state2_store="4*16(%rbp)";
93my $tmp_store="5*16(%rbp)";
94my $ctr0_store="6*16(%rbp)";
95my $ctr1_store="7*16(%rbp)";
96my $ctr2_store="8*16(%rbp)";
97my $ctr3_store="9*16(%rbp)";
98
99sub chacha_qr {
100my ($a,$b,$c,$d,$t,$dir)=@_;
101$code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/);
102$code.="paddd $b, $a
103 pxor $a, $d
104 pshufb .rol16(%rip), $d
105 paddd $d, $c
106 pxor $c, $b
107 movdqa $b, $t
108 pslld \$12, $t
109 psrld \$20, $b
110 pxor $t, $b
111 paddd $b, $a
112 pxor $a, $d
113 pshufb .rol8(%rip), $d
114 paddd $d, $c
115 pxor $c, $b
116 movdqa $b, $t
117 pslld \$7, $t
118 psrld \$25, $b
119 pxor $t, $b\n";
120$code.="palignr \$4, $b, $b
121 palignr \$8, $c, $c
122 palignr \$12, $d, $d\n" if ($dir =~ /left/);
123$code.="palignr \$12, $b, $b
124 palignr \$8, $c, $c
125 palignr \$4, $d, $d\n" if ($dir =~ /right/);
126$code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/);
127}
128
129sub poly_add {
130my ($src)=@_;
131$code.="add $src, $acc0
132 adc 8+$src, $acc1
133 adc \$1, $acc2\n";
134}
135
136sub poly_stage1 {
137$code.="mov 0+$r_store, %rax
138 mov %rax, $t2
139 mul $acc0
140 mov %rax, $t0
141 mov %rdx, $t1
142 mov 0+$r_store, %rax
143 mul $acc1
Robert Sloan4d1ac502017-02-06 08:36:14 -0800144 imulq $acc2, $t2
David Benjaminf31229b2017-01-25 14:08:15 -0500145 add %rax, $t1
146 adc %rdx, $t2\n";
147}
148
149sub poly_stage2 {
150$code.="mov 8+$r_store, %rax
151 mov %rax, $t3
152 mul $acc0
153 add %rax, $t1
154 adc \$0, %rdx
155 mov %rdx, $acc0
156 mov 8+$r_store, %rax
157 mul $acc1
158 add %rax, $t2
159 adc \$0, %rdx\n";
160}
161
162sub poly_stage3 {
Robert Sloan4d1ac502017-02-06 08:36:14 -0800163$code.="imulq $acc2, $t3
David Benjaminf31229b2017-01-25 14:08:15 -0500164 add $acc0, $t2
165 adc %rdx, $t3\n";
166}
167
168sub poly_reduce_stage {
169$code.="mov $t0, $acc0
170 mov $t1, $acc1
171 mov $t2, $acc2
172 and \$3, $acc2
173 mov $t2, $t0
174 and \$-4, $t0
175 mov $t3, $t1
176 shrd \$2, $t3, $t2
177 shr \$2, $t3
178 add $t0, $acc0
179 adc $t1, $acc1
180 adc \$0, $acc2
181 add $t2, $acc0
182 adc $t3, $acc1
183 adc \$0, $acc2\n";
184}
185
186sub poly_mul {
187 &poly_stage1();
188 &poly_stage2();
189 &poly_stage3();
190 &poly_reduce_stage();
191}
192
193sub prep_state {
194my ($n)=@_;
195$code.="movdqa .chacha20_consts(%rip), $A0
196 movdqa $state1_store, $B0
197 movdqa $state2_store, $C0\n";
198$code.="movdqa $A0, $A1
199 movdqa $B0, $B1
200 movdqa $C0, $C1\n" if ($n ge 2);
201$code.="movdqa $A0, $A2
202 movdqa $B0, $B2
203 movdqa $C0, $C2\n" if ($n ge 3);
204$code.="movdqa $A0, $A3
205 movdqa $B0, $B3
206 movdqa $C0, $C3\n" if ($n ge 4);
207$code.="movdqa $ctr0_store, $D0
208 paddd .sse_inc(%rip), $D0
209 movdqa $D0, $ctr0_store\n" if ($n eq 1);
210$code.="movdqa $ctr0_store, $D1
211 paddd .sse_inc(%rip), $D1
212 movdqa $D1, $D0
213 paddd .sse_inc(%rip), $D0
214 movdqa $D0, $ctr0_store
215 movdqa $D1, $ctr1_store\n" if ($n eq 2);
216$code.="movdqa $ctr0_store, $D2
217 paddd .sse_inc(%rip), $D2
218 movdqa $D2, $D1
219 paddd .sse_inc(%rip), $D1
220 movdqa $D1, $D0
221 paddd .sse_inc(%rip), $D0
222 movdqa $D0, $ctr0_store
223 movdqa $D1, $ctr1_store
224 movdqa $D2, $ctr2_store\n" if ($n eq 3);
225$code.="movdqa $ctr0_store, $D3
226 paddd .sse_inc(%rip), $D3
227 movdqa $D3, $D2
228 paddd .sse_inc(%rip), $D2
229 movdqa $D2, $D1
230 paddd .sse_inc(%rip), $D1
231 movdqa $D1, $D0
232 paddd .sse_inc(%rip), $D0
233 movdqa $D0, $ctr0_store
234 movdqa $D1, $ctr1_store
235 movdqa $D2, $ctr2_store
236 movdqa $D3, $ctr3_store\n" if ($n eq 4);
237}
238
239sub finalize_state {
240my ($n)=@_;
241$code.="paddd .chacha20_consts(%rip), $A3
242 paddd $state1_store, $B3
243 paddd $state2_store, $C3
244 paddd $ctr3_store, $D3\n" if ($n eq 4);
245$code.="paddd .chacha20_consts(%rip), $A2
246 paddd $state1_store, $B2
247 paddd $state2_store, $C2
248 paddd $ctr2_store, $D2\n" if ($n ge 3);
249$code.="paddd .chacha20_consts(%rip), $A1
250 paddd $state1_store, $B1
251 paddd $state2_store, $C1
252 paddd $ctr1_store, $D1\n" if ($n ge 2);
253$code.="paddd .chacha20_consts(%rip), $A0
254 paddd $state1_store, $B0
255 paddd $state2_store, $C0
256 paddd $ctr0_store, $D0\n";
257}
258
259sub xor_stream {
260my ($A, $B, $C, $D, $offset)=@_;
261$code.="movdqu 0*16 + $offset($inp), $A3
262 movdqu 1*16 + $offset($inp), $B3
263 movdqu 2*16 + $offset($inp), $C3
264 movdqu 3*16 + $offset($inp), $D3
265 pxor $A3, $A
266 pxor $B3, $B
267 pxor $C3, $C
268 pxor $D, $D3
269 movdqu $A, 0*16 + $offset($oup)
270 movdqu $B, 1*16 + $offset($oup)
271 movdqu $C, 2*16 + $offset($oup)
272 movdqu $D3, 3*16 + $offset($oup)\n";
273}
274
275sub xor_stream_using_temp {
276my ($A, $B, $C, $D, $offset, $temp)=@_;
277$code.="movdqa $temp, $tmp_store
278 movdqu 0*16 + $offset($inp), $temp
279 pxor $A, $temp
280 movdqu $temp, 0*16 + $offset($oup)
281 movdqu 1*16 + $offset($inp), $temp
282 pxor $B, $temp
283 movdqu $temp, 1*16 + $offset($oup)
284 movdqu 2*16 + $offset($inp), $temp
285 pxor $C, $temp
286 movdqu $temp, 2*16 + $offset($oup)
287 movdqu 3*16 + $offset($inp), $temp
288 pxor $D, $temp
289 movdqu $temp, 3*16 + $offset($oup)\n";
290}
291
292sub gen_chacha_round {
293my ($rot1, $rot2, $shift)=@_;
294my $round="";
295$round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20);
296$round.="movdqa $rot2, $C0
297 paddd $B3, $A3
298 paddd $B2, $A2
299 paddd $B1, $A1
300 paddd $B0, $A0
301 pxor $A3, $D3
302 pxor $A2, $D2
303 pxor $A1, $D1
304 pxor $A0, $D0
305 pshufb $C0, $D3
306 pshufb $C0, $D2
307 pshufb $C0, $D1
308 pshufb $C0, $D0
309 movdqa $tmp_store, $C0
310 paddd $D3, $C3
311 paddd $D2, $C2
312 paddd $D1, $C1
313 paddd $D0, $C0
314 pxor $C3, $B3
315 pxor $C2, $B2
316 pxor $C1, $B1
317 pxor $C0, $B0
318 movdqa $C0, $tmp_store
319 movdqa $B3, $C0
320 psrld \$$rot1, $C0
321 pslld \$32-$rot1, $B3
322 pxor $C0, $B3
323 movdqa $B2, $C0
324 psrld \$$rot1, $C0
325 pslld \$32-$rot1, $B2
326 pxor $C0, $B2
327 movdqa $B1, $C0
328 psrld \$$rot1, $C0
329 pslld \$32-$rot1, $B1
330 pxor $C0, $B1
331 movdqa $B0, $C0
332 psrld \$$rot1, $C0
333 pslld \$32-$rot1, $B0
334 pxor $C0, $B0\n";
335($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
336($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
337$round.="movdqa $tmp_store, $C0
338 palignr \$$s1, $B3, $B3
339 palignr \$$s2, $C3, $C3
340 palignr \$$s3, $D3, $D3
341 palignr \$$s1, $B2, $B2
342 palignr \$$s2, $C2, $C2
343 palignr \$$s3, $D2, $D2
344 palignr \$$s1, $B1, $B1
345 palignr \$$s2, $C1, $C1
346 palignr \$$s3, $D1, $D1
347 palignr \$$s1, $B0, $B0
348 palignr \$$s2, $C0, $C0
349 palignr \$$s3, $D0, $D0\n"
350if (($shift =~ /left/) || ($shift =~ /right/));
351return $round;
352};
353
354$chacha_body = &gen_chacha_round(20, ".rol16(%rip)") .
355 &gen_chacha_round(25, ".rol8(%rip)", "left") .
356 &gen_chacha_round(20, ".rol16(%rip)") .
357 &gen_chacha_round(25, ".rol8(%rip)", "right");
358
359my @loop_body = split /\n/, $chacha_body;
360
361sub emit_body {
362my ($n)=@_;
363 for (my $i=0; $i < $n; $i++) {
364 $code=$code.shift(@loop_body)."\n";
365 };
366}
367
368{
369################################################################################
370# void poly_hash_ad_internal();
371$code.="
372.type poly_hash_ad_internal,\@function,2
373.align 64
374poly_hash_ad_internal:
375.cfi_startproc
376 xor $acc0, $acc0
377 xor $acc1, $acc1
378 xor $acc2, $acc2
379 cmp \$13, $itr2
380 jne hash_ad_loop
381poly_fast_tls_ad:
382 # Special treatment for the TLS case of 13 bytes
383 mov ($adp), $acc0
384 mov 5($adp), $acc1
385 shr \$24, $acc1
386 mov \$1, $acc2\n";
387 &poly_mul(); $code.="
388 ret
389hash_ad_loop:
390 # Hash in 16 byte chunk
391 cmp \$16, $itr2
392 jb hash_ad_tail\n";
393 &poly_add("0($adp)");
394 &poly_mul(); $code.="
Robert Sloan4d1ac502017-02-06 08:36:14 -0800395 lea 1*16($adp), $adp
David Benjaminf31229b2017-01-25 14:08:15 -0500396 sub \$16, $itr2
397 jmp hash_ad_loop
398hash_ad_tail:
399 cmp \$0, $itr2
400 je 1f
401 # Hash last < 16 byte tail
402 xor $t0, $t0
403 xor $t1, $t1
404 xor $t2, $t2
405 add $itr2, $adp
406hash_ad_tail_loop:
407 shld \$8, $t0, $t1
408 shl \$8, $t0
409 movzxb -1($adp), $t2
410 xor $t2, $t0
411 dec $adp
412 dec $itr2
413 jne hash_ad_tail_loop
414
415 add $t0, $acc0
416 adc $t1, $acc1
417 adc \$1, $acc2\n";
418 &poly_mul(); $code.="
419 # Finished AD
4201:
421 ret
422.cfi_endproc
423.size poly_hash_ad_internal, .-poly_hash_ad_internal\n";
424}
425
426{
427################################################################################
428# void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
429$code.="
430.globl chacha20_poly1305_open
431.type chacha20_poly1305_open,\@function,2
432.align 64
433chacha20_poly1305_open:
434.cfi_startproc
435 push %rbp
436.cfi_adjust_cfa_offset 8
437 push %rbx
438.cfi_adjust_cfa_offset 8
439 push %r12
440.cfi_adjust_cfa_offset 8
441 push %r13
442.cfi_adjust_cfa_offset 8
443 push %r14
444.cfi_adjust_cfa_offset 8
445 push %r15
446.cfi_adjust_cfa_offset 8
447 # We write the calculated authenticator back to keyp at the end, so save
448 # the pointer on the stack too.
449 push $keyp
450.cfi_adjust_cfa_offset 8
451 sub \$288 + 32, %rsp
452.cfi_adjust_cfa_offset 288 + 32
453.cfi_offset rbp, -16
454.cfi_offset rbx, -24
455.cfi_offset r12, -32
456.cfi_offset r13, -40
457.cfi_offset r14, -48
458.cfi_offset r15, -56
David Benjaminf31229b2017-01-25 14:08:15 -0500459 lea 32(%rsp), %rbp
460 and \$-32, %rbp
461 mov %rdx, 8+$len_store
462 mov %r8, 0+$len_store
463 mov %rdx, $inl\n"; $code.="
464 mov OPENSSL_ia32cap_P+8(%rip), %eax
465 and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
466 xor \$`(1<<5) + (1<<8)`, %eax
467 jz chacha20_poly1305_open_avx2\n" if ($avx>1);
468$code.="
4691:
470 cmp \$128, $inl
471 jbe open_sse_128
472 # For long buffers, prepare the poly key first
473 movdqa .chacha20_consts(%rip), $A0
474 movdqu 0*16($keyp), $B0
475 movdqu 1*16($keyp), $C0
476 movdqu 2*16($keyp), $D0
477 movdqa $D0, $T1
478 # Store on stack, to free keyp
479 movdqa $B0, $state1_store
480 movdqa $C0, $state2_store
481 movdqa $D0, $ctr0_store
482 mov \$10, $acc0
4831: \n";
484 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
485 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
486 dec $acc0
487 jne 1b
488 # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
489 paddd .chacha20_consts(%rip), $A0
490 paddd $state1_store, $B0
491 # Clamp and store the key
492 pand .clamp(%rip), $A0
493 movdqa $A0, $r_store
494 movdqa $B0, $s_store
495 # Hash
496 mov %r8, $itr2
497 call poly_hash_ad_internal
498open_sse_main_loop:
499 cmp \$16*16, $inl
500 jb 2f
501 # Load state, increment counter blocks\n";
502 &prep_state(4); $code.="
503 # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we
504 # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
505 mov \$4, $itr1
506 mov $inp, $itr2
5071: \n";
508 &emit_body(20);
509 &poly_add("0($itr2)"); $code.="
510 lea 2*8($itr2), $itr2\n";
511 &emit_body(20);
512 &poly_stage1();
513 &emit_body(20);
514 &poly_stage2();
515 &emit_body(20);
516 &poly_stage3();
517 &emit_body(20);
518 &poly_reduce_stage();
519 foreach $l (@loop_body) {$code.=$l."\n";}
520 @loop_body = split /\n/, $chacha_body; $code.="
521 dec $itr1
522 jge 1b\n";
523 &poly_add("0($itr2)");
524 &poly_mul(); $code.="
525 lea 2*8($itr2), $itr2
526 cmp \$-6, $itr1
527 jg 1b\n";
528 &finalize_state(4);
529 &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
530 &xor_stream($A2, $B2, $C2, $D2, "4*16");
531 &xor_stream($A1, $B1, $C1, $D1, "8*16");
532 &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.="
533 lea 16*16($inp), $inp
534 lea 16*16($oup), $oup
535 sub \$16*16, $inl
536 jmp open_sse_main_loop
5372:
538 # Handle the various tail sizes efficiently
539 test $inl, $inl
540 jz open_sse_finalize
541 cmp \$4*16, $inl
542 ja 3f\n";
543###############################################################################
544 # At most 64 bytes are left
545 &prep_state(1); $code.="
546 xor $itr2, $itr2
547 mov $inl, $itr1
548 cmp \$16, $itr1
549 jb 2f
5501: \n";
551 &poly_add("0($inp, $itr2)");
552 &poly_mul(); $code.="
553 sub \$16, $itr1
5542:
555 add \$16, $itr2\n";
556 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
557 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
558 cmp \$16, $itr1
559 jae 1b
560 cmp \$10*16, $itr2
561 jne 2b\n";
562 &finalize_state(1); $code.="
563 jmp open_sse_tail_64_dec_loop
5643:
565 cmp \$8*16, $inl
566 ja 3f\n";
567###############################################################################
568 # 65 - 128 bytes are left
569 &prep_state(2); $code.="
570 mov $inl, $itr1
571 and \$-16, $itr1
572 xor $itr2, $itr2
5731: \n";
574 &poly_add("0($inp, $itr2)");
575 &poly_mul(); $code.="
5762:
577 add \$16, $itr2\n";
578 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
579 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
580 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
581 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
582 cmp $itr1, $itr2
583 jb 1b
584 cmp \$10*16, $itr2
585 jne 2b\n";
586 &finalize_state(2);
587 &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
588 sub \$4*16, $inl
589 lea 4*16($inp), $inp
590 lea 4*16($oup), $oup
591 jmp open_sse_tail_64_dec_loop
5923:
593 cmp \$12*16, $inl
594 ja 3f\n";
595###############################################################################
596 # 129 - 192 bytes are left
597 &prep_state(3); $code.="
598 mov $inl, $itr1
599 mov \$10*16, $itr2
600 cmp \$10*16, $itr1
601 cmovg $itr2, $itr1
602 and \$-16, $itr1
603 xor $itr2, $itr2
6041: \n";
605 &poly_add("0($inp, $itr2)");
606 &poly_mul(); $code.="
6072:
608 add \$16, $itr2\n";
609 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
610 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
611 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
612 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
613 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
614 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
615 cmp $itr1, $itr2
616 jb 1b
617 cmp \$10*16, $itr2
618 jne 2b
619 cmp \$11*16, $inl
620 jb 1f\n";
621 &poly_add("10*16($inp)");
622 &poly_mul(); $code.="
623 cmp \$12*16, $inl
624 jb 1f\n";
625 &poly_add("11*16($inp)");
626 &poly_mul(); $code.="
6271: \n";
628 &finalize_state(3);
629 &xor_stream($A2, $B2, $C2, $D2, "0*16");
630 &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
631 sub \$8*16, $inl
632 lea 8*16($inp), $inp
633 lea 8*16($oup), $oup
634 jmp open_sse_tail_64_dec_loop
6353:
636###############################################################################\n";
637 # 193 - 255 bytes are left
638 &prep_state(4); $code.="
639 xor $itr2, $itr2
6401: \n";
641 &poly_add("0($inp, $itr2)");
642 &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
643 &chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
644 &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
645 &poly_stage1();
646 &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
647 &poly_stage2();
648 &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
649 &chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
650 &poly_stage3();
651 &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
652 &poly_reduce_stage();
653 &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
654 add \$16, $itr2
655 cmp \$10*16, $itr2
656 jb 1b
657 mov $inl, $itr1
658 and \$-16, $itr1
6591: \n";
660 &poly_add("0($inp, $itr2)");
661 &poly_mul(); $code.="
662 add \$16, $itr2
663 cmp $itr1, $itr2
664 jb 1b\n";
665 &finalize_state(4);
666 &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
667 &xor_stream($A2, $B2, $C2, $D2, "4*16");
668 &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
669 movdqa $tmp_store, $D0
670 sub \$12*16, $inl
671 lea 12*16($inp), $inp
672 lea 12*16($oup), $oup
673###############################################################################
674 # Decrypt the remaining data, 16B at a time, using existing stream
675open_sse_tail_64_dec_loop:
676 cmp \$16, $inl
677 jb 1f
678 sub \$16, $inl
679 movdqu ($inp), $T0
680 pxor $T0, $A0
681 movdqu $A0, ($oup)
682 lea 16($inp), $inp
683 lea 16($oup), $oup
684 movdqa $B0, $A0
685 movdqa $C0, $B0
686 movdqa $D0, $C0
687 jmp open_sse_tail_64_dec_loop
6881:
689 movdqa $A0, $A1
690
691 # Decrypt up to 16 bytes at the end.
692open_sse_tail_16:
693 test $inl, $inl
694 jz open_sse_finalize
695
696 # Read the final bytes into $T0. They need to be read in reverse order so
697 # that they end up in the correct order in $T0.
698 pxor $T0, $T0
699 lea -1($inp, $inl), $inp
700 movq $inl, $itr2
7012:
702 pslldq \$1, $T0
703 pinsrb \$0, ($inp), $T0
704 sub \$1, $inp
705 sub \$1, $itr2
706 jnz 2b
707
7083:
709 movq $T0, $t0
710 pextrq \$1, $T0, $t1
711 # The final bytes of keystream are in $A1.
712 pxor $A1, $T0
713
714 # Copy the plaintext bytes out.
7152:
716 pextrb \$0, $T0, ($oup)
717 psrldq \$1, $T0
718 add \$1, $oup
719 sub \$1, $inl
720 jne 2b
721
722 add $t0, $acc0
723 adc $t1, $acc1
724 adc \$1, $acc2\n";
725 &poly_mul(); $code.="
726
727open_sse_finalize:\n";
728 &poly_add($len_store);
729 &poly_mul(); $code.="
730 # Final reduce
731 mov $acc0, $t0
732 mov $acc1, $t1
733 mov $acc2, $t2
734 sub \$-5, $acc0
735 sbb \$-1, $acc1
736 sbb \$3, $acc2
737 cmovc $t0, $acc0
738 cmovc $t1, $acc1
739 cmovc $t2, $acc2
740 # Add in s part of the key
741 add 0+$s_store, $acc0
742 adc 8+$s_store, $acc1
743
744 add \$288 + 32, %rsp
745.cfi_adjust_cfa_offset -(288 + 32)
746 pop $keyp
747.cfi_adjust_cfa_offset -8
748 movq $acc0, ($keyp)
749 movq $acc1, 8($keyp)
750
751 pop %r15
752.cfi_adjust_cfa_offset -8
753 pop %r14
754.cfi_adjust_cfa_offset -8
755 pop %r13
756.cfi_adjust_cfa_offset -8
757 pop %r12
758.cfi_adjust_cfa_offset -8
759 pop %rbx
760.cfi_adjust_cfa_offset -8
761 pop %rbp
762.cfi_adjust_cfa_offset -8
763 ret
764.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
765###############################################################################
766open_sse_128:
767 movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
768 movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
769 movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
770 movdqu 2*16($keyp), $D0
771 movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
772 movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2
773 movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
774 mov \$10, $acc0
7751: \n";
776 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
777 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
778 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
779 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
780 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
781 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
782 dec $acc0
783 jnz 1b
784 paddd .chacha20_consts(%rip), $A0
785 paddd .chacha20_consts(%rip), $A1
786 paddd .chacha20_consts(%rip), $A2
787 paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
788 paddd $T2, $C1\npaddd $T2, $C2
789 paddd $T3, $D1
790 paddd .sse_inc(%rip), $T3
791 paddd $T3, $D2
792 # Clamp and store the key
793 pand .clamp(%rip), $A0
794 movdqa $A0, $r_store
795 movdqa $B0, $s_store
796 # Hash
797 mov %r8, $itr2
798 call poly_hash_ad_internal
7991:
800 cmp \$16, $inl
801 jb open_sse_tail_16
802 sub \$16, $inl\n";
803 # Load for hashing
804 &poly_add("0*8($inp)"); $code.="
805 # Load for decryption
806 movdqu 0*16($inp), $T0
807 pxor $T0, $A1
808 movdqu $A1, 0*16($oup)
809 lea 1*16($inp), $inp
810 lea 1*16($oup), $oup\n";
811 &poly_mul(); $code.="
812 # Shift the stream left
813 movdqa $B1, $A1
814 movdqa $C1, $B1
815 movdqa $D1, $C1
816 movdqa $A2, $D1
817 movdqa $B2, $A2
818 movdqa $C2, $B2
819 movdqa $D2, $C2
820 jmp 1b
821 jmp open_sse_tail_16
822.size chacha20_poly1305_open, .-chacha20_poly1305_open
823.cfi_endproc
824
825################################################################################
826################################################################################
827# void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
828.globl chacha20_poly1305_seal
829.type chacha20_poly1305_seal,\@function,2
830.align 64
831chacha20_poly1305_seal:
832.cfi_startproc
833 push %rbp
834.cfi_adjust_cfa_offset 8
835 push %rbx
836.cfi_adjust_cfa_offset 8
837 push %r12
838.cfi_adjust_cfa_offset 8
839 push %r13
840.cfi_adjust_cfa_offset 8
841 push %r14
842.cfi_adjust_cfa_offset 8
843 push %r15
844.cfi_adjust_cfa_offset 8
845 # We write the calculated authenticator back to keyp at the end, so save
846 # the pointer on the stack too.
847 push $keyp
848.cfi_adjust_cfa_offset 8
849 sub \$288 + 32, %rsp
850.cfi_adjust_cfa_offset 288 + 32
851.cfi_offset rbp, -16
852.cfi_offset rbx, -24
853.cfi_offset r12, -32
854.cfi_offset r13, -40
855.cfi_offset r14, -48
856.cfi_offset r15, -56
David Benjaminf31229b2017-01-25 14:08:15 -0500857 lea 32(%rsp), %rbp
858 and \$-32, %rbp
859 mov %rdx, 8+$len_store
860 mov %r8, 0+$len_store
861 mov %rdx, $inl\n"; $code.="
862 mov OPENSSL_ia32cap_P+8(%rip), %eax
863 and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
864 xor \$`(1<<5) + (1<<8)`, %eax
865 jz chacha20_poly1305_seal_avx2\n" if ($avx>1);
866$code.="
867 cmp \$128, $inl
868 jbe seal_sse_128
869 # For longer buffers, prepare the poly key + some stream
870 movdqa .chacha20_consts(%rip), $A0
871 movdqu 0*16($keyp), $B0
872 movdqu 1*16($keyp), $C0
873 movdqu 2*16($keyp), $D0
874 movdqa $A0, $A1
875 movdqa $A0, $A2
876 movdqa $A0, $A3
877 movdqa $B0, $B1
878 movdqa $B0, $B2
879 movdqa $B0, $B3
880 movdqa $C0, $C1
881 movdqa $C0, $C2
882 movdqa $C0, $C3
883 movdqa $D0, $D3
884 paddd .sse_inc(%rip), $D0
885 movdqa $D0, $D2
886 paddd .sse_inc(%rip), $D0
887 movdqa $D0, $D1
888 paddd .sse_inc(%rip), $D0
889 # Store on stack
890 movdqa $B0, $state1_store
891 movdqa $C0, $state2_store
892 movdqa $D0, $ctr0_store
893 movdqa $D1, $ctr1_store
894 movdqa $D2, $ctr2_store
895 movdqa $D3, $ctr3_store
896 mov \$10, $acc0
8971: \n";
898 foreach $l (@loop_body) {$code.=$l."\n";}
899 @loop_body = split /\n/, $chacha_body; $code.="
900 dec $acc0
901 jnz 1b\n";
902 &finalize_state(4); $code.="
903 # Clamp and store the key
904 pand .clamp(%rip), $A3
905 movdqa $A3, $r_store
906 movdqa $B3, $s_store
907 # Hash
908 mov %r8, $itr2
909 call poly_hash_ad_internal\n";
910 &xor_stream($A2,$B2,$C2,$D2,"0*16");
911 &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.="
912 cmp \$12*16, $inl
913 ja 1f
914 mov \$8*16, $itr1
915 sub \$8*16, $inl
916 lea 8*16($inp), $inp
917 jmp seal_sse_128_seal_hash
9181: \n";
919 &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.="
920 mov \$12*16, $itr1
921 sub \$12*16, $inl
922 lea 12*16($inp), $inp
923 mov \$2, $itr1
924 mov \$8, $itr2
925 cmp \$4*16, $inl
926 jbe seal_sse_tail_64
927 cmp \$8*16, $inl
928 jbe seal_sse_tail_128
929 cmp \$12*16, $inl
930 jbe seal_sse_tail_192
931
9321: \n";
933 # The main loop
934 &prep_state(4); $code.="
9352: \n";
936 &emit_body(20);
937 &poly_add("0($oup)");
938 &emit_body(20);
939 &poly_stage1();
940 &emit_body(20);
941 &poly_stage2();
942 &emit_body(20);
943 &poly_stage3();
944 &emit_body(20);
945 &poly_reduce_stage();
946 foreach $l (@loop_body) {$code.=$l."\n";}
947 @loop_body = split /\n/, $chacha_body; $code.="
948 lea 16($oup), $oup
949 dec $itr2
950 jge 2b\n";
951 &poly_add("0*8($oup)");
952 &poly_mul(); $code.="
953 lea 16($oup), $oup
954 dec $itr1
955 jg 2b\n";
956
957 &finalize_state(4);$code.="
958 movdqa $D2, $tmp_store\n";
959 &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.="
960 movdqa $tmp_store, $D2\n";
961 &xor_stream($A2,$B2,$C2,$D2, 4*16);
962 &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.="
963 cmp \$16*16, $inl
964 ja 3f
965
966 mov \$12*16, $itr1
967 sub \$12*16, $inl
968 lea 12*16($inp), $inp
969 jmp seal_sse_128_seal_hash
9703: \n";
971 &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
972 lea 16*16($inp), $inp
973 sub \$16*16, $inl
974 mov \$6, $itr1
975 mov \$4, $itr2
976 cmp \$12*16, $inl
977 jg 1b
978 mov $inl, $itr1
979 test $inl, $inl
980 je seal_sse_128_seal_hash
981 mov \$6, $itr1
982 cmp \$4*16, $inl
983 jg 3f
984###############################################################################
985seal_sse_tail_64:\n";
986 &prep_state(1); $code.="
9871: \n";
988 &poly_add("0($oup)");
989 &poly_mul(); $code.="
990 lea 16($oup), $oup
9912: \n";
992 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
993 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
994 &poly_add("0($oup)");
995 &poly_mul(); $code.="
996 lea 16($oup), $oup
997 dec $itr1
998 jg 1b
999 dec $itr2
1000 jge 2b\n";
1001 &finalize_state(1); $code.="
1002 jmp seal_sse_128_seal
10033:
1004 cmp \$8*16, $inl
1005 jg 3f
1006###############################################################################
1007seal_sse_tail_128:\n";
1008 &prep_state(2); $code.="
10091: \n";
1010 &poly_add("0($oup)");
1011 &poly_mul(); $code.="
1012 lea 16($oup), $oup
10132: \n";
1014 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
1015 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
1016 &poly_add("0($oup)");
1017 &poly_mul();
1018 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
1019 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
1020 lea 16($oup), $oup
1021 dec $itr1
1022 jg 1b
1023 dec $itr2
1024 jge 2b\n";
1025 &finalize_state(2);
1026 &xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
1027 mov \$4*16, $itr1
1028 sub \$4*16, $inl
1029 lea 4*16($inp), $inp
1030 jmp seal_sse_128_seal_hash
10313:
1032###############################################################################
1033seal_sse_tail_192:\n";
1034 &prep_state(3); $code.="
10351: \n";
1036 &poly_add("0($oup)");
1037 &poly_mul(); $code.="
1038 lea 16($oup), $oup
10392: \n";
1040 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
1041 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
1042 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
1043 &poly_add("0($oup)");
1044 &poly_mul();
1045 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
1046 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
1047 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
1048 lea 16($oup), $oup
1049 dec $itr1
1050 jg 1b
1051 dec $itr2
1052 jge 2b\n";
1053 &finalize_state(3);
1054 &xor_stream($A2,$B2,$C2,$D2,0*16);
1055 &xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
1056 mov \$8*16, $itr1
1057 sub \$8*16, $inl
1058 lea 8*16($inp), $inp
1059###############################################################################
1060seal_sse_128_seal_hash:
1061 cmp \$16, $itr1
1062 jb seal_sse_128_seal\n";
1063 &poly_add("0($oup)");
1064 &poly_mul(); $code.="
1065 sub \$16, $itr1
1066 lea 16($oup), $oup
1067 jmp seal_sse_128_seal_hash
1068
1069seal_sse_128_seal:
1070 cmp \$16, $inl
1071 jb seal_sse_tail_16
1072 sub \$16, $inl
1073 # Load for decryption
1074 movdqu 0*16($inp), $T0
1075 pxor $T0, $A0
1076 movdqu $A0, 0*16($oup)
1077 # Then hash
1078 add 0*8($oup), $acc0
1079 adc 1*8($oup), $acc1
1080 adc \$1, $acc2
1081 lea 1*16($inp), $inp
1082 lea 1*16($oup), $oup\n";
1083 &poly_mul(); $code.="
1084 # Shift the stream left
1085 movdqa $B0, $A0
1086 movdqa $C0, $B0
1087 movdqa $D0, $C0
1088 movdqa $A1, $D0
1089 movdqa $B1, $A1
1090 movdqa $C1, $B1
1091 movdqa $D1, $C1
1092 jmp seal_sse_128_seal
1093
1094seal_sse_tail_16:
1095 test $inl, $inl
1096 jz seal_sse_finalize
1097 # We can only load the PT one byte at a time to avoid buffer overread
1098 mov $inl, $itr2
1099 shl \$4, $itr2
1100 lea .and_masks(%rip), $t0
1101 mov $inl, $itr1
1102 lea -1($inp, $inl), $inp
1103 pxor $T3, $T3
11041:
1105 pslldq \$1, $T3
1106 pinsrb \$0, ($inp), $T3
1107 lea -1($inp), $inp
1108 dec $itr1
1109 jne 1b
1110
1111 # XOR the keystream with the plaintext.
1112 pxor $A0, $T3
1113
1114 # Write ciphertext out, byte-by-byte.
1115 movq $inl, $itr1
1116 movdqu $T3, $A0
11172:
1118 pextrb \$0, $A0, ($oup)
1119 psrldq \$1, $A0
1120 add \$1, $oup
1121 sub \$1, $itr1
1122 jnz 2b
1123
1124 pand -16($t0, $itr2), $T3
1125 movq $T3, $t0
1126 pextrq \$1, $T3, $t1
1127 add $t0, $acc0
1128 adc $t1, $acc1
1129 adc \$1, $acc2\n";
1130 &poly_mul(); $code.="
1131seal_sse_finalize:\n";
1132 &poly_add($len_store);
1133 &poly_mul(); $code.="
1134 # Final reduce
1135 mov $acc0, $t0
1136 mov $acc1, $t1
1137 mov $acc2, $t2
1138 sub \$-5, $acc0
1139 sbb \$-1, $acc1
1140 sbb \$3, $acc2
1141 cmovc $t0, $acc0
1142 cmovc $t1, $acc1
1143 cmovc $t2, $acc2
1144 # Add in s part of the key
1145 add 0+$s_store, $acc0
1146 adc 8+$s_store, $acc1
1147
1148 add \$288 + 32, %rsp
1149.cfi_adjust_cfa_offset -(288 + 32)
1150 pop $keyp
1151.cfi_adjust_cfa_offset -8
1152 mov $acc0, 0*8($keyp)
1153 mov $acc1, 1*8($keyp)
1154
1155 pop %r15
1156.cfi_adjust_cfa_offset -8
1157 pop %r14
1158.cfi_adjust_cfa_offset -8
1159 pop %r13
1160.cfi_adjust_cfa_offset -8
1161 pop %r12
1162.cfi_adjust_cfa_offset -8
1163 pop %rbx
1164.cfi_adjust_cfa_offset -8
1165 pop %rbp
1166.cfi_adjust_cfa_offset -8
1167 ret
1168.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
1169################################################################################
1170seal_sse_128:
1171 movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
1172 movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
1173 movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
1174 movdqu 2*16($keyp), $D2
1175 movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0
1176 movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
1177 movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
1178 mov \$10, $acc0
11791:\n";
1180 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
1181 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
1182 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
1183 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
1184 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
1185 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
1186 dec $acc0
1187 jnz 1b
1188 paddd .chacha20_consts(%rip), $A0
1189 paddd .chacha20_consts(%rip), $A1
1190 paddd .chacha20_consts(%rip), $A2
1191 paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
1192 paddd $T2, $C0\npaddd $T2, $C1
1193 paddd $T3, $D0
1194 paddd .sse_inc(%rip), $T3
1195 paddd $T3, $D1
1196 # Clamp and store the key
1197 pand .clamp(%rip), $A2
1198 movdqa $A2, $r_store
1199 movdqa $B2, $s_store
1200 # Hash
1201 mov %r8, $itr2
1202 call poly_hash_ad_internal
1203 jmp seal_sse_128_seal
1204.size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n";
1205}
1206
1207# There should have been a cfi_endproc at the end of that function, but the two
1208# following blocks of code are jumped to without a stack frame and the CFI
1209# context which they are used in happens to match the CFI context at the end of
1210# the previous function. So the CFI table is just extended to the end of them.
1211
1212if ($avx>1) {
1213
1214($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15));
1215my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15));
1216($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
1217$state1_store="2*32(%rbp)";
1218$state2_store="3*32(%rbp)";
1219$tmp_store="4*32(%rbp)";
1220$ctr0_store="5*32(%rbp)";
1221$ctr1_store="6*32(%rbp)";
1222$ctr2_store="7*32(%rbp)";
1223$ctr3_store="8*32(%rbp)";
1224
1225sub chacha_qr_avx2 {
1226my ($a,$b,$c,$d,$t,$dir)=@_;
1227$code.=<<___ if ($dir =~ /store/);
1228 vmovdqa $t, $tmp_store
1229___
1230$code.=<<___;
1231 vpaddd $b, $a, $a
1232 vpxor $a, $d, $d
1233 vpshufb .rol16(%rip), $d, $d
1234 vpaddd $d, $c, $c
1235 vpxor $c, $b, $b
1236 vpsrld \$20, $b, $t
1237 vpslld \$12, $b, $b
1238 vpxor $t, $b, $b
1239 vpaddd $b, $a, $a
1240 vpxor $a, $d, $d
1241 vpshufb .rol8(%rip), $d, $d
1242 vpaddd $d, $c, $c
1243 vpxor $c, $b, $b
1244 vpslld \$7, $b, $t
1245 vpsrld \$25, $b, $b
1246 vpxor $t, $b, $b
1247___
1248$code.=<<___ if ($dir =~ /left/);
1249 vpalignr \$12, $d, $d, $d
1250 vpalignr \$8, $c, $c, $c
1251 vpalignr \$4, $b, $b, $b
1252___
1253$code.=<<___ if ($dir =~ /right/);
1254 vpalignr \$4, $d, $d, $d
1255 vpalignr \$8, $c, $c, $c
1256 vpalignr \$12, $b, $b, $b
1257___
1258$code.=<<___ if ($dir =~ /load/);
1259 vmovdqa $tmp_store, $t
1260___
1261}
1262
1263sub prep_state_avx2 {
1264my ($n)=@_;
1265$code.=<<___;
1266 vmovdqa .chacha20_consts(%rip), $A0
1267 vmovdqa $state1_store, $B0
1268 vmovdqa $state2_store, $C0
1269___
1270$code.=<<___ if ($n ge 2);
1271 vmovdqa $A0, $A1
1272 vmovdqa $B0, $B1
1273 vmovdqa $C0, $C1
1274___
1275$code.=<<___ if ($n ge 3);
1276 vmovdqa $A0, $A2
1277 vmovdqa $B0, $B2
1278 vmovdqa $C0, $C2
1279___
1280$code.=<<___ if ($n ge 4);
1281 vmovdqa $A0, $A3
1282 vmovdqa $B0, $B3
1283 vmovdqa $C0, $C3
1284___
1285$code.=<<___ if ($n eq 1);
1286 vmovdqa .avx2_inc(%rip), $D0
1287 vpaddd $ctr0_store, $D0, $D0
1288 vmovdqa $D0, $ctr0_store
1289___
1290$code.=<<___ if ($n eq 2);
1291 vmovdqa .avx2_inc(%rip), $D0
1292 vpaddd $ctr0_store, $D0, $D1
1293 vpaddd $D1, $D0, $D0
1294 vmovdqa $D0, $ctr0_store
1295 vmovdqa $D1, $ctr1_store
1296___
1297$code.=<<___ if ($n eq 3);
1298 vmovdqa .avx2_inc(%rip), $D0
1299 vpaddd $ctr0_store, $D0, $D2
1300 vpaddd $D2, $D0, $D1
1301 vpaddd $D1, $D0, $D0
1302 vmovdqa $D0, $ctr0_store
1303 vmovdqa $D1, $ctr1_store
1304 vmovdqa $D2, $ctr2_store
1305___
1306$code.=<<___ if ($n eq 4);
1307 vmovdqa .avx2_inc(%rip), $D0
1308 vpaddd $ctr0_store, $D0, $D3
1309 vpaddd $D3, $D0, $D2
1310 vpaddd $D2, $D0, $D1
1311 vpaddd $D1, $D0, $D0
1312 vmovdqa $D3, $ctr3_store
1313 vmovdqa $D2, $ctr2_store
1314 vmovdqa $D1, $ctr1_store
1315 vmovdqa $D0, $ctr0_store
1316___
1317}
1318
1319sub finalize_state_avx2 {
1320my ($n)=@_;
1321$code.=<<___ if ($n eq 4);
1322 vpaddd .chacha20_consts(%rip), $A3, $A3
1323 vpaddd $state1_store, $B3, $B3
1324 vpaddd $state2_store, $C3, $C3
1325 vpaddd $ctr3_store, $D3, $D3
1326___
1327$code.=<<___ if ($n ge 3);
1328 vpaddd .chacha20_consts(%rip), $A2, $A2
1329 vpaddd $state1_store, $B2, $B2
1330 vpaddd $state2_store, $C2, $C2
1331 vpaddd $ctr2_store, $D2, $D2
1332___
1333$code.=<<___ if ($n ge 2);
1334 vpaddd .chacha20_consts(%rip), $A1, $A1
1335 vpaddd $state1_store, $B1, $B1
1336 vpaddd $state2_store, $C1, $C1
1337 vpaddd $ctr1_store, $D1, $D1
1338___
1339$code.=<<___;
1340 vpaddd .chacha20_consts(%rip), $A0, $A0
1341 vpaddd $state1_store, $B0, $B0
1342 vpaddd $state2_store, $C0, $C0
1343 vpaddd $ctr0_store, $D0, $D0
1344___
1345}
1346
1347sub xor_stream_avx2 {
1348my ($A, $B, $C, $D, $offset, $hlp)=@_;
1349$code.=<<___;
1350 vperm2i128 \$0x02, $A, $B, $hlp
1351 vperm2i128 \$0x13, $A, $B, $B
1352 vperm2i128 \$0x02, $C, $D, $A
1353 vperm2i128 \$0x13, $C, $D, $C
1354 vpxor 0*32+$offset($inp), $hlp, $hlp
1355 vpxor 1*32+$offset($inp), $A, $A
1356 vpxor 2*32+$offset($inp), $B, $B
1357 vpxor 3*32+$offset($inp), $C, $C
1358 vmovdqu $hlp, 0*32+$offset($oup)
1359 vmovdqu $A, 1*32+$offset($oup)
1360 vmovdqu $B, 2*32+$offset($oup)
1361 vmovdqu $C, 3*32+$offset($oup)
1362___
1363}
1364
1365sub finish_stream_avx2 {
1366my ($A, $B, $C, $D, $hlp)=@_;
1367$code.=<<___;
1368 vperm2i128 \$0x13, $A, $B, $hlp
1369 vperm2i128 \$0x02, $A, $B, $A
1370 vperm2i128 \$0x02, $C, $D, $B
1371 vperm2i128 \$0x13, $C, $D, $D
1372 vmovdqa $hlp, $C
1373___
1374}
1375
1376sub poly_stage1_mulx {
1377$code.=<<___;
1378 mov 0+$r_store, %rdx
1379 mov %rdx, $t2
1380 mulx $acc0, $t0, $t1
1381 mulx $acc1, %rax, %rdx
Robert Sloan4d1ac502017-02-06 08:36:14 -08001382 imulq $acc2, $t2
David Benjaminf31229b2017-01-25 14:08:15 -05001383 add %rax, $t1
1384 adc %rdx, $t2
1385___
1386}
1387
1388sub poly_stage2_mulx {
1389$code.=<<___;
1390 mov 8+$r_store, %rdx
1391 mulx $acc0, $acc0, %rax
1392 add $acc0, $t1
1393 mulx $acc1, $acc1, $t3
1394 adc $acc1, $t2
1395 adc \$0, $t3
Robert Sloan4d1ac502017-02-06 08:36:14 -08001396 imulq $acc2, %rdx
David Benjaminf31229b2017-01-25 14:08:15 -05001397___
1398}
1399
1400sub poly_stage3_mulx {
1401$code.=<<___;
1402 add %rax, $t2
1403 adc %rdx, $t3
1404___
1405}
1406
1407sub poly_mul_mulx {
1408 &poly_stage1_mulx();
1409 &poly_stage2_mulx();
1410 &poly_stage3_mulx();
1411 &poly_reduce_stage();
1412}
1413
1414sub gen_chacha_round_avx2 {
1415my ($rot1, $rot2, $shift)=@_;
1416my $round="";
1417$round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20);
1418$round=$round ."vmovdqa $rot2, $C0
1419 vpaddd $B3, $A3, $A3
1420 vpaddd $B2, $A2, $A2
1421 vpaddd $B1, $A1, $A1
1422 vpaddd $B0, $A0, $A0
1423 vpxor $A3, $D3, $D3
1424 vpxor $A2, $D2, $D2
1425 vpxor $A1, $D1, $D1
1426 vpxor $A0, $D0, $D0
1427 vpshufb $C0, $D3, $D3
1428 vpshufb $C0, $D2, $D2
1429 vpshufb $C0, $D1, $D1
1430 vpshufb $C0, $D0, $D0
1431 vmovdqa $tmp_store, $C0
1432 vpaddd $D3, $C3, $C3
1433 vpaddd $D2, $C2, $C2
1434 vpaddd $D1, $C1, $C1
1435 vpaddd $D0, $C0, $C0
1436 vpxor $C3, $B3, $B3
1437 vpxor $C2, $B2, $B2
1438 vpxor $C1, $B1, $B1
1439 vpxor $C0, $B0, $B0
1440 vmovdqa $C0, $tmp_store
1441 vpsrld \$$rot1, $B3, $C0
1442 vpslld \$32-$rot1, $B3, $B3
1443 vpxor $C0, $B3, $B3
1444 vpsrld \$$rot1, $B2, $C0
1445 vpslld \$32-$rot1, $B2, $B2
1446 vpxor $C0, $B2, $B2
1447 vpsrld \$$rot1, $B1, $C0
1448 vpslld \$32-$rot1, $B1, $B1
1449 vpxor $C0, $B1, $B1
1450 vpsrld \$$rot1, $B0, $C0
1451 vpslld \$32-$rot1, $B0, $B0
1452 vpxor $C0, $B0, $B0\n";
1453($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
1454($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
1455$round=$round ."vmovdqa $tmp_store, $C0
1456 vpalignr \$$s1, $B3, $B3, $B3
1457 vpalignr \$$s2, $C3, $C3, $C3
1458 vpalignr \$$s3, $D3, $D3, $D3
1459 vpalignr \$$s1, $B2, $B2, $B2
1460 vpalignr \$$s2, $C2, $C2, $C2
1461 vpalignr \$$s3, $D2, $D2, $D2
1462 vpalignr \$$s1, $B1, $B1, $B1
1463 vpalignr \$$s2, $C1, $C1, $C1
1464 vpalignr \$$s3, $D1, $D1, $D1
1465 vpalignr \$$s1, $B0, $B0, $B0
1466 vpalignr \$$s2, $C0, $C0, $C0
1467 vpalignr \$$s3, $D0, $D0, $D0\n"
1468if (($shift =~ /left/) || ($shift =~ /right/));
1469return $round;
1470};
1471
1472$chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") .
1473 &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") .
1474 &gen_chacha_round_avx2(20, ".rol16(%rip)") .
1475 &gen_chacha_round_avx2(25, ".rol8(%rip)", "right");
1476
1477@loop_body = split /\n/, $chacha_body;
1478
1479$code.="
1480###############################################################################
1481.type chacha20_poly1305_open_avx2,\@function,2
1482.align 64
1483chacha20_poly1305_open_avx2:
1484 vzeroupper
1485 vmovdqa .chacha20_consts(%rip), $A0
1486 vbroadcasti128 0*16($keyp), $B0
1487 vbroadcasti128 1*16($keyp), $C0
1488 vbroadcasti128 2*16($keyp), $D0
1489 vpaddd .avx2_init(%rip), $D0, $D0
1490 cmp \$6*32, $inl
1491 jbe open_avx2_192
1492 cmp \$10*32, $inl
1493 jbe open_avx2_320
1494
1495 vmovdqa $B0, $state1_store
1496 vmovdqa $C0, $state2_store
1497 vmovdqa $D0, $ctr0_store
1498 mov \$10, $acc0
14991: \n";
1500 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1501 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
1502 dec $acc0
1503 jne 1b
1504 vpaddd .chacha20_consts(%rip), $A0, $A0
1505 vpaddd $state1_store, $B0, $B0
1506 vpaddd $state2_store, $C0, $C0
1507 vpaddd $ctr0_store, $D0, $D0
1508
1509 vperm2i128 \$0x02, $A0, $B0, $T0
1510 # Clamp and store key
1511 vpand .clamp(%rip), $T0, $T0
1512 vmovdqa $T0, $r_store
1513 # Stream for the first 64 bytes
1514 vperm2i128 \$0x13, $A0, $B0, $A0
1515 vperm2i128 \$0x13, $C0, $D0, $B0
1516 # Hash AD + first 64 bytes
1517 mov %r8, $itr2
1518 call poly_hash_ad_internal
1519 xor $itr1, $itr1
1520 # Hash first 64 bytes
15211: \n";
1522 &poly_add("0($inp, $itr1)");
1523 &poly_mul(); $code.="
1524 add \$16, $itr1
1525 cmp \$2*32, $itr1
1526 jne 1b
1527 # Decrypt first 64 bytes
1528 vpxor 0*32($inp), $A0, $A0
1529 vpxor 1*32($inp), $B0, $B0
1530 vmovdqu $A0, 0*32($oup)
1531 vmovdqu $B0, 1*32($oup)
1532 lea 2*32($inp), $inp
1533 lea 2*32($oup), $oup
1534 sub \$2*32, $inl
15351:
1536 # Hash and decrypt 512 bytes each iteration
1537 cmp \$16*32, $inl
1538 jb 3f\n";
1539 &prep_state_avx2(4); $code.="
1540 xor $itr1, $itr1
15412: \n";
1542 &poly_add("0*8($inp, $itr1)");
1543 &emit_body(10);
1544 &poly_stage1_mulx();
1545 &emit_body(9);
1546 &poly_stage2_mulx();
1547 &emit_body(12);
1548 &poly_stage3_mulx();
1549 &emit_body(10);
1550 &poly_reduce_stage();
1551 &emit_body(9);
1552 &poly_add("2*8($inp, $itr1)");
1553 &emit_body(8);
1554 &poly_stage1_mulx();
1555 &emit_body(18);
1556 &poly_stage2_mulx();
1557 &emit_body(18);
1558 &poly_stage3_mulx();
1559 &emit_body(9);
1560 &poly_reduce_stage();
1561 &emit_body(8);
1562 &poly_add("4*8($inp, $itr1)"); $code.="
1563 lea 6*8($itr1), $itr1\n";
1564 &emit_body(18);
1565 &poly_stage1_mulx();
1566 &emit_body(8);
1567 &poly_stage2_mulx();
1568 &emit_body(8);
1569 &poly_stage3_mulx();
1570 &emit_body(18);
1571 &poly_reduce_stage();
1572 foreach $l (@loop_body) {$code.=$l."\n";}
1573 @loop_body = split /\n/, $chacha_body; $code.="
1574 cmp \$10*6*8, $itr1
1575 jne 2b\n";
1576 &finalize_state_avx2(4); $code.="
1577 vmovdqa $A0, $tmp_store\n";
1578 &poly_add("10*6*8($inp)");
1579 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
1580 vmovdqa $tmp_store, $A0\n";
1581 &poly_mul();
1582 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
1583 &poly_add("10*6*8+2*8($inp)");
1584 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
1585 &poly_mul();
1586 &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
1587 lea 16*32($inp), $inp
1588 lea 16*32($oup), $oup
1589 sub \$16*32, $inl
1590 jmp 1b
15913:
1592 test $inl, $inl
1593 vzeroupper
1594 je open_sse_finalize
15953:
1596 cmp \$4*32, $inl
1597 ja 3f\n";
1598###############################################################################
1599 # 1-128 bytes left
1600 &prep_state_avx2(1); $code.="
1601 xor $itr2, $itr2
1602 mov $inl, $itr1
1603 and \$-16, $itr1
1604 test $itr1, $itr1
1605 je 2f
16061: \n";
1607 &poly_add("0*8($inp, $itr2)");
1608 &poly_mul(); $code.="
16092:
1610 add \$16, $itr2\n";
1611 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1612 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
1613 cmp $itr1, $itr2
1614 jb 1b
1615 cmp \$160, $itr2
1616 jne 2b\n";
1617 &finalize_state_avx2(1);
1618 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
1619 jmp open_avx2_tail_loop
16203:
1621 cmp \$8*32, $inl
1622 ja 3f\n";
1623###############################################################################
1624 # 129-256 bytes left
1625 &prep_state_avx2(2); $code.="
1626 mov $inl, $tmp_store
1627 mov $inl, $itr1
1628 sub \$4*32, $itr1
1629 shr \$4, $itr1
1630 mov \$10, $itr2
1631 cmp \$10, $itr1
1632 cmovg $itr2, $itr1
1633 mov $inp, $inl
1634 xor $itr2, $itr2
16351: \n";
1636 &poly_add("0*8($inl)");
1637 &poly_mul_mulx(); $code.="
1638 lea 16($inl), $inl
16392: \n";
1640 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1641 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.="
1642 inc $itr2\n";
1643 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
1644 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
1645 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
1646 cmp $itr1, $itr2
1647 jb 1b
1648 cmp \$10, $itr2
1649 jne 2b
1650 mov $inl, $itr2
1651 sub $inp, $inl
1652 mov $inl, $itr1
1653 mov $tmp_store, $inl
16541:
1655 add \$16, $itr1
1656 cmp $inl, $itr1
1657 jg 1f\n";
1658 &poly_add("0*8($itr2)");
1659 &poly_mul_mulx(); $code.="
1660 lea 16($itr2), $itr2
1661 jmp 1b
16621: \n";
1663 &finalize_state_avx2(2);
1664 &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0);
1665 &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
1666 lea 4*32($inp), $inp
1667 lea 4*32($oup), $oup
1668 sub \$4*32, $inl
1669 jmp open_avx2_tail_loop
16703:
1671 cmp \$12*32, $inl
1672 ja 3f\n";
1673###############################################################################
1674 # 257-383 bytes left
1675 &prep_state_avx2(3); $code.="
1676 mov $inl, $tmp_store
1677 mov $inl, $itr1
1678 sub \$8*32, $itr1
1679 shr \$4, $itr1
1680 add \$6, $itr1
1681 mov \$10, $itr2
1682 cmp \$10, $itr1
1683 cmovg $itr2, $itr1
1684 mov $inp, $inl
1685 xor $itr2, $itr2
16861: \n";
1687 &poly_add("0*8($inl)");
1688 &poly_mul_mulx(); $code.="
1689 lea 16($inl), $inl
16902: \n";
1691 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
1692 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
1693 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1694 &poly_add("0*8($inl)");
1695 &poly_mul(); $code.="
1696 lea 16($inl), $inl
1697 inc $itr2\n";
1698 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right");
1699 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
1700 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
1701 cmp $itr1, $itr2
1702 jb 1b
1703 cmp \$10, $itr2
1704 jne 2b
1705 mov $inl, $itr2
1706 sub $inp, $inl
1707 mov $inl, $itr1
1708 mov $tmp_store, $inl
17091:
1710 add \$16, $itr1
1711 cmp $inl, $itr1
1712 jg 1f\n";
1713 &poly_add("0*8($itr2)");
1714 &poly_mul_mulx(); $code.="
1715 lea 16($itr2), $itr2
1716 jmp 1b
17171: \n";
1718 &finalize_state_avx2(3);
1719 &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0);
1720 &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0);
1721 &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
1722 lea 8*32($inp), $inp
1723 lea 8*32($oup), $oup
1724 sub \$8*32, $inl
1725 jmp open_avx2_tail_loop
17263: \n";
1727###############################################################################
1728 # 384-512 bytes left
1729 &prep_state_avx2(4); $code.="
1730 xor $itr1, $itr1
1731 mov $inp, $itr2
17321: \n";
1733 &poly_add("0*8($itr2)");
1734 &poly_mul(); $code.="
1735 lea 2*8($itr2), $itr2
17362: \n";
1737 &emit_body(37);
1738 &poly_add("0*8($itr2)");
1739 &poly_mul_mulx();
1740 &emit_body(48);
1741 &poly_add("2*8($itr2)");
1742 &poly_mul_mulx(); $code.="
1743 lea 4*8($itr2), $itr2\n";
1744 foreach $l (@loop_body) {$code.=$l."\n";}
1745 @loop_body = split /\n/, $chacha_body; $code.="
1746 inc $itr1
1747 cmp \$4, $itr1
1748 jl 1b
1749 cmp \$10, $itr1
1750 jne 2b
1751 mov $inl, $itr1
1752 sub \$12*32, $itr1
1753 and \$-16, $itr1
17541:
1755 test $itr1, $itr1
1756 je 1f\n";
1757 &poly_add("0*8($itr2)");
1758 &poly_mul_mulx(); $code.="
1759 lea 2*8($itr2), $itr2
1760 sub \$2*8, $itr1
1761 jmp 1b
17621: \n";
1763 &finalize_state_avx2(4); $code.="
1764 vmovdqa $A0, $tmp_store\n";
1765 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
1766 vmovdqa $tmp_store, $A0\n";
1767 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
1768 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
1769 &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
1770 lea 12*32($inp), $inp
1771 lea 12*32($oup), $oup
1772 sub \$12*32, $inl
1773open_avx2_tail_loop:
1774 cmp \$32, $inl
1775 jb open_avx2_tail
1776 sub \$32, $inl
1777 vpxor ($inp), $A0, $A0
1778 vmovdqu $A0, ($oup)
1779 lea 1*32($inp), $inp
1780 lea 1*32($oup), $oup
1781 vmovdqa $B0, $A0
1782 vmovdqa $C0, $B0
1783 vmovdqa $D0, $C0
1784 jmp open_avx2_tail_loop
1785open_avx2_tail:
1786 cmp \$16, $inl
1787 vmovdqa $A0x, $A1x
1788 jb 1f
1789 sub \$16, $inl
1790 #load for decryption
1791 vpxor ($inp), $A0x, $A1x
1792 vmovdqu $A1x, ($oup)
1793 lea 1*16($inp), $inp
1794 lea 1*16($oup), $oup
1795 vperm2i128 \$0x11, $A0, $A0, $A0
1796 vmovdqa $A0x, $A1x
17971:
1798 vzeroupper
1799 jmp open_sse_tail_16
1800###############################################################################
1801open_avx2_192:
1802 vmovdqa $A0, $A1
1803 vmovdqa $A0, $A2
1804 vmovdqa $B0, $B1
1805 vmovdqa $B0, $B2
1806 vmovdqa $C0, $C1
1807 vmovdqa $C0, $C2
1808 vpaddd .avx2_inc(%rip), $D0, $D1
1809 vmovdqa $D0, $T2
1810 vmovdqa $D1, $T3
1811 mov \$10, $acc0
18121: \n";
1813 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1814 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
1815 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
1816 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
1817 dec $acc0
1818 jne 1b
1819 vpaddd $A2, $A0, $A0
1820 vpaddd $A2, $A1, $A1
1821 vpaddd $B2, $B0, $B0
1822 vpaddd $B2, $B1, $B1
1823 vpaddd $C2, $C0, $C0
1824 vpaddd $C2, $C1, $C1
1825 vpaddd $T2, $D0, $D0
1826 vpaddd $T3, $D1, $D1
1827 vperm2i128 \$0x02, $A0, $B0, $T0
1828 # Clamp and store the key
1829 vpand .clamp(%rip), $T0, $T0
1830 vmovdqa $T0, $r_store
1831 # Stream for up to 192 bytes
1832 vperm2i128 \$0x13, $A0, $B0, $A0
1833 vperm2i128 \$0x13, $C0, $D0, $B0
1834 vperm2i128 \$0x02, $A1, $B1, $C0
1835 vperm2i128 \$0x02, $C1, $D1, $D0
1836 vperm2i128 \$0x13, $A1, $B1, $A1
1837 vperm2i128 \$0x13, $C1, $D1, $B1
1838open_avx2_short:
1839 mov %r8, $itr2
1840 call poly_hash_ad_internal
1841open_avx2_hash_and_xor_loop:
1842 cmp \$32, $inl
1843 jb open_avx2_short_tail_32
1844 sub \$32, $inl\n";
1845 # Load + hash
1846 &poly_add("0*8($inp)");
1847 &poly_mul();
1848 &poly_add("2*8($inp)");
1849 &poly_mul(); $code.="
1850 # Load + decrypt
1851 vpxor ($inp), $A0, $A0
1852 vmovdqu $A0, ($oup)
1853 lea 1*32($inp), $inp
1854 lea 1*32($oup), $oup
1855 # Shift stream
1856 vmovdqa $B0, $A0
1857 vmovdqa $C0, $B0
1858 vmovdqa $D0, $C0
1859 vmovdqa $A1, $D0
1860 vmovdqa $B1, $A1
1861 vmovdqa $C1, $B1
1862 vmovdqa $D1, $C1
1863 vmovdqa $A2, $D1
1864 vmovdqa $B2, $A2
1865 jmp open_avx2_hash_and_xor_loop
1866open_avx2_short_tail_32:
1867 cmp \$16, $inl
1868 vmovdqa $A0x, $A1x
1869 jb 1f
1870 sub \$16, $inl\n";
1871 &poly_add("0*8($inp)");
1872 &poly_mul(); $code.="
1873 vpxor ($inp), $A0x, $A3x
1874 vmovdqu $A3x, ($oup)
1875 lea 1*16($inp), $inp
1876 lea 1*16($oup), $oup
1877 vextracti128 \$1, $A0, $A1x
18781:
1879 vzeroupper
1880 jmp open_sse_tail_16
1881###############################################################################
1882open_avx2_320:
1883 vmovdqa $A0, $A1
1884 vmovdqa $A0, $A2
1885 vmovdqa $B0, $B1
1886 vmovdqa $B0, $B2
1887 vmovdqa $C0, $C1
1888 vmovdqa $C0, $C2
1889 vpaddd .avx2_inc(%rip), $D0, $D1
1890 vpaddd .avx2_inc(%rip), $D1, $D2
1891 vmovdqa $B0, $T1
1892 vmovdqa $C0, $T2
1893 vmovdqa $D0, $ctr0_store
1894 vmovdqa $D1, $ctr1_store
1895 vmovdqa $D2, $ctr2_store
1896 mov \$10, $acc0
18971: \n";
1898 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1899 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
1900 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
1901 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
1902 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
1903 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
1904 dec $acc0
1905 jne 1b
1906 vpaddd .chacha20_consts(%rip), $A0, $A0
1907 vpaddd .chacha20_consts(%rip), $A1, $A1
1908 vpaddd .chacha20_consts(%rip), $A2, $A2
1909 vpaddd $T1, $B0, $B0
1910 vpaddd $T1, $B1, $B1
1911 vpaddd $T1, $B2, $B2
1912 vpaddd $T2, $C0, $C0
1913 vpaddd $T2, $C1, $C1
1914 vpaddd $T2, $C2, $C2
1915 vpaddd $ctr0_store, $D0, $D0
1916 vpaddd $ctr1_store, $D1, $D1
1917 vpaddd $ctr2_store, $D2, $D2
1918 vperm2i128 \$0x02, $A0, $B0, $T0
1919 # Clamp and store the key
1920 vpand .clamp(%rip), $T0, $T0
1921 vmovdqa $T0, $r_store
1922 # Stream for up to 320 bytes
1923 vperm2i128 \$0x13, $A0, $B0, $A0
1924 vperm2i128 \$0x13, $C0, $D0, $B0
1925 vperm2i128 \$0x02, $A1, $B1, $C0
1926 vperm2i128 \$0x02, $C1, $D1, $D0
1927 vperm2i128 \$0x13, $A1, $B1, $A1
1928 vperm2i128 \$0x13, $C1, $D1, $B1
1929 vperm2i128 \$0x02, $A2, $B2, $C1
1930 vperm2i128 \$0x02, $C2, $D2, $D1
1931 vperm2i128 \$0x13, $A2, $B2, $A2
1932 vperm2i128 \$0x13, $C2, $D2, $B2
1933 jmp open_avx2_short
1934.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
1935###############################################################################
1936###############################################################################
1937.type chacha20_poly1305_seal_avx2,\@function,2
1938.align 64
1939chacha20_poly1305_seal_avx2:
1940 vzeroupper
1941 vmovdqa .chacha20_consts(%rip), $A0
1942 vbroadcasti128 0*16($keyp), $B0
1943 vbroadcasti128 1*16($keyp), $C0
1944 vbroadcasti128 2*16($keyp), $D0
1945 vpaddd .avx2_init(%rip), $D0, $D0
1946 cmp \$6*32, $inl
1947 jbe seal_avx2_192
1948 cmp \$10*32, $inl
1949 jbe seal_avx2_320
1950 vmovdqa $A0, $A1
1951 vmovdqa $A0, $A2
1952 vmovdqa $A0, $A3
1953 vmovdqa $B0, $B1
1954 vmovdqa $B0, $B2
1955 vmovdqa $B0, $B3
1956 vmovdqa $B0, $state1_store
1957 vmovdqa $C0, $C1
1958 vmovdqa $C0, $C2
1959 vmovdqa $C0, $C3
1960 vmovdqa $C0, $state2_store
1961 vmovdqa $D0, $D3
1962 vpaddd .avx2_inc(%rip), $D3, $D2
1963 vpaddd .avx2_inc(%rip), $D2, $D1
1964 vpaddd .avx2_inc(%rip), $D1, $D0
1965 vmovdqa $D0, $ctr0_store
1966 vmovdqa $D1, $ctr1_store
1967 vmovdqa $D2, $ctr2_store
1968 vmovdqa $D3, $ctr3_store
1969 mov \$10, $acc0
19701: \n";
1971 foreach $l (@loop_body) {$code.=$l."\n";}
1972 @loop_body = split /\n/, $chacha_body; $code.="
1973 dec $acc0
1974 jnz 1b\n";
1975 &finalize_state_avx2(4); $code.="
1976 vperm2i128 \$0x13, $C3, $D3, $C3
1977 vperm2i128 \$0x02, $A3, $B3, $D3
1978 vperm2i128 \$0x13, $A3, $B3, $A3
1979 vpand .clamp(%rip), $D3, $D3
1980 vmovdqa $D3, $r_store
1981 mov %r8, $itr2
1982 call poly_hash_ad_internal
1983 # Safely store 320 bytes (otherwise would handle with optimized call)
1984 vpxor 0*32($inp), $A3, $A3
1985 vpxor 1*32($inp), $C3, $C3
1986 vmovdqu $A3, 0*32($oup)
1987 vmovdqu $C3, 1*32($oup)\n";
1988 &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3);
1989 &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3);
1990 &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.="
1991 lea 10*32($inp), $inp
1992 sub \$10*32, $inl
1993 mov \$10*32, $itr1
1994 cmp \$4*32, $inl
1995 jbe seal_avx2_hash
1996 vpxor 0*32($inp), $A0, $A0
1997 vpxor 1*32($inp), $B0, $B0
1998 vpxor 2*32($inp), $C0, $C0
1999 vpxor 3*32($inp), $D0, $D0
2000 vmovdqu $A0, 10*32($oup)
2001 vmovdqu $B0, 11*32($oup)
2002 vmovdqu $C0, 12*32($oup)
2003 vmovdqu $D0, 13*32($oup)
2004 lea 4*32($inp), $inp
2005 sub \$4*32, $inl
2006 mov \$8, $itr1
2007 mov \$2, $itr2
2008 cmp \$4*32, $inl
2009 jbe seal_avx2_tail_128
2010 cmp \$8*32, $inl
2011 jbe seal_avx2_tail_256
2012 cmp \$12*32, $inl
2013 jbe seal_avx2_tail_384
2014 cmp \$16*32, $inl
2015 jbe seal_avx2_tail_512\n";
2016 # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
2017 &prep_state_avx2(4);
2018 foreach $l (@loop_body) {$code.=$l."\n";}
2019 @loop_body = split /\n/, $chacha_body;
2020 &emit_body(41);
2021 @loop_body = split /\n/, $chacha_body; $code.="
2022 sub \$16, $oup
2023 mov \$9, $itr1
2024 jmp 4f
20251: \n";
2026 &prep_state_avx2(4); $code.="
2027 mov \$10, $itr1
20282: \n";
2029 &poly_add("0*8($oup)");
2030 &emit_body(10);
2031 &poly_stage1_mulx();
2032 &emit_body(9);
2033 &poly_stage2_mulx();
2034 &emit_body(12);
2035 &poly_stage3_mulx();
2036 &emit_body(10);
2037 &poly_reduce_stage(); $code.="
20384: \n";
2039 &emit_body(9);
2040 &poly_add("2*8($oup)");
2041 &emit_body(8);
2042 &poly_stage1_mulx();
2043 &emit_body(18);
2044 &poly_stage2_mulx();
2045 &emit_body(18);
2046 &poly_stage3_mulx();
2047 &emit_body(9);
2048 &poly_reduce_stage();
2049 &emit_body(8);
2050 &poly_add("4*8($oup)"); $code.="
2051 lea 6*8($oup), $oup\n";
2052 &emit_body(18);
2053 &poly_stage1_mulx();
2054 &emit_body(8);
2055 &poly_stage2_mulx();
2056 &emit_body(8);
2057 &poly_stage3_mulx();
2058 &emit_body(18);
2059 &poly_reduce_stage();
2060 foreach $l (@loop_body) {$code.=$l."\n";}
2061 @loop_body = split /\n/, $chacha_body; $code.="
2062 dec $itr1
2063 jne 2b\n";
2064 &finalize_state_avx2(4); $code.="
2065 lea 4*8($oup), $oup
2066 vmovdqa $A0, $tmp_store\n";
2067 &poly_add("-4*8($oup)");
2068 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
2069 vmovdqa $tmp_store, $A0\n";
2070 &poly_mul();
2071 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
2072 &poly_add("-2*8($oup)");
2073 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
2074 &poly_mul();
2075 &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
2076 lea 16*32($inp), $inp
2077 sub \$16*32, $inl
2078 cmp \$16*32, $inl
2079 jg 1b\n";
2080 &poly_add("0*8($oup)");
2081 &poly_mul();
2082 &poly_add("2*8($oup)");
2083 &poly_mul(); $code.="
2084 lea 4*8($oup), $oup
2085 mov \$10, $itr1
2086 xor $itr2, $itr2
2087 cmp \$4*32, $inl
2088 ja 3f
2089###############################################################################
2090seal_avx2_tail_128:\n";
2091 &prep_state_avx2(1); $code.="
20921: \n";
2093 &poly_add("0($oup)");
2094 &poly_mul(); $code.="
2095 lea 2*8($oup), $oup
20962: \n";
2097 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2098 &poly_add("0*8($oup)");
2099 &poly_mul();
2100 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2101 &poly_add("2*8($oup)");
2102 &poly_mul(); $code.="
2103 lea 4*8($oup), $oup
2104 dec $itr1
2105 jg 1b
2106 dec $itr2
2107 jge 2b\n";
2108 &finalize_state_avx2(1);
2109 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
2110 jmp seal_avx2_short_loop
21113:
2112 cmp \$8*32, $inl
2113 ja 3f
2114###############################################################################
2115seal_avx2_tail_256:\n";
2116 &prep_state_avx2(2); $code.="
21171: \n";
2118 &poly_add("0($oup)");
2119 &poly_mul(); $code.="
2120 lea 2*8($oup), $oup
21212: \n";
2122 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2123 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
2124 &poly_add("0*8($oup)");
2125 &poly_mul();
2126 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2127 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
2128 &poly_add("2*8($oup)");
2129 &poly_mul(); $code.="
2130 lea 4*8($oup), $oup
2131 dec $itr1
2132 jg 1b
2133 dec $itr2
2134 jge 2b\n";
2135 &finalize_state_avx2(2);
2136 &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0);
2137 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
2138 mov \$4*32, $itr1
2139 lea 4*32($inp), $inp
2140 sub \$4*32, $inl
2141 jmp seal_avx2_hash
21423:
2143 cmp \$12*32, $inl
2144 ja seal_avx2_tail_512
2145###############################################################################
2146seal_avx2_tail_384:\n";
2147 &prep_state_avx2(3); $code.="
21481: \n";
2149 &poly_add("0($oup)");
2150 &poly_mul(); $code.="
2151 lea 2*8($oup), $oup
21522: \n";
2153 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2154 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
2155 &poly_add("0*8($oup)");
2156 &poly_mul();
2157 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
2158 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2159 &poly_add("2*8($oup)");
2160 &poly_mul();
2161 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
2162 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
2163 lea 4*8($oup), $oup
2164 dec $itr1
2165 jg 1b
2166 dec $itr2
2167 jge 2b\n";
2168 &finalize_state_avx2(3);
2169 &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0);
2170 &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0);
2171 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
2172 mov \$8*32, $itr1
2173 lea 8*32($inp), $inp
2174 sub \$8*32, $inl
2175 jmp seal_avx2_hash
2176###############################################################################
2177seal_avx2_tail_512:\n";
2178 &prep_state_avx2(4); $code.="
21791: \n";
2180 &poly_add("0($oup)");
2181 &poly_mul_mulx(); $code.="
2182 lea 2*8($oup), $oup
21832: \n";
2184 &emit_body(20);
2185 &poly_add("0*8($oup)");
2186 &emit_body(20);
2187 &poly_stage1_mulx();
2188 &emit_body(20);
2189 &poly_stage2_mulx();
2190 &emit_body(20);
2191 &poly_stage3_mulx();
2192 &emit_body(20);
2193 &poly_reduce_stage();
2194 &emit_body(20);
2195 &poly_add("2*8($oup)");
2196 &emit_body(20);
2197 &poly_stage1_mulx();
2198 &emit_body(20);
2199 &poly_stage2_mulx();
2200 &emit_body(20);
2201 &poly_stage3_mulx();
2202 &emit_body(20);
2203 &poly_reduce_stage();
2204 foreach $l (@loop_body) {$code.=$l."\n";}
2205 @loop_body = split /\n/, $chacha_body; $code.="
2206 lea 4*8($oup), $oup
2207 dec $itr1
2208 jg 1b
2209 dec $itr2
2210 jge 2b\n";
2211 &finalize_state_avx2(4); $code.="
2212 vmovdqa $A0, $tmp_store\n";
2213 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
2214 vmovdqa $tmp_store, $A0\n";
2215 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
2216 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
2217 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
2218 mov \$12*32, $itr1
2219 lea 12*32($inp), $inp
2220 sub \$12*32, $inl
2221 jmp seal_avx2_hash
2222################################################################################
2223seal_avx2_320:
2224 vmovdqa $A0, $A1
2225 vmovdqa $A0, $A2
2226 vmovdqa $B0, $B1
2227 vmovdqa $B0, $B2
2228 vmovdqa $C0, $C1
2229 vmovdqa $C0, $C2
2230 vpaddd .avx2_inc(%rip), $D0, $D1
2231 vpaddd .avx2_inc(%rip), $D1, $D2
2232 vmovdqa $B0, $T1
2233 vmovdqa $C0, $T2
2234 vmovdqa $D0, $ctr0_store
2235 vmovdqa $D1, $ctr1_store
2236 vmovdqa $D2, $ctr2_store
2237 mov \$10, $acc0
22381: \n";
2239 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2240 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
2241 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
2242 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2243 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
2244 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
2245 dec $acc0
2246 jne 1b
2247 vpaddd .chacha20_consts(%rip), $A0, $A0
2248 vpaddd .chacha20_consts(%rip), $A1, $A1
2249 vpaddd .chacha20_consts(%rip), $A2, $A2
2250 vpaddd $T1, $B0, $B0
2251 vpaddd $T1, $B1, $B1
2252 vpaddd $T1, $B2, $B2
2253 vpaddd $T2, $C0, $C0
2254 vpaddd $T2, $C1, $C1
2255 vpaddd $T2, $C2, $C2
2256 vpaddd $ctr0_store, $D0, $D0
2257 vpaddd $ctr1_store, $D1, $D1
2258 vpaddd $ctr2_store, $D2, $D2
2259 vperm2i128 \$0x02, $A0, $B0, $T0
2260 # Clamp and store the key
2261 vpand .clamp(%rip), $T0, $T0
2262 vmovdqa $T0, $r_store
2263 # Stream for up to 320 bytes
2264 vperm2i128 \$0x13, $A0, $B0, $A0
2265 vperm2i128 \$0x13, $C0, $D0, $B0
2266 vperm2i128 \$0x02, $A1, $B1, $C0
2267 vperm2i128 \$0x02, $C1, $D1, $D0
2268 vperm2i128 \$0x13, $A1, $B1, $A1
2269 vperm2i128 \$0x13, $C1, $D1, $B1
2270 vperm2i128 \$0x02, $A2, $B2, $C1
2271 vperm2i128 \$0x02, $C2, $D2, $D1
2272 vperm2i128 \$0x13, $A2, $B2, $A2
2273 vperm2i128 \$0x13, $C2, $D2, $B2
2274 jmp seal_avx2_short
2275################################################################################
2276seal_avx2_192:
2277 vmovdqa $A0, $A1
2278 vmovdqa $A0, $A2
2279 vmovdqa $B0, $B1
2280 vmovdqa $B0, $B2
2281 vmovdqa $C0, $C1
2282 vmovdqa $C0, $C2
2283 vpaddd .avx2_inc(%rip), $D0, $D1
2284 vmovdqa $D0, $T2
2285 vmovdqa $D1, $T3
2286 mov \$10, $acc0
22871: \n";
2288 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2289 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
2290 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2291 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
2292 dec $acc0
2293 jne 1b
2294 vpaddd $A2, $A0, $A0
2295 vpaddd $A2, $A1, $A1
2296 vpaddd $B2, $B0, $B0
2297 vpaddd $B2, $B1, $B1
2298 vpaddd $C2, $C0, $C0
2299 vpaddd $C2, $C1, $C1
2300 vpaddd $T2, $D0, $D0
2301 vpaddd $T3, $D1, $D1
2302 vperm2i128 \$0x02, $A0, $B0, $T0
2303 # Clamp and store the key
2304 vpand .clamp(%rip), $T0, $T0
2305 vmovdqa $T0, $r_store
2306 # Stream for up to 192 bytes
2307 vperm2i128 \$0x13, $A0, $B0, $A0
2308 vperm2i128 \$0x13, $C0, $D0, $B0
2309 vperm2i128 \$0x02, $A1, $B1, $C0
2310 vperm2i128 \$0x02, $C1, $D1, $D0
2311 vperm2i128 \$0x13, $A1, $B1, $A1
2312 vperm2i128 \$0x13, $C1, $D1, $B1
2313seal_avx2_short:
2314 mov %r8, $itr2
2315 call poly_hash_ad_internal
2316 xor $itr1, $itr1
2317seal_avx2_hash:
2318 cmp \$16, $itr1
2319 jb seal_avx2_short_loop\n";
2320 &poly_add("0($oup)");
2321 &poly_mul(); $code.="
2322 sub \$16, $itr1
2323 add \$16, $oup
2324 jmp seal_avx2_hash
2325seal_avx2_short_loop:
2326 cmp \$32, $inl
2327 jb seal_avx2_short_tail
2328 sub \$32, $inl
2329 # Encrypt
2330 vpxor ($inp), $A0, $A0
2331 vmovdqu $A0, ($oup)
2332 lea 1*32($inp), $inp
2333 # Load + hash\n";
2334 &poly_add("0*8($oup)");
2335 &poly_mul();
2336 &poly_add("2*8($oup)");
2337 &poly_mul(); $code.="
2338 lea 1*32($oup), $oup
2339 # Shift stream
2340 vmovdqa $B0, $A0
2341 vmovdqa $C0, $B0
2342 vmovdqa $D0, $C0
2343 vmovdqa $A1, $D0
2344 vmovdqa $B1, $A1
2345 vmovdqa $C1, $B1
2346 vmovdqa $D1, $C1
2347 vmovdqa $A2, $D1
2348 vmovdqa $B2, $A2
2349 jmp seal_avx2_short_loop
2350seal_avx2_short_tail:
2351 cmp \$16, $inl
2352 jb 1f
2353 sub \$16, $inl
2354 vpxor ($inp), $A0x, $A3x
2355 vmovdqu $A3x, ($oup)
2356 lea 1*16($inp), $inp\n";
2357 &poly_add("0*8($oup)");
2358 &poly_mul(); $code.="
2359 lea 1*16($oup), $oup
2360 vextracti128 \$1, $A0, $A0x
23611:
2362 vzeroupper
2363 jmp seal_sse_tail_16
2364.cfi_endproc
2365";
2366}
2367
2368if (!$win64) {
2369 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2370 print $code;
2371} else {
2372 print <<___;
2373.globl dummy_chacha20_poly1305_asm
2374.type dummy_chacha20_poly1305_asm,\@abi-omnipotent
2375dummy_chacha20_poly1305_asm:
2376 ret
2377___
2378}
2379
2380close STDOUT;