blob: 6be270ec53762279bfe8923fe461ef24d4658d5b [file] [log] [blame]
Robert Sloana94fe052017-02-21 08:49:28 -08001#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
David Benjamin4969cc92016-04-22 15:02:23 -04009#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# November 2014
18#
19# ChaCha20 for x86_64.
20#
Robert Sloana94fe052017-02-21 08:49:28 -080021# December 2016
22#
23# Add AVX512F code path.
24#
David Benjamin4969cc92016-04-22 15:02:23 -040025# Performance in cycles per byte out of large buffer.
26#
Robert Sloanfe7cd212017-08-07 09:03:39 -070027# IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 NxAVX(v)
David Benjamin4969cc92016-04-22 15:02:23 -040028#
29# P4 9.48/+99% -/22.7(ii) -
30# Core2 7.83/+55% 7.90/8.08 4.35
31# Westmere 7.19/+50% 5.60/6.70 3.00
32# Sandy Bridge 8.31/+42% 5.45/6.76 2.72
33# Ivy Bridge 6.71/+46% 5.40/6.49 2.41
34# Haswell 5.92/+43% 5.20/6.45 2.42 1.23
Robert Sloanfe7cd212017-08-07 09:03:39 -070035# Skylake[-X] 5.87/+39% 4.70/- 2.31 1.19[0.57]
David Benjamin4969cc92016-04-22 15:02:23 -040036# Silvermont 12.0/+33% 7.75/7.40 7.03(iii)
Robert Sloanfe7cd212017-08-07 09:03:39 -070037# Knights L 11.7/- - 9.60(iii) 0.80
Robert Sloana94fe052017-02-21 08:49:28 -080038# Goldmont 10.6/+17% 5.10/- 3.28
David Benjamin4969cc92016-04-22 15:02:23 -040039# Sledgehammer 7.28/+52% -/14.2(ii) -
40# Bulldozer 9.66/+28% 9.85/11.1 3.06(iv)
Robert Sloanfe7cd212017-08-07 09:03:39 -070041# Ryzen 5.96/+50% 5.19/- 2.40 2.09
David Benjamin4969cc92016-04-22 15:02:23 -040042# VIA Nano 10.5/+46% 6.72/8.60 6.05
43#
44# (i) compared to older gcc 3.x one can observe >2x improvement on
45# most platforms;
46# (ii) as it can be seen, SSE2 performance is too low on legacy
47# processors; NxSSE2 results are naturally better, but not
48# impressively better than IALU ones, which is why you won't
49# find SSE2 code below;
50# (iii) this is not optimal result for Atom because of MSROM
51# limitations, SSE2 can do better, but gain is considered too
52# low to justify the [maintenance] effort;
53# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
54#
55# Modified from upstream OpenSSL to remove the XOP code.
56
57$flavour = shift;
58$output = shift;
59if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
60
61$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
62
63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
64( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
65( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
66die "can't locate x86_64-xlate.pl";
67
68$avx = 2;
69
70open OUT,"| \"$^X\" $xlate $flavour $output";
71*STDOUT=*OUT;
72
73# input parameter block
74($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
75
76$code.=<<___;
77.text
78
79.extern OPENSSL_ia32cap_P
80
81.align 64
82.Lzero:
83.long 0,0,0,0
84.Lone:
85.long 1,0,0,0
86.Linc:
87.long 0,1,2,3
88.Lfour:
89.long 4,4,4,4
90.Lincy:
91.long 0,2,4,6,1,3,5,7
92.Leight:
93.long 8,8,8,8,8,8,8,8
94.Lrot16:
95.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
96.Lrot24:
97.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
98.Lsigma:
99.asciz "expand 32-byte k"
Robert Sloana94fe052017-02-21 08:49:28 -0800100.align 64
101.Lzeroz:
102.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
103.Lfourz:
104.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
105.Lincz:
106.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
107.Lsixteen:
108.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
David Benjamin4969cc92016-04-22 15:02:23 -0400109.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
110___
111
112sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
113{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
114 my $arg = pop;
115 $arg = "\$$arg" if ($arg*1 eq $arg);
116 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
117}
118
119@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
120 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
121@t=("%esi","%edi");
122
123sub ROUND { # critical path is 24 cycles per round
124my ($a0,$b0,$c0,$d0)=@_;
125my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
126my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
127my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
128my ($xc,$xc_)=map("\"$_\"",@t);
129my @x=map("\"$_\"",@x);
130
131 # Consider order in which variables are addressed by their
132 # index:
133 #
134 # a b c d
135 #
136 # 0 4 8 12 < even round
137 # 1 5 9 13
138 # 2 6 10 14
139 # 3 7 11 15
140 # 0 5 10 15 < odd round
141 # 1 6 11 12
142 # 2 7 8 13
143 # 3 4 9 14
144 #
145 # 'a', 'b' and 'd's are permanently allocated in registers,
146 # @x[0..7,12..15], while 'c's are maintained in memory. If
147 # you observe 'c' column, you'll notice that pair of 'c's is
148 # invariant between rounds. This means that we have to reload
149 # them once per round, in the middle. This is why you'll see
150 # bunch of 'c' stores and loads in the middle, but none in
151 # the beginning or end.
152
153 # Normally instructions would be interleaved to favour in-order
154 # execution. Generally out-of-order cores manage it gracefully,
155 # but not this time for some reason. As in-order execution
156 # cores are dying breed, old Atom is the only one around,
157 # instructions are left uninterleaved. Besides, Atom is better
158 # off executing 1xSSSE3 code anyway...
159
160 (
161 "&add (@x[$a0],@x[$b0])", # Q1
162 "&xor (@x[$d0],@x[$a0])",
163 "&rol (@x[$d0],16)",
164 "&add (@x[$a1],@x[$b1])", # Q2
165 "&xor (@x[$d1],@x[$a1])",
166 "&rol (@x[$d1],16)",
167
168 "&add ($xc,@x[$d0])",
169 "&xor (@x[$b0],$xc)",
170 "&rol (@x[$b0],12)",
171 "&add ($xc_,@x[$d1])",
172 "&xor (@x[$b1],$xc_)",
173 "&rol (@x[$b1],12)",
174
175 "&add (@x[$a0],@x[$b0])",
176 "&xor (@x[$d0],@x[$a0])",
177 "&rol (@x[$d0],8)",
178 "&add (@x[$a1],@x[$b1])",
179 "&xor (@x[$d1],@x[$a1])",
180 "&rol (@x[$d1],8)",
181
182 "&add ($xc,@x[$d0])",
183 "&xor (@x[$b0],$xc)",
184 "&rol (@x[$b0],7)",
185 "&add ($xc_,@x[$d1])",
186 "&xor (@x[$b1],$xc_)",
187 "&rol (@x[$b1],7)",
188
189 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
190 "&mov (\"4*$c1(%rsp)\",$xc_)",
191 "&mov ($xc,\"4*$c2(%rsp)\")",
192 "&mov ($xc_,\"4*$c3(%rsp)\")",
193
194 "&add (@x[$a2],@x[$b2])", # Q3
195 "&xor (@x[$d2],@x[$a2])",
196 "&rol (@x[$d2],16)",
197 "&add (@x[$a3],@x[$b3])", # Q4
198 "&xor (@x[$d3],@x[$a3])",
199 "&rol (@x[$d3],16)",
200
201 "&add ($xc,@x[$d2])",
202 "&xor (@x[$b2],$xc)",
203 "&rol (@x[$b2],12)",
204 "&add ($xc_,@x[$d3])",
205 "&xor (@x[$b3],$xc_)",
206 "&rol (@x[$b3],12)",
207
208 "&add (@x[$a2],@x[$b2])",
209 "&xor (@x[$d2],@x[$a2])",
210 "&rol (@x[$d2],8)",
211 "&add (@x[$a3],@x[$b3])",
212 "&xor (@x[$d3],@x[$a3])",
213 "&rol (@x[$d3],8)",
214
215 "&add ($xc,@x[$d2])",
216 "&xor (@x[$b2],$xc)",
217 "&rol (@x[$b2],7)",
218 "&add ($xc_,@x[$d3])",
219 "&xor (@x[$b3],$xc_)",
220 "&rol (@x[$b3],7)"
221 );
222}
223
224########################################################################
225# Generic code path that handles all lengths on pre-SSSE3 processors.
226$code.=<<___;
227.globl ChaCha20_ctr32
228.type ChaCha20_ctr32,\@function,5
229.align 64
230ChaCha20_ctr32:
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800231.cfi_startproc
David Benjamin4969cc92016-04-22 15:02:23 -0400232 cmp \$0,$len
233 je .Lno_data
234 mov OPENSSL_ia32cap_P+4(%rip),%r10
Robert Sloana94fe052017-02-21 08:49:28 -0800235___
236$code.=<<___ if ($avx>2);
237 bt \$48,%r10 # check for AVX512F
238 jc .LChaCha20_avx512
239___
240$code.=<<___;
David Benjamin4969cc92016-04-22 15:02:23 -0400241 test \$`1<<(41-32)`,%r10d
242 jnz .LChaCha20_ssse3
243
244 push %rbx
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800245.cfi_push rbx
David Benjamin4969cc92016-04-22 15:02:23 -0400246 push %rbp
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800247.cfi_push rbp
David Benjamin4969cc92016-04-22 15:02:23 -0400248 push %r12
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800249.cfi_push r12
David Benjamin4969cc92016-04-22 15:02:23 -0400250 push %r13
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800251.cfi_push r13
David Benjamin4969cc92016-04-22 15:02:23 -0400252 push %r14
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800253.cfi_push r14
David Benjamin4969cc92016-04-22 15:02:23 -0400254 push %r15
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800255.cfi_push r15
David Benjamin4969cc92016-04-22 15:02:23 -0400256 sub \$64+24,%rsp
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800257.cfi_adjust_cfa_offset `64+24`
Robert Sloana94fe052017-02-21 08:49:28 -0800258.Lctr32_body:
David Benjamin4969cc92016-04-22 15:02:23 -0400259
260 #movdqa .Lsigma(%rip),%xmm0
261 movdqu ($key),%xmm1
262 movdqu 16($key),%xmm2
263 movdqu ($counter),%xmm3
264 movdqa .Lone(%rip),%xmm4
265
266 #movdqa %xmm0,4*0(%rsp) # key[0]
267 movdqa %xmm1,4*4(%rsp) # key[1]
268 movdqa %xmm2,4*8(%rsp) # key[2]
269 movdqa %xmm3,4*12(%rsp) # key[3]
270 mov $len,%rbp # reassign $len
271 jmp .Loop_outer
272
273.align 32
274.Loop_outer:
275 mov \$0x61707865,@x[0] # 'expa'
276 mov \$0x3320646e,@x[1] # 'nd 3'
277 mov \$0x79622d32,@x[2] # '2-by'
278 mov \$0x6b206574,@x[3] # 'te k'
279 mov 4*4(%rsp),@x[4]
280 mov 4*5(%rsp),@x[5]
281 mov 4*6(%rsp),@x[6]
282 mov 4*7(%rsp),@x[7]
283 movd %xmm3,@x[12]
284 mov 4*13(%rsp),@x[13]
285 mov 4*14(%rsp),@x[14]
286 mov 4*15(%rsp),@x[15]
287
288 mov %rbp,64+0(%rsp) # save len
289 mov \$10,%ebp
290 mov $inp,64+8(%rsp) # save inp
291 movq %xmm2,%rsi # "@x[8]"
292 mov $out,64+16(%rsp) # save out
293 mov %rsi,%rdi
294 shr \$32,%rdi # "@x[9]"
295 jmp .Loop
296
297.align 32
298.Loop:
299___
300 foreach (&ROUND (0, 4, 8,12)) { eval; }
301 foreach (&ROUND (0, 5,10,15)) { eval; }
302 &dec ("%ebp");
303 &jnz (".Loop");
304
305$code.=<<___;
306 mov @t[1],4*9(%rsp) # modulo-scheduled
307 mov @t[0],4*8(%rsp)
308 mov 64(%rsp),%rbp # load len
309 movdqa %xmm2,%xmm1
310 mov 64+8(%rsp),$inp # load inp
311 paddd %xmm4,%xmm3 # increment counter
312 mov 64+16(%rsp),$out # load out
313
314 add \$0x61707865,@x[0] # 'expa'
315 add \$0x3320646e,@x[1] # 'nd 3'
316 add \$0x79622d32,@x[2] # '2-by'
317 add \$0x6b206574,@x[3] # 'te k'
318 add 4*4(%rsp),@x[4]
319 add 4*5(%rsp),@x[5]
320 add 4*6(%rsp),@x[6]
321 add 4*7(%rsp),@x[7]
322 add 4*12(%rsp),@x[12]
323 add 4*13(%rsp),@x[13]
324 add 4*14(%rsp),@x[14]
325 add 4*15(%rsp),@x[15]
326 paddd 4*8(%rsp),%xmm1
327
328 cmp \$64,%rbp
329 jb .Ltail
330
331 xor 4*0($inp),@x[0] # xor with input
332 xor 4*1($inp),@x[1]
333 xor 4*2($inp),@x[2]
334 xor 4*3($inp),@x[3]
335 xor 4*4($inp),@x[4]
336 xor 4*5($inp),@x[5]
337 xor 4*6($inp),@x[6]
338 xor 4*7($inp),@x[7]
339 movdqu 4*8($inp),%xmm0
340 xor 4*12($inp),@x[12]
341 xor 4*13($inp),@x[13]
342 xor 4*14($inp),@x[14]
343 xor 4*15($inp),@x[15]
344 lea 4*16($inp),$inp # inp+=64
345 pxor %xmm1,%xmm0
346
347 movdqa %xmm2,4*8(%rsp)
348 movd %xmm3,4*12(%rsp)
349
350 mov @x[0],4*0($out) # write output
351 mov @x[1],4*1($out)
352 mov @x[2],4*2($out)
353 mov @x[3],4*3($out)
354 mov @x[4],4*4($out)
355 mov @x[5],4*5($out)
356 mov @x[6],4*6($out)
357 mov @x[7],4*7($out)
358 movdqu %xmm0,4*8($out)
359 mov @x[12],4*12($out)
360 mov @x[13],4*13($out)
361 mov @x[14],4*14($out)
362 mov @x[15],4*15($out)
363 lea 4*16($out),$out # out+=64
364
365 sub \$64,%rbp
366 jnz .Loop_outer
367
368 jmp .Ldone
369
370.align 16
371.Ltail:
372 mov @x[0],4*0(%rsp)
373 mov @x[1],4*1(%rsp)
374 xor %rbx,%rbx
375 mov @x[2],4*2(%rsp)
376 mov @x[3],4*3(%rsp)
377 mov @x[4],4*4(%rsp)
378 mov @x[5],4*5(%rsp)
379 mov @x[6],4*6(%rsp)
380 mov @x[7],4*7(%rsp)
381 movdqa %xmm1,4*8(%rsp)
382 mov @x[12],4*12(%rsp)
383 mov @x[13],4*13(%rsp)
384 mov @x[14],4*14(%rsp)
385 mov @x[15],4*15(%rsp)
386
387.Loop_tail:
388 movzb ($inp,%rbx),%eax
389 movzb (%rsp,%rbx),%edx
390 lea 1(%rbx),%rbx
391 xor %edx,%eax
392 mov %al,-1($out,%rbx)
393 dec %rbp
394 jnz .Loop_tail
395
396.Ldone:
Robert Sloana94fe052017-02-21 08:49:28 -0800397 lea 64+24+48(%rsp),%rsi
398 mov -48(%rsi),%r15
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800399.cfi_restore r15
Robert Sloana94fe052017-02-21 08:49:28 -0800400 mov -40(%rsi),%r14
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800401.cfi_restore r14
Robert Sloana94fe052017-02-21 08:49:28 -0800402 mov -32(%rsi),%r13
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800403.cfi_restore r13
Robert Sloana94fe052017-02-21 08:49:28 -0800404 mov -24(%rsi),%r12
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800405.cfi_restore r12
Robert Sloana94fe052017-02-21 08:49:28 -0800406 mov -16(%rsi),%rbp
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800407.cfi_restore rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800408 mov -8(%rsi),%rbx
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800409.cfi_restore rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800410 lea (%rsi),%rsp
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800411.cfi_adjust_cfa_offset `-64-24-48`
David Benjamin4969cc92016-04-22 15:02:23 -0400412.Lno_data:
413 ret
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800414.cfi_endproc
David Benjamin4969cc92016-04-22 15:02:23 -0400415.size ChaCha20_ctr32,.-ChaCha20_ctr32
416___
417
418########################################################################
419# SSSE3 code path that handles shorter lengths
420{
421my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
422
423sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
424 &paddd ($a,$b);
425 &pxor ($d,$a);
426 &pshufb ($d,$rot16);
427
428 &paddd ($c,$d);
429 &pxor ($b,$c);
430 &movdqa ($t,$b);
431 &psrld ($b,20);
432 &pslld ($t,12);
433 &por ($b,$t);
434
435 &paddd ($a,$b);
436 &pxor ($d,$a);
437 &pshufb ($d,$rot24);
438
439 &paddd ($c,$d);
440 &pxor ($b,$c);
441 &movdqa ($t,$b);
442 &psrld ($b,25);
443 &pslld ($t,7);
444 &por ($b,$t);
445}
446
Robert Sloana94fe052017-02-21 08:49:28 -0800447my $xframe = $win64 ? 32+8 : 8;
David Benjamin4969cc92016-04-22 15:02:23 -0400448
449$code.=<<___;
450.type ChaCha20_ssse3,\@function,5
451.align 32
452ChaCha20_ssse3:
453.LChaCha20_ssse3:
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800454.cfi_startproc
Robert Sloana94fe052017-02-21 08:49:28 -0800455 mov %rsp,%r9 # frame pointer
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800456.cfi_def_cfa_register r9
David Benjamin4969cc92016-04-22 15:02:23 -0400457___
458$code.=<<___;
459 cmp \$128,$len # we might throw away some data,
460 ja .LChaCha20_4x # but overall it won't be slower
461
462.Ldo_sse3_after_all:
David Benjamin4969cc92016-04-22 15:02:23 -0400463 sub \$64+$xframe,%rsp
464___
465$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -0800466 movaps %xmm6,-0x28(%r9)
467 movaps %xmm7,-0x18(%r9)
468.Lssse3_body:
David Benjamin4969cc92016-04-22 15:02:23 -0400469___
470$code.=<<___;
471 movdqa .Lsigma(%rip),$a
472 movdqu ($key),$b
473 movdqu 16($key),$c
474 movdqu ($counter),$d
475 movdqa .Lrot16(%rip),$rot16
476 movdqa .Lrot24(%rip),$rot24
477
478 movdqa $a,0x00(%rsp)
479 movdqa $b,0x10(%rsp)
480 movdqa $c,0x20(%rsp)
481 movdqa $d,0x30(%rsp)
Robert Sloana94fe052017-02-21 08:49:28 -0800482 mov \$10,$counter # reuse $counter
David Benjamin4969cc92016-04-22 15:02:23 -0400483 jmp .Loop_ssse3
484
485.align 32
486.Loop_outer_ssse3:
487 movdqa .Lone(%rip),$d
488 movdqa 0x00(%rsp),$a
489 movdqa 0x10(%rsp),$b
490 movdqa 0x20(%rsp),$c
491 paddd 0x30(%rsp),$d
Robert Sloana94fe052017-02-21 08:49:28 -0800492 mov \$10,$counter
David Benjamin4969cc92016-04-22 15:02:23 -0400493 movdqa $d,0x30(%rsp)
494 jmp .Loop_ssse3
495
496.align 32
497.Loop_ssse3:
498___
499 &SSSE3ROUND();
500 &pshufd ($c,$c,0b01001110);
501 &pshufd ($b,$b,0b00111001);
502 &pshufd ($d,$d,0b10010011);
503 &nop ();
504
505 &SSSE3ROUND();
506 &pshufd ($c,$c,0b01001110);
507 &pshufd ($b,$b,0b10010011);
508 &pshufd ($d,$d,0b00111001);
509
Robert Sloana94fe052017-02-21 08:49:28 -0800510 &dec ($counter);
David Benjamin4969cc92016-04-22 15:02:23 -0400511 &jnz (".Loop_ssse3");
512
513$code.=<<___;
514 paddd 0x00(%rsp),$a
515 paddd 0x10(%rsp),$b
516 paddd 0x20(%rsp),$c
517 paddd 0x30(%rsp),$d
518
519 cmp \$64,$len
520 jb .Ltail_ssse3
521
522 movdqu 0x00($inp),$t
523 movdqu 0x10($inp),$t1
524 pxor $t,$a # xor with input
525 movdqu 0x20($inp),$t
526 pxor $t1,$b
527 movdqu 0x30($inp),$t1
528 lea 0x40($inp),$inp # inp+=64
529 pxor $t,$c
530 pxor $t1,$d
531
532 movdqu $a,0x00($out) # write output
533 movdqu $b,0x10($out)
534 movdqu $c,0x20($out)
535 movdqu $d,0x30($out)
536 lea 0x40($out),$out # out+=64
537
538 sub \$64,$len
539 jnz .Loop_outer_ssse3
540
541 jmp .Ldone_ssse3
542
543.align 16
544.Ltail_ssse3:
545 movdqa $a,0x00(%rsp)
546 movdqa $b,0x10(%rsp)
547 movdqa $c,0x20(%rsp)
548 movdqa $d,0x30(%rsp)
Robert Sloana94fe052017-02-21 08:49:28 -0800549 xor $counter,$counter
David Benjamin4969cc92016-04-22 15:02:23 -0400550
551.Loop_tail_ssse3:
Robert Sloana94fe052017-02-21 08:49:28 -0800552 movzb ($inp,$counter),%eax
553 movzb (%rsp,$counter),%ecx
554 lea 1($counter),$counter
David Benjamin4969cc92016-04-22 15:02:23 -0400555 xor %ecx,%eax
Robert Sloana94fe052017-02-21 08:49:28 -0800556 mov %al,-1($out,$counter)
David Benjamin4969cc92016-04-22 15:02:23 -0400557 dec $len
558 jnz .Loop_tail_ssse3
559
560.Ldone_ssse3:
561___
562$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -0800563 movaps -0x28(%r9),%xmm6
564 movaps -0x18(%r9),%xmm7
David Benjamin4969cc92016-04-22 15:02:23 -0400565___
566$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -0800567 lea (%r9),%rsp
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800568.cfi_def_cfa_register rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800569.Lssse3_epilogue:
David Benjamin4969cc92016-04-22 15:02:23 -0400570 ret
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800571.cfi_endproc
David Benjamin4969cc92016-04-22 15:02:23 -0400572.size ChaCha20_ssse3,.-ChaCha20_ssse3
573___
574}
575
576########################################################################
577# SSSE3 code path that handles longer messages.
578{
579# assign variables to favor Atom front-end
580my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
581 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
582my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
583 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
584
585sub SSSE3_lane_ROUND {
586my ($a0,$b0,$c0,$d0)=@_;
587my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
588my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
589my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
590my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
591my @x=map("\"$_\"",@xx);
592
593 # Consider order in which variables are addressed by their
594 # index:
595 #
596 # a b c d
597 #
598 # 0 4 8 12 < even round
599 # 1 5 9 13
600 # 2 6 10 14
601 # 3 7 11 15
602 # 0 5 10 15 < odd round
603 # 1 6 11 12
604 # 2 7 8 13
605 # 3 4 9 14
606 #
607 # 'a', 'b' and 'd's are permanently allocated in registers,
608 # @x[0..7,12..15], while 'c's are maintained in memory. If
609 # you observe 'c' column, you'll notice that pair of 'c's is
610 # invariant between rounds. This means that we have to reload
611 # them once per round, in the middle. This is why you'll see
612 # bunch of 'c' stores and loads in the middle, but none in
613 # the beginning or end.
614
615 (
616 "&paddd (@x[$a0],@x[$b0])", # Q1
617 "&paddd (@x[$a1],@x[$b1])", # Q2
618 "&pxor (@x[$d0],@x[$a0])",
619 "&pxor (@x[$d1],@x[$a1])",
620 "&pshufb (@x[$d0],$t1)",
621 "&pshufb (@x[$d1],$t1)",
622
623 "&paddd ($xc,@x[$d0])",
624 "&paddd ($xc_,@x[$d1])",
625 "&pxor (@x[$b0],$xc)",
626 "&pxor (@x[$b1],$xc_)",
627 "&movdqa ($t0,@x[$b0])",
628 "&pslld (@x[$b0],12)",
629 "&psrld ($t0,20)",
630 "&movdqa ($t1,@x[$b1])",
631 "&pslld (@x[$b1],12)",
632 "&por (@x[$b0],$t0)",
633 "&psrld ($t1,20)",
634 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
635 "&por (@x[$b1],$t1)",
636
637 "&paddd (@x[$a0],@x[$b0])",
638 "&paddd (@x[$a1],@x[$b1])",
639 "&pxor (@x[$d0],@x[$a0])",
640 "&pxor (@x[$d1],@x[$a1])",
641 "&pshufb (@x[$d0],$t0)",
642 "&pshufb (@x[$d1],$t0)",
643
644 "&paddd ($xc,@x[$d0])",
645 "&paddd ($xc_,@x[$d1])",
646 "&pxor (@x[$b0],$xc)",
647 "&pxor (@x[$b1],$xc_)",
648 "&movdqa ($t1,@x[$b0])",
649 "&pslld (@x[$b0],7)",
650 "&psrld ($t1,25)",
651 "&movdqa ($t0,@x[$b1])",
652 "&pslld (@x[$b1],7)",
653 "&por (@x[$b0],$t1)",
654 "&psrld ($t0,25)",
655 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
656 "&por (@x[$b1],$t0)",
657
658 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
659 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
660 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
661 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
662
663 "&paddd (@x[$a2],@x[$b2])", # Q3
664 "&paddd (@x[$a3],@x[$b3])", # Q4
665 "&pxor (@x[$d2],@x[$a2])",
666 "&pxor (@x[$d3],@x[$a3])",
667 "&pshufb (@x[$d2],$t1)",
668 "&pshufb (@x[$d3],$t1)",
669
670 "&paddd ($xc,@x[$d2])",
671 "&paddd ($xc_,@x[$d3])",
672 "&pxor (@x[$b2],$xc)",
673 "&pxor (@x[$b3],$xc_)",
674 "&movdqa ($t0,@x[$b2])",
675 "&pslld (@x[$b2],12)",
676 "&psrld ($t0,20)",
677 "&movdqa ($t1,@x[$b3])",
678 "&pslld (@x[$b3],12)",
679 "&por (@x[$b2],$t0)",
680 "&psrld ($t1,20)",
681 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
682 "&por (@x[$b3],$t1)",
683
684 "&paddd (@x[$a2],@x[$b2])",
685 "&paddd (@x[$a3],@x[$b3])",
686 "&pxor (@x[$d2],@x[$a2])",
687 "&pxor (@x[$d3],@x[$a3])",
688 "&pshufb (@x[$d2],$t0)",
689 "&pshufb (@x[$d3],$t0)",
690
691 "&paddd ($xc,@x[$d2])",
692 "&paddd ($xc_,@x[$d3])",
693 "&pxor (@x[$b2],$xc)",
694 "&pxor (@x[$b3],$xc_)",
695 "&movdqa ($t1,@x[$b2])",
696 "&pslld (@x[$b2],7)",
697 "&psrld ($t1,25)",
698 "&movdqa ($t0,@x[$b3])",
699 "&pslld (@x[$b3],7)",
700 "&por (@x[$b2],$t1)",
701 "&psrld ($t0,25)",
702 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
703 "&por (@x[$b3],$t0)"
704 );
705}
706
Robert Sloana94fe052017-02-21 08:49:28 -0800707my $xframe = $win64 ? 0xa8 : 8;
David Benjamin4969cc92016-04-22 15:02:23 -0400708
709$code.=<<___;
710.type ChaCha20_4x,\@function,5
711.align 32
712ChaCha20_4x:
713.LChaCha20_4x:
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800714.cfi_startproc
Robert Sloana94fe052017-02-21 08:49:28 -0800715 mov %rsp,%r9 # frame pointer
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800716.cfi_def_cfa_register r9
David Benjamin4969cc92016-04-22 15:02:23 -0400717 mov %r10,%r11
718___
719$code.=<<___ if ($avx>1);
720 shr \$32,%r10 # OPENSSL_ia32cap_P+8
721 test \$`1<<5`,%r10 # test AVX2
722 jnz .LChaCha20_8x
723___
724$code.=<<___;
725 cmp \$192,$len
726 ja .Lproceed4x
727
728 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
729 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
730 je .Ldo_sse3_after_all # to detect Atom
731
732.Lproceed4x:
Robert Sloana94fe052017-02-21 08:49:28 -0800733 sub \$0x140+$xframe,%rsp
David Benjamin4969cc92016-04-22 15:02:23 -0400734___
735 ################ stack layout
736 # +0x00 SIMD equivalent of @x[8-12]
737 # ...
738 # +0x40 constant copy of key[0-2] smashed by lanes
739 # ...
740 # +0x100 SIMD counters (with nonce smashed by lanes)
741 # ...
742 # +0x140
743$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -0800744 movaps %xmm6,-0xa8(%r9)
745 movaps %xmm7,-0x98(%r9)
746 movaps %xmm8,-0x88(%r9)
747 movaps %xmm9,-0x78(%r9)
748 movaps %xmm10,-0x68(%r9)
749 movaps %xmm11,-0x58(%r9)
750 movaps %xmm12,-0x48(%r9)
751 movaps %xmm13,-0x38(%r9)
752 movaps %xmm14,-0x28(%r9)
753 movaps %xmm15,-0x18(%r9)
754.L4x_body:
David Benjamin4969cc92016-04-22 15:02:23 -0400755___
756$code.=<<___;
757 movdqa .Lsigma(%rip),$xa3 # key[0]
758 movdqu ($key),$xb3 # key[1]
759 movdqu 16($key),$xt3 # key[2]
760 movdqu ($counter),$xd3 # key[3]
761 lea 0x100(%rsp),%rcx # size optimization
762 lea .Lrot16(%rip),%r10
763 lea .Lrot24(%rip),%r11
764
765 pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
766 pshufd \$0x55,$xa3,$xa1
767 movdqa $xa0,0x40(%rsp) # ... and offload
768 pshufd \$0xaa,$xa3,$xa2
769 movdqa $xa1,0x50(%rsp)
770 pshufd \$0xff,$xa3,$xa3
771 movdqa $xa2,0x60(%rsp)
772 movdqa $xa3,0x70(%rsp)
773
774 pshufd \$0x00,$xb3,$xb0
775 pshufd \$0x55,$xb3,$xb1
776 movdqa $xb0,0x80-0x100(%rcx)
777 pshufd \$0xaa,$xb3,$xb2
778 movdqa $xb1,0x90-0x100(%rcx)
779 pshufd \$0xff,$xb3,$xb3
780 movdqa $xb2,0xa0-0x100(%rcx)
781 movdqa $xb3,0xb0-0x100(%rcx)
782
783 pshufd \$0x00,$xt3,$xt0 # "$xc0"
784 pshufd \$0x55,$xt3,$xt1 # "$xc1"
785 movdqa $xt0,0xc0-0x100(%rcx)
786 pshufd \$0xaa,$xt3,$xt2 # "$xc2"
787 movdqa $xt1,0xd0-0x100(%rcx)
788 pshufd \$0xff,$xt3,$xt3 # "$xc3"
789 movdqa $xt2,0xe0-0x100(%rcx)
790 movdqa $xt3,0xf0-0x100(%rcx)
791
792 pshufd \$0x00,$xd3,$xd0
793 pshufd \$0x55,$xd3,$xd1
794 paddd .Linc(%rip),$xd0 # don't save counters yet
795 pshufd \$0xaa,$xd3,$xd2
796 movdqa $xd1,0x110-0x100(%rcx)
797 pshufd \$0xff,$xd3,$xd3
798 movdqa $xd2,0x120-0x100(%rcx)
799 movdqa $xd3,0x130-0x100(%rcx)
800
801 jmp .Loop_enter4x
802
803.align 32
804.Loop_outer4x:
805 movdqa 0x40(%rsp),$xa0 # re-load smashed key
806 movdqa 0x50(%rsp),$xa1
807 movdqa 0x60(%rsp),$xa2
808 movdqa 0x70(%rsp),$xa3
809 movdqa 0x80-0x100(%rcx),$xb0
810 movdqa 0x90-0x100(%rcx),$xb1
811 movdqa 0xa0-0x100(%rcx),$xb2
812 movdqa 0xb0-0x100(%rcx),$xb3
813 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
814 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
815 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
816 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
817 movdqa 0x100-0x100(%rcx),$xd0
818 movdqa 0x110-0x100(%rcx),$xd1
819 movdqa 0x120-0x100(%rcx),$xd2
820 movdqa 0x130-0x100(%rcx),$xd3
821 paddd .Lfour(%rip),$xd0 # next SIMD counters
822
823.Loop_enter4x:
824 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
825 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
826 movdqa (%r10),$xt3 # .Lrot16(%rip)
827 mov \$10,%eax
828 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
829 jmp .Loop4x
830
831.align 32
832.Loop4x:
833___
834 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
835 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
836$code.=<<___;
837 dec %eax
838 jnz .Loop4x
839
840 paddd 0x40(%rsp),$xa0 # accumulate key material
841 paddd 0x50(%rsp),$xa1
842 paddd 0x60(%rsp),$xa2
843 paddd 0x70(%rsp),$xa3
844
845 movdqa $xa0,$xt2 # "de-interlace" data
846 punpckldq $xa1,$xa0
847 movdqa $xa2,$xt3
848 punpckldq $xa3,$xa2
849 punpckhdq $xa1,$xt2
850 punpckhdq $xa3,$xt3
851 movdqa $xa0,$xa1
852 punpcklqdq $xa2,$xa0 # "a0"
853 movdqa $xt2,$xa3
854 punpcklqdq $xt3,$xt2 # "a2"
855 punpckhqdq $xa2,$xa1 # "a1"
856 punpckhqdq $xt3,$xa3 # "a3"
857___
858 ($xa2,$xt2)=($xt2,$xa2);
859$code.=<<___;
860 paddd 0x80-0x100(%rcx),$xb0
861 paddd 0x90-0x100(%rcx),$xb1
862 paddd 0xa0-0x100(%rcx),$xb2
863 paddd 0xb0-0x100(%rcx),$xb3
864
865 movdqa $xa0,0x00(%rsp) # offload $xaN
866 movdqa $xa1,0x10(%rsp)
867 movdqa 0x20(%rsp),$xa0 # "xc2"
868 movdqa 0x30(%rsp),$xa1 # "xc3"
869
870 movdqa $xb0,$xt2
871 punpckldq $xb1,$xb0
872 movdqa $xb2,$xt3
873 punpckldq $xb3,$xb2
874 punpckhdq $xb1,$xt2
875 punpckhdq $xb3,$xt3
876 movdqa $xb0,$xb1
877 punpcklqdq $xb2,$xb0 # "b0"
878 movdqa $xt2,$xb3
879 punpcklqdq $xt3,$xt2 # "b2"
880 punpckhqdq $xb2,$xb1 # "b1"
881 punpckhqdq $xt3,$xb3 # "b3"
882___
883 ($xb2,$xt2)=($xt2,$xb2);
884 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
885$code.=<<___;
886 paddd 0xc0-0x100(%rcx),$xc0
887 paddd 0xd0-0x100(%rcx),$xc1
888 paddd 0xe0-0x100(%rcx),$xc2
889 paddd 0xf0-0x100(%rcx),$xc3
890
891 movdqa $xa2,0x20(%rsp) # keep offloading $xaN
892 movdqa $xa3,0x30(%rsp)
893
894 movdqa $xc0,$xt2
895 punpckldq $xc1,$xc0
896 movdqa $xc2,$xt3
897 punpckldq $xc3,$xc2
898 punpckhdq $xc1,$xt2
899 punpckhdq $xc3,$xt3
900 movdqa $xc0,$xc1
901 punpcklqdq $xc2,$xc0 # "c0"
902 movdqa $xt2,$xc3
903 punpcklqdq $xt3,$xt2 # "c2"
904 punpckhqdq $xc2,$xc1 # "c1"
905 punpckhqdq $xt3,$xc3 # "c3"
906___
907 ($xc2,$xt2)=($xt2,$xc2);
908 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
909$code.=<<___;
910 paddd 0x100-0x100(%rcx),$xd0
911 paddd 0x110-0x100(%rcx),$xd1
912 paddd 0x120-0x100(%rcx),$xd2
913 paddd 0x130-0x100(%rcx),$xd3
914
915 movdqa $xd0,$xt2
916 punpckldq $xd1,$xd0
917 movdqa $xd2,$xt3
918 punpckldq $xd3,$xd2
919 punpckhdq $xd1,$xt2
920 punpckhdq $xd3,$xt3
921 movdqa $xd0,$xd1
922 punpcklqdq $xd2,$xd0 # "d0"
923 movdqa $xt2,$xd3
924 punpcklqdq $xt3,$xt2 # "d2"
925 punpckhqdq $xd2,$xd1 # "d1"
926 punpckhqdq $xt3,$xd3 # "d3"
927___
928 ($xd2,$xt2)=($xt2,$xd2);
929$code.=<<___;
930 cmp \$64*4,$len
931 jb .Ltail4x
932
933 movdqu 0x00($inp),$xt0 # xor with input
934 movdqu 0x10($inp),$xt1
935 movdqu 0x20($inp),$xt2
936 movdqu 0x30($inp),$xt3
937 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
938 pxor $xb0,$xt1
939 pxor $xc0,$xt2
940 pxor $xd0,$xt3
941
942 movdqu $xt0,0x00($out)
943 movdqu 0x40($inp),$xt0
944 movdqu $xt1,0x10($out)
945 movdqu 0x50($inp),$xt1
946 movdqu $xt2,0x20($out)
947 movdqu 0x60($inp),$xt2
948 movdqu $xt3,0x30($out)
949 movdqu 0x70($inp),$xt3
950 lea 0x80($inp),$inp # size optimization
951 pxor 0x10(%rsp),$xt0
952 pxor $xb1,$xt1
953 pxor $xc1,$xt2
954 pxor $xd1,$xt3
955
956 movdqu $xt0,0x40($out)
957 movdqu 0x00($inp),$xt0
958 movdqu $xt1,0x50($out)
959 movdqu 0x10($inp),$xt1
960 movdqu $xt2,0x60($out)
961 movdqu 0x20($inp),$xt2
962 movdqu $xt3,0x70($out)
963 lea 0x80($out),$out # size optimization
964 movdqu 0x30($inp),$xt3
965 pxor 0x20(%rsp),$xt0
966 pxor $xb2,$xt1
967 pxor $xc2,$xt2
968 pxor $xd2,$xt3
969
970 movdqu $xt0,0x00($out)
971 movdqu 0x40($inp),$xt0
972 movdqu $xt1,0x10($out)
973 movdqu 0x50($inp),$xt1
974 movdqu $xt2,0x20($out)
975 movdqu 0x60($inp),$xt2
976 movdqu $xt3,0x30($out)
977 movdqu 0x70($inp),$xt3
978 lea 0x80($inp),$inp # inp+=64*4
979 pxor 0x30(%rsp),$xt0
980 pxor $xb3,$xt1
981 pxor $xc3,$xt2
982 pxor $xd3,$xt3
983 movdqu $xt0,0x40($out)
984 movdqu $xt1,0x50($out)
985 movdqu $xt2,0x60($out)
986 movdqu $xt3,0x70($out)
987 lea 0x80($out),$out # out+=64*4
988
989 sub \$64*4,$len
990 jnz .Loop_outer4x
991
992 jmp .Ldone4x
993
994.Ltail4x:
995 cmp \$192,$len
996 jae .L192_or_more4x
997 cmp \$128,$len
998 jae .L128_or_more4x
999 cmp \$64,$len
1000 jae .L64_or_more4x
1001
1002 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1003 xor %r10,%r10
1004 #movdqa $xt0,0x00(%rsp)
1005 movdqa $xb0,0x10(%rsp)
1006 movdqa $xc0,0x20(%rsp)
1007 movdqa $xd0,0x30(%rsp)
1008 jmp .Loop_tail4x
1009
1010.align 32
1011.L64_or_more4x:
1012 movdqu 0x00($inp),$xt0 # xor with input
1013 movdqu 0x10($inp),$xt1
1014 movdqu 0x20($inp),$xt2
1015 movdqu 0x30($inp),$xt3
1016 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
1017 pxor $xb0,$xt1
1018 pxor $xc0,$xt2
1019 pxor $xd0,$xt3
1020 movdqu $xt0,0x00($out)
1021 movdqu $xt1,0x10($out)
1022 movdqu $xt2,0x20($out)
1023 movdqu $xt3,0x30($out)
1024 je .Ldone4x
1025
1026 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
1027 lea 0x40($inp),$inp # inp+=64*1
1028 xor %r10,%r10
1029 movdqa $xt0,0x00(%rsp)
1030 movdqa $xb1,0x10(%rsp)
1031 lea 0x40($out),$out # out+=64*1
1032 movdqa $xc1,0x20(%rsp)
1033 sub \$64,$len # len-=64*1
1034 movdqa $xd1,0x30(%rsp)
1035 jmp .Loop_tail4x
1036
1037.align 32
1038.L128_or_more4x:
1039 movdqu 0x00($inp),$xt0 # xor with input
1040 movdqu 0x10($inp),$xt1
1041 movdqu 0x20($inp),$xt2
1042 movdqu 0x30($inp),$xt3
1043 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1044 pxor $xb0,$xt1
1045 pxor $xc0,$xt2
1046 pxor $xd0,$xt3
1047
1048 movdqu $xt0,0x00($out)
1049 movdqu 0x40($inp),$xt0
1050 movdqu $xt1,0x10($out)
1051 movdqu 0x50($inp),$xt1
1052 movdqu $xt2,0x20($out)
1053 movdqu 0x60($inp),$xt2
1054 movdqu $xt3,0x30($out)
1055 movdqu 0x70($inp),$xt3
1056 pxor 0x10(%rsp),$xt0
1057 pxor $xb1,$xt1
1058 pxor $xc1,$xt2
1059 pxor $xd1,$xt3
1060 movdqu $xt0,0x40($out)
1061 movdqu $xt1,0x50($out)
1062 movdqu $xt2,0x60($out)
1063 movdqu $xt3,0x70($out)
1064 je .Ldone4x
1065
1066 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
1067 lea 0x80($inp),$inp # inp+=64*2
1068 xor %r10,%r10
1069 movdqa $xt0,0x00(%rsp)
1070 movdqa $xb2,0x10(%rsp)
1071 lea 0x80($out),$out # out+=64*2
1072 movdqa $xc2,0x20(%rsp)
1073 sub \$128,$len # len-=64*2
1074 movdqa $xd2,0x30(%rsp)
1075 jmp .Loop_tail4x
1076
1077.align 32
1078.L192_or_more4x:
1079 movdqu 0x00($inp),$xt0 # xor with input
1080 movdqu 0x10($inp),$xt1
1081 movdqu 0x20($inp),$xt2
1082 movdqu 0x30($inp),$xt3
1083 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1084 pxor $xb0,$xt1
1085 pxor $xc0,$xt2
1086 pxor $xd0,$xt3
1087
1088 movdqu $xt0,0x00($out)
1089 movdqu 0x40($inp),$xt0
1090 movdqu $xt1,0x10($out)
1091 movdqu 0x50($inp),$xt1
1092 movdqu $xt2,0x20($out)
1093 movdqu 0x60($inp),$xt2
1094 movdqu $xt3,0x30($out)
1095 movdqu 0x70($inp),$xt3
1096 lea 0x80($inp),$inp # size optimization
1097 pxor 0x10(%rsp),$xt0
1098 pxor $xb1,$xt1
1099 pxor $xc1,$xt2
1100 pxor $xd1,$xt3
1101
1102 movdqu $xt0,0x40($out)
1103 movdqu 0x00($inp),$xt0
1104 movdqu $xt1,0x50($out)
1105 movdqu 0x10($inp),$xt1
1106 movdqu $xt2,0x60($out)
1107 movdqu 0x20($inp),$xt2
1108 movdqu $xt3,0x70($out)
1109 lea 0x80($out),$out # size optimization
1110 movdqu 0x30($inp),$xt3
1111 pxor 0x20(%rsp),$xt0
1112 pxor $xb2,$xt1
1113 pxor $xc2,$xt2
1114 pxor $xd2,$xt3
1115 movdqu $xt0,0x00($out)
1116 movdqu $xt1,0x10($out)
1117 movdqu $xt2,0x20($out)
1118 movdqu $xt3,0x30($out)
1119 je .Ldone4x
1120
1121 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
1122 lea 0x40($inp),$inp # inp+=64*3
1123 xor %r10,%r10
1124 movdqa $xt0,0x00(%rsp)
1125 movdqa $xb3,0x10(%rsp)
1126 lea 0x40($out),$out # out+=64*3
1127 movdqa $xc3,0x20(%rsp)
1128 sub \$192,$len # len-=64*3
1129 movdqa $xd3,0x30(%rsp)
1130
1131.Loop_tail4x:
1132 movzb ($inp,%r10),%eax
1133 movzb (%rsp,%r10),%ecx
1134 lea 1(%r10),%r10
1135 xor %ecx,%eax
1136 mov %al,-1($out,%r10)
1137 dec $len
1138 jnz .Loop_tail4x
1139
1140.Ldone4x:
1141___
1142$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -08001143 movaps -0xa8(%r9),%xmm6
1144 movaps -0x98(%r9),%xmm7
1145 movaps -0x88(%r9),%xmm8
1146 movaps -0x78(%r9),%xmm9
1147 movaps -0x68(%r9),%xmm10
1148 movaps -0x58(%r9),%xmm11
1149 movaps -0x48(%r9),%xmm12
1150 movaps -0x38(%r9),%xmm13
1151 movaps -0x28(%r9),%xmm14
1152 movaps -0x18(%r9),%xmm15
David Benjamin4969cc92016-04-22 15:02:23 -04001153___
1154$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08001155 lea (%r9),%rsp
Robert Sloan4c22c5f2019-03-01 15:53:37 -08001156.cfi_def_cfa_register rsp
Robert Sloana94fe052017-02-21 08:49:28 -08001157.L4x_epilogue:
David Benjamin4969cc92016-04-22 15:02:23 -04001158 ret
Robert Sloan4c22c5f2019-03-01 15:53:37 -08001159.cfi_endproc
David Benjamin4969cc92016-04-22 15:02:23 -04001160.size ChaCha20_4x,.-ChaCha20_4x
1161___
1162}
1163
1164########################################################################
1165# AVX2 code path
1166if ($avx>1) {
1167my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1168 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1169my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1170 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1171
1172sub AVX2_lane_ROUND {
1173my ($a0,$b0,$c0,$d0)=@_;
1174my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1175my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1176my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1177my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1178my @x=map("\"$_\"",@xx);
1179
1180 # Consider order in which variables are addressed by their
1181 # index:
1182 #
1183 # a b c d
1184 #
1185 # 0 4 8 12 < even round
1186 # 1 5 9 13
1187 # 2 6 10 14
1188 # 3 7 11 15
1189 # 0 5 10 15 < odd round
1190 # 1 6 11 12
1191 # 2 7 8 13
1192 # 3 4 9 14
1193 #
1194 # 'a', 'b' and 'd's are permanently allocated in registers,
1195 # @x[0..7,12..15], while 'c's are maintained in memory. If
1196 # you observe 'c' column, you'll notice that pair of 'c's is
1197 # invariant between rounds. This means that we have to reload
1198 # them once per round, in the middle. This is why you'll see
1199 # bunch of 'c' stores and loads in the middle, but none in
1200 # the beginning or end.
1201
1202 (
1203 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1204 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1205 "&vpshufb (@x[$d0],@x[$d0],$t1)",
1206 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1207 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1208 "&vpshufb (@x[$d1],@x[$d1],$t1)",
1209
1210 "&vpaddd ($xc,$xc,@x[$d0])",
1211 "&vpxor (@x[$b0],$xc,@x[$b0])",
1212 "&vpslld ($t0,@x[$b0],12)",
1213 "&vpsrld (@x[$b0],@x[$b0],20)",
1214 "&vpor (@x[$b0],$t0,@x[$b0])",
1215 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1216 "&vpaddd ($xc_,$xc_,@x[$d1])",
1217 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1218 "&vpslld ($t1,@x[$b1],12)",
1219 "&vpsrld (@x[$b1],@x[$b1],20)",
1220 "&vpor (@x[$b1],$t1,@x[$b1])",
1221
1222 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
1223 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1224 "&vpshufb (@x[$d0],@x[$d0],$t0)",
1225 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
1226 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1227 "&vpshufb (@x[$d1],@x[$d1],$t0)",
1228
1229 "&vpaddd ($xc,$xc,@x[$d0])",
1230 "&vpxor (@x[$b0],$xc,@x[$b0])",
1231 "&vpslld ($t1,@x[$b0],7)",
1232 "&vpsrld (@x[$b0],@x[$b0],25)",
1233 "&vpor (@x[$b0],$t1,@x[$b0])",
1234 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1235 "&vpaddd ($xc_,$xc_,@x[$d1])",
1236 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1237 "&vpslld ($t0,@x[$b1],7)",
1238 "&vpsrld (@x[$b1],@x[$b1],25)",
1239 "&vpor (@x[$b1],$t0,@x[$b1])",
1240
1241 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
1242 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
1243 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
1244 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
1245
1246 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1247 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1248 "&vpshufb (@x[$d2],@x[$d2],$t1)",
1249 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1250 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1251 "&vpshufb (@x[$d3],@x[$d3],$t1)",
1252
1253 "&vpaddd ($xc,$xc,@x[$d2])",
1254 "&vpxor (@x[$b2],$xc,@x[$b2])",
1255 "&vpslld ($t0,@x[$b2],12)",
1256 "&vpsrld (@x[$b2],@x[$b2],20)",
1257 "&vpor (@x[$b2],$t0,@x[$b2])",
1258 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1259 "&vpaddd ($xc_,$xc_,@x[$d3])",
1260 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1261 "&vpslld ($t1,@x[$b3],12)",
1262 "&vpsrld (@x[$b3],@x[$b3],20)",
1263 "&vpor (@x[$b3],$t1,@x[$b3])",
1264
1265 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1266 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1267 "&vpshufb (@x[$d2],@x[$d2],$t0)",
1268 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1269 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1270 "&vpshufb (@x[$d3],@x[$d3],$t0)",
1271
1272 "&vpaddd ($xc,$xc,@x[$d2])",
1273 "&vpxor (@x[$b2],$xc,@x[$b2])",
1274 "&vpslld ($t1,@x[$b2],7)",
1275 "&vpsrld (@x[$b2],@x[$b2],25)",
1276 "&vpor (@x[$b2],$t1,@x[$b2])",
1277 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1278 "&vpaddd ($xc_,$xc_,@x[$d3])",
1279 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1280 "&vpslld ($t0,@x[$b3],7)",
1281 "&vpsrld (@x[$b3],@x[$b3],25)",
1282 "&vpor (@x[$b3],$t0,@x[$b3])"
1283 );
1284}
1285
Robert Sloana94fe052017-02-21 08:49:28 -08001286my $xframe = $win64 ? 0xa8 : 8;
David Benjamin4969cc92016-04-22 15:02:23 -04001287
1288$code.=<<___;
1289.type ChaCha20_8x,\@function,5
1290.align 32
1291ChaCha20_8x:
1292.LChaCha20_8x:
Robert Sloan4c22c5f2019-03-01 15:53:37 -08001293.cfi_startproc
Robert Sloana94fe052017-02-21 08:49:28 -08001294 mov %rsp,%r9 # frame register
Robert Sloan4c22c5f2019-03-01 15:53:37 -08001295.cfi_def_cfa_register r9
David Benjamin4969cc92016-04-22 15:02:23 -04001296 sub \$0x280+$xframe,%rsp
1297 and \$-32,%rsp
1298___
1299$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -08001300 movaps %xmm6,-0xa8(%r9)
1301 movaps %xmm7,-0x98(%r9)
1302 movaps %xmm8,-0x88(%r9)
1303 movaps %xmm9,-0x78(%r9)
1304 movaps %xmm10,-0x68(%r9)
1305 movaps %xmm11,-0x58(%r9)
1306 movaps %xmm12,-0x48(%r9)
1307 movaps %xmm13,-0x38(%r9)
1308 movaps %xmm14,-0x28(%r9)
1309 movaps %xmm15,-0x18(%r9)
1310.L8x_body:
David Benjamin4969cc92016-04-22 15:02:23 -04001311___
1312$code.=<<___;
1313 vzeroupper
David Benjamin4969cc92016-04-22 15:02:23 -04001314
1315 ################ stack layout
1316 # +0x00 SIMD equivalent of @x[8-12]
1317 # ...
1318 # +0x80 constant copy of key[0-2] smashed by lanes
1319 # ...
1320 # +0x200 SIMD counters (with nonce smashed by lanes)
1321 # ...
Robert Sloana94fe052017-02-21 08:49:28 -08001322 # +0x280
David Benjamin4969cc92016-04-22 15:02:23 -04001323
1324 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
1325 vbroadcasti128 ($key),$xb3 # key[1]
1326 vbroadcasti128 16($key),$xt3 # key[2]
1327 vbroadcasti128 ($counter),$xd3 # key[3]
1328 lea 0x100(%rsp),%rcx # size optimization
1329 lea 0x200(%rsp),%rax # size optimization
1330 lea .Lrot16(%rip),%r10
1331 lea .Lrot24(%rip),%r11
1332
1333 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1334 vpshufd \$0x55,$xa3,$xa1
1335 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
1336 vpshufd \$0xaa,$xa3,$xa2
1337 vmovdqa $xa1,0xa0-0x100(%rcx)
1338 vpshufd \$0xff,$xa3,$xa3
1339 vmovdqa $xa2,0xc0-0x100(%rcx)
1340 vmovdqa $xa3,0xe0-0x100(%rcx)
1341
1342 vpshufd \$0x00,$xb3,$xb0
1343 vpshufd \$0x55,$xb3,$xb1
1344 vmovdqa $xb0,0x100-0x100(%rcx)
1345 vpshufd \$0xaa,$xb3,$xb2
1346 vmovdqa $xb1,0x120-0x100(%rcx)
1347 vpshufd \$0xff,$xb3,$xb3
1348 vmovdqa $xb2,0x140-0x100(%rcx)
1349 vmovdqa $xb3,0x160-0x100(%rcx)
1350
1351 vpshufd \$0x00,$xt3,$xt0 # "xc0"
1352 vpshufd \$0x55,$xt3,$xt1 # "xc1"
1353 vmovdqa $xt0,0x180-0x200(%rax)
1354 vpshufd \$0xaa,$xt3,$xt2 # "xc2"
1355 vmovdqa $xt1,0x1a0-0x200(%rax)
1356 vpshufd \$0xff,$xt3,$xt3 # "xc3"
1357 vmovdqa $xt2,0x1c0-0x200(%rax)
1358 vmovdqa $xt3,0x1e0-0x200(%rax)
1359
1360 vpshufd \$0x00,$xd3,$xd0
1361 vpshufd \$0x55,$xd3,$xd1
1362 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
1363 vpshufd \$0xaa,$xd3,$xd2
1364 vmovdqa $xd1,0x220-0x200(%rax)
1365 vpshufd \$0xff,$xd3,$xd3
1366 vmovdqa $xd2,0x240-0x200(%rax)
1367 vmovdqa $xd3,0x260-0x200(%rax)
1368
1369 jmp .Loop_enter8x
1370
1371.align 32
1372.Loop_outer8x:
1373 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
1374 vmovdqa 0xa0-0x100(%rcx),$xa1
1375 vmovdqa 0xc0-0x100(%rcx),$xa2
1376 vmovdqa 0xe0-0x100(%rcx),$xa3
1377 vmovdqa 0x100-0x100(%rcx),$xb0
1378 vmovdqa 0x120-0x100(%rcx),$xb1
1379 vmovdqa 0x140-0x100(%rcx),$xb2
1380 vmovdqa 0x160-0x100(%rcx),$xb3
1381 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
1382 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
1383 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
1384 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
1385 vmovdqa 0x200-0x200(%rax),$xd0
1386 vmovdqa 0x220-0x200(%rax),$xd1
1387 vmovdqa 0x240-0x200(%rax),$xd2
1388 vmovdqa 0x260-0x200(%rax),$xd3
1389 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
1390
1391.Loop_enter8x:
1392 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
1393 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
1394 vbroadcasti128 (%r10),$xt3
1395 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
1396 mov \$10,%eax
1397 jmp .Loop8x
1398
1399.align 32
1400.Loop8x:
1401___
1402 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
1403 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
1404$code.=<<___;
1405 dec %eax
1406 jnz .Loop8x
1407
1408 lea 0x200(%rsp),%rax # size optimization
1409 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
1410 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
1411 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
1412 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
1413
1414 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1415 vpunpckldq $xa3,$xa2,$xt3
1416 vpunpckhdq $xa1,$xa0,$xa0
1417 vpunpckhdq $xa3,$xa2,$xa2
1418 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1419 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1420 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1421 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1422___
1423 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1424$code.=<<___;
1425 vpaddd 0x100-0x100(%rcx),$xb0,$xb0
1426 vpaddd 0x120-0x100(%rcx),$xb1,$xb1
1427 vpaddd 0x140-0x100(%rcx),$xb2,$xb2
1428 vpaddd 0x160-0x100(%rcx),$xb3,$xb3
1429
1430 vpunpckldq $xb1,$xb0,$xt2
1431 vpunpckldq $xb3,$xb2,$xt3
1432 vpunpckhdq $xb1,$xb0,$xb0
1433 vpunpckhdq $xb3,$xb2,$xb2
1434 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1435 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1436 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1437 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1438___
1439 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1440$code.=<<___;
1441 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
1442 vperm2i128 \$0x31,$xb0,$xa0,$xb0
1443 vperm2i128 \$0x20,$xb1,$xa1,$xa0
1444 vperm2i128 \$0x31,$xb1,$xa1,$xb1
1445 vperm2i128 \$0x20,$xb2,$xa2,$xa1
1446 vperm2i128 \$0x31,$xb2,$xa2,$xb2
1447 vperm2i128 \$0x20,$xb3,$xa3,$xa2
1448 vperm2i128 \$0x31,$xb3,$xa3,$xb3
1449___
1450 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
1451 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1452$code.=<<___;
1453 vmovdqa $xa0,0x00(%rsp) # offload $xaN
1454 vmovdqa $xa1,0x20(%rsp)
1455 vmovdqa 0x40(%rsp),$xc2 # $xa0
1456 vmovdqa 0x60(%rsp),$xc3 # $xa1
1457
1458 vpaddd 0x180-0x200(%rax),$xc0,$xc0
1459 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
1460 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
1461 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
1462
1463 vpunpckldq $xc1,$xc0,$xt2
1464 vpunpckldq $xc3,$xc2,$xt3
1465 vpunpckhdq $xc1,$xc0,$xc0
1466 vpunpckhdq $xc3,$xc2,$xc2
1467 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1468 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1469 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1470 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1471___
1472 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1473$code.=<<___;
1474 vpaddd 0x200-0x200(%rax),$xd0,$xd0
1475 vpaddd 0x220-0x200(%rax),$xd1,$xd1
1476 vpaddd 0x240-0x200(%rax),$xd2,$xd2
1477 vpaddd 0x260-0x200(%rax),$xd3,$xd3
1478
1479 vpunpckldq $xd1,$xd0,$xt2
1480 vpunpckldq $xd3,$xd2,$xt3
1481 vpunpckhdq $xd1,$xd0,$xd0
1482 vpunpckhdq $xd3,$xd2,$xd2
1483 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1484 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1485 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1486 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1487___
1488 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1489$code.=<<___;
1490 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
1491 vperm2i128 \$0x31,$xd0,$xc0,$xd0
1492 vperm2i128 \$0x20,$xd1,$xc1,$xc0
1493 vperm2i128 \$0x31,$xd1,$xc1,$xd1
1494 vperm2i128 \$0x20,$xd2,$xc2,$xc1
1495 vperm2i128 \$0x31,$xd2,$xc2,$xd2
1496 vperm2i128 \$0x20,$xd3,$xc3,$xc2
1497 vperm2i128 \$0x31,$xd3,$xc3,$xd3
1498___
1499 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
1500 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
1501 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
1502 ($xa0,$xa1)=($xt2,$xt3);
1503$code.=<<___;
1504 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
1505 vmovdqa 0x20(%rsp),$xa1
1506
1507 cmp \$64*8,$len
1508 jb .Ltail8x
1509
1510 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1511 vpxor 0x20($inp),$xb0,$xb0
1512 vpxor 0x40($inp),$xc0,$xc0
1513 vpxor 0x60($inp),$xd0,$xd0
1514 lea 0x80($inp),$inp # size optimization
1515 vmovdqu $xa0,0x00($out)
1516 vmovdqu $xb0,0x20($out)
1517 vmovdqu $xc0,0x40($out)
1518 vmovdqu $xd0,0x60($out)
1519 lea 0x80($out),$out # size optimization
1520
1521 vpxor 0x00($inp),$xa1,$xa1
1522 vpxor 0x20($inp),$xb1,$xb1
1523 vpxor 0x40($inp),$xc1,$xc1
1524 vpxor 0x60($inp),$xd1,$xd1
1525 lea 0x80($inp),$inp # size optimization
1526 vmovdqu $xa1,0x00($out)
1527 vmovdqu $xb1,0x20($out)
1528 vmovdqu $xc1,0x40($out)
1529 vmovdqu $xd1,0x60($out)
1530 lea 0x80($out),$out # size optimization
1531
1532 vpxor 0x00($inp),$xa2,$xa2
1533 vpxor 0x20($inp),$xb2,$xb2
1534 vpxor 0x40($inp),$xc2,$xc2
1535 vpxor 0x60($inp),$xd2,$xd2
1536 lea 0x80($inp),$inp # size optimization
1537 vmovdqu $xa2,0x00($out)
1538 vmovdqu $xb2,0x20($out)
1539 vmovdqu $xc2,0x40($out)
1540 vmovdqu $xd2,0x60($out)
1541 lea 0x80($out),$out # size optimization
1542
1543 vpxor 0x00($inp),$xa3,$xa3
1544 vpxor 0x20($inp),$xb3,$xb3
1545 vpxor 0x40($inp),$xc3,$xc3
1546 vpxor 0x60($inp),$xd3,$xd3
1547 lea 0x80($inp),$inp # size optimization
1548 vmovdqu $xa3,0x00($out)
1549 vmovdqu $xb3,0x20($out)
1550 vmovdqu $xc3,0x40($out)
1551 vmovdqu $xd3,0x60($out)
1552 lea 0x80($out),$out # size optimization
1553
1554 sub \$64*8,$len
1555 jnz .Loop_outer8x
1556
1557 jmp .Ldone8x
1558
1559.Ltail8x:
1560 cmp \$448,$len
1561 jae .L448_or_more8x
1562 cmp \$384,$len
1563 jae .L384_or_more8x
1564 cmp \$320,$len
1565 jae .L320_or_more8x
1566 cmp \$256,$len
1567 jae .L256_or_more8x
1568 cmp \$192,$len
1569 jae .L192_or_more8x
1570 cmp \$128,$len
1571 jae .L128_or_more8x
1572 cmp \$64,$len
1573 jae .L64_or_more8x
1574
1575 xor %r10,%r10
1576 vmovdqa $xa0,0x00(%rsp)
1577 vmovdqa $xb0,0x20(%rsp)
1578 jmp .Loop_tail8x
1579
1580.align 32
1581.L64_or_more8x:
1582 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1583 vpxor 0x20($inp),$xb0,$xb0
1584 vmovdqu $xa0,0x00($out)
1585 vmovdqu $xb0,0x20($out)
1586 je .Ldone8x
1587
1588 lea 0x40($inp),$inp # inp+=64*1
1589 xor %r10,%r10
1590 vmovdqa $xc0,0x00(%rsp)
1591 lea 0x40($out),$out # out+=64*1
1592 sub \$64,$len # len-=64*1
1593 vmovdqa $xd0,0x20(%rsp)
1594 jmp .Loop_tail8x
1595
1596.align 32
1597.L128_or_more8x:
1598 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1599 vpxor 0x20($inp),$xb0,$xb0
1600 vpxor 0x40($inp),$xc0,$xc0
1601 vpxor 0x60($inp),$xd0,$xd0
1602 vmovdqu $xa0,0x00($out)
1603 vmovdqu $xb0,0x20($out)
1604 vmovdqu $xc0,0x40($out)
1605 vmovdqu $xd0,0x60($out)
1606 je .Ldone8x
1607
1608 lea 0x80($inp),$inp # inp+=64*2
1609 xor %r10,%r10
1610 vmovdqa $xa1,0x00(%rsp)
1611 lea 0x80($out),$out # out+=64*2
1612 sub \$128,$len # len-=64*2
1613 vmovdqa $xb1,0x20(%rsp)
1614 jmp .Loop_tail8x
1615
1616.align 32
1617.L192_or_more8x:
1618 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1619 vpxor 0x20($inp),$xb0,$xb0
1620 vpxor 0x40($inp),$xc0,$xc0
1621 vpxor 0x60($inp),$xd0,$xd0
1622 vpxor 0x80($inp),$xa1,$xa1
1623 vpxor 0xa0($inp),$xb1,$xb1
1624 vmovdqu $xa0,0x00($out)
1625 vmovdqu $xb0,0x20($out)
1626 vmovdqu $xc0,0x40($out)
1627 vmovdqu $xd0,0x60($out)
1628 vmovdqu $xa1,0x80($out)
1629 vmovdqu $xb1,0xa0($out)
1630 je .Ldone8x
1631
1632 lea 0xc0($inp),$inp # inp+=64*3
1633 xor %r10,%r10
1634 vmovdqa $xc1,0x00(%rsp)
1635 lea 0xc0($out),$out # out+=64*3
1636 sub \$192,$len # len-=64*3
1637 vmovdqa $xd1,0x20(%rsp)
1638 jmp .Loop_tail8x
1639
1640.align 32
1641.L256_or_more8x:
1642 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1643 vpxor 0x20($inp),$xb0,$xb0
1644 vpxor 0x40($inp),$xc0,$xc0
1645 vpxor 0x60($inp),$xd0,$xd0
1646 vpxor 0x80($inp),$xa1,$xa1
1647 vpxor 0xa0($inp),$xb1,$xb1
1648 vpxor 0xc0($inp),$xc1,$xc1
1649 vpxor 0xe0($inp),$xd1,$xd1
1650 vmovdqu $xa0,0x00($out)
1651 vmovdqu $xb0,0x20($out)
1652 vmovdqu $xc0,0x40($out)
1653 vmovdqu $xd0,0x60($out)
1654 vmovdqu $xa1,0x80($out)
1655 vmovdqu $xb1,0xa0($out)
1656 vmovdqu $xc1,0xc0($out)
1657 vmovdqu $xd1,0xe0($out)
1658 je .Ldone8x
1659
1660 lea 0x100($inp),$inp # inp+=64*4
1661 xor %r10,%r10
1662 vmovdqa $xa2,0x00(%rsp)
1663 lea 0x100($out),$out # out+=64*4
1664 sub \$256,$len # len-=64*4
1665 vmovdqa $xb2,0x20(%rsp)
1666 jmp .Loop_tail8x
1667
1668.align 32
1669.L320_or_more8x:
1670 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1671 vpxor 0x20($inp),$xb0,$xb0
1672 vpxor 0x40($inp),$xc0,$xc0
1673 vpxor 0x60($inp),$xd0,$xd0
1674 vpxor 0x80($inp),$xa1,$xa1
1675 vpxor 0xa0($inp),$xb1,$xb1
1676 vpxor 0xc0($inp),$xc1,$xc1
1677 vpxor 0xe0($inp),$xd1,$xd1
1678 vpxor 0x100($inp),$xa2,$xa2
1679 vpxor 0x120($inp),$xb2,$xb2
1680 vmovdqu $xa0,0x00($out)
1681 vmovdqu $xb0,0x20($out)
1682 vmovdqu $xc0,0x40($out)
1683 vmovdqu $xd0,0x60($out)
1684 vmovdqu $xa1,0x80($out)
1685 vmovdqu $xb1,0xa0($out)
1686 vmovdqu $xc1,0xc0($out)
1687 vmovdqu $xd1,0xe0($out)
1688 vmovdqu $xa2,0x100($out)
1689 vmovdqu $xb2,0x120($out)
1690 je .Ldone8x
1691
1692 lea 0x140($inp),$inp # inp+=64*5
1693 xor %r10,%r10
1694 vmovdqa $xc2,0x00(%rsp)
1695 lea 0x140($out),$out # out+=64*5
1696 sub \$320,$len # len-=64*5
1697 vmovdqa $xd2,0x20(%rsp)
1698 jmp .Loop_tail8x
1699
1700.align 32
1701.L384_or_more8x:
1702 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1703 vpxor 0x20($inp),$xb0,$xb0
1704 vpxor 0x40($inp),$xc0,$xc0
1705 vpxor 0x60($inp),$xd0,$xd0
1706 vpxor 0x80($inp),$xa1,$xa1
1707 vpxor 0xa0($inp),$xb1,$xb1
1708 vpxor 0xc0($inp),$xc1,$xc1
1709 vpxor 0xe0($inp),$xd1,$xd1
1710 vpxor 0x100($inp),$xa2,$xa2
1711 vpxor 0x120($inp),$xb2,$xb2
1712 vpxor 0x140($inp),$xc2,$xc2
1713 vpxor 0x160($inp),$xd2,$xd2
1714 vmovdqu $xa0,0x00($out)
1715 vmovdqu $xb0,0x20($out)
1716 vmovdqu $xc0,0x40($out)
1717 vmovdqu $xd0,0x60($out)
1718 vmovdqu $xa1,0x80($out)
1719 vmovdqu $xb1,0xa0($out)
1720 vmovdqu $xc1,0xc0($out)
1721 vmovdqu $xd1,0xe0($out)
1722 vmovdqu $xa2,0x100($out)
1723 vmovdqu $xb2,0x120($out)
1724 vmovdqu $xc2,0x140($out)
1725 vmovdqu $xd2,0x160($out)
1726 je .Ldone8x
1727
1728 lea 0x180($inp),$inp # inp+=64*6
1729 xor %r10,%r10
1730 vmovdqa $xa3,0x00(%rsp)
1731 lea 0x180($out),$out # out+=64*6
1732 sub \$384,$len # len-=64*6
1733 vmovdqa $xb3,0x20(%rsp)
1734 jmp .Loop_tail8x
1735
1736.align 32
1737.L448_or_more8x:
1738 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1739 vpxor 0x20($inp),$xb0,$xb0
1740 vpxor 0x40($inp),$xc0,$xc0
1741 vpxor 0x60($inp),$xd0,$xd0
1742 vpxor 0x80($inp),$xa1,$xa1
1743 vpxor 0xa0($inp),$xb1,$xb1
1744 vpxor 0xc0($inp),$xc1,$xc1
1745 vpxor 0xe0($inp),$xd1,$xd1
1746 vpxor 0x100($inp),$xa2,$xa2
1747 vpxor 0x120($inp),$xb2,$xb2
1748 vpxor 0x140($inp),$xc2,$xc2
1749 vpxor 0x160($inp),$xd2,$xd2
1750 vpxor 0x180($inp),$xa3,$xa3
1751 vpxor 0x1a0($inp),$xb3,$xb3
1752 vmovdqu $xa0,0x00($out)
1753 vmovdqu $xb0,0x20($out)
1754 vmovdqu $xc0,0x40($out)
1755 vmovdqu $xd0,0x60($out)
1756 vmovdqu $xa1,0x80($out)
1757 vmovdqu $xb1,0xa0($out)
1758 vmovdqu $xc1,0xc0($out)
1759 vmovdqu $xd1,0xe0($out)
1760 vmovdqu $xa2,0x100($out)
1761 vmovdqu $xb2,0x120($out)
1762 vmovdqu $xc2,0x140($out)
1763 vmovdqu $xd2,0x160($out)
1764 vmovdqu $xa3,0x180($out)
1765 vmovdqu $xb3,0x1a0($out)
1766 je .Ldone8x
1767
1768 lea 0x1c0($inp),$inp # inp+=64*7
1769 xor %r10,%r10
1770 vmovdqa $xc3,0x00(%rsp)
1771 lea 0x1c0($out),$out # out+=64*7
1772 sub \$448,$len # len-=64*7
1773 vmovdqa $xd3,0x20(%rsp)
1774
1775.Loop_tail8x:
1776 movzb ($inp,%r10),%eax
1777 movzb (%rsp,%r10),%ecx
1778 lea 1(%r10),%r10
1779 xor %ecx,%eax
1780 mov %al,-1($out,%r10)
1781 dec $len
1782 jnz .Loop_tail8x
1783
1784.Ldone8x:
1785 vzeroall
1786___
1787$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -08001788 movaps -0xa8(%r9),%xmm6
1789 movaps -0x98(%r9),%xmm7
1790 movaps -0x88(%r9),%xmm8
1791 movaps -0x78(%r9),%xmm9
1792 movaps -0x68(%r9),%xmm10
1793 movaps -0x58(%r9),%xmm11
1794 movaps -0x48(%r9),%xmm12
1795 movaps -0x38(%r9),%xmm13
1796 movaps -0x28(%r9),%xmm14
1797 movaps -0x18(%r9),%xmm15
David Benjamin4969cc92016-04-22 15:02:23 -04001798___
1799$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08001800 lea (%r9),%rsp
Robert Sloan4c22c5f2019-03-01 15:53:37 -08001801.cfi_def_cfa_register rsp
Robert Sloana94fe052017-02-21 08:49:28 -08001802.L8x_epilogue:
David Benjamin4969cc92016-04-22 15:02:23 -04001803 ret
Robert Sloan4c22c5f2019-03-01 15:53:37 -08001804.cfi_endproc
David Benjamin4969cc92016-04-22 15:02:23 -04001805.size ChaCha20_8x,.-ChaCha20_8x
1806___
1807}
1808
Robert Sloana94fe052017-02-21 08:49:28 -08001809########################################################################
1810# AVX512 code paths
1811if ($avx>2) {
1812# This one handles shorter inputs...
David Benjamin4969cc92016-04-22 15:02:23 -04001813
Robert Sloana94fe052017-02-21 08:49:28 -08001814my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
1815my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1816
1817sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
1818 &vpaddd ($a,$a,$b);
1819 &vpxord ($d,$d,$a);
1820 &vprold ($d,$d,16);
1821
1822 &vpaddd ($c,$c,$d);
1823 &vpxord ($b,$b,$c);
1824 &vprold ($b,$b,12);
1825
1826 &vpaddd ($a,$a,$b);
1827 &vpxord ($d,$d,$a);
1828 &vprold ($d,$d,8);
1829
1830 &vpaddd ($c,$c,$d);
1831 &vpxord ($b,$b,$c);
1832 &vprold ($b,$b,7);
1833}
1834
1835my $xframe = $win64 ? 32+8 : 8;
1836
1837$code.=<<___;
1838.type ChaCha20_avx512,\@function,5
1839.align 32
1840ChaCha20_avx512:
1841.LChaCha20_avx512:
Robert Sloan4c22c5f2019-03-01 15:53:37 -08001842.cfi_startproc
Robert Sloana94fe052017-02-21 08:49:28 -08001843 mov %rsp,%r9 # frame pointer
Robert Sloan4c22c5f2019-03-01 15:53:37 -08001844.cfi_def_cfa_register r9
Robert Sloana94fe052017-02-21 08:49:28 -08001845 cmp \$512,$len
1846 ja .LChaCha20_16x
1847
1848 sub \$64+$xframe,%rsp
1849___
1850$code.=<<___ if ($win64);
1851 movaps %xmm6,-0x28(%r9)
1852 movaps %xmm7,-0x18(%r9)
1853.Lavx512_body:
1854___
1855$code.=<<___;
1856 vbroadcasti32x4 .Lsigma(%rip),$a
1857 vbroadcasti32x4 ($key),$b
1858 vbroadcasti32x4 16($key),$c
1859 vbroadcasti32x4 ($counter),$d
1860
1861 vmovdqa32 $a,$a_
1862 vmovdqa32 $b,$b_
1863 vmovdqa32 $c,$c_
1864 vpaddd .Lzeroz(%rip),$d,$d
1865 vmovdqa32 .Lfourz(%rip),$fourz
1866 mov \$10,$counter # reuse $counter
1867 vmovdqa32 $d,$d_
1868 jmp .Loop_avx512
1869
1870.align 16
1871.Loop_outer_avx512:
1872 vmovdqa32 $a_,$a
1873 vmovdqa32 $b_,$b
1874 vmovdqa32 $c_,$c
1875 vpaddd $fourz,$d_,$d
1876 mov \$10,$counter
1877 vmovdqa32 $d,$d_
1878 jmp .Loop_avx512
1879
1880.align 32
1881.Loop_avx512:
1882___
1883 &AVX512ROUND();
1884 &vpshufd ($c,$c,0b01001110);
1885 &vpshufd ($b,$b,0b00111001);
1886 &vpshufd ($d,$d,0b10010011);
1887
1888 &AVX512ROUND();
1889 &vpshufd ($c,$c,0b01001110);
1890 &vpshufd ($b,$b,0b10010011);
1891 &vpshufd ($d,$d,0b00111001);
1892
1893 &dec ($counter);
1894 &jnz (".Loop_avx512");
1895
1896$code.=<<___;
1897 vpaddd $a_,$a,$a
1898 vpaddd $b_,$b,$b
1899 vpaddd $c_,$c,$c
1900 vpaddd $d_,$d,$d
1901
1902 sub \$64,$len
1903 jb .Ltail64_avx512
1904
1905 vpxor 0x00($inp),%x#$a,$t0 # xor with input
1906 vpxor 0x10($inp),%x#$b,$t1
1907 vpxor 0x20($inp),%x#$c,$t2
1908 vpxor 0x30($inp),%x#$d,$t3
1909 lea 0x40($inp),$inp # inp+=64
1910
1911 vmovdqu $t0,0x00($out) # write output
1912 vmovdqu $t1,0x10($out)
1913 vmovdqu $t2,0x20($out)
1914 vmovdqu $t3,0x30($out)
1915 lea 0x40($out),$out # out+=64
1916
1917 jz .Ldone_avx512
1918
1919 vextracti32x4 \$1,$a,$t0
1920 vextracti32x4 \$1,$b,$t1
1921 vextracti32x4 \$1,$c,$t2
1922 vextracti32x4 \$1,$d,$t3
1923
1924 sub \$64,$len
1925 jb .Ltail_avx512
1926
1927 vpxor 0x00($inp),$t0,$t0 # xor with input
1928 vpxor 0x10($inp),$t1,$t1
1929 vpxor 0x20($inp),$t2,$t2
1930 vpxor 0x30($inp),$t3,$t3
1931 lea 0x40($inp),$inp # inp+=64
1932
1933 vmovdqu $t0,0x00($out) # write output
1934 vmovdqu $t1,0x10($out)
1935 vmovdqu $t2,0x20($out)
1936 vmovdqu $t3,0x30($out)
1937 lea 0x40($out),$out # out+=64
1938
1939 jz .Ldone_avx512
1940
1941 vextracti32x4 \$2,$a,$t0
1942 vextracti32x4 \$2,$b,$t1
1943 vextracti32x4 \$2,$c,$t2
1944 vextracti32x4 \$2,$d,$t3
1945
1946 sub \$64,$len
1947 jb .Ltail_avx512
1948
1949 vpxor 0x00($inp),$t0,$t0 # xor with input
1950 vpxor 0x10($inp),$t1,$t1
1951 vpxor 0x20($inp),$t2,$t2
1952 vpxor 0x30($inp),$t3,$t3
1953 lea 0x40($inp),$inp # inp+=64
1954
1955 vmovdqu $t0,0x00($out) # write output
1956 vmovdqu $t1,0x10($out)
1957 vmovdqu $t2,0x20($out)
1958 vmovdqu $t3,0x30($out)
1959 lea 0x40($out),$out # out+=64
1960
1961 jz .Ldone_avx512
1962
1963 vextracti32x4 \$3,$a,$t0
1964 vextracti32x4 \$3,$b,$t1
1965 vextracti32x4 \$3,$c,$t2
1966 vextracti32x4 \$3,$d,$t3
1967
1968 sub \$64,$len
1969 jb .Ltail_avx512
1970
1971 vpxor 0x00($inp),$t0,$t0 # xor with input
1972 vpxor 0x10($inp),$t1,$t1
1973 vpxor 0x20($inp),$t2,$t2
1974 vpxor 0x30($inp),$t3,$t3
1975 lea 0x40($inp),$inp # inp+=64
1976
1977 vmovdqu $t0,0x00($out) # write output
1978 vmovdqu $t1,0x10($out)
1979 vmovdqu $t2,0x20($out)
1980 vmovdqu $t3,0x30($out)
1981 lea 0x40($out),$out # out+=64
1982
1983 jnz .Loop_outer_avx512
1984
1985 jmp .Ldone_avx512
1986
1987.align 16
1988.Ltail64_avx512:
1989 vmovdqa %x#$a,0x00(%rsp)
1990 vmovdqa %x#$b,0x10(%rsp)
1991 vmovdqa %x#$c,0x20(%rsp)
1992 vmovdqa %x#$d,0x30(%rsp)
1993 add \$64,$len
1994 jmp .Loop_tail_avx512
1995
1996.align 16
1997.Ltail_avx512:
1998 vmovdqa $t0,0x00(%rsp)
1999 vmovdqa $t1,0x10(%rsp)
2000 vmovdqa $t2,0x20(%rsp)
2001 vmovdqa $t3,0x30(%rsp)
2002 add \$64,$len
2003
2004.Loop_tail_avx512:
2005 movzb ($inp,$counter),%eax
2006 movzb (%rsp,$counter),%ecx
2007 lea 1($counter),$counter
2008 xor %ecx,%eax
2009 mov %al,-1($out,$counter)
2010 dec $len
2011 jnz .Loop_tail_avx512
2012
2013 vmovdqa32 $a_,0x00(%rsp)
2014
2015.Ldone_avx512:
2016 vzeroall
2017___
2018$code.=<<___ if ($win64);
2019 movaps -0x28(%r9),%xmm6
2020 movaps -0x18(%r9),%xmm7
2021___
2022$code.=<<___;
2023 lea (%r9),%rsp
Robert Sloan4c22c5f2019-03-01 15:53:37 -08002024.cfi_def_cfa_register rsp
Robert Sloana94fe052017-02-21 08:49:28 -08002025.Lavx512_epilogue:
2026 ret
Robert Sloan4c22c5f2019-03-01 15:53:37 -08002027.cfi_endproc
Robert Sloana94fe052017-02-21 08:49:28 -08002028.size ChaCha20_avx512,.-ChaCha20_avx512
2029___
2030}
2031if ($avx>2) {
2032# This one handles longer inputs...
2033
2034my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2035 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2036my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2037 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2038my @key=map("%zmm$_",(16..31));
2039my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2040
2041sub AVX512_lane_ROUND {
2042my ($a0,$b0,$c0,$d0)=@_;
2043my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2044my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2045my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2046my @x=map("\"$_\"",@xx);
2047
2048 (
2049 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
2050 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
2051 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
2052 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
2053 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2054 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2055 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2056 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2057 "&vprold (@x[$d0],@x[$d0],16)",
2058 "&vprold (@x[$d1],@x[$d1],16)",
2059 "&vprold (@x[$d2],@x[$d2],16)",
2060 "&vprold (@x[$d3],@x[$d3],16)",
2061
2062 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2063 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2064 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2065 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2066 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2067 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2068 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2069 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2070 "&vprold (@x[$b0],@x[$b0],12)",
2071 "&vprold (@x[$b1],@x[$b1],12)",
2072 "&vprold (@x[$b2],@x[$b2],12)",
2073 "&vprold (@x[$b3],@x[$b3],12)",
2074
2075 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
2076 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
2077 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
2078 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
2079 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2080 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2081 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2082 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2083 "&vprold (@x[$d0],@x[$d0],8)",
2084 "&vprold (@x[$d1],@x[$d1],8)",
2085 "&vprold (@x[$d2],@x[$d2],8)",
2086 "&vprold (@x[$d3],@x[$d3],8)",
2087
2088 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2089 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2090 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2091 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2092 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2093 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2094 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2095 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2096 "&vprold (@x[$b0],@x[$b0],7)",
2097 "&vprold (@x[$b1],@x[$b1],7)",
2098 "&vprold (@x[$b2],@x[$b2],7)",
2099 "&vprold (@x[$b3],@x[$b3],7)"
2100 );
2101}
2102
2103my $xframe = $win64 ? 0xa8 : 8;
2104
2105$code.=<<___;
2106.type ChaCha20_16x,\@function,5
2107.align 32
2108ChaCha20_16x:
2109.LChaCha20_16x:
Robert Sloan4c22c5f2019-03-01 15:53:37 -08002110.cfi_startproc
Robert Sloana94fe052017-02-21 08:49:28 -08002111 mov %rsp,%r9 # frame register
Robert Sloan4c22c5f2019-03-01 15:53:37 -08002112.cfi_def_cfa_register r9
Robert Sloana94fe052017-02-21 08:49:28 -08002113 sub \$64+$xframe,%rsp
2114 and \$-64,%rsp
2115___
2116$code.=<<___ if ($win64);
2117 movaps %xmm6,-0xa8(%r9)
2118 movaps %xmm7,-0x98(%r9)
2119 movaps %xmm8,-0x88(%r9)
2120 movaps %xmm9,-0x78(%r9)
2121 movaps %xmm10,-0x68(%r9)
2122 movaps %xmm11,-0x58(%r9)
2123 movaps %xmm12,-0x48(%r9)
2124 movaps %xmm13,-0x38(%r9)
2125 movaps %xmm14,-0x28(%r9)
2126 movaps %xmm15,-0x18(%r9)
2127.L16x_body:
2128___
2129$code.=<<___;
2130 vzeroupper
2131
2132 lea .Lsigma(%rip),%r10
2133 vbroadcasti32x4 (%r10),$xa3 # key[0]
2134 vbroadcasti32x4 ($key),$xb3 # key[1]
2135 vbroadcasti32x4 16($key),$xc3 # key[2]
2136 vbroadcasti32x4 ($counter),$xd3 # key[3]
2137
2138 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
2139 vpshufd \$0x55,$xa3,$xa1
2140 vpshufd \$0xaa,$xa3,$xa2
2141 vpshufd \$0xff,$xa3,$xa3
2142 vmovdqa64 $xa0,@key[0]
2143 vmovdqa64 $xa1,@key[1]
2144 vmovdqa64 $xa2,@key[2]
2145 vmovdqa64 $xa3,@key[3]
2146
2147 vpshufd \$0x00,$xb3,$xb0
2148 vpshufd \$0x55,$xb3,$xb1
2149 vpshufd \$0xaa,$xb3,$xb2
2150 vpshufd \$0xff,$xb3,$xb3
2151 vmovdqa64 $xb0,@key[4]
2152 vmovdqa64 $xb1,@key[5]
2153 vmovdqa64 $xb2,@key[6]
2154 vmovdqa64 $xb3,@key[7]
2155
2156 vpshufd \$0x00,$xc3,$xc0
2157 vpshufd \$0x55,$xc3,$xc1
2158 vpshufd \$0xaa,$xc3,$xc2
2159 vpshufd \$0xff,$xc3,$xc3
2160 vmovdqa64 $xc0,@key[8]
2161 vmovdqa64 $xc1,@key[9]
2162 vmovdqa64 $xc2,@key[10]
2163 vmovdqa64 $xc3,@key[11]
2164
2165 vpshufd \$0x00,$xd3,$xd0
2166 vpshufd \$0x55,$xd3,$xd1
2167 vpshufd \$0xaa,$xd3,$xd2
2168 vpshufd \$0xff,$xd3,$xd3
2169 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
2170 vmovdqa64 $xd0,@key[12]
2171 vmovdqa64 $xd1,@key[13]
2172 vmovdqa64 $xd2,@key[14]
2173 vmovdqa64 $xd3,@key[15]
2174
2175 mov \$10,%eax
2176 jmp .Loop16x
2177
2178.align 32
2179.Loop_outer16x:
2180 vpbroadcastd 0(%r10),$xa0 # reload key
2181 vpbroadcastd 4(%r10),$xa1
2182 vpbroadcastd 8(%r10),$xa2
2183 vpbroadcastd 12(%r10),$xa3
2184 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
2185 vmovdqa64 @key[4],$xb0
2186 vmovdqa64 @key[5],$xb1
2187 vmovdqa64 @key[6],$xb2
2188 vmovdqa64 @key[7],$xb3
2189 vmovdqa64 @key[8],$xc0
2190 vmovdqa64 @key[9],$xc1
2191 vmovdqa64 @key[10],$xc2
2192 vmovdqa64 @key[11],$xc3
2193 vmovdqa64 @key[12],$xd0
2194 vmovdqa64 @key[13],$xd1
2195 vmovdqa64 @key[14],$xd2
2196 vmovdqa64 @key[15],$xd3
2197
2198 vmovdqa64 $xa0,@key[0]
2199 vmovdqa64 $xa1,@key[1]
2200 vmovdqa64 $xa2,@key[2]
2201 vmovdqa64 $xa3,@key[3]
2202
2203 mov \$10,%eax
2204 jmp .Loop16x
2205
2206.align 32
2207.Loop16x:
2208___
2209 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
2210 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
2211$code.=<<___;
2212 dec %eax
2213 jnz .Loop16x
2214
2215 vpaddd @key[0],$xa0,$xa0 # accumulate key
2216 vpaddd @key[1],$xa1,$xa1
2217 vpaddd @key[2],$xa2,$xa2
2218 vpaddd @key[3],$xa3,$xa3
2219
2220 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
2221 vpunpckldq $xa3,$xa2,$xt3
2222 vpunpckhdq $xa1,$xa0,$xa0
2223 vpunpckhdq $xa3,$xa2,$xa2
2224 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
2225 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
2226 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
2227 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
2228___
2229 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2230$code.=<<___;
2231 vpaddd @key[4],$xb0,$xb0
2232 vpaddd @key[5],$xb1,$xb1
2233 vpaddd @key[6],$xb2,$xb2
2234 vpaddd @key[7],$xb3,$xb3
2235
2236 vpunpckldq $xb1,$xb0,$xt2
2237 vpunpckldq $xb3,$xb2,$xt3
2238 vpunpckhdq $xb1,$xb0,$xb0
2239 vpunpckhdq $xb3,$xb2,$xb2
2240 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
2241 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
2242 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
2243 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
2244___
2245 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2246$code.=<<___;
2247 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
2248 vshufi32x4 \$0xee,$xb0,$xa0,$xb0
2249 vshufi32x4 \$0x44,$xb1,$xa1,$xa0
2250 vshufi32x4 \$0xee,$xb1,$xa1,$xb1
2251 vshufi32x4 \$0x44,$xb2,$xa2,$xa1
2252 vshufi32x4 \$0xee,$xb2,$xa2,$xb2
2253 vshufi32x4 \$0x44,$xb3,$xa3,$xa2
2254 vshufi32x4 \$0xee,$xb3,$xa3,$xb3
2255___
2256 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2257$code.=<<___;
2258 vpaddd @key[8],$xc0,$xc0
2259 vpaddd @key[9],$xc1,$xc1
2260 vpaddd @key[10],$xc2,$xc2
2261 vpaddd @key[11],$xc3,$xc3
2262
2263 vpunpckldq $xc1,$xc0,$xt2
2264 vpunpckldq $xc3,$xc2,$xt3
2265 vpunpckhdq $xc1,$xc0,$xc0
2266 vpunpckhdq $xc3,$xc2,$xc2
2267 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
2268 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
2269 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
2270 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
2271___
2272 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2273$code.=<<___;
2274 vpaddd @key[12],$xd0,$xd0
2275 vpaddd @key[13],$xd1,$xd1
2276 vpaddd @key[14],$xd2,$xd2
2277 vpaddd @key[15],$xd3,$xd3
2278
2279 vpunpckldq $xd1,$xd0,$xt2
2280 vpunpckldq $xd3,$xd2,$xt3
2281 vpunpckhdq $xd1,$xd0,$xd0
2282 vpunpckhdq $xd3,$xd2,$xd2
2283 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
2284 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
2285 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
2286 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
2287___
2288 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2289$code.=<<___;
2290 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
2291 vshufi32x4 \$0xee,$xd0,$xc0,$xd0
2292 vshufi32x4 \$0x44,$xd1,$xc1,$xc0
2293 vshufi32x4 \$0xee,$xd1,$xc1,$xd1
2294 vshufi32x4 \$0x44,$xd2,$xc2,$xc1
2295 vshufi32x4 \$0xee,$xd2,$xc2,$xd2
2296 vshufi32x4 \$0x44,$xd3,$xc3,$xc2
2297 vshufi32x4 \$0xee,$xd3,$xc3,$xd3
2298___
2299 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2300$code.=<<___;
2301 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
2302 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
2303 vshufi32x4 \$0x88,$xd0,$xb0,$xc0
2304 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
2305 vshufi32x4 \$0x88,$xc1,$xa1,$xt1
2306 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
2307 vshufi32x4 \$0x88,$xd1,$xb1,$xc1
2308 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
2309 vshufi32x4 \$0x88,$xc2,$xa2,$xt2
2310 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
2311 vshufi32x4 \$0x88,$xd2,$xb2,$xc2
2312 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
2313 vshufi32x4 \$0x88,$xc3,$xa3,$xt3
2314 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
2315 vshufi32x4 \$0x88,$xd3,$xb3,$xc3
2316 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
2317___
2318 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
2319 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
2320
2321 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
2322 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
2323 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2324 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2325$code.=<<___;
2326 cmp \$64*16,$len
2327 jb .Ltail16x
2328
2329 vpxord 0x00($inp),$xa0,$xa0 # xor with input
2330 vpxord 0x40($inp),$xb0,$xb0
2331 vpxord 0x80($inp),$xc0,$xc0
2332 vpxord 0xc0($inp),$xd0,$xd0
2333 vmovdqu32 $xa0,0x00($out)
2334 vmovdqu32 $xb0,0x40($out)
2335 vmovdqu32 $xc0,0x80($out)
2336 vmovdqu32 $xd0,0xc0($out)
2337
2338 vpxord 0x100($inp),$xa1,$xa1
2339 vpxord 0x140($inp),$xb1,$xb1
2340 vpxord 0x180($inp),$xc1,$xc1
2341 vpxord 0x1c0($inp),$xd1,$xd1
2342 vmovdqu32 $xa1,0x100($out)
2343 vmovdqu32 $xb1,0x140($out)
2344 vmovdqu32 $xc1,0x180($out)
2345 vmovdqu32 $xd1,0x1c0($out)
2346
2347 vpxord 0x200($inp),$xa2,$xa2
2348 vpxord 0x240($inp),$xb2,$xb2
2349 vpxord 0x280($inp),$xc2,$xc2
2350 vpxord 0x2c0($inp),$xd2,$xd2
2351 vmovdqu32 $xa2,0x200($out)
2352 vmovdqu32 $xb2,0x240($out)
2353 vmovdqu32 $xc2,0x280($out)
2354 vmovdqu32 $xd2,0x2c0($out)
2355
2356 vpxord 0x300($inp),$xa3,$xa3
2357 vpxord 0x340($inp),$xb3,$xb3
2358 vpxord 0x380($inp),$xc3,$xc3
2359 vpxord 0x3c0($inp),$xd3,$xd3
2360 lea 0x400($inp),$inp
2361 vmovdqu32 $xa3,0x300($out)
2362 vmovdqu32 $xb3,0x340($out)
2363 vmovdqu32 $xc3,0x380($out)
2364 vmovdqu32 $xd3,0x3c0($out)
2365 lea 0x400($out),$out
2366
2367 sub \$64*16,$len
2368 jnz .Loop_outer16x
2369
2370 jmp .Ldone16x
2371
2372.align 32
2373.Ltail16x:
2374 xor %r10,%r10
2375 sub $inp,$out
2376 cmp \$64*1,$len
2377 jb .Less_than_64_16x
2378 vpxord ($inp),$xa0,$xa0 # xor with input
2379 vmovdqu32 $xa0,($out,$inp)
2380 je .Ldone16x
2381 vmovdqa32 $xb0,$xa0
2382 lea 64($inp),$inp
2383
2384 cmp \$64*2,$len
2385 jb .Less_than_64_16x
2386 vpxord ($inp),$xb0,$xb0
2387 vmovdqu32 $xb0,($out,$inp)
2388 je .Ldone16x
2389 vmovdqa32 $xc0,$xa0
2390 lea 64($inp),$inp
2391
2392 cmp \$64*3,$len
2393 jb .Less_than_64_16x
2394 vpxord ($inp),$xc0,$xc0
2395 vmovdqu32 $xc0,($out,$inp)
2396 je .Ldone16x
2397 vmovdqa32 $xd0,$xa0
2398 lea 64($inp),$inp
2399
2400 cmp \$64*4,$len
2401 jb .Less_than_64_16x
2402 vpxord ($inp),$xd0,$xd0
2403 vmovdqu32 $xd0,($out,$inp)
2404 je .Ldone16x
2405 vmovdqa32 $xa1,$xa0
2406 lea 64($inp),$inp
2407
2408 cmp \$64*5,$len
2409 jb .Less_than_64_16x
2410 vpxord ($inp),$xa1,$xa1
2411 vmovdqu32 $xa1,($out,$inp)
2412 je .Ldone16x
2413 vmovdqa32 $xb1,$xa0
2414 lea 64($inp),$inp
2415
2416 cmp \$64*6,$len
2417 jb .Less_than_64_16x
2418 vpxord ($inp),$xb1,$xb1
2419 vmovdqu32 $xb1,($out,$inp)
2420 je .Ldone16x
2421 vmovdqa32 $xc1,$xa0
2422 lea 64($inp),$inp
2423
2424 cmp \$64*7,$len
2425 jb .Less_than_64_16x
2426 vpxord ($inp),$xc1,$xc1
2427 vmovdqu32 $xc1,($out,$inp)
2428 je .Ldone16x
2429 vmovdqa32 $xd1,$xa0
2430 lea 64($inp),$inp
2431
2432 cmp \$64*8,$len
2433 jb .Less_than_64_16x
2434 vpxord ($inp),$xd1,$xd1
2435 vmovdqu32 $xd1,($out,$inp)
2436 je .Ldone16x
2437 vmovdqa32 $xa2,$xa0
2438 lea 64($inp),$inp
2439
2440 cmp \$64*9,$len
2441 jb .Less_than_64_16x
2442 vpxord ($inp),$xa2,$xa2
2443 vmovdqu32 $xa2,($out,$inp)
2444 je .Ldone16x
2445 vmovdqa32 $xb2,$xa0
2446 lea 64($inp),$inp
2447
2448 cmp \$64*10,$len
2449 jb .Less_than_64_16x
2450 vpxord ($inp),$xb2,$xb2
2451 vmovdqu32 $xb2,($out,$inp)
2452 je .Ldone16x
2453 vmovdqa32 $xc2,$xa0
2454 lea 64($inp),$inp
2455
2456 cmp \$64*11,$len
2457 jb .Less_than_64_16x
2458 vpxord ($inp),$xc2,$xc2
2459 vmovdqu32 $xc2,($out,$inp)
2460 je .Ldone16x
2461 vmovdqa32 $xd2,$xa0
2462 lea 64($inp),$inp
2463
2464 cmp \$64*12,$len
2465 jb .Less_than_64_16x
2466 vpxord ($inp),$xd2,$xd2
2467 vmovdqu32 $xd2,($out,$inp)
2468 je .Ldone16x
2469 vmovdqa32 $xa3,$xa0
2470 lea 64($inp),$inp
2471
2472 cmp \$64*13,$len
2473 jb .Less_than_64_16x
2474 vpxord ($inp),$xa3,$xa3
2475 vmovdqu32 $xa3,($out,$inp)
2476 je .Ldone16x
2477 vmovdqa32 $xb3,$xa0
2478 lea 64($inp),$inp
2479
2480 cmp \$64*14,$len
2481 jb .Less_than_64_16x
2482 vpxord ($inp),$xb3,$xb3
2483 vmovdqu32 $xb3,($out,$inp)
2484 je .Ldone16x
2485 vmovdqa32 $xc3,$xa0
2486 lea 64($inp),$inp
2487
2488 cmp \$64*15,$len
2489 jb .Less_than_64_16x
2490 vpxord ($inp),$xc3,$xc3
2491 vmovdqu32 $xc3,($out,$inp)
2492 je .Ldone16x
2493 vmovdqa32 $xd3,$xa0
2494 lea 64($inp),$inp
2495
2496.Less_than_64_16x:
2497 vmovdqa32 $xa0,0x00(%rsp)
2498 lea ($out,$inp),$out
2499 and \$63,$len
2500
2501.Loop_tail16x:
2502 movzb ($inp,%r10),%eax
2503 movzb (%rsp,%r10),%ecx
2504 lea 1(%r10),%r10
2505 xor %ecx,%eax
2506 mov %al,-1($out,%r10)
2507 dec $len
2508 jnz .Loop_tail16x
2509
2510 vpxord $xa0,$xa0,$xa0
2511 vmovdqa32 $xa0,0(%rsp)
2512
2513.Ldone16x:
2514 vzeroall
2515___
2516$code.=<<___ if ($win64);
2517 movaps -0xa8(%r9),%xmm6
2518 movaps -0x98(%r9),%xmm7
2519 movaps -0x88(%r9),%xmm8
2520 movaps -0x78(%r9),%xmm9
2521 movaps -0x68(%r9),%xmm10
2522 movaps -0x58(%r9),%xmm11
2523 movaps -0x48(%r9),%xmm12
2524 movaps -0x38(%r9),%xmm13
2525 movaps -0x28(%r9),%xmm14
2526 movaps -0x18(%r9),%xmm15
2527___
2528$code.=<<___;
2529 lea (%r9),%rsp
Robert Sloan4c22c5f2019-03-01 15:53:37 -08002530.cfi_def_cfa_register rsp
Robert Sloana94fe052017-02-21 08:49:28 -08002531.L16x_epilogue:
2532 ret
Robert Sloan4c22c5f2019-03-01 15:53:37 -08002533.cfi_endproc
Robert Sloana94fe052017-02-21 08:49:28 -08002534.size ChaCha20_16x,.-ChaCha20_16x
2535___
2536}
2537
2538# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2539# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2540if ($win64) {
2541$rec="%rcx";
2542$frame="%rdx";
2543$context="%r8";
2544$disp="%r9";
2545
2546$code.=<<___;
2547.extern __imp_RtlVirtualUnwind
2548.type se_handler,\@abi-omnipotent
2549.align 16
2550se_handler:
2551 push %rsi
2552 push %rdi
2553 push %rbx
2554 push %rbp
2555 push %r12
2556 push %r13
2557 push %r14
2558 push %r15
2559 pushfq
2560 sub \$64,%rsp
2561
2562 mov 120($context),%rax # pull context->Rax
2563 mov 248($context),%rbx # pull context->Rip
2564
2565 mov 8($disp),%rsi # disp->ImageBase
2566 mov 56($disp),%r11 # disp->HandlerData
2567
2568 lea .Lctr32_body(%rip),%r10
2569 cmp %r10,%rbx # context->Rip<.Lprologue
2570 jb .Lcommon_seh_tail
2571
2572 mov 152($context),%rax # pull context->Rsp
2573
2574 lea .Lno_data(%rip),%r10 # epilogue label
2575 cmp %r10,%rbx # context->Rip>=.Lepilogue
2576 jae .Lcommon_seh_tail
2577
2578 lea 64+24+48(%rax),%rax
2579
2580 mov -8(%rax),%rbx
2581 mov -16(%rax),%rbp
2582 mov -24(%rax),%r12
2583 mov -32(%rax),%r13
2584 mov -40(%rax),%r14
2585 mov -48(%rax),%r15
2586 mov %rbx,144($context) # restore context->Rbx
2587 mov %rbp,160($context) # restore context->Rbp
2588 mov %r12,216($context) # restore context->R12
2589 mov %r13,224($context) # restore context->R13
2590 mov %r14,232($context) # restore context->R14
2591 mov %r15,240($context) # restore context->R14
2592
2593.Lcommon_seh_tail:
2594 mov 8(%rax),%rdi
2595 mov 16(%rax),%rsi
2596 mov %rax,152($context) # restore context->Rsp
2597 mov %rsi,168($context) # restore context->Rsi
2598 mov %rdi,176($context) # restore context->Rdi
2599
2600 mov 40($disp),%rdi # disp->ContextRecord
2601 mov $context,%rsi # context
2602 mov \$154,%ecx # sizeof(CONTEXT)
2603 .long 0xa548f3fc # cld; rep movsq
2604
2605 mov $disp,%rsi
2606 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2607 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2608 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2609 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2610 mov 40(%rsi),%r10 # disp->ContextRecord
2611 lea 56(%rsi),%r11 # &disp->HandlerData
2612 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2613 mov %r10,32(%rsp) # arg5
2614 mov %r11,40(%rsp) # arg6
2615 mov %r12,48(%rsp) # arg7
2616 mov %rcx,56(%rsp) # arg8, (NULL)
2617 call *__imp_RtlVirtualUnwind(%rip)
2618
2619 mov \$1,%eax # ExceptionContinueSearch
2620 add \$64,%rsp
2621 popfq
2622 pop %r15
2623 pop %r14
2624 pop %r13
2625 pop %r12
2626 pop %rbp
2627 pop %rbx
2628 pop %rdi
2629 pop %rsi
2630 ret
2631.size se_handler,.-se_handler
2632
2633.type ssse3_handler,\@abi-omnipotent
2634.align 16
2635ssse3_handler:
2636 push %rsi
2637 push %rdi
2638 push %rbx
2639 push %rbp
2640 push %r12
2641 push %r13
2642 push %r14
2643 push %r15
2644 pushfq
2645 sub \$64,%rsp
2646
2647 mov 120($context),%rax # pull context->Rax
2648 mov 248($context),%rbx # pull context->Rip
2649
2650 mov 8($disp),%rsi # disp->ImageBase
2651 mov 56($disp),%r11 # disp->HandlerData
2652
2653 mov 0(%r11),%r10d # HandlerData[0]
2654 lea (%rsi,%r10),%r10 # prologue label
2655 cmp %r10,%rbx # context->Rip<prologue label
2656 jb .Lcommon_seh_tail
2657
2658 mov 192($context),%rax # pull context->R9
2659
2660 mov 4(%r11),%r10d # HandlerData[1]
2661 lea (%rsi,%r10),%r10 # epilogue label
2662 cmp %r10,%rbx # context->Rip>=epilogue label
2663 jae .Lcommon_seh_tail
2664
2665 lea -0x28(%rax),%rsi
2666 lea 512($context),%rdi # &context.Xmm6
2667 mov \$4,%ecx
2668 .long 0xa548f3fc # cld; rep movsq
2669
2670 jmp .Lcommon_seh_tail
2671.size ssse3_handler,.-ssse3_handler
2672
2673.type full_handler,\@abi-omnipotent
2674.align 16
2675full_handler:
2676 push %rsi
2677 push %rdi
2678 push %rbx
2679 push %rbp
2680 push %r12
2681 push %r13
2682 push %r14
2683 push %r15
2684 pushfq
2685 sub \$64,%rsp
2686
2687 mov 120($context),%rax # pull context->Rax
2688 mov 248($context),%rbx # pull context->Rip
2689
2690 mov 8($disp),%rsi # disp->ImageBase
2691 mov 56($disp),%r11 # disp->HandlerData
2692
2693 mov 0(%r11),%r10d # HandlerData[0]
2694 lea (%rsi,%r10),%r10 # prologue label
2695 cmp %r10,%rbx # context->Rip<prologue label
2696 jb .Lcommon_seh_tail
2697
2698 mov 192($context),%rax # pull context->R9
2699
2700 mov 4(%r11),%r10d # HandlerData[1]
2701 lea (%rsi,%r10),%r10 # epilogue label
2702 cmp %r10,%rbx # context->Rip>=epilogue label
2703 jae .Lcommon_seh_tail
2704
2705 lea -0xa8(%rax),%rsi
2706 lea 512($context),%rdi # &context.Xmm6
2707 mov \$20,%ecx
2708 .long 0xa548f3fc # cld; rep movsq
2709
2710 jmp .Lcommon_seh_tail
2711.size full_handler,.-full_handler
2712
2713.section .pdata
2714.align 4
2715 .rva .LSEH_begin_ChaCha20_ctr32
2716 .rva .LSEH_end_ChaCha20_ctr32
2717 .rva .LSEH_info_ChaCha20_ctr32
2718
2719 .rva .LSEH_begin_ChaCha20_ssse3
2720 .rva .LSEH_end_ChaCha20_ssse3
2721 .rva .LSEH_info_ChaCha20_ssse3
2722
2723 .rva .LSEH_begin_ChaCha20_4x
2724 .rva .LSEH_end_ChaCha20_4x
2725 .rva .LSEH_info_ChaCha20_4x
2726___
2727$code.=<<___ if ($avx>1);
2728 .rva .LSEH_begin_ChaCha20_8x
2729 .rva .LSEH_end_ChaCha20_8x
2730 .rva .LSEH_info_ChaCha20_8x
2731___
2732$code.=<<___ if ($avx>2);
2733 .rva .LSEH_begin_ChaCha20_avx512
2734 .rva .LSEH_end_ChaCha20_avx512
2735 .rva .LSEH_info_ChaCha20_avx512
2736
2737 .rva .LSEH_begin_ChaCha20_16x
2738 .rva .LSEH_end_ChaCha20_16x
2739 .rva .LSEH_info_ChaCha20_16x
2740___
2741$code.=<<___;
2742.section .xdata
2743.align 8
2744.LSEH_info_ChaCha20_ctr32:
2745 .byte 9,0,0,0
2746 .rva se_handler
2747
2748.LSEH_info_ChaCha20_ssse3:
2749 .byte 9,0,0,0
2750 .rva ssse3_handler
2751 .rva .Lssse3_body,.Lssse3_epilogue
2752
2753.LSEH_info_ChaCha20_4x:
2754 .byte 9,0,0,0
2755 .rva full_handler
2756 .rva .L4x_body,.L4x_epilogue
2757___
2758$code.=<<___ if ($avx>1);
2759.LSEH_info_ChaCha20_8x:
2760 .byte 9,0,0,0
2761 .rva full_handler
2762 .rva .L8x_body,.L8x_epilogue # HandlerData[]
2763___
2764$code.=<<___ if ($avx>2);
2765.LSEH_info_ChaCha20_avx512:
2766 .byte 9,0,0,0
2767 .rva ssse3_handler
2768 .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[]
2769
2770.LSEH_info_ChaCha20_16x:
2771 .byte 9,0,0,0
2772 .rva full_handler
2773 .rva .L16x_body,.L16x_epilogue # HandlerData[]
2774___
2775}
2776
2777foreach (split("\n",$code)) {
2778 s/\`([^\`]*)\`/eval $1/ge;
2779
2780 s/%x#%[yz]/%x/g; # "down-shift"
David Benjamin4969cc92016-04-22 15:02:23 -04002781
2782 print $_,"\n";
2783}
2784
Srinivas Paladugudd42a612019-08-09 19:30:39 +00002785close STDOUT;