blob: a776b267ac31432e9cc96bd3fbf00b1903f9c3d3 [file] [log] [blame]
David Benjamin4969cc92016-04-22 15:02:23 -04001#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# January 2015
11#
12# ChaCha20 for x86.
13#
14# Performance in cycles per byte out of large buffer.
15#
16# 1xIALU/gcc 4xSSSE3
17# Pentium 17.5/+80%
18# PIII 14.2/+60%
19# P4 18.6/+84%
20# Core2 9.56/+89% 4.83
21# Westmere 9.50/+45% 3.35
David Benjamin6e899c72016-06-09 18:02:18 -040022# Sandy Bridge 10.5/+47% 3.20
23# Haswell 8.15/+50% 2.83
Robert Sloana94fe052017-02-21 08:49:28 -080024# Skylake 7.53/+22% 2.75
David Benjamin6e899c72016-06-09 18:02:18 -040025# Silvermont 17.4/+36% 8.35
Robert Sloana94fe052017-02-21 08:49:28 -080026# Goldmont 13.4/+40% 4.36
David Benjamin4969cc92016-04-22 15:02:23 -040027# Sledgehammer 10.2/+54%
David Benjamin6e899c72016-06-09 18:02:18 -040028# Bulldozer 13.4/+50% 4.38(*)
David Benjamin4969cc92016-04-22 15:02:23 -040029#
David Benjamin6e899c72016-06-09 18:02:18 -040030# (*) Bulldozer actually executes 4xXOP code path that delivers 3.55;
David Benjamin4969cc92016-04-22 15:02:23 -040031#
32# Modified from upstream OpenSSL to remove the XOP code.
33
34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35push(@INC,"${dir}","${dir}../../perlasm");
36require "x86asm.pl";
37
David Benjaminc895d6b2016-08-11 13:26:41 -040038$output=pop;
39open STDOUT,">$output";
40
Robert Sloan8ff03552017-06-14 12:40:58 -070041&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
David Benjamin4969cc92016-04-22 15:02:23 -040042
Robert Sloana94fe052017-02-21 08:49:28 -080043$xmm=$ymm=1;
44$gasver=999; # enable everything
David Benjamin4969cc92016-04-22 15:02:23 -040045
46$a="eax";
47($b,$b_)=("ebx","ebp");
48($c,$c_)=("ecx","esi");
49($d,$d_)=("edx","edi");
50
51sub QUARTERROUND {
52my ($ai,$bi,$ci,$di,$i)=@_;
53my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
54my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
55
56 # a b c d
57 #
58 # 0 4 8 12 < even round
59 # 1 5 9 13
60 # 2 6 10 14
61 # 3 7 11 15
62 # 0 5 10 15 < odd round
63 # 1 6 11 12
64 # 2 7 8 13
65 # 3 4 9 14
66
67 if ($i==0) {
68 my $j=4;
69 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
70 } elsif ($i==3) {
71 my $j=0;
72 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
73 } elsif ($i==4) {
74 my $j=4;
75 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
76 } elsif ($i==7) {
77 my $j=0;
78 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
79 }
80
81 #&add ($a,$b); # see elsewhere
82 &xor ($d,$a);
83 &mov (&DWP(4*$cp,"esp"),$c_) if ($ai>0 && $ai<3);
84 &rol ($d,16);
85 &mov (&DWP(4*$bp,"esp"),$b_) if ($i!=0);
86 &add ($c,$d);
87 &mov ($c_,&DWP(4*$cn,"esp")) if ($ai>0 && $ai<3);
88 &xor ($b,$c);
89 &mov ($d_,&DWP(4*$dn,"esp")) if ($di!=$dn);
90 &rol ($b,12);
91 &mov ($b_,&DWP(4*$bn,"esp")) if ($i<7);
92 &mov ($b_,&DWP(128,"esp")) if ($i==7); # loop counter
93 &add ($a,$b);
94 &xor ($d,$a);
95 &mov (&DWP(4*$ai,"esp"),$a);
96 &rol ($d,8);
97 &mov ($a,&DWP(4*$an,"esp"));
98 &add ($c,$d);
99 &mov (&DWP(4*$di,"esp"),$d) if ($di!=$dn);
100 &mov ($d_,$d) if ($di==$dn);
101 &xor ($b,$c);
102 &add ($a,$b_) if ($i<7); # elsewhere
103 &rol ($b,7);
104
105 ($b,$b_)=($b_,$b);
106 ($c,$c_)=($c_,$c);
107 ($d,$d_)=($d_,$d);
108}
109
110&static_label("ssse3_shortcut");
111&static_label("ssse3_data");
112&static_label("pic_point");
113
114&function_begin("ChaCha20_ctr32");
115 &xor ("eax","eax");
116 &cmp ("eax",&wparam(2)); # len==0?
117 &je (&label("no_data"));
118if ($xmm) {
119 &call (&label("pic_point"));
120&set_label("pic_point");
121 &blindpop("eax");
122 &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
123 &test (&DWP(0,"ebp"),1<<24); # test FXSR bit
124 &jz (&label("x86"));
125 &test (&DWP(4,"ebp"),1<<9); # test SSSE3 bit
126 &jz (&label("x86"));
127 &jmp (&label("ssse3_shortcut"));
128&set_label("x86");
129}
130 &mov ("esi",&wparam(3)); # key
131 &mov ("edi",&wparam(4)); # counter and nonce
132
133 &stack_push(33);
134
135 &mov ("eax",&DWP(4*0,"esi")); # copy key
136 &mov ("ebx",&DWP(4*1,"esi"));
137 &mov ("ecx",&DWP(4*2,"esi"));
138 &mov ("edx",&DWP(4*3,"esi"));
139 &mov (&DWP(64+4*4,"esp"),"eax");
140 &mov (&DWP(64+4*5,"esp"),"ebx");
141 &mov (&DWP(64+4*6,"esp"),"ecx");
142 &mov (&DWP(64+4*7,"esp"),"edx");
143 &mov ("eax",&DWP(4*4,"esi"));
144 &mov ("ebx",&DWP(4*5,"esi"));
145 &mov ("ecx",&DWP(4*6,"esi"));
146 &mov ("edx",&DWP(4*7,"esi"));
147 &mov (&DWP(64+4*8,"esp"),"eax");
148 &mov (&DWP(64+4*9,"esp"),"ebx");
149 &mov (&DWP(64+4*10,"esp"),"ecx");
150 &mov (&DWP(64+4*11,"esp"),"edx");
151 &mov ("eax",&DWP(4*0,"edi")); # copy counter and nonce
152 &mov ("ebx",&DWP(4*1,"edi"));
153 &mov ("ecx",&DWP(4*2,"edi"));
154 &mov ("edx",&DWP(4*3,"edi"));
155 &sub ("eax",1);
156 &mov (&DWP(64+4*12,"esp"),"eax");
157 &mov (&DWP(64+4*13,"esp"),"ebx");
158 &mov (&DWP(64+4*14,"esp"),"ecx");
159 &mov (&DWP(64+4*15,"esp"),"edx");
160 &jmp (&label("entry"));
161
162&set_label("outer_loop",16);
163 &mov (&wparam(1),$b); # save input
164 &mov (&wparam(0),$a); # save output
165 &mov (&wparam(2),$c); # save len
166&set_label("entry");
167 &mov ($a,0x61707865);
168 &mov (&DWP(4*1,"esp"),0x3320646e);
169 &mov (&DWP(4*2,"esp"),0x79622d32);
170 &mov (&DWP(4*3,"esp"),0x6b206574);
171
172 &mov ($b, &DWP(64+4*5,"esp")); # copy key material
173 &mov ($b_,&DWP(64+4*6,"esp"));
174 &mov ($c, &DWP(64+4*10,"esp"));
175 &mov ($c_,&DWP(64+4*11,"esp"));
176 &mov ($d, &DWP(64+4*13,"esp"));
177 &mov ($d_,&DWP(64+4*14,"esp"));
178 &mov (&DWP(4*5,"esp"),$b);
179 &mov (&DWP(4*6,"esp"),$b_);
180 &mov (&DWP(4*10,"esp"),$c);
181 &mov (&DWP(4*11,"esp"),$c_);
182 &mov (&DWP(4*13,"esp"),$d);
183 &mov (&DWP(4*14,"esp"),$d_);
184
185 &mov ($b, &DWP(64+4*7,"esp"));
186 &mov ($d_,&DWP(64+4*15,"esp"));
187 &mov ($d, &DWP(64+4*12,"esp"));
188 &mov ($b_,&DWP(64+4*4,"esp"));
189 &mov ($c, &DWP(64+4*8,"esp"));
190 &mov ($c_,&DWP(64+4*9,"esp"));
191 &add ($d,1); # counter value
192 &mov (&DWP(4*7,"esp"),$b);
193 &mov (&DWP(4*15,"esp"),$d_);
194 &mov (&DWP(64+4*12,"esp"),$d); # save counter value
195
196 &mov ($b,10); # loop counter
197 &jmp (&label("loop"));
198
199&set_label("loop",16);
200 &add ($a,$b_); # elsewhere
201 &mov (&DWP(128,"esp"),$b); # save loop counter
202 &mov ($b,$b_);
203 &QUARTERROUND(0, 4, 8, 12, 0);
204 &QUARTERROUND(1, 5, 9, 13, 1);
205 &QUARTERROUND(2, 6,10, 14, 2);
206 &QUARTERROUND(3, 7,11, 15, 3);
207 &QUARTERROUND(0, 5,10, 15, 4);
208 &QUARTERROUND(1, 6,11, 12, 5);
209 &QUARTERROUND(2, 7, 8, 13, 6);
210 &QUARTERROUND(3, 4, 9, 14, 7);
211 &dec ($b);
212 &jnz (&label("loop"));
213
214 &mov ($b,&wparam(2)); # load len
215
216 &add ($a,0x61707865); # accumulate key material
217 &add ($b_,&DWP(64+4*4,"esp"));
218 &add ($c, &DWP(64+4*8,"esp"));
219 &add ($c_,&DWP(64+4*9,"esp"));
220
221 &cmp ($b,64);
222 &jb (&label("tail"));
223
224 &mov ($b,&wparam(1)); # load input pointer
225 &add ($d, &DWP(64+4*12,"esp"));
226 &add ($d_,&DWP(64+4*14,"esp"));
227
228 &xor ($a, &DWP(4*0,$b)); # xor with input
229 &xor ($b_,&DWP(4*4,$b));
David Benjamin6e899c72016-06-09 18:02:18 -0400230 &mov (&DWP(4*0,"esp"),$a);
David Benjamin4969cc92016-04-22 15:02:23 -0400231 &mov ($a,&wparam(0)); # load output pointer
232 &xor ($c, &DWP(4*8,$b));
233 &xor ($c_,&DWP(4*9,$b));
234 &xor ($d, &DWP(4*12,$b));
235 &xor ($d_,&DWP(4*14,$b));
David Benjamin6e899c72016-06-09 18:02:18 -0400236 &mov (&DWP(4*4,$a),$b_); # write output
237 &mov (&DWP(4*8,$a),$c);
238 &mov (&DWP(4*9,$a),$c_);
239 &mov (&DWP(4*12,$a),$d);
240 &mov (&DWP(4*14,$a),$d_);
David Benjamin4969cc92016-04-22 15:02:23 -0400241
David Benjamin4969cc92016-04-22 15:02:23 -0400242 &mov ($b_,&DWP(4*1,"esp"));
243 &mov ($c, &DWP(4*2,"esp"));
244 &mov ($c_,&DWP(4*3,"esp"));
245 &mov ($d, &DWP(4*5,"esp"));
246 &mov ($d_,&DWP(4*6,"esp"));
247 &add ($b_,0x3320646e); # accumulate key material
248 &add ($c, 0x79622d32);
249 &add ($c_,0x6b206574);
250 &add ($d, &DWP(64+4*5,"esp"));
251 &add ($d_,&DWP(64+4*6,"esp"));
252 &xor ($b_,&DWP(4*1,$b));
253 &xor ($c, &DWP(4*2,$b));
254 &xor ($c_,&DWP(4*3,$b));
255 &xor ($d, &DWP(4*5,$b));
256 &xor ($d_,&DWP(4*6,$b));
257 &mov (&DWP(4*1,$a),$b_);
David Benjamin4969cc92016-04-22 15:02:23 -0400258 &mov (&DWP(4*2,$a),$c);
259 &mov (&DWP(4*3,$a),$c_);
David Benjamin4969cc92016-04-22 15:02:23 -0400260 &mov (&DWP(4*5,$a),$d);
261 &mov (&DWP(4*6,$a),$d_);
262
David Benjamin6e899c72016-06-09 18:02:18 -0400263 &mov ($b_,&DWP(4*7,"esp"));
264 &mov ($c, &DWP(4*10,"esp"));
David Benjamin4969cc92016-04-22 15:02:23 -0400265 &mov ($c_,&DWP(4*11,"esp"));
David Benjamin4969cc92016-04-22 15:02:23 -0400266 &mov ($d, &DWP(4*13,"esp"));
267 &mov ($d_,&DWP(4*15,"esp"));
David Benjamin6e899c72016-06-09 18:02:18 -0400268 &add ($b_,&DWP(64+4*7,"esp"));
269 &add ($c, &DWP(64+4*10,"esp"));
270 &add ($c_,&DWP(64+4*11,"esp"));
David Benjamin4969cc92016-04-22 15:02:23 -0400271 &add ($d, &DWP(64+4*13,"esp"));
272 &add ($d_,&DWP(64+4*15,"esp"));
David Benjamin6e899c72016-06-09 18:02:18 -0400273 &xor ($b_,&DWP(4*7,$b));
274 &xor ($c, &DWP(4*10,$b));
275 &xor ($c_,&DWP(4*11,$b));
David Benjamin4969cc92016-04-22 15:02:23 -0400276 &xor ($d, &DWP(4*13,$b));
277 &xor ($d_,&DWP(4*15,$b));
278 &lea ($b,&DWP(4*16,$b));
David Benjamin6e899c72016-06-09 18:02:18 -0400279 &mov (&DWP(4*7,$a),$b_);
280 &mov ($b_,&DWP(4*0,"esp"));
281 &mov (&DWP(4*10,$a),$c);
David Benjamin4969cc92016-04-22 15:02:23 -0400282 &mov ($c,&wparam(2)); # len
David Benjamin6e899c72016-06-09 18:02:18 -0400283 &mov (&DWP(4*11,$a),$c_);
David Benjamin4969cc92016-04-22 15:02:23 -0400284 &mov (&DWP(4*13,$a),$d);
David Benjamin4969cc92016-04-22 15:02:23 -0400285 &mov (&DWP(4*15,$a),$d_);
David Benjamin6e899c72016-06-09 18:02:18 -0400286 &mov (&DWP(4*0,$a),$b_);
David Benjamin4969cc92016-04-22 15:02:23 -0400287 &lea ($a,&DWP(4*16,$a));
288 &sub ($c,64);
289 &jnz (&label("outer_loop"));
290
291 &jmp (&label("done"));
292
293&set_label("tail");
294 &add ($d, &DWP(64+4*12,"esp"));
295 &add ($d_,&DWP(64+4*14,"esp"));
296 &mov (&DWP(4*0,"esp"),$a);
297 &mov (&DWP(4*4,"esp"),$b_);
298 &mov (&DWP(4*8,"esp"),$c);
299 &mov (&DWP(4*9,"esp"),$c_);
300 &mov (&DWP(4*12,"esp"),$d);
301 &mov (&DWP(4*14,"esp"),$d_);
302
303 &mov ($b_,&DWP(4*1,"esp"));
304 &mov ($c, &DWP(4*2,"esp"));
305 &mov ($c_,&DWP(4*3,"esp"));
306 &mov ($d, &DWP(4*5,"esp"));
307 &mov ($d_,&DWP(4*6,"esp"));
308 &add ($b_,0x3320646e); # accumulate key material
309 &add ($c, 0x79622d32);
310 &add ($c_,0x6b206574);
311 &add ($d, &DWP(64+4*5,"esp"));
312 &add ($d_,&DWP(64+4*6,"esp"));
313 &mov (&DWP(4*1,"esp"),$b_);
314 &mov (&DWP(4*2,"esp"),$c);
315 &mov (&DWP(4*3,"esp"),$c_);
316 &mov (&DWP(4*5,"esp"),$d);
317 &mov (&DWP(4*6,"esp"),$d_);
318
319 &mov ($b_,&DWP(4*7,"esp"));
320 &mov ($c, &DWP(4*10,"esp"));
321 &mov ($c_,&DWP(4*11,"esp"));
322 &mov ($d, &DWP(4*13,"esp"));
323 &mov ($d_,&DWP(4*15,"esp"));
324 &add ($b_,&DWP(64+4*7,"esp"));
325 &add ($c, &DWP(64+4*10,"esp"));
326 &add ($c_,&DWP(64+4*11,"esp"));
327 &add ($d, &DWP(64+4*13,"esp"));
328 &add ($d_,&DWP(64+4*15,"esp"));
329 &mov (&DWP(4*7,"esp"),$b_);
330 &mov ($b_,&wparam(1)); # load input
331 &mov (&DWP(4*10,"esp"),$c);
332 &mov ($c,&wparam(0)); # load output
333 &mov (&DWP(4*11,"esp"),$c_);
334 &xor ($c_,$c_);
335 &mov (&DWP(4*13,"esp"),$d);
336 &mov (&DWP(4*15,"esp"),$d_);
337
338 &xor ("eax","eax");
339 &xor ("edx","edx");
340&set_label("tail_loop");
341 &movb ("al",&BP(0,$c_,$b_));
342 &movb ("dl",&BP(0,"esp",$c_));
343 &lea ($c_,&DWP(1,$c_));
344 &xor ("al","dl");
345 &mov (&BP(-1,$c,$c_),"al");
346 &dec ($b);
347 &jnz (&label("tail_loop"));
348
349&set_label("done");
350 &stack_pop(33);
351&set_label("no_data");
352&function_end("ChaCha20_ctr32");
353
354if ($xmm) {
355my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
356my ($out,$inp,$len)=("edi","esi","ecx");
357
358sub QUARTERROUND_SSSE3 {
359my ($ai,$bi,$ci,$di,$i)=@_;
360my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
361my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
362
363 # a b c d
364 #
365 # 0 4 8 12 < even round
366 # 1 5 9 13
367 # 2 6 10 14
368 # 3 7 11 15
369 # 0 5 10 15 < odd round
370 # 1 6 11 12
371 # 2 7 8 13
372 # 3 4 9 14
373
374 if ($i==0) {
375 my $j=4;
376 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
377 } elsif ($i==3) {
378 my $j=0;
379 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
380 } elsif ($i==4) {
381 my $j=4;
382 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
383 } elsif ($i==7) {
384 my $j=0;
385 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
386 }
387
388 #&paddd ($xa,$xb); # see elsewhere
389 #&pxor ($xd,$xa); # see elsewhere
390 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3);
391 &pshufb ($xd,&QWP(0,"eax")); # rot16
392 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0);
393 &paddd ($xc,$xd);
394 &movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3);
395 &pxor ($xb,$xc);
396 &movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7);
397 &movdqa ($xa_,$xb); # borrow as temporary
398 &pslld ($xb,12);
399 &psrld ($xa_,20);
400 &por ($xb,$xa_);
401 &movdqa($xa_,&QWP(16*$an-128,"ebx"));
402 &paddd ($xa,$xb);
403 &movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn);
404 &pxor ($xd,$xa);
405 &movdqa (&QWP(16*$ai-128,"ebx"),$xa);
406 &pshufb ($xd,&QWP(16,"eax")); # rot8
407 &paddd ($xc,$xd);
408 &movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn);
409 &movdqa ($xd_,$xd) if ($di==$dn);
410 &pxor ($xb,$xc);
411 &paddd ($xa_,$xb_) if ($i<7); # elsewhere
412 &movdqa ($xa,$xb); # borrow as temporary
413 &pslld ($xb,7);
414 &psrld ($xa,25);
415 &pxor ($xd_,$xa_) if ($i<7); # elsewhere
416 &por ($xb,$xa);
417
418 ($xa,$xa_)=($xa_,$xa);
419 ($xb,$xb_)=($xb_,$xb);
420 ($xc,$xc_)=($xc_,$xc);
421 ($xd,$xd_)=($xd_,$xd);
422}
423
424&function_begin("ChaCha20_ssse3");
425&set_label("ssse3_shortcut");
426 &mov ($out,&wparam(0));
427 &mov ($inp,&wparam(1));
428 &mov ($len,&wparam(2));
429 &mov ("edx",&wparam(3)); # key
430 &mov ("ebx",&wparam(4)); # counter and nonce
431
432 &mov ("ebp","esp");
433 &stack_push (131);
434 &and ("esp",-64);
435 &mov (&DWP(512,"esp"),"ebp");
436
437 &lea ("eax",&DWP(&label("ssse3_data")."-".
438 &label("pic_point"),"eax"));
439 &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce
440
Robert Sloana94fe052017-02-21 08:49:28 -0800441if (defined($gasver) && $gasver>=2.17) { # even though we encode
442 # pshufb manually, we
443 # handle only register
444 # operands, while this
445 # segment uses memory
446 # operand...
David Benjamin4969cc92016-04-22 15:02:23 -0400447 &cmp ($len,64*4);
448 &jb (&label("1x"));
449
450 &mov (&DWP(512+4,"esp"),"edx"); # offload pointers
451 &mov (&DWP(512+8,"esp"),"ebx");
452 &sub ($len,64*4); # bias len
453 &lea ("ebp",&DWP(256+128,"esp")); # size optimization
454
455 &movdqu ("xmm7",&QWP(0,"edx")); # key
456 &pshufd ("xmm0","xmm3",0x00);
457 &pshufd ("xmm1","xmm3",0x55);
458 &pshufd ("xmm2","xmm3",0xaa);
459 &pshufd ("xmm3","xmm3",0xff);
460 &paddd ("xmm0",&QWP(16*3,"eax")); # fix counters
461 &pshufd ("xmm4","xmm7",0x00);
462 &pshufd ("xmm5","xmm7",0x55);
463 &psubd ("xmm0",&QWP(16*4,"eax"));
464 &pshufd ("xmm6","xmm7",0xaa);
465 &pshufd ("xmm7","xmm7",0xff);
466 &movdqa (&QWP(16*12-128,"ebp"),"xmm0");
467 &movdqa (&QWP(16*13-128,"ebp"),"xmm1");
468 &movdqa (&QWP(16*14-128,"ebp"),"xmm2");
469 &movdqa (&QWP(16*15-128,"ebp"),"xmm3");
470 &movdqu ("xmm3",&QWP(16,"edx")); # key
471 &movdqa (&QWP(16*4-128,"ebp"),"xmm4");
472 &movdqa (&QWP(16*5-128,"ebp"),"xmm5");
473 &movdqa (&QWP(16*6-128,"ebp"),"xmm6");
474 &movdqa (&QWP(16*7-128,"ebp"),"xmm7");
475 &movdqa ("xmm7",&QWP(16*2,"eax")); # sigma
476 &lea ("ebx",&DWP(128,"esp")); # size optimization
477
478 &pshufd ("xmm0","xmm3",0x00);
479 &pshufd ("xmm1","xmm3",0x55);
480 &pshufd ("xmm2","xmm3",0xaa);
481 &pshufd ("xmm3","xmm3",0xff);
482 &pshufd ("xmm4","xmm7",0x00);
483 &pshufd ("xmm5","xmm7",0x55);
484 &pshufd ("xmm6","xmm7",0xaa);
485 &pshufd ("xmm7","xmm7",0xff);
486 &movdqa (&QWP(16*8-128,"ebp"),"xmm0");
487 &movdqa (&QWP(16*9-128,"ebp"),"xmm1");
488 &movdqa (&QWP(16*10-128,"ebp"),"xmm2");
489 &movdqa (&QWP(16*11-128,"ebp"),"xmm3");
490 &movdqa (&QWP(16*0-128,"ebp"),"xmm4");
491 &movdqa (&QWP(16*1-128,"ebp"),"xmm5");
492 &movdqa (&QWP(16*2-128,"ebp"),"xmm6");
493 &movdqa (&QWP(16*3-128,"ebp"),"xmm7");
494
495 &lea ($inp,&DWP(128,$inp)); # size optimization
496 &lea ($out,&DWP(128,$out)); # size optimization
497 &jmp (&label("outer_loop"));
498
499&set_label("outer_loop",16);
500 #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material
501 &movdqa ("xmm1",&QWP(16*1-128,"ebp"));
502 &movdqa ("xmm2",&QWP(16*2-128,"ebp"));
503 &movdqa ("xmm3",&QWP(16*3-128,"ebp"));
504 #&movdqa ("xmm4",&QWP(16*4-128,"ebp"));
505 &movdqa ("xmm5",&QWP(16*5-128,"ebp"));
506 &movdqa ("xmm6",&QWP(16*6-128,"ebp"));
507 &movdqa ("xmm7",&QWP(16*7-128,"ebp"));
508 #&movdqa (&QWP(16*0-128,"ebx"),"xmm0");
509 &movdqa (&QWP(16*1-128,"ebx"),"xmm1");
510 &movdqa (&QWP(16*2-128,"ebx"),"xmm2");
511 &movdqa (&QWP(16*3-128,"ebx"),"xmm3");
512 #&movdqa (&QWP(16*4-128,"ebx"),"xmm4");
513 &movdqa (&QWP(16*5-128,"ebx"),"xmm5");
514 &movdqa (&QWP(16*6-128,"ebx"),"xmm6");
515 &movdqa (&QWP(16*7-128,"ebx"),"xmm7");
516 #&movdqa ("xmm0",&QWP(16*8-128,"ebp"));
517 #&movdqa ("xmm1",&QWP(16*9-128,"ebp"));
518 &movdqa ("xmm2",&QWP(16*10-128,"ebp"));
519 &movdqa ("xmm3",&QWP(16*11-128,"ebp"));
520 &movdqa ("xmm4",&QWP(16*12-128,"ebp"));
521 &movdqa ("xmm5",&QWP(16*13-128,"ebp"));
522 &movdqa ("xmm6",&QWP(16*14-128,"ebp"));
523 &movdqa ("xmm7",&QWP(16*15-128,"ebp"));
524 &paddd ("xmm4",&QWP(16*4,"eax")); # counter value
525 #&movdqa (&QWP(16*8-128,"ebx"),"xmm0");
526 #&movdqa (&QWP(16*9-128,"ebx"),"xmm1");
527 &movdqa (&QWP(16*10-128,"ebx"),"xmm2");
528 &movdqa (&QWP(16*11-128,"ebx"),"xmm3");
529 &movdqa (&QWP(16*12-128,"ebx"),"xmm4");
530 &movdqa (&QWP(16*13-128,"ebx"),"xmm5");
531 &movdqa (&QWP(16*14-128,"ebx"),"xmm6");
532 &movdqa (&QWP(16*15-128,"ebx"),"xmm7");
533 &movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value
534
535 &movdqa ($xa, &QWP(16*0-128,"ebp"));
536 &movdqa ($xd, "xmm4");
537 &movdqa ($xb_,&QWP(16*4-128,"ebp"));
538 &movdqa ($xc, &QWP(16*8-128,"ebp"));
539 &movdqa ($xc_,&QWP(16*9-128,"ebp"));
540
541 &mov ("edx",10); # loop counter
542 &nop ();
543
544&set_label("loop",16);
545 &paddd ($xa,$xb_); # elsewhere
546 &movdqa ($xb,$xb_);
547 &pxor ($xd,$xa); # elsewhere
548 &QUARTERROUND_SSSE3(0, 4, 8, 12, 0);
549 &QUARTERROUND_SSSE3(1, 5, 9, 13, 1);
550 &QUARTERROUND_SSSE3(2, 6,10, 14, 2);
551 &QUARTERROUND_SSSE3(3, 7,11, 15, 3);
552 &QUARTERROUND_SSSE3(0, 5,10, 15, 4);
553 &QUARTERROUND_SSSE3(1, 6,11, 12, 5);
554 &QUARTERROUND_SSSE3(2, 7, 8, 13, 6);
555 &QUARTERROUND_SSSE3(3, 4, 9, 14, 7);
556 &dec ("edx");
557 &jnz (&label("loop"));
558
559 &movdqa (&QWP(16*4-128,"ebx"),$xb_);
560 &movdqa (&QWP(16*8-128,"ebx"),$xc);
561 &movdqa (&QWP(16*9-128,"ebx"),$xc_);
562 &movdqa (&QWP(16*12-128,"ebx"),$xd);
563 &movdqa (&QWP(16*14-128,"ebx"),$xd_);
564
565 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
566
David Benjamin6e899c72016-06-09 18:02:18 -0400567 #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there
568 &movdqa ($xa1,&QWP(16*1-128,"ebx"));
569 &movdqa ($xa2,&QWP(16*2-128,"ebx"));
570 &movdqa ($xa3,&QWP(16*3-128,"ebx"));
David Benjamin4969cc92016-04-22 15:02:23 -0400571
David Benjamin6e899c72016-06-09 18:02:18 -0400572 for($i=0;$i<256;$i+=64) {
David Benjamin4969cc92016-04-22 15:02:23 -0400573 &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material
574 &paddd ($xa1,&QWP($i+16*1-128,"ebp"));
575 &paddd ($xa2,&QWP($i+16*2-128,"ebp"));
576 &paddd ($xa3,&QWP($i+16*3-128,"ebp"));
577
578 &movdqa ($xt2,$xa0); # "de-interlace" data
579 &punpckldq ($xa0,$xa1);
580 &movdqa ($xt3,$xa2);
581 &punpckldq ($xa2,$xa3);
582 &punpckhdq ($xt2,$xa1);
583 &punpckhdq ($xt3,$xa3);
584 &movdqa ($xa1,$xa0);
585 &punpcklqdq ($xa0,$xa2); # "a0"
586 &movdqa ($xa3,$xt2);
587 &punpcklqdq ($xt2,$xt3); # "a2"
588 &punpckhqdq ($xa1,$xa2); # "a1"
589 &punpckhqdq ($xa3,$xt3); # "a3"
590
591 #($xa2,$xt2)=($xt2,$xa2);
592
David Benjamin6e899c72016-06-09 18:02:18 -0400593 &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input
594 &movdqu ($xt1,&QWP(64*1-128,$inp));
595 &movdqu ($xa2,&QWP(64*2-128,$inp));
596 &movdqu ($xt3,&QWP(64*3-128,$inp));
597 &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp));
598 &pxor ($xt0,$xa0);
David Benjamin4969cc92016-04-22 15:02:23 -0400599 &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192);
David Benjamin6e899c72016-06-09 18:02:18 -0400600 &pxor ($xt1,$xa1);
601 &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192);
602 &pxor ($xt2,$xa2);
603 &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192);
604 &pxor ($xt3,$xa3);
605 &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192);
606 &movdqu (&QWP(64*0-128,$out),$xt0); # store output
607 &movdqu (&QWP(64*1-128,$out),$xt1);
608 &movdqu (&QWP(64*2-128,$out),$xt2);
609 &movdqu (&QWP(64*3-128,$out),$xt3);
610 &lea ($out,&QWP($i<192?16:(64*4-16*3),$out));
David Benjamin4969cc92016-04-22 15:02:23 -0400611 }
David Benjamin4969cc92016-04-22 15:02:23 -0400612 &sub ($len,64*4);
613 &jnc (&label("outer_loop"));
614
615 &add ($len,64*4);
616 &jz (&label("done"));
617
618 &mov ("ebx",&DWP(512+8,"esp")); # restore pointers
619 &lea ($inp,&DWP(-128,$inp));
620 &mov ("edx",&DWP(512+4,"esp"));
621 &lea ($out,&DWP(-128,$out));
622
623 &movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value
624 &movdqu ("xmm3",&QWP(0,"ebx"));
625 &paddd ("xmm2",&QWP(16*6,"eax")); # +four
626 &pand ("xmm3",&QWP(16*7,"eax"));
627 &por ("xmm3","xmm2"); # counter value
Robert Sloana94fe052017-02-21 08:49:28 -0800628}
David Benjamin4969cc92016-04-22 15:02:23 -0400629{
630my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
631
632sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
633 &paddd ($a,$b);
634 &pxor ($d,$a);
635 &pshufb ($d,$rot16);
636
637 &paddd ($c,$d);
638 &pxor ($b,$c);
639 &movdqa ($t,$b);
640 &psrld ($b,20);
641 &pslld ($t,12);
642 &por ($b,$t);
643
644 &paddd ($a,$b);
645 &pxor ($d,$a);
646 &pshufb ($d,$rot24);
647
648 &paddd ($c,$d);
649 &pxor ($b,$c);
650 &movdqa ($t,$b);
651 &psrld ($b,25);
652 &pslld ($t,7);
653 &por ($b,$t);
654}
655
656&set_label("1x");
657 &movdqa ($a,&QWP(16*2,"eax")); # sigma
658 &movdqu ($b,&QWP(0,"edx"));
659 &movdqu ($c,&QWP(16,"edx"));
660 #&movdqu ($d,&QWP(0,"ebx")); # already loaded
661 &movdqa ($rot16,&QWP(0,"eax"));
662 &movdqa ($rot24,&QWP(16,"eax"));
663 &mov (&DWP(16*3,"esp"),"ebp");
664
665 &movdqa (&QWP(16*0,"esp"),$a);
666 &movdqa (&QWP(16*1,"esp"),$b);
667 &movdqa (&QWP(16*2,"esp"),$c);
668 &movdqa (&QWP(16*3,"esp"),$d);
669 &mov ("edx",10);
670 &jmp (&label("loop1x"));
671
672&set_label("outer1x",16);
673 &movdqa ($d,&QWP(16*5,"eax")); # one
674 &movdqa ($a,&QWP(16*0,"esp"));
675 &movdqa ($b,&QWP(16*1,"esp"));
676 &movdqa ($c,&QWP(16*2,"esp"));
677 &paddd ($d,&QWP(16*3,"esp"));
678 &mov ("edx",10);
679 &movdqa (&QWP(16*3,"esp"),$d);
680 &jmp (&label("loop1x"));
681
682&set_label("loop1x",16);
683 &SSSE3ROUND();
684 &pshufd ($c,$c,0b01001110);
685 &pshufd ($b,$b,0b00111001);
686 &pshufd ($d,$d,0b10010011);
687 &nop ();
688
689 &SSSE3ROUND();
690 &pshufd ($c,$c,0b01001110);
691 &pshufd ($b,$b,0b10010011);
692 &pshufd ($d,$d,0b00111001);
693
694 &dec ("edx");
695 &jnz (&label("loop1x"));
696
697 &paddd ($a,&QWP(16*0,"esp"));
698 &paddd ($b,&QWP(16*1,"esp"));
699 &paddd ($c,&QWP(16*2,"esp"));
700 &paddd ($d,&QWP(16*3,"esp"));
701
702 &cmp ($len,64);
703 &jb (&label("tail"));
704
705 &movdqu ($t,&QWP(16*0,$inp));
706 &movdqu ($t1,&QWP(16*1,$inp));
707 &pxor ($a,$t); # xor with input
708 &movdqu ($t,&QWP(16*2,$inp));
709 &pxor ($b,$t1);
710 &movdqu ($t1,&QWP(16*3,$inp));
711 &pxor ($c,$t);
712 &pxor ($d,$t1);
713 &lea ($inp,&DWP(16*4,$inp)); # inp+=64
714
715 &movdqu (&QWP(16*0,$out),$a); # write output
716 &movdqu (&QWP(16*1,$out),$b);
717 &movdqu (&QWP(16*2,$out),$c);
718 &movdqu (&QWP(16*3,$out),$d);
719 &lea ($out,&DWP(16*4,$out)); # inp+=64
720
721 &sub ($len,64);
722 &jnz (&label("outer1x"));
723
724 &jmp (&label("done"));
725
726&set_label("tail");
727 &movdqa (&QWP(16*0,"esp"),$a);
728 &movdqa (&QWP(16*1,"esp"),$b);
729 &movdqa (&QWP(16*2,"esp"),$c);
730 &movdqa (&QWP(16*3,"esp"),$d);
731
732 &xor ("eax","eax");
733 &xor ("edx","edx");
734 &xor ("ebp","ebp");
735
736&set_label("tail_loop");
737 &movb ("al",&BP(0,"esp","ebp"));
738 &movb ("dl",&BP(0,$inp,"ebp"));
739 &lea ("ebp",&DWP(1,"ebp"));
740 &xor ("al","dl");
741 &movb (&BP(-1,$out,"ebp"),"al");
742 &dec ($len);
743 &jnz (&label("tail_loop"));
744}
745&set_label("done");
746 &mov ("esp",&DWP(512,"esp"));
747&function_end("ChaCha20_ssse3");
748
749&align (64);
750&set_label("ssse3_data");
751&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
752&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
753&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574);
754&data_word(0,1,2,3);
755&data_word(4,4,4,4);
756&data_word(1,0,0,0);
757&data_word(4,0,0,0);
758&data_word(0,-1,-1,-1);
759&align (64);
760}
761&asciz ("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
762
763&asm_finish();
David Benjaminc895d6b2016-08-11 13:26:41 -0400764
765close STDOUT;