blob: 49278506c3927c8dcf7b518406b7cc3788522be3 [file] [log] [blame]
Robert Sloanfe7cd212017-08-07 09:03:39 -07001#! /usr/bin/env perl
2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
Adam Langleyd9e397b2015-01-22 14:27:53 -08009#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the OpenSSL license.
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
Robert Sloana94fe052017-02-21 08:49:28 -080044# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
Adam Langleyd9e397b2015-01-22 14:27:53 -080045# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54# unfortunately -2% SHA512 on P4 [which nobody should care about
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
88#
89# AMD K8 14.9 - - 9.57 -
90# P4 17.3 - - 30.8 -
91# Core 2 15.6 13.8(+13%) - 9.97 -
92# Westmere 14.8 12.3(+19%) - 9.58 -
93# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
94# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
95# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
Robert Sloana94fe052017-02-21 08:49:28 -080096# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
Adam Langleyd9e397b2015-01-22 14:27:53 -080097# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
Robert Sloanfe7cd212017-08-07 09:03:39 -070098# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%)
Adam Langleyd9e397b2015-01-22 14:27:53 -080099# VIA Nano 23.0 16.5(+39%) - 14.7 -
100# Atom 23.0 18.9(+22%) - 14.7 -
101# Silvermont 27.4 20.6(+33%) - 17.5 -
Robert Sloanfe7cd212017-08-07 09:03:39 -0700102# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%)
Robert Sloana94fe052017-02-21 08:49:28 -0800103# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
Adam Langleyd9e397b2015-01-22 14:27:53 -0800104#
Robert Sloana94fe052017-02-21 08:49:28 -0800105# (*) whichever best applicable, including SHAEXT;
Adam Langleyd9e397b2015-01-22 14:27:53 -0800106# (**) switch from ror to shrd stands for fair share of improvement;
107# (***) execution time is fully determined by remaining integer-only
108# part, body_00_15; reducing the amount of SIMD instructions
109# below certain limit makes no difference/sense; to conserve
110# space SHA256 XOP code path is therefore omitted;
Robert Sloan11c28bd2018-12-17 12:09:20 -0800111#
112# Modified from upstream OpenSSL to remove the XOP code.
Adam Langleyd9e397b2015-01-22 14:27:53 -0800113
114$flavour = shift;
115$output = shift;
116if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
117
118$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
119
120$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
121( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
Robert Sloan572a4e22017-04-17 10:52:19 -0700122( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
Adam Langleyd9e397b2015-01-22 14:27:53 -0800123die "can't locate x86_64-xlate.pl";
124
Kenny Roote99801b2015-11-06 15:31:15 -0800125# In upstream, this is controlled by shelling out to the compiler to check
126# versions, but BoringSSL is intended to be used with pre-generated perlasm
127# output, so this isn't useful anyway.
128#
Adam Langley4139edb2016-01-13 15:00:54 -0800129# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
130# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
131# did not tie them together until after $shaext was added.
132$avx = 1;
Adam Langleyd9e397b2015-01-22 14:27:53 -0800133
Kenny Roote99801b2015-11-06 15:31:15 -0800134# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
135# been tested.
Adam Langleyd9e397b2015-01-22 14:27:53 -0800136$shaext=0; ### set to zero if compiling for 1.0.1
137$avx=1 if (!$shaext && $avx);
138
David Benjaminc895d6b2016-08-11 13:26:41 -0400139open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
Adam Langleyd9e397b2015-01-22 14:27:53 -0800140*STDOUT=*OUT;
141
142if ($output =~ /512/) {
143 $func="sha512_block_data_order";
144 $TABLE="K512";
145 $SZ=8;
146 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
147 "%r8", "%r9", "%r10","%r11");
148 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
149 @Sigma0=(28,34,39);
150 @Sigma1=(14,18,41);
151 @sigma0=(1, 8, 7);
152 @sigma1=(19,61, 6);
153 $rounds=80;
154} else {
155 $func="sha256_block_data_order";
156 $TABLE="K256";
157 $SZ=4;
158 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
159 "%r8d","%r9d","%r10d","%r11d");
160 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
161 @Sigma0=( 2,13,22);
162 @Sigma1=( 6,11,25);
163 @sigma0=( 7,18, 3);
164 @sigma1=(17,19,10);
165 $rounds=64;
166}
167
168$ctx="%rdi"; # 1st arg, zapped by $a3
169$inp="%rsi"; # 2nd arg
170$Tbl="%rbp";
171
172$_ctx="16*$SZ+0*8(%rsp)";
173$_inp="16*$SZ+1*8(%rsp)";
174$_end="16*$SZ+2*8(%rsp)";
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100175$_rsp="`16*$SZ+3*8`(%rsp)";
Adam Langleyd9e397b2015-01-22 14:27:53 -0800176$framesz="16*$SZ+4*8";
177
178
179sub ROUND_00_15()
180{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
181 my $STRIDE=$SZ;
182 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
183
184$code.=<<___;
185 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
186 mov $f,$a2
187
188 xor $e,$a0
189 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
190 xor $g,$a2 # f^g
191
192 mov $T1,`$SZ*($i&0xf)`(%rsp)
193 xor $a,$a1
194 and $e,$a2 # (f^g)&e
195
196 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
197 add $h,$T1 # T1+=h
198 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
199
200 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
201 xor $e,$a0
202 add $a2,$T1 # T1+=Ch(e,f,g)
203
204 mov $a,$a2
205 add ($Tbl),$T1 # T1+=K[round]
206 xor $a,$a1
207
208 xor $b,$a2 # a^b, b^c in next round
209 ror \$$Sigma1[0],$a0 # Sigma1(e)
210 mov $b,$h
211
212 and $a2,$a3
213 ror \$$Sigma0[0],$a1 # Sigma0(a)
214 add $a0,$T1 # T1+=Sigma1(e)
215
216 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
217 add $T1,$d # d+=T1
218 add $T1,$h # h+=T1
219
220 lea $STRIDE($Tbl),$Tbl # round++
221___
222$code.=<<___ if ($i<15);
223 add $a1,$h # h+=Sigma0(a)
224___
225 ($a2,$a3) = ($a3,$a2);
226}
227
228sub ROUND_16_XX()
229{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
230
231$code.=<<___;
232 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
233 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
234
235 mov $a0,$T1
236 ror \$`$sigma0[1]-$sigma0[0]`,$a0
237 add $a1,$a # modulo-scheduled h+=Sigma0(a)
238 mov $a2,$a1
239 ror \$`$sigma1[1]-$sigma1[0]`,$a2
240
241 xor $T1,$a0
242 shr \$$sigma0[2],$T1
243 ror \$$sigma0[0],$a0
244 xor $a1,$a2
245 shr \$$sigma1[2],$a1
246
247 ror \$$sigma1[0],$a2
248 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
249 xor $a1,$a2 # sigma1(X[(i+14)&0xf])
250 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
251
252 add `$SZ*($i&0xf)`(%rsp),$T1
253 mov $e,$a0
254 add $a2,$T1
255 mov $a,$a1
256___
257 &ROUND_00_15(@_);
258}
259
260$code=<<___;
261.text
262
Robert Sloan2424d842017-05-01 07:46:28 -0700263.extern OPENSSL_ia32cap_P
Adam Langleyd9e397b2015-01-22 14:27:53 -0800264.globl $func
265.type $func,\@function,3
266.align 16
267$func:
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100268.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800269___
270$code.=<<___ if ($SZ==4 || $avx);
Robert Sloan2424d842017-05-01 07:46:28 -0700271 leaq OPENSSL_ia32cap_P(%rip),%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -0800272 mov 0(%r11),%r9d
273 mov 4(%r11),%r10d
274 mov 8(%r11),%r11d
275___
276$code.=<<___ if ($SZ==4 && $shaext);
277 test \$`1<<29`,%r11d # check for SHA
278 jnz _shaext_shortcut
279___
Robert Sloan11c28bd2018-12-17 12:09:20 -0800280 # XOP codepath removed.
Srinivas Paladugudd42a612019-08-09 19:30:39 +0000281___
Adam Langleyd9e397b2015-01-22 14:27:53 -0800282$code.=<<___ if ($avx>1);
283 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
284 cmp \$`1<<8|1<<5|1<<3`,%r11d
285 je .Lavx2_shortcut
286___
287$code.=<<___ if ($avx);
288 and \$`1<<30`,%r9d # mask "Intel CPU" bit
289 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
290 or %r9d,%r10d
291 cmp \$`1<<28|1<<9|1<<30`,%r10d
292 je .Lavx_shortcut
293___
294$code.=<<___ if ($SZ==4);
295 test \$`1<<9`,%r10d
296 jnz .Lssse3_shortcut
297___
298$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -0800299 mov %rsp,%rax # copy %rsp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100300.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800301 push %rbx
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100302.cfi_push %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800303 push %rbp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100304.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800305 push %r12
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100306.cfi_push %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800307 push %r13
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100308.cfi_push %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800309 push %r14
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100310.cfi_push %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800311 push %r15
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100312.cfi_push %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800313 shl \$4,%rdx # num*16
314 sub \$$framesz,%rsp
315 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
316 and \$-64,%rsp # align stack frame
317 mov $ctx,$_ctx # save ctx, 1st arg
318 mov $inp,$_inp # save inp, 2nd arh
319 mov %rdx,$_end # save end pointer, "3rd" arg
Robert Sloana94fe052017-02-21 08:49:28 -0800320 mov %rax,$_rsp # save copy of %rsp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100321.cfi_cfa_expression $_rsp,deref,+8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800322.Lprologue:
323
324 mov $SZ*0($ctx),$A
325 mov $SZ*1($ctx),$B
326 mov $SZ*2($ctx),$C
327 mov $SZ*3($ctx),$D
328 mov $SZ*4($ctx),$E
329 mov $SZ*5($ctx),$F
330 mov $SZ*6($ctx),$G
331 mov $SZ*7($ctx),$H
332 jmp .Lloop
333
334.align 16
335.Lloop:
336 mov $B,$a3
337 lea $TABLE(%rip),$Tbl
338 xor $C,$a3 # magic
339___
340 for($i=0;$i<16;$i++) {
341 $code.=" mov $SZ*$i($inp),$T1\n";
342 $code.=" mov @ROT[4],$a0\n";
343 $code.=" mov @ROT[0],$a1\n";
344 $code.=" bswap $T1\n";
345 &ROUND_00_15($i,@ROT);
346 unshift(@ROT,pop(@ROT));
347 }
348$code.=<<___;
349 jmp .Lrounds_16_xx
350.align 16
351.Lrounds_16_xx:
352___
353 for(;$i<32;$i++) {
354 &ROUND_16_XX($i,@ROT);
355 unshift(@ROT,pop(@ROT));
356 }
357
358$code.=<<___;
359 cmpb \$0,`$SZ-1`($Tbl)
360 jnz .Lrounds_16_xx
361
362 mov $_ctx,$ctx
363 add $a1,$A # modulo-scheduled h+=Sigma0(a)
364 lea 16*$SZ($inp),$inp
365
366 add $SZ*0($ctx),$A
367 add $SZ*1($ctx),$B
368 add $SZ*2($ctx),$C
369 add $SZ*3($ctx),$D
370 add $SZ*4($ctx),$E
371 add $SZ*5($ctx),$F
372 add $SZ*6($ctx),$G
373 add $SZ*7($ctx),$H
374
375 cmp $_end,$inp
376
377 mov $A,$SZ*0($ctx)
378 mov $B,$SZ*1($ctx)
379 mov $C,$SZ*2($ctx)
380 mov $D,$SZ*3($ctx)
381 mov $E,$SZ*4($ctx)
382 mov $F,$SZ*5($ctx)
383 mov $G,$SZ*6($ctx)
384 mov $H,$SZ*7($ctx)
385 jb .Lloop
386
387 mov $_rsp,%rsi
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100388.cfi_def_cfa %rsi,8
Robert Sloana94fe052017-02-21 08:49:28 -0800389 mov -48(%rsi),%r15
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100390.cfi_restore %r15
Robert Sloana94fe052017-02-21 08:49:28 -0800391 mov -40(%rsi),%r14
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100392.cfi_restore %r14
Robert Sloana94fe052017-02-21 08:49:28 -0800393 mov -32(%rsi),%r13
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100394.cfi_restore %r13
Robert Sloana94fe052017-02-21 08:49:28 -0800395 mov -24(%rsi),%r12
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100396.cfi_restore %r12
Robert Sloana94fe052017-02-21 08:49:28 -0800397 mov -16(%rsi),%rbp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100398.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800399 mov -8(%rsi),%rbx
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100400.cfi_restore %rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800401 lea (%rsi),%rsp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100402.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800403.Lepilogue:
404 ret
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100405.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800406.size $func,.-$func
407___
408
409if ($SZ==4) {
410$code.=<<___;
411.align 64
412.type $TABLE,\@object
413$TABLE:
414 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
415 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
416 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
417 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
418 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
419 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
420 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
421 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
422 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
423 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
424 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
425 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
426 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
427 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
428 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
429 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
430 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
431 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
432 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
433 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
434 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
435 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
436 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
437 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
438 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
439 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
440 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
441 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
442 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
443 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
444 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
445 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
446
447 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
448 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
449 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
450 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
451 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
452 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
453 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
454___
455} else {
456$code.=<<___;
457.align 64
458.type $TABLE,\@object
459$TABLE:
460 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
461 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
462 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
463 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
464 .quad 0x3956c25bf348b538,0x59f111f1b605d019
465 .quad 0x3956c25bf348b538,0x59f111f1b605d019
466 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
467 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
468 .quad 0xd807aa98a3030242,0x12835b0145706fbe
469 .quad 0xd807aa98a3030242,0x12835b0145706fbe
470 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
471 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
472 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
473 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
474 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
475 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
476 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
477 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
478 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
479 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
480 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
481 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
482 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
484 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
485 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
486 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
487 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
488 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
489 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
490 .quad 0x06ca6351e003826f,0x142929670a0e6e70
491 .quad 0x06ca6351e003826f,0x142929670a0e6e70
492 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
493 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
494 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
495 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
496 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
497 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
498 .quad 0x81c2c92e47edaee6,0x92722c851482353b
499 .quad 0x81c2c92e47edaee6,0x92722c851482353b
500 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
501 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
502 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
503 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
504 .quad 0xd192e819d6ef5218,0xd69906245565a910
505 .quad 0xd192e819d6ef5218,0xd69906245565a910
506 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
507 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
508 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
509 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
510 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
511 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
512 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
513 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
514 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
515 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
516 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
517 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
518 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
519 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
520 .quad 0x90befffa23631e28,0xa4506cebde82bde9
521 .quad 0x90befffa23631e28,0xa4506cebde82bde9
522 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
523 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
524 .quad 0xca273eceea26619c,0xd186b8c721c0c207
525 .quad 0xca273eceea26619c,0xd186b8c721c0c207
526 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
527 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
528 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
529 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
530 .quad 0x113f9804bef90dae,0x1b710b35131c471b
531 .quad 0x113f9804bef90dae,0x1b710b35131c471b
532 .quad 0x28db77f523047d84,0x32caab7b40c72493
533 .quad 0x28db77f523047d84,0x32caab7b40c72493
534 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
535 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
536 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
537 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
538 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
539 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
540
541 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
542 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
543 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
544___
545}
546
547######################################################################
548# SIMD code paths
549#
550if ($SZ==4 && $shaext) {{{
551######################################################################
552# Intel SHA Extensions implementation of SHA256 update function.
553#
554my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
555
556my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
557my @MSG=map("%xmm$_",(3..6));
558
559$code.=<<___;
560.type sha256_block_data_order_shaext,\@function,3
561.align 64
562sha256_block_data_order_shaext:
563_shaext_shortcut:
564___
565$code.=<<___ if ($win64);
566 lea `-8-5*16`(%rsp),%rsp
567 movaps %xmm6,-8-5*16(%rax)
568 movaps %xmm7,-8-4*16(%rax)
569 movaps %xmm8,-8-3*16(%rax)
570 movaps %xmm9,-8-2*16(%rax)
571 movaps %xmm10,-8-1*16(%rax)
572.Lprologue_shaext:
573___
574$code.=<<___;
575 lea K256+0x80(%rip),$Tbl
576 movdqu ($ctx),$ABEF # DCBA
577 movdqu 16($ctx),$CDGH # HGFE
578 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
579
580 pshufd \$0x1b,$ABEF,$Wi # ABCD
581 pshufd \$0xb1,$ABEF,$ABEF # CDAB
582 pshufd \$0x1b,$CDGH,$CDGH # EFGH
583 movdqa $TMP,$BSWAP # offload
584 palignr \$8,$CDGH,$ABEF # ABEF
585 punpcklqdq $Wi,$CDGH # CDGH
586 jmp .Loop_shaext
587
588.align 16
589.Loop_shaext:
590 movdqu ($inp),@MSG[0]
591 movdqu 0x10($inp),@MSG[1]
592 movdqu 0x20($inp),@MSG[2]
593 pshufb $TMP,@MSG[0]
594 movdqu 0x30($inp),@MSG[3]
595
596 movdqa 0*32-0x80($Tbl),$Wi
597 paddd @MSG[0],$Wi
598 pshufb $TMP,@MSG[1]
599 movdqa $CDGH,$CDGH_SAVE # offload
600 sha256rnds2 $ABEF,$CDGH # 0-3
601 pshufd \$0x0e,$Wi,$Wi
602 nop
603 movdqa $ABEF,$ABEF_SAVE # offload
604 sha256rnds2 $CDGH,$ABEF
605
606 movdqa 1*32-0x80($Tbl),$Wi
607 paddd @MSG[1],$Wi
608 pshufb $TMP,@MSG[2]
609 sha256rnds2 $ABEF,$CDGH # 4-7
610 pshufd \$0x0e,$Wi,$Wi
611 lea 0x40($inp),$inp
612 sha256msg1 @MSG[1],@MSG[0]
613 sha256rnds2 $CDGH,$ABEF
614
615 movdqa 2*32-0x80($Tbl),$Wi
616 paddd @MSG[2],$Wi
617 pshufb $TMP,@MSG[3]
618 sha256rnds2 $ABEF,$CDGH # 8-11
619 pshufd \$0x0e,$Wi,$Wi
620 movdqa @MSG[3],$TMP
621 palignr \$4,@MSG[2],$TMP
622 nop
623 paddd $TMP,@MSG[0]
624 sha256msg1 @MSG[2],@MSG[1]
625 sha256rnds2 $CDGH,$ABEF
626
627 movdqa 3*32-0x80($Tbl),$Wi
628 paddd @MSG[3],$Wi
629 sha256msg2 @MSG[3],@MSG[0]
630 sha256rnds2 $ABEF,$CDGH # 12-15
631 pshufd \$0x0e,$Wi,$Wi
632 movdqa @MSG[0],$TMP
633 palignr \$4,@MSG[3],$TMP
634 nop
635 paddd $TMP,@MSG[1]
636 sha256msg1 @MSG[3],@MSG[2]
637 sha256rnds2 $CDGH,$ABEF
638___
639for($i=4;$i<16-3;$i++) {
640$code.=<<___;
641 movdqa $i*32-0x80($Tbl),$Wi
642 paddd @MSG[0],$Wi
643 sha256msg2 @MSG[0],@MSG[1]
644 sha256rnds2 $ABEF,$CDGH # 16-19...
645 pshufd \$0x0e,$Wi,$Wi
646 movdqa @MSG[1],$TMP
647 palignr \$4,@MSG[0],$TMP
648 nop
649 paddd $TMP,@MSG[2]
650 sha256msg1 @MSG[0],@MSG[3]
651 sha256rnds2 $CDGH,$ABEF
652___
653 push(@MSG,shift(@MSG));
654}
655$code.=<<___;
656 movdqa 13*32-0x80($Tbl),$Wi
657 paddd @MSG[0],$Wi
658 sha256msg2 @MSG[0],@MSG[1]
659 sha256rnds2 $ABEF,$CDGH # 52-55
660 pshufd \$0x0e,$Wi,$Wi
661 movdqa @MSG[1],$TMP
662 palignr \$4,@MSG[0],$TMP
663 sha256rnds2 $CDGH,$ABEF
664 paddd $TMP,@MSG[2]
665
666 movdqa 14*32-0x80($Tbl),$Wi
667 paddd @MSG[1],$Wi
668 sha256rnds2 $ABEF,$CDGH # 56-59
669 pshufd \$0x0e,$Wi,$Wi
670 sha256msg2 @MSG[1],@MSG[2]
671 movdqa $BSWAP,$TMP
672 sha256rnds2 $CDGH,$ABEF
673
674 movdqa 15*32-0x80($Tbl),$Wi
675 paddd @MSG[2],$Wi
676 nop
677 sha256rnds2 $ABEF,$CDGH # 60-63
678 pshufd \$0x0e,$Wi,$Wi
679 dec $num
680 nop
681 sha256rnds2 $CDGH,$ABEF
682
683 paddd $CDGH_SAVE,$CDGH
684 paddd $ABEF_SAVE,$ABEF
685 jnz .Loop_shaext
686
687 pshufd \$0xb1,$CDGH,$CDGH # DCHG
688 pshufd \$0x1b,$ABEF,$TMP # FEBA
689 pshufd \$0xb1,$ABEF,$ABEF # BAFE
690 punpckhqdq $CDGH,$ABEF # DCBA
691 palignr \$8,$TMP,$CDGH # HGFE
692
693 movdqu $ABEF,($ctx)
694 movdqu $CDGH,16($ctx)
695___
696$code.=<<___ if ($win64);
697 movaps -8-5*16(%rax),%xmm6
698 movaps -8-4*16(%rax),%xmm7
699 movaps -8-3*16(%rax),%xmm8
700 movaps -8-2*16(%rax),%xmm9
701 movaps -8-1*16(%rax),%xmm10
702 mov %rax,%rsp
703.Lepilogue_shaext:
704___
705$code.=<<___;
706 ret
707.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
708___
709}}}
710{{{
711
712my $a4=$T1;
713my ($a,$b,$c,$d,$e,$f,$g,$h);
714
715sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
716{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
717 my $arg = pop;
718 $arg = "\$$arg" if ($arg*1 eq $arg);
719 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
720}
721
722sub body_00_15 () {
723 (
724 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
725
726 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
727 '&mov ($a,$a1)',
728 '&mov ($a4,$f)',
729
730 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
731 '&xor ($a0,$e)',
732 '&xor ($a4,$g)', # f^g
733
734 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
735 '&xor ($a1,$a)',
736 '&and ($a4,$e)', # (f^g)&e
737
738 '&xor ($a0,$e)',
739 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
740 '&mov ($a2,$a)',
741
742 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
743 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
744 '&xor ($a2,$b)', # a^b, b^c in next round
745
746 '&add ($h,$a4)', # h+=Ch(e,f,g)
747 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
748 '&and ($a3,$a2)', # (b^c)&(a^b)
749
750 '&xor ($a1,$a)',
751 '&add ($h,$a0)', # h+=Sigma1(e)
752 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
753
754 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
755 '&add ($d,$h)', # d+=h
756 '&add ($h,$a3)', # h+=Maj(a,b,c)
757
758 '&mov ($a0,$d)',
759 '&add ($a1,$h);'. # h+=Sigma0(a)
760 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
761 );
762}
763
764######################################################################
765# SSSE3 code path
766#
767if ($SZ==4) { # SHA256 only
768my @X = map("%xmm$_",(0..3));
769my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
770
771$code.=<<___;
772.type ${func}_ssse3,\@function,3
773.align 64
774${func}_ssse3:
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100775.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800776.Lssse3_shortcut:
Robert Sloana94fe052017-02-21 08:49:28 -0800777 mov %rsp,%rax # copy %rsp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100778.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800779 push %rbx
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100780.cfi_push %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800781 push %rbp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100782.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800783 push %r12
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100784.cfi_push %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800785 push %r13
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100786.cfi_push %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800787 push %r14
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100788.cfi_push %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800789 push %r15
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100790.cfi_push %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800791 shl \$4,%rdx # num*16
792 sub \$`$framesz+$win64*16*4`,%rsp
793 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
794 and \$-64,%rsp # align stack frame
795 mov $ctx,$_ctx # save ctx, 1st arg
796 mov $inp,$_inp # save inp, 2nd arh
797 mov %rdx,$_end # save end pointer, "3rd" arg
Robert Sloana94fe052017-02-21 08:49:28 -0800798 mov %rax,$_rsp # save copy of %rsp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100799.cfi_cfa_expression $_rsp,deref,+8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800800___
801$code.=<<___ if ($win64);
802 movaps %xmm6,16*$SZ+32(%rsp)
803 movaps %xmm7,16*$SZ+48(%rsp)
804 movaps %xmm8,16*$SZ+64(%rsp)
805 movaps %xmm9,16*$SZ+80(%rsp)
806___
807$code.=<<___;
808.Lprologue_ssse3:
809
810 mov $SZ*0($ctx),$A
811 mov $SZ*1($ctx),$B
812 mov $SZ*2($ctx),$C
813 mov $SZ*3($ctx),$D
814 mov $SZ*4($ctx),$E
815 mov $SZ*5($ctx),$F
816 mov $SZ*6($ctx),$G
817 mov $SZ*7($ctx),$H
818___
819
820$code.=<<___;
821 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
822 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
823 jmp .Lloop_ssse3
824.align 16
825.Lloop_ssse3:
826 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
827 movdqu 0x00($inp),@X[0]
828 movdqu 0x10($inp),@X[1]
829 movdqu 0x20($inp),@X[2]
830 pshufb $t3,@X[0]
831 movdqu 0x30($inp),@X[3]
832 lea $TABLE(%rip),$Tbl
833 pshufb $t3,@X[1]
834 movdqa 0x00($Tbl),$t0
835 movdqa 0x20($Tbl),$t1
836 pshufb $t3,@X[2]
837 paddd @X[0],$t0
838 movdqa 0x40($Tbl),$t2
839 pshufb $t3,@X[3]
840 movdqa 0x60($Tbl),$t3
841 paddd @X[1],$t1
842 paddd @X[2],$t2
843 paddd @X[3],$t3
844 movdqa $t0,0x00(%rsp)
845 mov $A,$a1
846 movdqa $t1,0x10(%rsp)
847 mov $B,$a3
848 movdqa $t2,0x20(%rsp)
849 xor $C,$a3 # magic
850 movdqa $t3,0x30(%rsp)
851 mov $E,$a0
852 jmp .Lssse3_00_47
853
854.align 16
855.Lssse3_00_47:
856 sub \$`-16*2*$SZ`,$Tbl # size optimization
857___
858sub Xupdate_256_SSSE3 () {
859 (
860 '&movdqa ($t0,@X[1]);',
861 '&movdqa ($t3,@X[3])',
862 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
863 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
864 '&movdqa ($t1,$t0)',
865 '&movdqa ($t2,$t0);',
866 '&psrld ($t0,$sigma0[2])',
867 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
868 '&psrld ($t2,$sigma0[0])',
869 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
870 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
871 '&pxor ($t0,$t2)',
872 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
873 '&pxor ($t0,$t1)',
874 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
875 '&pxor ($t0,$t2);',
876 '&movdqa ($t2,$t3)',
877 '&pxor ($t0,$t1);', # sigma0(X[1..4])
878 '&psrld ($t3,$sigma1[2])',
879 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
880 '&psrlq ($t2,$sigma1[0])',
881 '&pxor ($t3,$t2);',
882 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
883 '&pxor ($t3,$t2)',
884 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
885 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
886 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
887 '&movdqa ($t2,$t3);',
888 '&psrld ($t3,$sigma1[2])',
889 '&psrlq ($t2,$sigma1[0])',
890 '&pxor ($t3,$t2);',
891 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
892 '&pxor ($t3,$t2);',
893 '&movdqa ($t2,16*2*$j."($Tbl)")',
894 '&pshufb ($t3,$t5)',
895 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
896 );
897}
898
899sub SSSE3_256_00_47 () {
900my $j = shift;
901my $body = shift;
902my @X = @_;
903my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
904
905 if (0) {
906 foreach (Xupdate_256_SSSE3()) { # 36 instructions
907 eval;
908 eval(shift(@insns));
909 eval(shift(@insns));
910 eval(shift(@insns));
911 }
912 } else { # squeeze extra 4% on Westmere and 19% on Atom
913 eval(shift(@insns)); #@
914 &movdqa ($t0,@X[1]);
915 eval(shift(@insns));
916 eval(shift(@insns));
917 &movdqa ($t3,@X[3]);
918 eval(shift(@insns)); #@
919 eval(shift(@insns));
920 eval(shift(@insns));
921 eval(shift(@insns)); #@
922 eval(shift(@insns));
923 &palignr ($t0,@X[0],$SZ); # X[1..4]
924 eval(shift(@insns));
925 eval(shift(@insns));
926 &palignr ($t3,@X[2],$SZ); # X[9..12]
927 eval(shift(@insns));
928 eval(shift(@insns));
929 eval(shift(@insns));
930 eval(shift(@insns)); #@
931 &movdqa ($t1,$t0);
932 eval(shift(@insns));
933 eval(shift(@insns));
934 &movdqa ($t2,$t0);
935 eval(shift(@insns)); #@
936 eval(shift(@insns));
937 &psrld ($t0,$sigma0[2]);
938 eval(shift(@insns));
939 eval(shift(@insns));
940 eval(shift(@insns));
941 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
942 eval(shift(@insns)); #@
943 eval(shift(@insns));
944 &psrld ($t2,$sigma0[0]);
945 eval(shift(@insns));
946 eval(shift(@insns));
947 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
948 eval(shift(@insns));
949 eval(shift(@insns)); #@
950 &pslld ($t1,8*$SZ-$sigma0[1]);
951 eval(shift(@insns));
952 eval(shift(@insns));
953 &pxor ($t0,$t2);
954 eval(shift(@insns)); #@
955 eval(shift(@insns));
956 eval(shift(@insns));
957 eval(shift(@insns)); #@
958 &psrld ($t2,$sigma0[1]-$sigma0[0]);
959 eval(shift(@insns));
960 &pxor ($t0,$t1);
961 eval(shift(@insns));
962 eval(shift(@insns));
963 &pslld ($t1,$sigma0[1]-$sigma0[0]);
964 eval(shift(@insns));
965 eval(shift(@insns));
966 &pxor ($t0,$t2);
967 eval(shift(@insns));
968 eval(shift(@insns)); #@
969 &movdqa ($t2,$t3);
970 eval(shift(@insns));
971 eval(shift(@insns));
972 &pxor ($t0,$t1); # sigma0(X[1..4])
973 eval(shift(@insns)); #@
974 eval(shift(@insns));
975 eval(shift(@insns));
976 &psrld ($t3,$sigma1[2]);
977 eval(shift(@insns));
978 eval(shift(@insns));
979 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
980 eval(shift(@insns)); #@
981 eval(shift(@insns));
982 &psrlq ($t2,$sigma1[0]);
983 eval(shift(@insns));
984 eval(shift(@insns));
985 eval(shift(@insns));
986 &pxor ($t3,$t2);
987 eval(shift(@insns)); #@
988 eval(shift(@insns));
989 eval(shift(@insns));
990 eval(shift(@insns)); #@
991 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
992 eval(shift(@insns));
993 eval(shift(@insns));
994 &pxor ($t3,$t2);
995 eval(shift(@insns)); #@
996 eval(shift(@insns));
997 eval(shift(@insns));
998 #&pshufb ($t3,$t4); # sigma1(X[14..15])
999 &pshufd ($t3,$t3,0b10000000);
1000 eval(shift(@insns));
1001 eval(shift(@insns));
1002 eval(shift(@insns));
1003 &psrldq ($t3,8);
1004 eval(shift(@insns));
1005 eval(shift(@insns)); #@
1006 eval(shift(@insns));
1007 eval(shift(@insns));
1008 eval(shift(@insns)); #@
1009 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1010 eval(shift(@insns));
1011 eval(shift(@insns));
1012 eval(shift(@insns));
1013 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
1014 eval(shift(@insns));
1015 eval(shift(@insns)); #@
1016 eval(shift(@insns));
1017 &movdqa ($t2,$t3);
1018 eval(shift(@insns));
1019 eval(shift(@insns));
1020 &psrld ($t3,$sigma1[2]);
1021 eval(shift(@insns));
1022 eval(shift(@insns)); #@
1023 &psrlq ($t2,$sigma1[0]);
1024 eval(shift(@insns));
1025 eval(shift(@insns));
1026 &pxor ($t3,$t2);
1027 eval(shift(@insns)); #@
1028 eval(shift(@insns));
1029 eval(shift(@insns));
1030 eval(shift(@insns)); #@
1031 eval(shift(@insns));
1032 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1033 eval(shift(@insns));
1034 eval(shift(@insns));
1035 eval(shift(@insns));
1036 &pxor ($t3,$t2);
1037 eval(shift(@insns));
1038 eval(shift(@insns));
1039 eval(shift(@insns)); #@
1040 #&pshufb ($t3,$t5);
1041 &pshufd ($t3,$t3,0b00001000);
1042 eval(shift(@insns));
1043 eval(shift(@insns));
1044 &movdqa ($t2,16*2*$j."($Tbl)");
1045 eval(shift(@insns)); #@
1046 eval(shift(@insns));
1047 &pslldq ($t3,8);
1048 eval(shift(@insns));
1049 eval(shift(@insns));
1050 eval(shift(@insns));
1051 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1052 eval(shift(@insns)); #@
1053 eval(shift(@insns));
1054 eval(shift(@insns));
1055 }
1056 &paddd ($t2,@X[0]);
1057 foreach (@insns) { eval; } # remaining instructions
1058 &movdqa (16*$j."(%rsp)",$t2);
1059}
1060
1061 for ($i=0,$j=0; $j<4; $j++) {
1062 &SSSE3_256_00_47($j,\&body_00_15,@X);
1063 push(@X,shift(@X)); # rotate(@X)
1064 }
1065 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1066 &jne (".Lssse3_00_47");
1067
1068 for ($i=0; $i<16; ) {
1069 foreach(body_00_15()) { eval; }
1070 }
1071$code.=<<___;
1072 mov $_ctx,$ctx
1073 mov $a1,$A
1074
1075 add $SZ*0($ctx),$A
1076 lea 16*$SZ($inp),$inp
1077 add $SZ*1($ctx),$B
1078 add $SZ*2($ctx),$C
1079 add $SZ*3($ctx),$D
1080 add $SZ*4($ctx),$E
1081 add $SZ*5($ctx),$F
1082 add $SZ*6($ctx),$G
1083 add $SZ*7($ctx),$H
1084
1085 cmp $_end,$inp
1086
1087 mov $A,$SZ*0($ctx)
1088 mov $B,$SZ*1($ctx)
1089 mov $C,$SZ*2($ctx)
1090 mov $D,$SZ*3($ctx)
1091 mov $E,$SZ*4($ctx)
1092 mov $F,$SZ*5($ctx)
1093 mov $G,$SZ*6($ctx)
1094 mov $H,$SZ*7($ctx)
1095 jb .Lloop_ssse3
1096
1097 mov $_rsp,%rsi
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001098.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001099___
1100$code.=<<___ if ($win64);
1101 movaps 16*$SZ+32(%rsp),%xmm6
1102 movaps 16*$SZ+48(%rsp),%xmm7
1103 movaps 16*$SZ+64(%rsp),%xmm8
1104 movaps 16*$SZ+80(%rsp),%xmm9
1105___
1106$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08001107 mov -48(%rsi),%r15
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001108.cfi_restore %r15
Robert Sloana94fe052017-02-21 08:49:28 -08001109 mov -40(%rsi),%r14
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001110.cfi_restore %r14
Robert Sloana94fe052017-02-21 08:49:28 -08001111 mov -32(%rsi),%r13
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001112.cfi_restore %r13
Robert Sloana94fe052017-02-21 08:49:28 -08001113 mov -24(%rsi),%r12
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001114.cfi_restore %r12
Robert Sloana94fe052017-02-21 08:49:28 -08001115 mov -16(%rsi),%rbp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001116.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08001117 mov -8(%rsi),%rbx
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001118.cfi_restore %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08001119 lea (%rsi),%rsp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001120.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001121.Lepilogue_ssse3:
1122 ret
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001123.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08001124.size ${func}_ssse3,.-${func}_ssse3
1125___
1126}
1127
1128if ($avx) {{
1129######################################################################
Adam Langleyd9e397b2015-01-22 14:27:53 -08001130# AVX+shrd code path
1131#
1132local *ror = sub { &shrd(@_[0],@_) };
1133
1134$code.=<<___;
1135.type ${func}_avx,\@function,3
1136.align 64
1137${func}_avx:
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001138.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08001139.Lavx_shortcut:
Robert Sloana94fe052017-02-21 08:49:28 -08001140 mov %rsp,%rax # copy %rsp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001141.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001142 push %rbx
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001143.cfi_push %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001144 push %rbp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001145.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001146 push %r12
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001147.cfi_push %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08001148 push %r13
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001149.cfi_push %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001150 push %r14
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001151.cfi_push %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08001152 push %r15
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001153.cfi_push %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08001154 shl \$4,%rdx # num*16
1155 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1156 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1157 and \$-64,%rsp # align stack frame
1158 mov $ctx,$_ctx # save ctx, 1st arg
1159 mov $inp,$_inp # save inp, 2nd arh
1160 mov %rdx,$_end # save end pointer, "3rd" arg
Robert Sloana94fe052017-02-21 08:49:28 -08001161 mov %rax,$_rsp # save copy of %rsp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001162.cfi_cfa_expression $_rsp,deref,+8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001163___
1164$code.=<<___ if ($win64);
1165 movaps %xmm6,16*$SZ+32(%rsp)
1166 movaps %xmm7,16*$SZ+48(%rsp)
1167 movaps %xmm8,16*$SZ+64(%rsp)
1168 movaps %xmm9,16*$SZ+80(%rsp)
1169___
1170$code.=<<___ if ($win64 && $SZ>4);
1171 movaps %xmm10,16*$SZ+96(%rsp)
1172 movaps %xmm11,16*$SZ+112(%rsp)
1173___
1174$code.=<<___;
1175.Lprologue_avx:
1176
1177 vzeroupper
1178 mov $SZ*0($ctx),$A
1179 mov $SZ*1($ctx),$B
1180 mov $SZ*2($ctx),$C
1181 mov $SZ*3($ctx),$D
1182 mov $SZ*4($ctx),$E
1183 mov $SZ*5($ctx),$F
1184 mov $SZ*6($ctx),$G
1185 mov $SZ*7($ctx),$H
1186___
1187 if ($SZ==4) { # SHA256
1188 my @X = map("%xmm$_",(0..3));
1189 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1190
1191$code.=<<___;
1192 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1193 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1194 jmp .Lloop_avx
1195.align 16
1196.Lloop_avx:
1197 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1198 vmovdqu 0x00($inp),@X[0]
1199 vmovdqu 0x10($inp),@X[1]
1200 vmovdqu 0x20($inp),@X[2]
1201 vmovdqu 0x30($inp),@X[3]
1202 vpshufb $t3,@X[0],@X[0]
1203 lea $TABLE(%rip),$Tbl
1204 vpshufb $t3,@X[1],@X[1]
1205 vpshufb $t3,@X[2],@X[2]
1206 vpaddd 0x00($Tbl),@X[0],$t0
1207 vpshufb $t3,@X[3],@X[3]
1208 vpaddd 0x20($Tbl),@X[1],$t1
1209 vpaddd 0x40($Tbl),@X[2],$t2
1210 vpaddd 0x60($Tbl),@X[3],$t3
1211 vmovdqa $t0,0x00(%rsp)
1212 mov $A,$a1
1213 vmovdqa $t1,0x10(%rsp)
1214 mov $B,$a3
1215 vmovdqa $t2,0x20(%rsp)
1216 xor $C,$a3 # magic
1217 vmovdqa $t3,0x30(%rsp)
1218 mov $E,$a0
1219 jmp .Lavx_00_47
1220
1221.align 16
1222.Lavx_00_47:
1223 sub \$`-16*2*$SZ`,$Tbl # size optimization
1224___
1225sub Xupdate_256_AVX () {
1226 (
1227 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1228 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1229 '&vpsrld ($t2,$t0,$sigma0[0]);',
1230 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1231 '&vpsrld ($t3,$t0,$sigma0[2])',
1232 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1233 '&vpxor ($t0,$t3,$t2)',
1234 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1235 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1236 '&vpxor ($t0,$t0,$t1)',
1237 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1238 '&vpxor ($t0,$t0,$t2)',
1239 '&vpsrld ($t2,$t3,$sigma1[2]);',
1240 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1241 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1242 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1243 '&vpxor ($t2,$t2,$t3);',
1244 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1245 '&vpxor ($t2,$t2,$t3)',
1246 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1247 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1248 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1249 '&vpsrld ($t2,$t3,$sigma1[2])',
1250 '&vpsrlq ($t3,$t3,$sigma1[0])',
1251 '&vpxor ($t2,$t2,$t3);',
1252 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1253 '&vpxor ($t2,$t2,$t3)',
1254 '&vpshufb ($t2,$t2,$t5)',
1255 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1256 );
1257}
1258
1259sub AVX_256_00_47 () {
1260my $j = shift;
1261my $body = shift;
1262my @X = @_;
1263my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1264
1265 foreach (Xupdate_256_AVX()) { # 29 instructions
1266 eval;
1267 eval(shift(@insns));
1268 eval(shift(@insns));
1269 eval(shift(@insns));
1270 }
1271 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1272 foreach (@insns) { eval; } # remaining instructions
1273 &vmovdqa (16*$j."(%rsp)",$t2);
1274}
1275
1276 for ($i=0,$j=0; $j<4; $j++) {
1277 &AVX_256_00_47($j,\&body_00_15,@X);
1278 push(@X,shift(@X)); # rotate(@X)
1279 }
1280 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1281 &jne (".Lavx_00_47");
1282
1283 for ($i=0; $i<16; ) {
1284 foreach(body_00_15()) { eval; }
1285 }
1286
1287 } else { # SHA512
1288 my @X = map("%xmm$_",(0..7));
1289 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1290
1291$code.=<<___;
1292 jmp .Lloop_avx
1293.align 16
1294.Lloop_avx:
1295 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1296 vmovdqu 0x00($inp),@X[0]
1297 lea $TABLE+0x80(%rip),$Tbl # size optimization
1298 vmovdqu 0x10($inp),@X[1]
1299 vmovdqu 0x20($inp),@X[2]
1300 vpshufb $t3,@X[0],@X[0]
1301 vmovdqu 0x30($inp),@X[3]
1302 vpshufb $t3,@X[1],@X[1]
1303 vmovdqu 0x40($inp),@X[4]
1304 vpshufb $t3,@X[2],@X[2]
1305 vmovdqu 0x50($inp),@X[5]
1306 vpshufb $t3,@X[3],@X[3]
1307 vmovdqu 0x60($inp),@X[6]
1308 vpshufb $t3,@X[4],@X[4]
1309 vmovdqu 0x70($inp),@X[7]
1310 vpshufb $t3,@X[5],@X[5]
1311 vpaddq -0x80($Tbl),@X[0],$t0
1312 vpshufb $t3,@X[6],@X[6]
1313 vpaddq -0x60($Tbl),@X[1],$t1
1314 vpshufb $t3,@X[7],@X[7]
1315 vpaddq -0x40($Tbl),@X[2],$t2
1316 vpaddq -0x20($Tbl),@X[3],$t3
1317 vmovdqa $t0,0x00(%rsp)
1318 vpaddq 0x00($Tbl),@X[4],$t0
1319 vmovdqa $t1,0x10(%rsp)
1320 vpaddq 0x20($Tbl),@X[5],$t1
1321 vmovdqa $t2,0x20(%rsp)
1322 vpaddq 0x40($Tbl),@X[6],$t2
1323 vmovdqa $t3,0x30(%rsp)
1324 vpaddq 0x60($Tbl),@X[7],$t3
1325 vmovdqa $t0,0x40(%rsp)
1326 mov $A,$a1
1327 vmovdqa $t1,0x50(%rsp)
1328 mov $B,$a3
1329 vmovdqa $t2,0x60(%rsp)
1330 xor $C,$a3 # magic
1331 vmovdqa $t3,0x70(%rsp)
1332 mov $E,$a0
1333 jmp .Lavx_00_47
1334
1335.align 16
1336.Lavx_00_47:
1337 add \$`16*2*$SZ`,$Tbl
1338___
1339sub Xupdate_512_AVX () {
1340 (
1341 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1342 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1343 '&vpsrlq ($t2,$t0,$sigma0[0])',
1344 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1345 '&vpsrlq ($t3,$t0,$sigma0[2])',
1346 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1347 '&vpxor ($t0,$t3,$t2)',
1348 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1349 '&vpxor ($t0,$t0,$t1)',
1350 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1351 '&vpxor ($t0,$t0,$t2)',
1352 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1353 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1354 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1355 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1356 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1357 '&vpxor ($t3,$t3,$t2)',
1358 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1359 '&vpxor ($t3,$t3,$t1)',
1360 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1361 '&vpxor ($t3,$t3,$t2)',
1362 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1363 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1364 );
1365}
1366
1367sub AVX_512_00_47 () {
1368my $j = shift;
1369my $body = shift;
1370my @X = @_;
1371my @insns = (&$body,&$body); # 52 instructions
1372
1373 foreach (Xupdate_512_AVX()) { # 23 instructions
1374 eval;
1375 eval(shift(@insns));
1376 eval(shift(@insns));
1377 }
1378 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1379 foreach (@insns) { eval; } # remaining instructions
1380 &vmovdqa (16*$j."(%rsp)",$t2);
1381}
1382
1383 for ($i=0,$j=0; $j<8; $j++) {
1384 &AVX_512_00_47($j,\&body_00_15,@X);
1385 push(@X,shift(@X)); # rotate(@X)
1386 }
1387 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1388 &jne (".Lavx_00_47");
1389
1390 for ($i=0; $i<16; ) {
1391 foreach(body_00_15()) { eval; }
1392 }
1393}
1394$code.=<<___;
1395 mov $_ctx,$ctx
1396 mov $a1,$A
1397
1398 add $SZ*0($ctx),$A
1399 lea 16*$SZ($inp),$inp
1400 add $SZ*1($ctx),$B
1401 add $SZ*2($ctx),$C
1402 add $SZ*3($ctx),$D
1403 add $SZ*4($ctx),$E
1404 add $SZ*5($ctx),$F
1405 add $SZ*6($ctx),$G
1406 add $SZ*7($ctx),$H
1407
1408 cmp $_end,$inp
1409
1410 mov $A,$SZ*0($ctx)
1411 mov $B,$SZ*1($ctx)
1412 mov $C,$SZ*2($ctx)
1413 mov $D,$SZ*3($ctx)
1414 mov $E,$SZ*4($ctx)
1415 mov $F,$SZ*5($ctx)
1416 mov $G,$SZ*6($ctx)
1417 mov $H,$SZ*7($ctx)
1418 jb .Lloop_avx
1419
1420 mov $_rsp,%rsi
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001421.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001422 vzeroupper
1423___
1424$code.=<<___ if ($win64);
1425 movaps 16*$SZ+32(%rsp),%xmm6
1426 movaps 16*$SZ+48(%rsp),%xmm7
1427 movaps 16*$SZ+64(%rsp),%xmm8
1428 movaps 16*$SZ+80(%rsp),%xmm9
1429___
1430$code.=<<___ if ($win64 && $SZ>4);
1431 movaps 16*$SZ+96(%rsp),%xmm10
1432 movaps 16*$SZ+112(%rsp),%xmm11
1433___
1434$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08001435 mov -48(%rsi),%r15
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001436.cfi_restore %r15
Robert Sloana94fe052017-02-21 08:49:28 -08001437 mov -40(%rsi),%r14
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001438.cfi_restore %r14
Robert Sloana94fe052017-02-21 08:49:28 -08001439 mov -32(%rsi),%r13
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001440.cfi_restore %r13
Robert Sloana94fe052017-02-21 08:49:28 -08001441 mov -24(%rsi),%r12
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001442.cfi_restore %r12
Robert Sloana94fe052017-02-21 08:49:28 -08001443 mov -16(%rsi),%rbp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001444.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08001445 mov -8(%rsi),%rbx
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001446.cfi_restore %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08001447 lea (%rsi),%rsp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001448.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001449.Lepilogue_avx:
1450 ret
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001451.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08001452.size ${func}_avx,.-${func}_avx
1453___
1454
1455if ($avx>1) {{
1456######################################################################
1457# AVX2+BMI code path
1458#
Robert Sloana94fe052017-02-21 08:49:28 -08001459my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001460my $PUSH8=8*2*$SZ;
1461use integer;
1462
1463sub bodyx_00_15 () {
1464 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1465 (
1466 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1467
1468 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1469 '&and ($a4,$e)', # f&e
1470 '&rorx ($a0,$e,$Sigma1[2])',
1471 '&rorx ($a2,$e,$Sigma1[1])',
1472
1473 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1474 '&lea ($h,"($h,$a4)")',
1475 '&andn ($a4,$e,$g)', # ~e&g
1476 '&xor ($a0,$a2)',
1477
1478 '&rorx ($a1,$e,$Sigma1[0])',
1479 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1480 '&xor ($a0,$a1)', # Sigma1(e)
1481 '&mov ($a2,$a)',
1482
1483 '&rorx ($a4,$a,$Sigma0[2])',
1484 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1485 '&xor ($a2,$b)', # a^b, b^c in next round
1486 '&rorx ($a1,$a,$Sigma0[1])',
1487
1488 '&rorx ($a0,$a,$Sigma0[0])',
1489 '&lea ($d,"($d,$h)")', # d+=h
1490 '&and ($a3,$a2)', # (b^c)&(a^b)
1491 '&xor ($a1,$a4)',
1492
1493 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1494 '&xor ($a1,$a0)', # Sigma0(a)
1495 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1496 '&mov ($a4,$e)', # copy of f in future
1497
1498 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1499 );
1500 # and at the finish one has to $a+=$a1
1501}
1502
1503$code.=<<___;
1504.type ${func}_avx2,\@function,3
1505.align 64
1506${func}_avx2:
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001507.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08001508.Lavx2_shortcut:
Robert Sloana94fe052017-02-21 08:49:28 -08001509 mov %rsp,%rax # copy %rsp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001510.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001511 push %rbx
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001512.cfi_push %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001513 push %rbp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001514.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001515 push %r12
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001516.cfi_push %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08001517 push %r13
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001518.cfi_push %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001519 push %r14
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001520.cfi_push %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08001521 push %r15
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001522.cfi_push %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08001523 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1524 shl \$4,%rdx # num*16
1525 and \$-256*$SZ,%rsp # align stack frame
1526 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1527 add \$`2*$SZ*($rounds-8)`,%rsp
1528 mov $ctx,$_ctx # save ctx, 1st arg
1529 mov $inp,$_inp # save inp, 2nd arh
1530 mov %rdx,$_end # save end pointer, "3rd" arg
Robert Sloana94fe052017-02-21 08:49:28 -08001531 mov %rax,$_rsp # save copy of %rsp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001532.cfi_cfa_expression $_rsp,deref,+8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001533___
1534$code.=<<___ if ($win64);
1535 movaps %xmm6,16*$SZ+32(%rsp)
1536 movaps %xmm7,16*$SZ+48(%rsp)
1537 movaps %xmm8,16*$SZ+64(%rsp)
1538 movaps %xmm9,16*$SZ+80(%rsp)
1539___
1540$code.=<<___ if ($win64 && $SZ>4);
1541 movaps %xmm10,16*$SZ+96(%rsp)
1542 movaps %xmm11,16*$SZ+112(%rsp)
1543___
1544$code.=<<___;
1545.Lprologue_avx2:
1546
1547 vzeroupper
1548 sub \$-16*$SZ,$inp # inp++, size optimization
1549 mov $SZ*0($ctx),$A
1550 mov $inp,%r12 # borrow $T1
1551 mov $SZ*1($ctx),$B
1552 cmp %rdx,$inp # $_end
1553 mov $SZ*2($ctx),$C
1554 cmove %rsp,%r12 # next block or random data
1555 mov $SZ*3($ctx),$D
1556 mov $SZ*4($ctx),$E
1557 mov $SZ*5($ctx),$F
1558 mov $SZ*6($ctx),$G
1559 mov $SZ*7($ctx),$H
1560___
1561 if ($SZ==4) { # SHA256
1562 my @X = map("%ymm$_",(0..3));
1563 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1564
1565$code.=<<___;
1566 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1567 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1568 jmp .Loop_avx2
1569.align 16
1570.Loop_avx2:
1571 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1572 vmovdqu -16*$SZ+0($inp),%xmm0
1573 vmovdqu -16*$SZ+16($inp),%xmm1
1574 vmovdqu -16*$SZ+32($inp),%xmm2
1575 vmovdqu -16*$SZ+48($inp),%xmm3
1576 #mov $inp,$_inp # offload $inp
1577 vinserti128 \$1,(%r12),@X[0],@X[0]
1578 vinserti128 \$1,16(%r12),@X[1],@X[1]
1579 vpshufb $t3,@X[0],@X[0]
1580 vinserti128 \$1,32(%r12),@X[2],@X[2]
1581 vpshufb $t3,@X[1],@X[1]
1582 vinserti128 \$1,48(%r12),@X[3],@X[3]
1583
1584 lea $TABLE(%rip),$Tbl
1585 vpshufb $t3,@X[2],@X[2]
1586 vpaddd 0x00($Tbl),@X[0],$t0
1587 vpshufb $t3,@X[3],@X[3]
1588 vpaddd 0x20($Tbl),@X[1],$t1
1589 vpaddd 0x40($Tbl),@X[2],$t2
1590 vpaddd 0x60($Tbl),@X[3],$t3
1591 vmovdqa $t0,0x00(%rsp)
1592 xor $a1,$a1
1593 vmovdqa $t1,0x20(%rsp)
1594 lea -$PUSH8(%rsp),%rsp
1595 mov $B,$a3
1596 vmovdqa $t2,0x00(%rsp)
1597 xor $C,$a3 # magic
1598 vmovdqa $t3,0x20(%rsp)
1599 mov $F,$a4
1600 sub \$-16*2*$SZ,$Tbl # size optimization
1601 jmp .Lavx2_00_47
1602
1603.align 16
1604.Lavx2_00_47:
1605___
1606
1607sub AVX2_256_00_47 () {
1608my $j = shift;
1609my $body = shift;
1610my @X = @_;
1611my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1612my $base = "+2*$PUSH8(%rsp)";
1613
1614 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1615 foreach (Xupdate_256_AVX()) { # 29 instructions
1616 eval;
1617 eval(shift(@insns));
1618 eval(shift(@insns));
1619 eval(shift(@insns));
1620 }
1621 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1622 foreach (@insns) { eval; } # remaining instructions
1623 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1624}
1625
1626 for ($i=0,$j=0; $j<4; $j++) {
1627 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1628 push(@X,shift(@X)); # rotate(@X)
1629 }
1630 &lea ($Tbl,16*2*$SZ."($Tbl)");
1631 &cmpb (($SZ-1)."($Tbl)",0);
1632 &jne (".Lavx2_00_47");
1633
1634 for ($i=0; $i<16; ) {
1635 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1636 foreach(bodyx_00_15()) { eval; }
1637 }
1638 } else { # SHA512
1639 my @X = map("%ymm$_",(0..7));
1640 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1641
1642$code.=<<___;
1643 jmp .Loop_avx2
1644.align 16
1645.Loop_avx2:
1646 vmovdqu -16*$SZ($inp),%xmm0
1647 vmovdqu -16*$SZ+16($inp),%xmm1
1648 vmovdqu -16*$SZ+32($inp),%xmm2
1649 lea $TABLE+0x80(%rip),$Tbl # size optimization
1650 vmovdqu -16*$SZ+48($inp),%xmm3
1651 vmovdqu -16*$SZ+64($inp),%xmm4
1652 vmovdqu -16*$SZ+80($inp),%xmm5
1653 vmovdqu -16*$SZ+96($inp),%xmm6
1654 vmovdqu -16*$SZ+112($inp),%xmm7
1655 #mov $inp,$_inp # offload $inp
1656 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
1657 vinserti128 \$1,(%r12),@X[0],@X[0]
1658 vinserti128 \$1,16(%r12),@X[1],@X[1]
1659 vpshufb $t2,@X[0],@X[0]
1660 vinserti128 \$1,32(%r12),@X[2],@X[2]
1661 vpshufb $t2,@X[1],@X[1]
1662 vinserti128 \$1,48(%r12),@X[3],@X[3]
1663 vpshufb $t2,@X[2],@X[2]
1664 vinserti128 \$1,64(%r12),@X[4],@X[4]
1665 vpshufb $t2,@X[3],@X[3]
1666 vinserti128 \$1,80(%r12),@X[5],@X[5]
1667 vpshufb $t2,@X[4],@X[4]
1668 vinserti128 \$1,96(%r12),@X[6],@X[6]
1669 vpshufb $t2,@X[5],@X[5]
1670 vinserti128 \$1,112(%r12),@X[7],@X[7]
1671
1672 vpaddq -0x80($Tbl),@X[0],$t0
1673 vpshufb $t2,@X[6],@X[6]
1674 vpaddq -0x60($Tbl),@X[1],$t1
1675 vpshufb $t2,@X[7],@X[7]
1676 vpaddq -0x40($Tbl),@X[2],$t2
1677 vpaddq -0x20($Tbl),@X[3],$t3
1678 vmovdqa $t0,0x00(%rsp)
1679 vpaddq 0x00($Tbl),@X[4],$t0
1680 vmovdqa $t1,0x20(%rsp)
1681 vpaddq 0x20($Tbl),@X[5],$t1
1682 vmovdqa $t2,0x40(%rsp)
1683 vpaddq 0x40($Tbl),@X[6],$t2
1684 vmovdqa $t3,0x60(%rsp)
1685 lea -$PUSH8(%rsp),%rsp
1686 vpaddq 0x60($Tbl),@X[7],$t3
1687 vmovdqa $t0,0x00(%rsp)
1688 xor $a1,$a1
1689 vmovdqa $t1,0x20(%rsp)
1690 mov $B,$a3
1691 vmovdqa $t2,0x40(%rsp)
1692 xor $C,$a3 # magic
1693 vmovdqa $t3,0x60(%rsp)
1694 mov $F,$a4
1695 add \$16*2*$SZ,$Tbl
1696 jmp .Lavx2_00_47
1697
1698.align 16
1699.Lavx2_00_47:
1700___
1701
1702sub AVX2_512_00_47 () {
1703my $j = shift;
1704my $body = shift;
1705my @X = @_;
1706my @insns = (&$body,&$body); # 48 instructions
1707my $base = "+2*$PUSH8(%rsp)";
1708
1709 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
1710 foreach (Xupdate_512_AVX()) { # 23 instructions
1711 eval;
1712 if ($_ !~ /\;$/) {
1713 eval(shift(@insns));
1714 eval(shift(@insns));
1715 eval(shift(@insns));
1716 }
1717 }
1718 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1719 foreach (@insns) { eval; } # remaining instructions
1720 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1721}
1722
1723 for ($i=0,$j=0; $j<8; $j++) {
1724 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
1725 push(@X,shift(@X)); # rotate(@X)
1726 }
1727 &lea ($Tbl,16*2*$SZ."($Tbl)");
1728 &cmpb (($SZ-1-0x80)."($Tbl)",0);
1729 &jne (".Lavx2_00_47");
1730
1731 for ($i=0; $i<16; ) {
1732 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1733 foreach(bodyx_00_15()) { eval; }
1734 }
1735}
1736$code.=<<___;
1737 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
1738 add $a1,$A
1739 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
1740 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1741
1742 add $SZ*0($ctx),$A
1743 add $SZ*1($ctx),$B
1744 add $SZ*2($ctx),$C
1745 add $SZ*3($ctx),$D
1746 add $SZ*4($ctx),$E
1747 add $SZ*5($ctx),$F
1748 add $SZ*6($ctx),$G
1749 add $SZ*7($ctx),$H
1750
1751 mov $A,$SZ*0($ctx)
1752 mov $B,$SZ*1($ctx)
1753 mov $C,$SZ*2($ctx)
1754 mov $D,$SZ*3($ctx)
1755 mov $E,$SZ*4($ctx)
1756 mov $F,$SZ*5($ctx)
1757 mov $G,$SZ*6($ctx)
1758 mov $H,$SZ*7($ctx)
1759
1760 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
1761 je .Ldone_avx2
1762
1763 xor $a1,$a1
1764 mov $B,$a3
1765 xor $C,$a3 # magic
1766 mov $F,$a4
1767 jmp .Lower_avx2
1768.align 16
1769.Lower_avx2:
1770___
1771 for ($i=0; $i<8; ) {
1772 my $base="+16($Tbl)";
1773 foreach(bodyx_00_15()) { eval; }
1774 }
1775$code.=<<___;
1776 lea -$PUSH8($Tbl),$Tbl
1777 cmp %rsp,$Tbl
1778 jae .Lower_avx2
1779
1780 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
1781 add $a1,$A
1782 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
1783 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1784
1785 add $SZ*0($ctx),$A
1786 add $SZ*1($ctx),$B
1787 add $SZ*2($ctx),$C
1788 add $SZ*3($ctx),$D
1789 add $SZ*4($ctx),$E
1790 add $SZ*5($ctx),$F
1791 lea `2*16*$SZ`($inp),$inp # inp+=2
1792 add $SZ*6($ctx),$G
1793 mov $inp,%r12
1794 add $SZ*7($ctx),$H
1795 cmp $_end,$inp
1796
1797 mov $A,$SZ*0($ctx)
1798 cmove %rsp,%r12 # next block or stale data
1799 mov $B,$SZ*1($ctx)
1800 mov $C,$SZ*2($ctx)
1801 mov $D,$SZ*3($ctx)
1802 mov $E,$SZ*4($ctx)
1803 mov $F,$SZ*5($ctx)
1804 mov $G,$SZ*6($ctx)
1805 mov $H,$SZ*7($ctx)
1806
1807 jbe .Loop_avx2
1808 lea (%rsp),$Tbl
1809
1810.Ldone_avx2:
1811 lea ($Tbl),%rsp
1812 mov $_rsp,%rsi
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001813.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001814 vzeroupper
1815___
1816$code.=<<___ if ($win64);
1817 movaps 16*$SZ+32(%rsp),%xmm6
1818 movaps 16*$SZ+48(%rsp),%xmm7
1819 movaps 16*$SZ+64(%rsp),%xmm8
1820 movaps 16*$SZ+80(%rsp),%xmm9
1821___
1822$code.=<<___ if ($win64 && $SZ>4);
1823 movaps 16*$SZ+96(%rsp),%xmm10
1824 movaps 16*$SZ+112(%rsp),%xmm11
1825___
1826$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08001827 mov -48(%rsi),%r15
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001828.cfi_restore %r15
Robert Sloana94fe052017-02-21 08:49:28 -08001829 mov -40(%rsi),%r14
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001830.cfi_restore %r14
Robert Sloana94fe052017-02-21 08:49:28 -08001831 mov -32(%rsi),%r13
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001832.cfi_restore %r13
Robert Sloana94fe052017-02-21 08:49:28 -08001833 mov -24(%rsi),%r12
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001834.cfi_restore %r12
Robert Sloana94fe052017-02-21 08:49:28 -08001835 mov -16(%rsi),%rbp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001836.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08001837 mov -8(%rsi),%rbx
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001838.cfi_restore %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08001839 lea (%rsi),%rsp
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001840.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001841.Lepilogue_avx2:
1842 ret
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001843.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08001844.size ${func}_avx2,.-${func}_avx2
1845___
1846}}
1847}}}}}
1848
1849# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1850# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1851if ($win64) {
1852$rec="%rcx";
1853$frame="%rdx";
1854$context="%r8";
1855$disp="%r9";
1856
1857$code.=<<___;
1858.extern __imp_RtlVirtualUnwind
1859.type se_handler,\@abi-omnipotent
1860.align 16
1861se_handler:
1862 push %rsi
1863 push %rdi
1864 push %rbx
1865 push %rbp
1866 push %r12
1867 push %r13
1868 push %r14
1869 push %r15
1870 pushfq
1871 sub \$64,%rsp
1872
1873 mov 120($context),%rax # pull context->Rax
1874 mov 248($context),%rbx # pull context->Rip
1875
1876 mov 8($disp),%rsi # disp->ImageBase
1877 mov 56($disp),%r11 # disp->HanderlData
1878
1879 mov 0(%r11),%r10d # HandlerData[0]
1880 lea (%rsi,%r10),%r10 # prologue label
1881 cmp %r10,%rbx # context->Rip<prologue label
1882 jb .Lin_prologue
1883
1884 mov 152($context),%rax # pull context->Rsp
1885
1886 mov 4(%r11),%r10d # HandlerData[1]
1887 lea (%rsi,%r10),%r10 # epilogue label
1888 cmp %r10,%rbx # context->Rip>=epilogue label
1889 jae .Lin_prologue
1890___
1891$code.=<<___ if ($avx>1);
1892 lea .Lavx2_shortcut(%rip),%r10
1893 cmp %r10,%rbx # context->Rip<avx2_shortcut
1894 jb .Lnot_in_avx2
1895
1896 and \$-256*$SZ,%rax
1897 add \$`2*$SZ*($rounds-8)`,%rax
1898.Lnot_in_avx2:
1899___
1900$code.=<<___;
1901 mov %rax,%rsi # put aside Rsp
1902 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001903
1904 mov -8(%rax),%rbx
1905 mov -16(%rax),%rbp
1906 mov -24(%rax),%r12
1907 mov -32(%rax),%r13
1908 mov -40(%rax),%r14
1909 mov -48(%rax),%r15
1910 mov %rbx,144($context) # restore context->Rbx
1911 mov %rbp,160($context) # restore context->Rbp
1912 mov %r12,216($context) # restore context->R12
1913 mov %r13,224($context) # restore context->R13
1914 mov %r14,232($context) # restore context->R14
1915 mov %r15,240($context) # restore context->R15
1916
1917 lea .Lepilogue(%rip),%r10
1918 cmp %r10,%rbx
1919 jb .Lin_prologue # non-AVX code
1920
1921 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
1922 lea 512($context),%rdi # &context.Xmm6
1923 mov \$`$SZ==4?8:12`,%ecx
1924 .long 0xa548f3fc # cld; rep movsq
1925
1926.Lin_prologue:
1927 mov 8(%rax),%rdi
1928 mov 16(%rax),%rsi
1929 mov %rax,152($context) # restore context->Rsp
1930 mov %rsi,168($context) # restore context->Rsi
1931 mov %rdi,176($context) # restore context->Rdi
1932
1933 mov 40($disp),%rdi # disp->ContextRecord
1934 mov $context,%rsi # context
1935 mov \$154,%ecx # sizeof(CONTEXT)
1936 .long 0xa548f3fc # cld; rep movsq
1937
1938 mov $disp,%rsi
1939 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1940 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1941 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1942 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1943 mov 40(%rsi),%r10 # disp->ContextRecord
1944 lea 56(%rsi),%r11 # &disp->HandlerData
1945 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1946 mov %r10,32(%rsp) # arg5
1947 mov %r11,40(%rsp) # arg6
1948 mov %r12,48(%rsp) # arg7
1949 mov %rcx,56(%rsp) # arg8, (NULL)
1950 call *__imp_RtlVirtualUnwind(%rip)
1951
1952 mov \$1,%eax # ExceptionContinueSearch
1953 add \$64,%rsp
1954 popfq
1955 pop %r15
1956 pop %r14
1957 pop %r13
1958 pop %r12
1959 pop %rbp
1960 pop %rbx
1961 pop %rdi
1962 pop %rsi
1963 ret
1964.size se_handler,.-se_handler
1965___
Kenny Roote99801b2015-11-06 15:31:15 -08001966
1967$code.=<<___ if ($SZ==4 && $shaext);
Adam Langleyd9e397b2015-01-22 14:27:53 -08001968.type shaext_handler,\@abi-omnipotent
1969.align 16
1970shaext_handler:
1971 push %rsi
1972 push %rdi
1973 push %rbx
1974 push %rbp
1975 push %r12
1976 push %r13
1977 push %r14
1978 push %r15
1979 pushfq
1980 sub \$64,%rsp
1981
1982 mov 120($context),%rax # pull context->Rax
1983 mov 248($context),%rbx # pull context->Rip
1984
1985 lea .Lprologue_shaext(%rip),%r10
1986 cmp %r10,%rbx # context->Rip<.Lprologue
1987 jb .Lin_prologue
1988
1989 lea .Lepilogue_shaext(%rip),%r10
1990 cmp %r10,%rbx # context->Rip>=.Lepilogue
1991 jae .Lin_prologue
1992
1993 lea -8-5*16(%rax),%rsi
1994 lea 512($context),%rdi # &context.Xmm6
1995 mov \$10,%ecx
1996 .long 0xa548f3fc # cld; rep movsq
1997
1998 jmp .Lin_prologue
1999.size shaext_handler,.-shaext_handler
2000___
Kenny Roote99801b2015-11-06 15:31:15 -08002001
Adam Langleyd9e397b2015-01-22 14:27:53 -08002002$code.=<<___;
2003.section .pdata
2004.align 4
2005 .rva .LSEH_begin_$func
2006 .rva .LSEH_end_$func
2007 .rva .LSEH_info_$func
2008___
Kenny Roote99801b2015-11-06 15:31:15 -08002009$code.=<<___ if ($SZ==4 && $shaext);
Adam Langleyd9e397b2015-01-22 14:27:53 -08002010 .rva .LSEH_begin_${func}_shaext
2011 .rva .LSEH_end_${func}_shaext
2012 .rva .LSEH_info_${func}_shaext
2013___
2014$code.=<<___ if ($SZ==4);
2015 .rva .LSEH_begin_${func}_ssse3
2016 .rva .LSEH_end_${func}_ssse3
2017 .rva .LSEH_info_${func}_ssse3
2018___
Adam Langleyd9e397b2015-01-22 14:27:53 -08002019$code.=<<___ if ($avx);
2020 .rva .LSEH_begin_${func}_avx
2021 .rva .LSEH_end_${func}_avx
2022 .rva .LSEH_info_${func}_avx
2023___
2024$code.=<<___ if ($avx>1);
2025 .rva .LSEH_begin_${func}_avx2
2026 .rva .LSEH_end_${func}_avx2
2027 .rva .LSEH_info_${func}_avx2
2028___
2029$code.=<<___;
2030.section .xdata
2031.align 8
2032.LSEH_info_$func:
2033 .byte 9,0,0,0
2034 .rva se_handler
2035 .rva .Lprologue,.Lepilogue # HandlerData[]
2036___
2037$code.=<<___ if ($SZ==4 && $shaext);
2038.LSEH_info_${func}_shaext:
2039 .byte 9,0,0,0
2040 .rva shaext_handler
2041___
2042$code.=<<___ if ($SZ==4);
2043.LSEH_info_${func}_ssse3:
2044 .byte 9,0,0,0
2045 .rva se_handler
2046 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2047___
Adam Langleyd9e397b2015-01-22 14:27:53 -08002048$code.=<<___ if ($avx);
2049.LSEH_info_${func}_avx:
2050 .byte 9,0,0,0
2051 .rva se_handler
2052 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2053___
2054$code.=<<___ if ($avx>1);
2055.LSEH_info_${func}_avx2:
2056 .byte 9,0,0,0
2057 .rva se_handler
2058 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2059___
2060}
2061
2062sub sha256op38 {
2063 my $instr = shift;
2064 my %opcodelet = (
2065 "sha256rnds2" => 0xcb,
2066 "sha256msg1" => 0xcc,
2067 "sha256msg2" => 0xcd );
2068
2069 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2070 my @opcode=(0x0f,0x38);
2071 push @opcode,$opcodelet{$instr};
2072 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2073 return ".byte\t".join(',',@opcode);
2074 } else {
2075 return $instr."\t".@_[0];
2076 }
2077}
2078
2079foreach (split("\n",$code)) {
2080 s/\`([^\`]*)\`/eval $1/geo;
2081
2082 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2083
2084 print $_,"\n";
2085}
Srinivas Paladugudd42a612019-08-09 19:30:39 +00002086close STDOUT;