blob: 944248febd7ac01fc0afe53189b2a606467354b1 [file] [log] [blame]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# sha1_block procedure for ARMv4.
11#
12# January 2007.
13
14# Size/performance trade-off
15# ====================================================================
16# impl size in bytes comp cycles[*] measured performance
17# ====================================================================
18# thumb 304 3212 4420
19# armv4-small 392/+29% 1958/+64% 2250/+96%
20# armv4-compact 740/+89% 1552/+26% 1840/+22%
21# armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
22# full unroll ~5100/+260% ~1260/+4% ~1300/+5%
23# ====================================================================
24# thumb = same as 'small' but in Thumb instructions[**] and
25# with recurring code in two private functions;
26# small = detached Xload/update, loops are folded;
27# compact = detached Xload/update, 5x unroll;
28# large = interleaved Xload/update, 5x unroll;
29# full unroll = interleaved Xload/update, full unroll, estimated[!];
30#
31# [*] Manually counted instructions in "grand" loop body. Measured
32# performance is affected by prologue and epilogue overhead,
33# i-cache availability, branch penalties, etc.
34# [**] While each Thumb instruction is twice smaller, they are not as
35# diverse as ARM ones: e.g., there are only two arithmetic
36# instructions with 3 arguments, no [fixed] rotate, addressing
37# modes are limited. As result it takes more instructions to do
38# the same job in Thumb, therefore the code is never twice as
39# small and always slower.
40# [***] which is also ~35% better than compiler generated code. Dual-
41# issue Cortex A8 core was measured to process input block in
42# ~990 cycles.
43
44# August 2010.
45#
46# Rescheduling for dual-issue pipeline resulted in 13% improvement on
47# Cortex A8 core and in absolute terms ~870 cycles per input block
48# [or 13.6 cycles per byte].
49
50# February 2011.
51#
52# Profiler-assisted and platform-specific optimization resulted in 10%
53# improvement on Cortex A8 core and 12.2 cycles per byte.
54
55# September 2013.
56#
57# Add NEON implementation (see sha1-586.pl for background info). On
58# Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
59# faster than integer-only code. Because [fully unrolled] NEON code
60# is ~2.5x larger and there are some redundant instructions executed
61# when processing last block, improvement is not as big for smallest
62# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
Adam Langleye9ada862015-05-11 17:20:37 -070063# byte, which is also >80% faster than integer-only code. Cortex-A15
64# is even faster spending 5.6 cycles per byte outperforming integer-
65# only code by factor of 2.
Adam Langleyd9e397b2015-01-22 14:27:53 -080066
67# May 2014.
68#
69# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
70
Adam Langleye9ada862015-05-11 17:20:37 -070071$flavour = shift;
David Benjaminc895d6b2016-08-11 13:26:41 -040072if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
73else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
Adam Langleye9ada862015-05-11 17:20:37 -070074
75if ($flavour && $flavour ne "void") {
76 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
77 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
Robert Sloan572a4e22017-04-17 10:52:19 -070078 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
Adam Langleye9ada862015-05-11 17:20:37 -070079 die "can't locate arm-xlate.pl";
80
81 open STDOUT,"| \"$^X\" $xlate $flavour $output";
82} else {
83 open STDOUT,">$output";
84}
Adam Langleyd9e397b2015-01-22 14:27:53 -080085
86$ctx="r0";
87$inp="r1";
88$len="r2";
89$a="r3";
90$b="r4";
91$c="r5";
92$d="r6";
93$e="r7";
94$K="r8";
95$t0="r9";
96$t1="r10";
97$t2="r11";
98$t3="r12";
99$Xi="r14";
100@V=($a,$b,$c,$d,$e);
101
102sub Xupdate {
103my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
104$code.=<<___;
105 ldr $t0,[$Xi,#15*4]
106 ldr $t1,[$Xi,#13*4]
107 ldr $t2,[$Xi,#7*4]
108 add $e,$K,$e,ror#2 @ E+=K_xx_xx
109 ldr $t3,[$Xi,#2*4]
110 eor $t0,$t0,$t1
111 eor $t2,$t2,$t3 @ 1 cycle stall
112 eor $t1,$c,$d @ F_xx_xx
113 mov $t0,$t0,ror#31
114 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
115 eor $t0,$t0,$t2,ror#31
116 str $t0,[$Xi,#-4]!
117 $opt1 @ F_xx_xx
118 $opt2 @ F_xx_xx
119 add $e,$e,$t0 @ E+=X[i]
120___
121}
122
123sub BODY_00_15 {
124my ($a,$b,$c,$d,$e)=@_;
125$code.=<<___;
126#if __ARM_ARCH__<7
127 ldrb $t1,[$inp,#2]
128 ldrb $t0,[$inp,#3]
129 ldrb $t2,[$inp,#1]
130 add $e,$K,$e,ror#2 @ E+=K_00_19
131 ldrb $t3,[$inp],#4
132 orr $t0,$t0,$t1,lsl#8
133 eor $t1,$c,$d @ F_xx_xx
134 orr $t0,$t0,$t2,lsl#16
135 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
136 orr $t0,$t0,$t3,lsl#24
137#else
138 ldr $t0,[$inp],#4 @ handles unaligned
139 add $e,$K,$e,ror#2 @ E+=K_00_19
140 eor $t1,$c,$d @ F_xx_xx
141 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
142#ifdef __ARMEL__
143 rev $t0,$t0 @ byte swap
144#endif
145#endif
146 and $t1,$b,$t1,ror#2
147 add $e,$e,$t0 @ E+=X[i]
148 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
149 str $t0,[$Xi,#-4]!
150 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
151___
152}
153
154sub BODY_16_19 {
155my ($a,$b,$c,$d,$e)=@_;
156 &Xupdate(@_,"and $t1,$b,$t1,ror#2");
157$code.=<<___;
158 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
159 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
160___
161}
162
163sub BODY_20_39 {
164my ($a,$b,$c,$d,$e)=@_;
165 &Xupdate(@_,"eor $t1,$b,$t1,ror#2");
166$code.=<<___;
167 add $e,$e,$t1 @ E+=F_20_39(B,C,D)
168___
169}
170
171sub BODY_40_59 {
172my ($a,$b,$c,$d,$e)=@_;
173 &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
174$code.=<<___;
175 add $e,$e,$t1 @ E+=F_40_59(B,C,D)
176 add $e,$e,$t2,ror#2
177___
178}
179
180$code=<<___;
Kenny Rootb8494592015-09-25 02:29:14 +0000181#include <openssl/arm_arch.h>
Adam Langleyd9e397b2015-01-22 14:27:53 -0800182
183.text
184.code 32
185
186.global sha1_block_data_order
187.type sha1_block_data_order,%function
188
189.align 5
190sha1_block_data_order:
191#if __ARM_MAX_ARCH__>=7
192 sub r3,pc,#8 @ sha1_block_data_order
193 ldr r12,.LOPENSSL_armcap
194 ldr r12,[r3,r12] @ OPENSSL_armcap_P
Adam Langleye9ada862015-05-11 17:20:37 -0700195#ifdef __APPLE__
196 ldr r12,[r12]
197#endif
Adam Langleyd9e397b2015-01-22 14:27:53 -0800198 tst r12,#ARMV8_SHA1
199 bne .LARMv8
200 tst r12,#ARMV7_NEON
201 bne .LNEON
202#endif
203 stmdb sp!,{r4-r12,lr}
204 add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
205 ldmia $ctx,{$a,$b,$c,$d,$e}
206.Lloop:
207 ldr $K,.LK_00_19
208 mov $Xi,sp
209 sub sp,sp,#15*4
210 mov $c,$c,ror#30
211 mov $d,$d,ror#30
212 mov $e,$e,ror#30 @ [6]
213.L_00_15:
214___
215for($i=0;$i<5;$i++) {
216 &BODY_00_15(@V); unshift(@V,pop(@V));
217}
218$code.=<<___;
219 teq $Xi,sp
220 bne .L_00_15 @ [((11+4)*5+2)*3]
221 sub sp,sp,#25*4
222___
223 &BODY_00_15(@V); unshift(@V,pop(@V));
224 &BODY_16_19(@V); unshift(@V,pop(@V));
225 &BODY_16_19(@V); unshift(@V,pop(@V));
226 &BODY_16_19(@V); unshift(@V,pop(@V));
227 &BODY_16_19(@V); unshift(@V,pop(@V));
228$code.=<<___;
229
230 ldr $K,.LK_20_39 @ [+15+16*4]
231 cmn sp,#0 @ [+3], clear carry to denote 20_39
232.L_20_39_or_60_79:
233___
234for($i=0;$i<5;$i++) {
235 &BODY_20_39(@V); unshift(@V,pop(@V));
236}
237$code.=<<___;
238 teq $Xi,sp @ preserve carry
239 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
240 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
241
242 ldr $K,.LK_40_59
243 sub sp,sp,#20*4 @ [+2]
244.L_40_59:
245___
246for($i=0;$i<5;$i++) {
247 &BODY_40_59(@V); unshift(@V,pop(@V));
248}
249$code.=<<___;
250 teq $Xi,sp
251 bne .L_40_59 @ [+((12+5)*5+2)*4]
252
253 ldr $K,.LK_60_79
254 sub sp,sp,#20*4
255 cmp sp,#0 @ set carry to denote 60_79
256 b .L_20_39_or_60_79 @ [+4], spare 300 bytes
257.L_done:
258 add sp,sp,#80*4 @ "deallocate" stack frame
259 ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
260 add $a,$K,$a
261 add $b,$t0,$b
262 add $c,$t1,$c,ror#2
263 add $d,$t2,$d,ror#2
264 add $e,$t3,$e,ror#2
265 stmia $ctx,{$a,$b,$c,$d,$e}
266 teq $inp,$len
267 bne .Lloop @ [+18], total 1307
268
269#if __ARM_ARCH__>=5
270 ldmia sp!,{r4-r12,pc}
271#else
272 ldmia sp!,{r4-r12,lr}
273 tst lr,#1
274 moveq pc,lr @ be binary compatible with V4, yet
275 bx lr @ interoperable with Thumb ISA:-)
276#endif
277.size sha1_block_data_order,.-sha1_block_data_order
278
279.align 5
280.LK_00_19: .word 0x5a827999
281.LK_20_39: .word 0x6ed9eba1
282.LK_40_59: .word 0x8f1bbcdc
283.LK_60_79: .word 0xca62c1d6
284#if __ARM_MAX_ARCH__>=7
285.LOPENSSL_armcap:
286.word OPENSSL_armcap_P-sha1_block_data_order
287#endif
288.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
289.align 5
290___
291#####################################################################
292# NEON stuff
293#
294{{{
295my @V=($a,$b,$c,$d,$e);
296my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
297my $Xi=4;
298my @X=map("q$_",(8..11,0..3));
299my @Tx=("q12","q13");
300my ($K,$zero)=("q14","q15");
301my $j=0;
302
303sub AUTOLOAD() # thunk [simplified] x86-style perlasm
304{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
305 my $arg = pop;
306 $arg = "#$arg" if ($arg*1 eq $arg);
307 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
308}
309
310sub body_00_19 () {
311 (
312 '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
313 '&bic ($t0,$d,$b)',
314 '&add ($e,$e,$Ki)', # e+=X[i]+K
315 '&and ($t1,$c,$b)',
316 '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
317 '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
318 '&eor ($t1,$t1,$t0)', # F_00_19
319 '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
320 '&add ($e,$e,$t1);'. # e+=F_00_19
321 '$j++; unshift(@V,pop(@V));'
322 )
323}
324sub body_20_39 () {
325 (
326 '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
327 '&eor ($t0,$b,$d)',
328 '&add ($e,$e,$Ki)', # e+=X[i]+K
329 '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
330 '&eor ($t1,$t0,$c)', # F_20_39
331 '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
332 '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
333 '&add ($e,$e,$t1);'. # e+=F_20_39
334 '$j++; unshift(@V,pop(@V));'
335 )
336}
337sub body_40_59 () {
338 (
339 '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
340 '&add ($e,$e,$Ki)', # e+=X[i]+K
341 '&and ($t0,$c,$d)',
342 '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
343 '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
344 '&eor ($t1,$c,$d)',
345 '&add ($e,$e,$t0)',
346 '&and ($t1,$t1,$b)',
347 '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
348 '&add ($e,$e,$t1);'. # e+=F_40_59
349 '$j++; unshift(@V,pop(@V));'
350 )
351}
352
353sub Xupdate_16_31 ()
354{ use integer;
355 my $body = shift;
356 my @insns = (&$body,&$body,&$body,&$body);
357 my ($a,$b,$c,$d,$e);
358
359 &vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]"
360 eval(shift(@insns));
361 eval(shift(@insns));
362 eval(shift(@insns));
363 &vadd_i32 (@Tx[1],@X[-1&7],$K);
364 eval(shift(@insns));
365 &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
366 eval(shift(@insns));
367 &vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words
368 eval(shift(@insns));
369 eval(shift(@insns));
370 eval(shift(@insns));
371 &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
372 eval(shift(@insns));
373 eval(shift(@insns));
374 &veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
375 eval(shift(@insns));
376 eval(shift(@insns));
377 &veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8]
378 eval(shift(@insns));
379 eval(shift(@insns));
380 &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
381 &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
382 eval(shift(@insns));
383 eval(shift(@insns));
384 &vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword
385 eval(shift(@insns));
386 eval(shift(@insns));
387 &vadd_i32 (@X[0],@Tx[0],@Tx[0]);
388 eval(shift(@insns));
389 eval(shift(@insns));
390 &vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1
391 eval(shift(@insns));
392 eval(shift(@insns));
393 eval(shift(@insns));
394 &vshr_u32 (@Tx[0],@Tx[1],30);
395 eval(shift(@insns));
396 eval(shift(@insns));
397 &vshl_u32 (@Tx[1],@Tx[1],2);
398 eval(shift(@insns));
399 eval(shift(@insns));
400 &veor (@X[0],@X[0],@Tx[0]);
401 eval(shift(@insns));
402 eval(shift(@insns));
403 &veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
404
405 foreach (@insns) { eval; } # remaining instructions [if any]
406
407 $Xi++; push(@X,shift(@X)); # "rotate" X[]
408}
409
410sub Xupdate_32_79 ()
411{ use integer;
412 my $body = shift;
413 my @insns = (&$body,&$body,&$body,&$body);
414 my ($a,$b,$c,$d,$e);
415
416 &vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]"
417 eval(shift(@insns));
418 eval(shift(@insns));
419 eval(shift(@insns));
420 &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
421 eval(shift(@insns));
422 eval(shift(@insns));
423 &veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
424 eval(shift(@insns));
425 eval(shift(@insns));
426 &vadd_i32 (@Tx[1],@X[-1&7],$K);
427 eval(shift(@insns));
428 &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
429 eval(shift(@insns));
430 &veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]"
431 eval(shift(@insns));
432 eval(shift(@insns));
433 &vshr_u32 (@X[0],@Tx[0],30);
434 eval(shift(@insns));
435 eval(shift(@insns));
436 &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
437 &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
438 eval(shift(@insns));
439 eval(shift(@insns));
440 &vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2
441
442 foreach (@insns) { eval; } # remaining instructions [if any]
443
444 $Xi++; push(@X,shift(@X)); # "rotate" X[]
445}
446
447sub Xuplast_80 ()
448{ use integer;
449 my $body = shift;
450 my @insns = (&$body,&$body,&$body,&$body);
451 my ($a,$b,$c,$d,$e);
452
453 &vadd_i32 (@Tx[1],@X[-1&7],$K);
454 eval(shift(@insns));
455 eval(shift(@insns));
456 &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!");
457 &sub ($Xfer,$Xfer,64);
458
459 &teq ($inp,$len);
460 &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
461 &subeq ($inp,$inp,64); # reload last block to avoid SEGV
462 &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
463 eval(shift(@insns));
464 eval(shift(@insns));
465 &vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!");
466 eval(shift(@insns));
467 eval(shift(@insns));
468 &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19
469 eval(shift(@insns));
470 eval(shift(@insns));
471 &vrev32_8 (@X[-4&7],@X[-4&7]);
472
473 foreach (@insns) { eval; } # remaining instructions
474
475 $Xi=0;
476}
477
478sub Xloop()
479{ use integer;
480 my $body = shift;
481 my @insns = (&$body,&$body,&$body,&$body);
482 my ($a,$b,$c,$d,$e);
483
484 &vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]);
485 eval(shift(@insns));
486 eval(shift(@insns));
487 &vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K);
488 eval(shift(@insns));
489 eval(shift(@insns));
490 &vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
491
492 foreach (@insns) { eval; }
493
494 $Xi++;
495}
496
497$code.=<<___;
498#if __ARM_MAX_ARCH__>=7
499.arch armv7-a
500.fpu neon
501
502.type sha1_block_data_order_neon,%function
503.align 4
504sha1_block_data_order_neon:
505.LNEON:
506 stmdb sp!,{r4-r12,lr}
507 add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
508 @ dmb @ errata #451034 on early Cortex A8
509 @ vstmdb sp!,{d8-d15} @ ABI specification says so
510 mov $saved_sp,sp
511 sub sp,sp,#64 @ alloca
512 adr $K_XX_XX,.LK_00_19
513 bic sp,sp,#15 @ align for 128-bit stores
514
515 ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
516 mov $Xfer,sp
517
518 vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
519 veor $zero,$zero,$zero
520 vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
521 vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
522 vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
523 vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
524 vrev32.8 @X[-2&7],@X[-2&7]
525 vadd.i32 @X[0],@X[-4&7],$K
526 vrev32.8 @X[-1&7],@X[-1&7]
527 vadd.i32 @X[1],@X[-3&7],$K
528 vst1.32 {@X[0]},[$Xfer,:128]!
529 vadd.i32 @X[2],@X[-2&7],$K
530 vst1.32 {@X[1]},[$Xfer,:128]!
531 vst1.32 {@X[2]},[$Xfer,:128]!
532 ldr $Ki,[sp] @ big RAW stall
533
534.Loop_neon:
535___
536 &Xupdate_16_31(\&body_00_19);
537 &Xupdate_16_31(\&body_00_19);
538 &Xupdate_16_31(\&body_00_19);
539 &Xupdate_16_31(\&body_00_19);
540 &Xupdate_32_79(\&body_00_19);
541 &Xupdate_32_79(\&body_20_39);
542 &Xupdate_32_79(\&body_20_39);
543 &Xupdate_32_79(\&body_20_39);
544 &Xupdate_32_79(\&body_20_39);
545 &Xupdate_32_79(\&body_20_39);
546 &Xupdate_32_79(\&body_40_59);
547 &Xupdate_32_79(\&body_40_59);
548 &Xupdate_32_79(\&body_40_59);
549 &Xupdate_32_79(\&body_40_59);
550 &Xupdate_32_79(\&body_40_59);
551 &Xupdate_32_79(\&body_20_39);
552 &Xuplast_80(\&body_20_39);
553 &Xloop(\&body_20_39);
554 &Xloop(\&body_20_39);
555 &Xloop(\&body_20_39);
556$code.=<<___;
557 ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
558 add $a,$a,$Ki
559 ldr $Ki,[$ctx,#16]
560 add $b,$b,$t0
561 add $c,$c,$t1
562 add $d,$d,$Xfer
563 moveq sp,$saved_sp
564 add $e,$e,$Ki
565 ldrne $Ki,[sp]
566 stmia $ctx,{$a,$b,$c,$d,$e}
567 addne $Xfer,sp,#3*16
568 bne .Loop_neon
569
570 @ vldmia sp!,{d8-d15}
571 ldmia sp!,{r4-r12,pc}
572.size sha1_block_data_order_neon,.-sha1_block_data_order_neon
573#endif
574___
575}}}
576#####################################################################
577# ARMv8 stuff
578#
579{{{
580my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
581my @MSG=map("q$_",(4..7));
582my @Kxx=map("q$_",(8..11));
583my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
584
585$code.=<<___;
586#if __ARM_MAX_ARCH__>=7
587.type sha1_block_data_order_armv8,%function
588.align 5
589sha1_block_data_order_armv8:
590.LARMv8:
591 vstmdb sp!,{d8-d15} @ ABI specification says so
592
593 veor $E,$E,$E
594 adr r3,.LK_00_19
595 vld1.32 {$ABCD},[$ctx]!
596 vld1.32 {$E\[0]},[$ctx]
597 sub $ctx,$ctx,#16
598 vld1.32 {@Kxx[0]\[]},[r3,:32]!
599 vld1.32 {@Kxx[1]\[]},[r3,:32]!
600 vld1.32 {@Kxx[2]\[]},[r3,:32]!
601 vld1.32 {@Kxx[3]\[]},[r3,:32]
602
603.Loop_v8:
604 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
605 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
606 vrev32.8 @MSG[0],@MSG[0]
607 vrev32.8 @MSG[1],@MSG[1]
608
609 vadd.i32 $W0,@Kxx[0],@MSG[0]
610 vrev32.8 @MSG[2],@MSG[2]
611 vmov $ABCD_SAVE,$ABCD @ offload
612 subs $len,$len,#1
613
614 vadd.i32 $W1,@Kxx[0],@MSG[1]
615 vrev32.8 @MSG[3],@MSG[3]
616 sha1h $E1,$ABCD @ 0
617 sha1c $ABCD,$E,$W0
618 vadd.i32 $W0,@Kxx[$j],@MSG[2]
619 sha1su0 @MSG[0],@MSG[1],@MSG[2]
620___
621for ($j=0,$i=1;$i<20-3;$i++) {
622my $f=("c","p","m","p")[$i/5];
623$code.=<<___;
624 sha1h $E0,$ABCD @ $i
625 sha1$f $ABCD,$E1,$W1
626 vadd.i32 $W1,@Kxx[$j],@MSG[3]
627 sha1su1 @MSG[0],@MSG[3]
628___
629$code.=<<___ if ($i<20-4);
630 sha1su0 @MSG[1],@MSG[2],@MSG[3]
631___
632 ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
633 push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
634}
635$code.=<<___;
636 sha1h $E0,$ABCD @ $i
637 sha1p $ABCD,$E1,$W1
638 vadd.i32 $W1,@Kxx[$j],@MSG[3]
639
640 sha1h $E1,$ABCD @ 18
641 sha1p $ABCD,$E0,$W0
642
643 sha1h $E0,$ABCD @ 19
644 sha1p $ABCD,$E1,$W1
645
646 vadd.i32 $E,$E,$E0
647 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
648 bne .Loop_v8
649
650 vst1.32 {$ABCD},[$ctx]!
651 vst1.32 {$E\[0]},[$ctx]
652
653 vldmia sp!,{d8-d15}
654 ret @ bx lr
655.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
656#endif
657___
658}}}
659$code.=<<___;
660#if __ARM_MAX_ARCH__>=7
661.comm OPENSSL_armcap_P,4,4
Adam Langley13066f12015-02-13 14:47:35 -0800662.hidden OPENSSL_armcap_P
Adam Langleyd9e397b2015-01-22 14:27:53 -0800663#endif
664___
665
666{ my %opcode = (
667 "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
668 "sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40,
669 "sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 );
670
671 sub unsha1 {
672 my ($mnemonic,$arg)=@_;
673
674 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
675 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
676 |(($2&7)<<17)|(($2&8)<<4)
677 |(($3&7)<<1) |(($3&8)<<2);
678 # since ARMv7 instructions are always encoded little-endian.
679 # correct solution is to use .inst directive, but older
680 # assemblers don't implement it:-(
681 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
682 $word&0xff,($word>>8)&0xff,
683 ($word>>16)&0xff,($word>>24)&0xff,
684 $mnemonic,$arg;
685 }
686 }
687}
688
689foreach (split($/,$code)) {
690 s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or
691 s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
692
693 s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
694
695 s/\bret\b/bx lr/o or
696 s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
697
698 print $_,$/;
699}
700
701close STDOUT; # enforce flush