blob: 15d78de6dca1d3477e7a873670c7a63d33c9939b [file] [log] [blame]
David Benjamin1b249672016-12-06 18:25:50 -05001#! /usr/bin/env perl
2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
Adam Langleyd9e397b2015-01-22 14:27:53 -08009
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
Adam Langleye9ada862015-05-11 17:20:37 -070015#
16# Permission to use under GPL terms is granted.
Adam Langleyd9e397b2015-01-22 14:27:53 -080017# ====================================================================
18
19# SHA256 block procedure for ARMv4. May 2007.
20
21# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23# byte [on single-issue Xscale PXA250 core].
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28# Cortex A8 core and ~20 cycles per processed byte.
29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35# September 2013.
36#
37# Add NEON implementation. On Cortex A8 it was measured to process one
38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40# code (meaning that latter performs sub-optimally, nothing was done
41# about it).
42
43# May 2014.
44#
45# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
Adam Langleye9ada862015-05-11 17:20:37 -070047$flavour = shift;
David Benjaminc895d6b2016-08-11 13:26:41 -040048if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
49else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
Adam Langleye9ada862015-05-11 17:20:37 -070050
51if ($flavour && $flavour ne "void") {
52 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
Robert Sloan572a4e22017-04-17 10:52:19 -070054 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
Adam Langleye9ada862015-05-11 17:20:37 -070055 die "can't locate arm-xlate.pl";
56
57 open STDOUT,"| \"$^X\" $xlate $flavour $output";
58} else {
59 open STDOUT,">$output";
60}
Adam Langleyd9e397b2015-01-22 14:27:53 -080061
62$ctx="r0"; $t0="r0";
63$inp="r1"; $t4="r1";
64$len="r2"; $t1="r2";
65$T1="r3"; $t3="r3";
66$A="r4";
67$B="r5";
68$C="r6";
69$D="r7";
70$E="r8";
71$F="r9";
72$G="r10";
73$H="r11";
74@V=($A,$B,$C,$D,$E,$F,$G,$H);
75$t2="r12";
76$Ktbl="r14";
77
78@Sigma0=( 2,13,22);
79@Sigma1=( 6,11,25);
80@sigma0=( 7,18, 3);
81@sigma1=(17,19,10);
82
83sub BODY_00_15 {
84my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
85
86$code.=<<___ if ($i<16);
87#if __ARM_ARCH__>=7
88 @ ldr $t1,[$inp],#4 @ $i
89# if $i==15
90 str $inp,[sp,#17*4] @ make room for $t4
91# endif
92 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
93 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
94 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
Adam Langleye9ada862015-05-11 17:20:37 -070095# ifndef __ARMEB__
Adam Langleyd9e397b2015-01-22 14:27:53 -080096 rev $t1,$t1
Adam Langleye9ada862015-05-11 17:20:37 -070097# endif
Adam Langleyd9e397b2015-01-22 14:27:53 -080098#else
99 @ ldrb $t1,[$inp,#3] @ $i
100 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
101 ldrb $t2,[$inp,#2]
102 ldrb $t0,[$inp,#1]
103 orr $t1,$t1,$t2,lsl#8
104 ldrb $t2,[$inp],#4
105 orr $t1,$t1,$t0,lsl#16
106# if $i==15
107 str $inp,[sp,#17*4] @ make room for $t4
108# endif
109 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
110 orr $t1,$t1,$t2,lsl#24
111 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
112#endif
113___
114$code.=<<___;
115 ldr $t2,[$Ktbl],#4 @ *K256++
116 add $h,$h,$t1 @ h+=X[i]
117 str $t1,[sp,#`$i%16`*4]
118 eor $t1,$f,$g
119 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
120 and $t1,$t1,$e
121 add $h,$h,$t2 @ h+=K256[i]
122 eor $t1,$t1,$g @ Ch(e,f,g)
123 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
124 add $h,$h,$t1 @ h+=Ch(e,f,g)
125#if $i==31
126 and $t2,$t2,#0xff
127 cmp $t2,#0xf2 @ done?
128#endif
129#if $i<15
130# if __ARM_ARCH__>=7
131 ldr $t1,[$inp],#4 @ prefetch
132# else
133 ldrb $t1,[$inp,#3]
134# endif
135 eor $t2,$a,$b @ a^b, b^c in next round
136#else
137 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
138 eor $t2,$a,$b @ a^b, b^c in next round
139 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
140#endif
141 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
142 and $t3,$t3,$t2 @ (b^c)&=(a^b)
143 add $d,$d,$h @ d+=h
144 eor $t3,$t3,$b @ Maj(a,b,c)
145 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
146 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
147___
148 ($t2,$t3)=($t3,$t2);
149}
150
151sub BODY_16_XX {
152my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
153
154$code.=<<___;
155 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
156 @ ldr $t4,[sp,#`($i+14)%16`*4]
157 mov $t0,$t1,ror#$sigma0[0]
158 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
159 mov $t2,$t4,ror#$sigma1[0]
160 eor $t0,$t0,$t1,ror#$sigma0[1]
161 eor $t2,$t2,$t4,ror#$sigma1[1]
162 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
163 ldr $t1,[sp,#`($i+0)%16`*4]
164 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
165 ldr $t4,[sp,#`($i+9)%16`*4]
166
167 add $t2,$t2,$t0
168 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
169 add $t1,$t1,$t2
170 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
171 add $t1,$t1,$t4 @ X[i]
172___
173 &BODY_00_15(@_);
174}
175
176$code=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -0700177#ifndef __KERNEL__
Kenny Rootb8494592015-09-25 02:29:14 +0000178# include <openssl/arm_arch.h>
Adam Langleye9ada862015-05-11 17:20:37 -0700179#else
180# define __ARM_ARCH__ __LINUX_ARM_ARCH__
181# define __ARM_MAX_ARCH__ 7
182#endif
Adam Langleyd9e397b2015-01-22 14:27:53 -0800183
Robert Sloan55818102017-12-18 11:26:17 -0800184@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
185@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
186@ instructions are manually-encoded. (See unsha256.)
187.arch armv7-a
188
Adam Langleyd9e397b2015-01-22 14:27:53 -0800189.text
David Benjamin1b249672016-12-06 18:25:50 -0500190#if defined(__thumb2__)
Adam Langleye9ada862015-05-11 17:20:37 -0700191.syntax unified
Adam Langleye9ada862015-05-11 17:20:37 -0700192.thumb
David Benjamin1b249672016-12-06 18:25:50 -0500193#else
Adam Langleye9ada862015-05-11 17:20:37 -0700194.code 32
Adam Langleye9ada862015-05-11 17:20:37 -0700195#endif
Adam Langleyd9e397b2015-01-22 14:27:53 -0800196
197.type K256,%object
198.align 5
199K256:
200.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
201.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
202.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
203.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
204.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
205.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
206.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
207.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
208.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
209.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
210.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
211.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
212.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
213.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
214.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
215.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
216.size K256,.-K256
217.word 0 @ terminator
Adam Langleye9ada862015-05-11 17:20:37 -0700218#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800219.LOPENSSL_armcap:
Adam Langleye9ada862015-05-11 17:20:37 -0700220.word OPENSSL_armcap_P-.Lsha256_block_data_order
Adam Langleyd9e397b2015-01-22 14:27:53 -0800221#endif
222.align 5
223
224.global sha256_block_data_order
225.type sha256_block_data_order,%function
226sha256_block_data_order:
Adam Langleye9ada862015-05-11 17:20:37 -0700227.Lsha256_block_data_order:
David Benjamin1b249672016-12-06 18:25:50 -0500228#if __ARM_ARCH__<7 && !defined(__thumb2__)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800229 sub r3,pc,#8 @ sha256_block_data_order
Adam Langleye9ada862015-05-11 17:20:37 -0700230#else
David Benjamin1b249672016-12-06 18:25:50 -0500231 adr r3,.Lsha256_block_data_order
Adam Langleye9ada862015-05-11 17:20:37 -0700232#endif
233#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800234 ldr r12,.LOPENSSL_armcap
235 ldr r12,[r3,r12] @ OPENSSL_armcap_P
Adam Langleye9ada862015-05-11 17:20:37 -0700236#ifdef __APPLE__
237 ldr r12,[r12]
238#endif
Adam Langleyd9e397b2015-01-22 14:27:53 -0800239 tst r12,#ARMV8_SHA256
240 bne .LARMv8
241 tst r12,#ARMV7_NEON
242 bne .LNEON
243#endif
Adam Langleye9ada862015-05-11 17:20:37 -0700244 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800245 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
246 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
247 sub $Ktbl,r3,#256+32 @ K256
248 sub sp,sp,#16*4 @ alloca(X[16])
249.Loop:
250# if __ARM_ARCH__>=7
251 ldr $t1,[$inp],#4
252# else
253 ldrb $t1,[$inp,#3]
254# endif
255 eor $t3,$B,$C @ magic
256 eor $t2,$t2,$t2
257___
258for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
259$code.=".Lrounds_16_xx:\n";
260for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
261$code.=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -0700262#if __ARM_ARCH__>=7
263 ite eq @ Thumb2 thing, sanity check in ARM
264#endif
Adam Langleyd9e397b2015-01-22 14:27:53 -0800265 ldreq $t3,[sp,#16*4] @ pull ctx
266 bne .Lrounds_16_xx
267
268 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
269 ldr $t0,[$t3,#0]
270 ldr $t1,[$t3,#4]
271 ldr $t2,[$t3,#8]
272 add $A,$A,$t0
273 ldr $t0,[$t3,#12]
274 add $B,$B,$t1
275 ldr $t1,[$t3,#16]
276 add $C,$C,$t2
277 ldr $t2,[$t3,#20]
278 add $D,$D,$t0
279 ldr $t0,[$t3,#24]
280 add $E,$E,$t1
281 ldr $t1,[$t3,#28]
282 add $F,$F,$t2
283 ldr $inp,[sp,#17*4] @ pull inp
284 ldr $t2,[sp,#18*4] @ pull inp+len
285 add $G,$G,$t0
286 add $H,$H,$t1
287 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
288 cmp $inp,$t2
289 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
290 bne .Loop
291
292 add sp,sp,#`16+3`*4 @ destroy frame
293#if __ARM_ARCH__>=5
294 ldmia sp!,{r4-r11,pc}
295#else
296 ldmia sp!,{r4-r11,lr}
297 tst lr,#1
298 moveq pc,lr @ be binary compatible with V4, yet
299 bx lr @ interoperable with Thumb ISA:-)
300#endif
301.size sha256_block_data_order,.-sha256_block_data_order
302___
303######################################################################
304# NEON stuff
305#
306{{{
307my @X=map("q$_",(0..3));
308my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
309my $Xfer=$t4;
310my $j=0;
311
312sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
313sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
314
315sub AUTOLOAD() # thunk [simplified] x86-style perlasm
316{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
317 my $arg = pop;
318 $arg = "#$arg" if ($arg*1 eq $arg);
319 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
320}
321
322sub Xupdate()
323{ use integer;
324 my $body = shift;
325 my @insns = (&$body,&$body,&$body,&$body);
326 my ($a,$b,$c,$d,$e,$f,$g,$h);
327
328 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
329 eval(shift(@insns));
330 eval(shift(@insns));
331 eval(shift(@insns));
332 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
333 eval(shift(@insns));
334 eval(shift(@insns));
335 eval(shift(@insns));
336 &vshr_u32 ($T2,$T0,$sigma0[0]);
337 eval(shift(@insns));
338 eval(shift(@insns));
339 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
340 eval(shift(@insns));
341 eval(shift(@insns));
342 &vshr_u32 ($T1,$T0,$sigma0[2]);
343 eval(shift(@insns));
344 eval(shift(@insns));
345 &vsli_32 ($T2,$T0,32-$sigma0[0]);
346 eval(shift(@insns));
347 eval(shift(@insns));
348 &vshr_u32 ($T3,$T0,$sigma0[1]);
349 eval(shift(@insns));
350 eval(shift(@insns));
351 &veor ($T1,$T1,$T2);
352 eval(shift(@insns));
353 eval(shift(@insns));
354 &vsli_32 ($T3,$T0,32-$sigma0[1]);
355 eval(shift(@insns));
356 eval(shift(@insns));
357 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
358 eval(shift(@insns));
359 eval(shift(@insns));
360 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
361 eval(shift(@insns));
362 eval(shift(@insns));
363 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
364 eval(shift(@insns));
365 eval(shift(@insns));
366 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
367 eval(shift(@insns));
368 eval(shift(@insns));
369 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
370 eval(shift(@insns));
371 eval(shift(@insns));
372 &veor ($T5,$T5,$T4);
373 eval(shift(@insns));
374 eval(shift(@insns));
375 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
376 eval(shift(@insns));
377 eval(shift(@insns));
378 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
379 eval(shift(@insns));
380 eval(shift(@insns));
381 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
382 eval(shift(@insns));
383 eval(shift(@insns));
384 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
385 eval(shift(@insns));
386 eval(shift(@insns));
387 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
388 eval(shift(@insns));
389 eval(shift(@insns));
390 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
391 eval(shift(@insns));
392 eval(shift(@insns));
393 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
394 eval(shift(@insns));
395 eval(shift(@insns));
396 &veor ($T5,$T5,$T4);
397 eval(shift(@insns));
398 eval(shift(@insns));
399 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
400 eval(shift(@insns));
401 eval(shift(@insns));
402 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
403 eval(shift(@insns));
404 eval(shift(@insns));
405 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
406 eval(shift(@insns));
407 eval(shift(@insns));
408 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
409 eval(shift(@insns));
410 eval(shift(@insns));
411 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
412 eval(shift(@insns));
413 eval(shift(@insns));
414 &vadd_i32 ($T0,$T0,@X[0]);
415 while($#insns>=2) { eval(shift(@insns)); }
416 &vst1_32 ("{$T0}","[$Xfer,:128]!");
417 eval(shift(@insns));
418 eval(shift(@insns));
419
420 push(@X,shift(@X)); # "rotate" X[]
421}
422
423sub Xpreload()
424{ use integer;
425 my $body = shift;
426 my @insns = (&$body,&$body,&$body,&$body);
427 my ($a,$b,$c,$d,$e,$f,$g,$h);
428
429 eval(shift(@insns));
430 eval(shift(@insns));
431 eval(shift(@insns));
432 eval(shift(@insns));
433 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
434 eval(shift(@insns));
435 eval(shift(@insns));
436 eval(shift(@insns));
437 eval(shift(@insns));
438 &vrev32_8 (@X[0],@X[0]);
439 eval(shift(@insns));
440 eval(shift(@insns));
441 eval(shift(@insns));
442 eval(shift(@insns));
443 &vadd_i32 ($T0,$T0,@X[0]);
444 foreach (@insns) { eval; } # remaining instructions
445 &vst1_32 ("{$T0}","[$Xfer,:128]!");
446
447 push(@X,shift(@X)); # "rotate" X[]
448}
449
450sub body_00_15 () {
451 (
452 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
453 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
454 '&eor ($t1,$f,$g)',
455 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
456 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
457 '&and ($t1,$t1,$e)',
458 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
459 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
460 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
461 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
462 '&eor ($t2,$a,$b)', # a^b, b^c in next round
463 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
464 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
465 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
466 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
467 '&ldr ($t1,"[sp,#64]") if ($j==31)',
468 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
469 '&add ($d,$d,$h)', # d+=h
470 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
471 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
472 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
473 )
474}
475
476$code.=<<___;
477#if __ARM_MAX_ARCH__>=7
478.arch armv7-a
479.fpu neon
480
Adam Langleye9ada862015-05-11 17:20:37 -0700481.global sha256_block_data_order_neon
Adam Langleyd9e397b2015-01-22 14:27:53 -0800482.type sha256_block_data_order_neon,%function
David Benjamin1b249672016-12-06 18:25:50 -0500483.align 5
484.skip 16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800485sha256_block_data_order_neon:
486.LNEON:
487 stmdb sp!,{r4-r12,lr}
488
Adam Langleye9ada862015-05-11 17:20:37 -0700489 sub $H,sp,#16*4+16
David Benjamin1b249672016-12-06 18:25:50 -0500490 adr $Ktbl,K256
Adam Langleye9ada862015-05-11 17:20:37 -0700491 bic $H,$H,#15 @ align for 128-bit stores
Adam Langleyd9e397b2015-01-22 14:27:53 -0800492 mov $t2,sp
Adam Langleye9ada862015-05-11 17:20:37 -0700493 mov sp,$H @ alloca
494 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800495
496 vld1.8 {@X[0]},[$inp]!
497 vld1.8 {@X[1]},[$inp]!
498 vld1.8 {@X[2]},[$inp]!
499 vld1.8 {@X[3]},[$inp]!
500 vld1.32 {$T0},[$Ktbl,:128]!
501 vld1.32 {$T1},[$Ktbl,:128]!
502 vld1.32 {$T2},[$Ktbl,:128]!
503 vld1.32 {$T3},[$Ktbl,:128]!
504 vrev32.8 @X[0],@X[0] @ yes, even on
505 str $ctx,[sp,#64]
506 vrev32.8 @X[1],@X[1] @ big-endian
507 str $inp,[sp,#68]
508 mov $Xfer,sp
509 vrev32.8 @X[2],@X[2]
510 str $len,[sp,#72]
511 vrev32.8 @X[3],@X[3]
512 str $t2,[sp,#76] @ save original sp
513 vadd.i32 $T0,$T0,@X[0]
514 vadd.i32 $T1,$T1,@X[1]
515 vst1.32 {$T0},[$Xfer,:128]!
516 vadd.i32 $T2,$T2,@X[2]
517 vst1.32 {$T1},[$Xfer,:128]!
518 vadd.i32 $T3,$T3,@X[3]
519 vst1.32 {$T2},[$Xfer,:128]!
520 vst1.32 {$T3},[$Xfer,:128]!
521
522 ldmia $ctx,{$A-$H}
523 sub $Xfer,$Xfer,#64
524 ldr $t1,[sp,#0]
525 eor $t2,$t2,$t2
526 eor $t3,$B,$C
527 b .L_00_48
528
529.align 4
530.L_00_48:
531___
532 &Xupdate(\&body_00_15);
533 &Xupdate(\&body_00_15);
534 &Xupdate(\&body_00_15);
535 &Xupdate(\&body_00_15);
536$code.=<<___;
537 teq $t1,#0 @ check for K256 terminator
538 ldr $t1,[sp,#0]
539 sub $Xfer,$Xfer,#64
540 bne .L_00_48
541
542 ldr $inp,[sp,#68]
543 ldr $t0,[sp,#72]
544 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
545 teq $inp,$t0
Adam Langleye9ada862015-05-11 17:20:37 -0700546 it eq
Adam Langleyd9e397b2015-01-22 14:27:53 -0800547 subeq $inp,$inp,#64 @ avoid SEGV
548 vld1.8 {@X[0]},[$inp]! @ load next input block
549 vld1.8 {@X[1]},[$inp]!
550 vld1.8 {@X[2]},[$inp]!
551 vld1.8 {@X[3]},[$inp]!
Adam Langleye9ada862015-05-11 17:20:37 -0700552 it ne
Adam Langleyd9e397b2015-01-22 14:27:53 -0800553 strne $inp,[sp,#68]
554 mov $Xfer,sp
555___
556 &Xpreload(\&body_00_15);
557 &Xpreload(\&body_00_15);
558 &Xpreload(\&body_00_15);
559 &Xpreload(\&body_00_15);
560$code.=<<___;
561 ldr $t0,[$t1,#0]
562 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
563 ldr $t2,[$t1,#4]
564 ldr $t3,[$t1,#8]
565 ldr $t4,[$t1,#12]
566 add $A,$A,$t0 @ accumulate
567 ldr $t0,[$t1,#16]
568 add $B,$B,$t2
569 ldr $t2,[$t1,#20]
570 add $C,$C,$t3
571 ldr $t3,[$t1,#24]
572 add $D,$D,$t4
573 ldr $t4,[$t1,#28]
574 add $E,$E,$t0
575 str $A,[$t1],#4
576 add $F,$F,$t2
577 str $B,[$t1],#4
578 add $G,$G,$t3
579 str $C,[$t1],#4
580 add $H,$H,$t4
581 str $D,[$t1],#4
582 stmia $t1,{$E-$H}
583
Adam Langleye9ada862015-05-11 17:20:37 -0700584 ittte ne
Adam Langleyd9e397b2015-01-22 14:27:53 -0800585 movne $Xfer,sp
586 ldrne $t1,[sp,#0]
587 eorne $t2,$t2,$t2
588 ldreq sp,[sp,#76] @ restore original sp
Adam Langleye9ada862015-05-11 17:20:37 -0700589 itt ne
Adam Langleyd9e397b2015-01-22 14:27:53 -0800590 eorne $t3,$B,$C
591 bne .L_00_48
592
593 ldmia sp!,{r4-r12,pc}
594.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
595#endif
596___
597}}}
598######################################################################
599# ARMv8 stuff
600#
601{{{
602my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
603my @MSG=map("q$_",(8..11));
604my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
605my $Ktbl="r3";
606
607$code.=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -0700608#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
609
David Benjamin1b249672016-12-06 18:25:50 -0500610# if defined(__thumb2__)
Adam Langleye9ada862015-05-11 17:20:37 -0700611# define INST(a,b,c,d) .byte c,d|0xc,a,b
612# else
613# define INST(a,b,c,d) .byte a,b,c,d
614# endif
615
Adam Langleyd9e397b2015-01-22 14:27:53 -0800616.type sha256_block_data_order_armv8,%function
617.align 5
618sha256_block_data_order_armv8:
619.LARMv8:
620 vld1.32 {$ABCD,$EFGH},[$ctx]
Adam Langleye9ada862015-05-11 17:20:37 -0700621 sub $Ktbl,$Ktbl,#256+32
Adam Langleye9ada862015-05-11 17:20:37 -0700622 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
David Benjamin1b249672016-12-06 18:25:50 -0500623 b .Loop_v8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800624
David Benjamin1b249672016-12-06 18:25:50 -0500625.align 4
Adam Langleyd9e397b2015-01-22 14:27:53 -0800626.Loop_v8:
627 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
628 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
629 vld1.32 {$W0},[$Ktbl]!
630 vrev32.8 @MSG[0],@MSG[0]
631 vrev32.8 @MSG[1],@MSG[1]
632 vrev32.8 @MSG[2],@MSG[2]
633 vrev32.8 @MSG[3],@MSG[3]
634 vmov $ABCD_SAVE,$ABCD @ offload
635 vmov $EFGH_SAVE,$EFGH
636 teq $inp,$len
637___
638for($i=0;$i<12;$i++) {
639$code.=<<___;
640 vld1.32 {$W1},[$Ktbl]!
641 vadd.i32 $W0,$W0,@MSG[0]
642 sha256su0 @MSG[0],@MSG[1]
643 vmov $abcd,$ABCD
644 sha256h $ABCD,$EFGH,$W0
645 sha256h2 $EFGH,$abcd,$W0
646 sha256su1 @MSG[0],@MSG[2],@MSG[3]
647___
648 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
649}
650$code.=<<___;
651 vld1.32 {$W1},[$Ktbl]!
652 vadd.i32 $W0,$W0,@MSG[0]
653 vmov $abcd,$ABCD
654 sha256h $ABCD,$EFGH,$W0
655 sha256h2 $EFGH,$abcd,$W0
656
657 vld1.32 {$W0},[$Ktbl]!
658 vadd.i32 $W1,$W1,@MSG[1]
659 vmov $abcd,$ABCD
660 sha256h $ABCD,$EFGH,$W1
661 sha256h2 $EFGH,$abcd,$W1
662
663 vld1.32 {$W1},[$Ktbl]
664 vadd.i32 $W0,$W0,@MSG[2]
665 sub $Ktbl,$Ktbl,#256-16 @ rewind
666 vmov $abcd,$ABCD
667 sha256h $ABCD,$EFGH,$W0
668 sha256h2 $EFGH,$abcd,$W0
669
670 vadd.i32 $W1,$W1,@MSG[3]
671 vmov $abcd,$ABCD
672 sha256h $ABCD,$EFGH,$W1
673 sha256h2 $EFGH,$abcd,$W1
674
675 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
676 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
Adam Langleye9ada862015-05-11 17:20:37 -0700677 it ne
Adam Langleyd9e397b2015-01-22 14:27:53 -0800678 bne .Loop_v8
679
680 vst1.32 {$ABCD,$EFGH},[$ctx]
681
682 ret @ bx lr
683.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
684#endif
685___
686}}}
687$code.=<<___;
688.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
689.align 2
Adam Langleye9ada862015-05-11 17:20:37 -0700690#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800691.comm OPENSSL_armcap_P,4,4
Adam Langley13066f12015-02-13 14:47:35 -0800692.hidden OPENSSL_armcap_P
Adam Langleyd9e397b2015-01-22 14:27:53 -0800693#endif
694___
695
Adam Langleye9ada862015-05-11 17:20:37 -0700696open SELF,$0;
697while(<SELF>) {
698 next if (/^#!/);
699 last if (!s/^#/@/ and !/^$/);
700 print;
701}
702close SELF;
703
Adam Langleyd9e397b2015-01-22 14:27:53 -0800704{ my %opcode = (
705 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
706 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
707
708 sub unsha256 {
709 my ($mnemonic,$arg)=@_;
710
711 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
712 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
713 |(($2&7)<<17)|(($2&8)<<4)
714 |(($3&7)<<1) |(($3&8)<<2);
715 # since ARMv7 instructions are always encoded little-endian.
716 # correct solution is to use .inst directive, but older
717 # assemblers don't implement it:-(
Adam Langleye9ada862015-05-11 17:20:37 -0700718 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
Adam Langleyd9e397b2015-01-22 14:27:53 -0800719 $word&0xff,($word>>8)&0xff,
720 ($word>>16)&0xff,($word>>24)&0xff,
721 $mnemonic,$arg;
722 }
723 }
724}
725
726foreach (split($/,$code)) {
727
728 s/\`([^\`]*)\`/eval $1/geo;
729
730 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
731
732 s/\bret\b/bx lr/go or
733 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
734
735 print $_,"\n";
736}
737
Pete Bentley0c61efe2019-08-13 09:32:23 +0100738close STDOUT or die "error closing STDOUT"; # enforce flush