blob: 88861af64116195bb074691243d6bdcac1354ad8 [file] [log] [blame]
Kinson Chika8fa74c2011-07-29 11:33:41 -07001#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# sha1_block procedure for ARMv4.
11#
12# January 2007.
13
14# Size/performance trade-off
15# ====================================================================
16# impl size in bytes comp cycles[*] measured performance
17# ====================================================================
18# thumb 304 3212 4420
19# armv4-small 392/+29% 1958/+64% 2250/+96%
20# armv4-compact 740/+89% 1552/+26% 1840/+22%
21# armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
22# full unroll ~5100/+260% ~1260/+4% ~1300/+5%
23# ====================================================================
24# thumb = same as 'small' but in Thumb instructions[**] and
25# with recurring code in two private functions;
26# small = detached Xload/update, loops are folded;
27# compact = detached Xload/update, 5x unroll;
28# large = interleaved Xload/update, 5x unroll;
29# full unroll = interleaved Xload/update, full unroll, estimated[!];
30#
31# [*] Manually counted instructions in "grand" loop body. Measured
32# performance is affected by prologue and epilogue overhead,
33# i-cache availability, branch penalties, etc.
34# [**] While each Thumb instruction is twice smaller, they are not as
35# diverse as ARM ones: e.g., there are only two arithmetic
36# instructions with 3 arguments, no [fixed] rotate, addressing
37# modes are limited. As result it takes more instructions to do
38# the same job in Thumb, therefore the code is never twice as
39# small and always slower.
40# [***] which is also ~35% better than compiler generated code.
41
42$output=shift;
43open STDOUT,">$output";
44
45$ctx="r0";
46$inp="r1";
47$len="r2";
48$a="r3";
49$b="r4";
50$c="r5";
51$d="r6";
52$e="r7";
53$K="r8";
54$t0="r9";
55$t1="r10";
56$t2="r11";
57$t3="r12";
58$Xi="r14";
59@V=($a,$b,$c,$d,$e);
60
61# One can optimize this for aligned access on big-endian architecture,
62# but code's endian neutrality makes it too pretty:-)
63sub Xload {
64my ($a,$b,$c,$d,$e)=@_;
65$code.=<<___;
66 ldrb $t0,[$inp],#4
67 ldrb $t1,[$inp,#-3]
68 ldrb $t2,[$inp,#-2]
69 ldrb $t3,[$inp,#-1]
70 add $e,$K,$e,ror#2 @ E+=K_00_19
71 orr $t0,$t1,$t0,lsl#8
72 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
73 orr $t0,$t2,$t0,lsl#8
74 eor $t1,$c,$d @ F_xx_xx
75 orr $t0,$t3,$t0,lsl#8
76 add $e,$e,$t0 @ E+=X[i]
77 str $t0,[$Xi,#-4]!
78___
79}
80sub Xupdate {
81my ($a,$b,$c,$d,$e,$flag)=@_;
82$code.=<<___;
83 ldr $t0,[$Xi,#15*4]
84 ldr $t1,[$Xi,#13*4]
85 ldr $t2,[$Xi,#7*4]
86 ldr $t3,[$Xi,#2*4]
87 add $e,$K,$e,ror#2 @ E+=K_xx_xx
88 eor $t0,$t0,$t1
89 eor $t0,$t0,$t2
90 eor $t0,$t0,$t3
91 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
92___
93$code.=<<___ if (!defined($flag));
94 eor $t1,$c,$d @ F_xx_xx, but not in 40_59
95___
96$code.=<<___;
97 mov $t0,$t0,ror#31
98 add $e,$e,$t0 @ E+=X[i]
99 str $t0,[$Xi,#-4]!
100___
101}
102
103sub BODY_00_15 {
104my ($a,$b,$c,$d,$e)=@_;
105 &Xload(@_);
106$code.=<<___;
107 and $t1,$b,$t1,ror#2
108 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
109 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
110___
111}
112
113sub BODY_16_19 {
114my ($a,$b,$c,$d,$e)=@_;
115 &Xupdate(@_);
116$code.=<<___;
117 and $t1,$b,$t1,ror#2
118 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
119 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
120___
121}
122
123sub BODY_20_39 {
124my ($a,$b,$c,$d,$e)=@_;
125 &Xupdate(@_);
126$code.=<<___;
127 eor $t1,$b,$t1,ror#2 @ F_20_39(B,C,D)
128 add $e,$e,$t1 @ E+=F_20_39(B,C,D)
129___
130}
131
132sub BODY_40_59 {
133my ($a,$b,$c,$d,$e)=@_;
134 &Xupdate(@_,1);
135$code.=<<___;
136 and $t1,$b,$c,ror#2
137 orr $t2,$b,$c,ror#2
138 and $t2,$t2,$d,ror#2
139 orr $t1,$t1,$t2 @ F_40_59(B,C,D)
140 add $e,$e,$t1 @ E+=F_40_59(B,C,D)
141___
142}
143
144$code=<<___;
145.text
146
147.global sha1_block_data_order
148.type sha1_block_data_order,%function
149
150.align 2
151sha1_block_data_order:
152 stmdb sp!,{r4-r12,lr}
153 add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
154 ldmia $ctx,{$a,$b,$c,$d,$e}
155.Lloop:
156 ldr $K,.LK_00_19
157 mov $Xi,sp
158 sub sp,sp,#15*4
159 mov $c,$c,ror#30
160 mov $d,$d,ror#30
161 mov $e,$e,ror#30 @ [6]
162.L_00_15:
163___
164for($i=0;$i<5;$i++) {
165 &BODY_00_15(@V); unshift(@V,pop(@V));
166}
167$code.=<<___;
168 teq $Xi,sp
169 bne .L_00_15 @ [((11+4)*5+2)*3]
170___
171 &BODY_00_15(@V); unshift(@V,pop(@V));
172 &BODY_16_19(@V); unshift(@V,pop(@V));
173 &BODY_16_19(@V); unshift(@V,pop(@V));
174 &BODY_16_19(@V); unshift(@V,pop(@V));
175 &BODY_16_19(@V); unshift(@V,pop(@V));
176$code.=<<___;
177
178 ldr $K,.LK_20_39 @ [+15+16*4]
179 sub sp,sp,#25*4
180 cmn sp,#0 @ [+3], clear carry to denote 20_39
181.L_20_39_or_60_79:
182___
183for($i=0;$i<5;$i++) {
184 &BODY_20_39(@V); unshift(@V,pop(@V));
185}
186$code.=<<___;
187 teq $Xi,sp @ preserve carry
188 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
189 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
190
191 ldr $K,.LK_40_59
192 sub sp,sp,#20*4 @ [+2]
193.L_40_59:
194___
195for($i=0;$i<5;$i++) {
196 &BODY_40_59(@V); unshift(@V,pop(@V));
197}
198$code.=<<___;
199 teq $Xi,sp
200 bne .L_40_59 @ [+((12+5)*5+2)*4]
201
202 ldr $K,.LK_60_79
203 sub sp,sp,#20*4
204 cmp sp,#0 @ set carry to denote 60_79
205 b .L_20_39_or_60_79 @ [+4], spare 300 bytes
206.L_done:
207 add sp,sp,#80*4 @ "deallocate" stack frame
208 ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
209 add $a,$K,$a
210 add $b,$t0,$b
211 add $c,$t1,$c,ror#2
212 add $d,$t2,$d,ror#2
213 add $e,$t3,$e,ror#2
214 stmia $ctx,{$a,$b,$c,$d,$e}
215 teq $inp,$len
216 bne .Lloop @ [+18], total 1307
217
218 ldmia sp!,{r4-r12,lr}
219 tst lr,#1
220 moveq pc,lr @ be binary compatible with V4, yet
221 bx lr @ interoperable with Thumb ISA:-)
222.align 2
223.LK_00_19: .word 0x5a827999
224.LK_20_39: .word 0x6ed9eba1
225.LK_40_59: .word 0x8f1bbcdc
226.LK_60_79: .word 0xca62c1d6
227.size sha1_block_data_order,.-sha1_block_data_order
228.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
229.align 2
230___
231
232$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
233print $code;
234close STDOUT; # enforce flush