blob: dcd0fcdfcfa20d8c6d32f9604fd3d5b7d6f9d68b [file] [log] [blame]
Kinson Chika8fa74c2011-07-29 11:33:41 -07001#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input(*), except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank. The module is big-endian [which is
13# not big deal as there're no little-endian targets left around].
14#
15# (*) this means that this module is inappropriate for PPC403? Does
16# anybody know if pre-POWER3 can sustain unaligned load?
17
18# -m64 -m32
19# ----------------------------------
20# PPC970,gcc-4.0.0 +76% +59%
21# Power6,xlc-7 +68% +33%
22
23$flavour = shift;
24
25if ($flavour =~ /64/) {
26 $SIZE_T =8;
27 $UCMP ="cmpld";
28 $STU ="stdu";
29 $POP ="ld";
30 $PUSH ="std";
31} elsif ($flavour =~ /32/) {
32 $SIZE_T =4;
33 $UCMP ="cmplw";
34 $STU ="stwu";
35 $POP ="lwz";
36 $PUSH ="stw";
37} else { die "nonsense $flavour"; }
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
41( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
42die "can't locate ppc-xlate.pl";
43
44open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
45
46$FRAME=24*$SIZE_T;
47
48$K ="r0";
49$sp ="r1";
50$toc="r2";
51$ctx="r3";
52$inp="r4";
53$num="r5";
54$t0 ="r15";
55$t1 ="r6";
56
57$A ="r7";
58$B ="r8";
59$C ="r9";
60$D ="r10";
61$E ="r11";
62$T ="r12";
63
64@V=($A,$B,$C,$D,$E,$T);
65@X=("r16","r17","r18","r19","r20","r21","r22","r23",
66 "r24","r25","r26","r27","r28","r29","r30","r31");
67
68sub BODY_00_19 {
69my ($i,$a,$b,$c,$d,$e,$f)=@_;
70my $j=$i+1;
71$code.=<<___ if ($i==0);
72 lwz @X[$i],`$i*4`($inp)
73___
74$code.=<<___ if ($i<15);
75 lwz @X[$j],`$j*4`($inp)
76 add $f,$K,$e
77 rotlwi $e,$a,5
78 add $f,$f,@X[$i]
79 and $t0,$c,$b
80 add $f,$f,$e
81 andc $t1,$d,$b
82 rotlwi $b,$b,30
83 or $t0,$t0,$t1
84 add $f,$f,$t0
85___
86$code.=<<___ if ($i>=15);
87 add $f,$K,$e
88 rotlwi $e,$a,5
89 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
90 add $f,$f,@X[$i%16]
91 and $t0,$c,$b
92 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
93 add $f,$f,$e
94 andc $t1,$d,$b
95 rotlwi $b,$b,30
96 or $t0,$t0,$t1
97 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
98 add $f,$f,$t0
99 rotlwi @X[$j%16],@X[$j%16],1
100___
101}
102
103sub BODY_20_39 {
104my ($i,$a,$b,$c,$d,$e,$f)=@_;
105my $j=$i+1;
106$code.=<<___ if ($i<79);
107 add $f,$K,$e
108 rotlwi $e,$a,5
109 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
110 add $f,$f,@X[$i%16]
111 xor $t0,$b,$c
112 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
113 add $f,$f,$e
114 rotlwi $b,$b,30
115 xor $t0,$t0,$d
116 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
117 add $f,$f,$t0
118 rotlwi @X[$j%16],@X[$j%16],1
119___
120$code.=<<___ if ($i==79);
121 add $f,$K,$e
122 rotlwi $e,$a,5
123 lwz r16,0($ctx)
124 add $f,$f,@X[$i%16]
125 xor $t0,$b,$c
126 lwz r17,4($ctx)
127 add $f,$f,$e
128 rotlwi $b,$b,30
129 lwz r18,8($ctx)
130 xor $t0,$t0,$d
131 lwz r19,12($ctx)
132 add $f,$f,$t0
133 lwz r20,16($ctx)
134___
135}
136
137sub BODY_40_59 {
138my ($i,$a,$b,$c,$d,$e,$f)=@_;
139my $j=$i+1;
140$code.=<<___;
141 add $f,$K,$e
142 rotlwi $e,$a,5
143 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
144 add $f,$f,@X[$i%16]
145 and $t0,$b,$c
146 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
147 add $f,$f,$e
148 or $t1,$b,$c
149 rotlwi $b,$b,30
150 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
151 and $t1,$t1,$d
152 or $t0,$t0,$t1
153 rotlwi @X[$j%16],@X[$j%16],1
154 add $f,$f,$t0
155___
156}
157
158$code=<<___;
159.machine "any"
160.text
161
162.globl .sha1_block_data_order
163.align 4
164.sha1_block_data_order:
165 mflr r0
166 $STU $sp,`-($FRAME+64)`($sp)
167 $PUSH r0,`$FRAME-$SIZE_T*18`($sp)
168 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
169 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
170 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
171 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
172 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
173 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
174 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
175 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
176 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
177 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
178 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
179 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
180 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
181 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
182 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
183 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
184 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
185 lwz $A,0($ctx)
186 lwz $B,4($ctx)
187 lwz $C,8($ctx)
188 lwz $D,12($ctx)
189 lwz $E,16($ctx)
190 andi. r0,$inp,3
191 bne Lunaligned
192Laligned:
193 mtctr $num
194 bl Lsha1_block_private
195Ldone:
196 $POP r0,`$FRAME-$SIZE_T*18`($sp)
197 $POP r15,`$FRAME-$SIZE_T*17`($sp)
198 $POP r16,`$FRAME-$SIZE_T*16`($sp)
199 $POP r17,`$FRAME-$SIZE_T*15`($sp)
200 $POP r18,`$FRAME-$SIZE_T*14`($sp)
201 $POP r19,`$FRAME-$SIZE_T*13`($sp)
202 $POP r20,`$FRAME-$SIZE_T*12`($sp)
203 $POP r21,`$FRAME-$SIZE_T*11`($sp)
204 $POP r22,`$FRAME-$SIZE_T*10`($sp)
205 $POP r23,`$FRAME-$SIZE_T*9`($sp)
206 $POP r24,`$FRAME-$SIZE_T*8`($sp)
207 $POP r25,`$FRAME-$SIZE_T*7`($sp)
208 $POP r26,`$FRAME-$SIZE_T*6`($sp)
209 $POP r27,`$FRAME-$SIZE_T*5`($sp)
210 $POP r28,`$FRAME-$SIZE_T*4`($sp)
211 $POP r29,`$FRAME-$SIZE_T*3`($sp)
212 $POP r30,`$FRAME-$SIZE_T*2`($sp)
213 $POP r31,`$FRAME-$SIZE_T*1`($sp)
214 mtlr r0
215 addi $sp,$sp,`$FRAME+64`
216 blr
217___
218
219# PowerPC specification allows an implementation to be ill-behaved
220# upon unaligned access which crosses page boundary. "Better safe
221# than sorry" principle makes me treat it specially. But I don't
222# look for particular offending word, but rather for 64-byte input
223# block which crosses the boundary. Once found that block is aligned
224# and hashed separately...
225$code.=<<___;
226.align 4
227Lunaligned:
228 subfic $t1,$inp,4096
229 andi. $t1,$t1,4095 ; distance to closest page boundary
230 srwi. $t1,$t1,6 ; t1/=64
231 beq Lcross_page
232 $UCMP $num,$t1
233 ble- Laligned ; didn't cross the page boundary
234 mtctr $t1
235 subfc $num,$t1,$num
236 bl Lsha1_block_private
237Lcross_page:
238 li $t1,16
239 mtctr $t1
240 addi r20,$sp,$FRAME ; spot below the frame
241Lmemcpy:
242 lbz r16,0($inp)
243 lbz r17,1($inp)
244 lbz r18,2($inp)
245 lbz r19,3($inp)
246 addi $inp,$inp,4
247 stb r16,0(r20)
248 stb r17,1(r20)
249 stb r18,2(r20)
250 stb r19,3(r20)
251 addi r20,r20,4
252 bdnz Lmemcpy
253
254 $PUSH $inp,`$FRAME-$SIZE_T*19`($sp)
255 li $t1,1
256 addi $inp,$sp,$FRAME
257 mtctr $t1
258 bl Lsha1_block_private
259 $POP $inp,`$FRAME-$SIZE_T*19`($sp)
260 addic. $num,$num,-1
261 bne- Lunaligned
262 b Ldone
263___
264
265# This is private block function, which uses tailored calling
266# interface, namely upon entry SHA_CTX is pre-loaded to given
267# registers and counter register contains amount of chunks to
268# digest...
269$code.=<<___;
270.align 4
271Lsha1_block_private:
272___
273$code.=<<___; # load K_00_19
274 lis $K,0x5a82
275 ori $K,$K,0x7999
276___
277for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
278$code.=<<___; # load K_20_39
279 lis $K,0x6ed9
280 ori $K,$K,0xeba1
281___
282for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
283$code.=<<___; # load K_40_59
284 lis $K,0x8f1b
285 ori $K,$K,0xbcdc
286___
287for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
288$code.=<<___; # load K_60_79
289 lis $K,0xca62
290 ori $K,$K,0xc1d6
291___
292for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
293$code.=<<___;
294 add r16,r16,$E
295 add r17,r17,$T
296 add r18,r18,$A
297 add r19,r19,$B
298 add r20,r20,$C
299 stw r16,0($ctx)
300 mr $A,r16
301 stw r17,4($ctx)
302 mr $B,r17
303 stw r18,8($ctx)
304 mr $C,r18
305 stw r19,12($ctx)
306 mr $D,r19
307 stw r20,16($ctx)
308 mr $E,r20
309 addi $inp,$inp,`16*4`
310 bdnz- Lsha1_block_private
311 blr
312___
313$code.=<<___;
314.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
315___
316
317$code =~ s/\`([^\`]*)\`/eval $1/gem;
318print $code;
319close STDOUT;