blob: 8f9156e02af382d6f619ed9fdc59bc1522d3f88e [file] [log] [blame]
Kinson Chika8fa74c2011-07-29 11:33:41 -07001#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# This module doesn't present direct interest for OpenSSL, because it
11# doesn't provide better performance for longer keys. While 512-bit
12# RSA private key operations are 40% faster, 1024-bit ones are hardly
13# faster at all, while longer key operations are slower by up to 20%.
14# It might be of interest to embedded system developers though, as
15# it's smaller than 1KB, yet offers ~3x improvement over compiler
16# generated code.
17#
18# The module targets N32 and N64 MIPS ABIs and currently is a bit
19# IRIX-centric, i.e. is likely to require adaptation for other OSes.
20
21# int bn_mul_mont(
22$rp="a0"; # BN_ULONG *rp,
23$ap="a1"; # const BN_ULONG *ap,
24$bp="a2"; # const BN_ULONG *bp,
25$np="a3"; # const BN_ULONG *np,
26$n0="a4"; # const BN_ULONG *n0,
27$num="a5"; # int num);
28
29$lo0="a6";
30$hi0="a7";
31$lo1="v0";
32$hi1="v1";
33$aj="t0";
34$bi="t1";
35$nj="t2";
36$tp="t3";
37$alo="s0";
38$ahi="s1";
39$nlo="s2";
40$nhi="s3";
41$tj="s4";
42$i="s5";
43$j="s6";
44$fp="t8";
45$m1="t9";
46
47$FRAME=8*(2+8);
48
49$code=<<___;
50#include <asm.h>
51#include <regdef.h>
52
53.text
54
55.set noat
56.set reorder
57
58.align 5
59.globl bn_mul_mont
60.ent bn_mul_mont
61bn_mul_mont:
62 .set noreorder
63 PTR_SUB sp,64
64 move $fp,sp
65 .frame $fp,64,ra
66 slt AT,$num,4
67 li v0,0
68 beqzl AT,.Lproceed
69 nop
70 jr ra
71 PTR_ADD sp,$fp,64
72 .set reorder
73.align 5
74.Lproceed:
75 ld $n0,0($n0)
76 ld $bi,0($bp) # bp[0]
77 ld $aj,0($ap) # ap[0]
78 ld $nj,0($np) # np[0]
79 PTR_SUB sp,16 # place for two extra words
80 sll $num,3
81 li AT,-4096
82 PTR_SUB sp,$num
83 and sp,AT
84
85 sd s0,0($fp)
86 sd s1,8($fp)
87 sd s2,16($fp)
88 sd s3,24($fp)
89 sd s4,32($fp)
90 sd s5,40($fp)
91 sd s6,48($fp)
92 sd s7,56($fp)
93
94 dmultu $aj,$bi
95 ld $alo,8($ap)
96 ld $nlo,8($np)
97 mflo $lo0
98 mfhi $hi0
99 dmultu $lo0,$n0
100 mflo $m1
101
102 dmultu $alo,$bi
103 mflo $alo
104 mfhi $ahi
105
106 dmultu $nj,$m1
107 mflo $lo1
108 mfhi $hi1
109 dmultu $nlo,$m1
110 daddu $lo1,$lo0
111 sltu AT,$lo1,$lo0
112 daddu $hi1,AT
113 mflo $nlo
114 mfhi $nhi
115
116 move $tp,sp
117 li $j,16
118.align 4
119.L1st:
120 .set noreorder
121 PTR_ADD $aj,$ap,$j
122 ld $aj,($aj)
123 PTR_ADD $nj,$np,$j
124 ld $nj,($nj)
125
126 dmultu $aj,$bi
127 daddu $lo0,$alo,$hi0
128 daddu $lo1,$nlo,$hi1
129 sltu AT,$lo0,$hi0
130 sltu s7,$lo1,$hi1
131 daddu $hi0,$ahi,AT
132 daddu $hi1,$nhi,s7
133 mflo $alo
134 mfhi $ahi
135
136 daddu $lo1,$lo0
137 sltu AT,$lo1,$lo0
138 dmultu $nj,$m1
139 daddu $hi1,AT
140 addu $j,8
141 sd $lo1,($tp)
142 sltu s7,$j,$num
143 mflo $nlo
144 mfhi $nhi
145
146 bnez s7,.L1st
147 PTR_ADD $tp,8
148 .set reorder
149
150 daddu $lo0,$alo,$hi0
151 sltu AT,$lo0,$hi0
152 daddu $hi0,$ahi,AT
153
154 daddu $lo1,$nlo,$hi1
155 sltu s7,$lo1,$hi1
156 daddu $hi1,$nhi,s7
157 daddu $lo1,$lo0
158 sltu AT,$lo1,$lo0
159 daddu $hi1,AT
160
161 sd $lo1,($tp)
162
163 daddu $hi1,$hi0
164 sltu AT,$hi1,$hi0
165 sd $hi1,8($tp)
166 sd AT,16($tp)
167
168 li $i,8
169.align 4
170.Louter:
171 PTR_ADD $bi,$bp,$i
172 ld $bi,($bi)
173 ld $aj,($ap)
174 ld $alo,8($ap)
175 ld $tj,(sp)
176
177 dmultu $aj,$bi
178 ld $nj,($np)
179 ld $nlo,8($np)
180 mflo $lo0
181 mfhi $hi0
182 daddu $lo0,$tj
183 dmultu $lo0,$n0
184 sltu AT,$lo0,$tj
185 daddu $hi0,AT
186 mflo $m1
187
188 dmultu $alo,$bi
189 mflo $alo
190 mfhi $ahi
191
192 dmultu $nj,$m1
193 mflo $lo1
194 mfhi $hi1
195
196 dmultu $nlo,$m1
197 daddu $lo1,$lo0
198 sltu AT,$lo1,$lo0
199 daddu $hi1,AT
200 mflo $nlo
201 mfhi $nhi
202
203 move $tp,sp
204 li $j,16
205 ld $tj,8($tp)
206.align 4
207.Linner:
208 .set noreorder
209 PTR_ADD $aj,$ap,$j
210 ld $aj,($aj)
211 PTR_ADD $nj,$np,$j
212 ld $nj,($nj)
213
214 dmultu $aj,$bi
215 daddu $lo0,$alo,$hi0
216 daddu $lo1,$nlo,$hi1
217 sltu AT,$lo0,$hi0
218 sltu s7,$lo1,$hi1
219 daddu $hi0,$ahi,AT
220 daddu $hi1,$nhi,s7
221 mflo $alo
222 mfhi $ahi
223
224 daddu $lo0,$tj
225 addu $j,8
226 dmultu $nj,$m1
227 sltu AT,$lo0,$tj
228 daddu $lo1,$lo0
229 daddu $hi0,AT
230 sltu s7,$lo1,$lo0
231 ld $tj,16($tp)
232 daddu $hi1,s7
233 sltu AT,$j,$num
234 mflo $nlo
235 mfhi $nhi
236 sd $lo1,($tp)
237 bnez AT,.Linner
238 PTR_ADD $tp,8
239 .set reorder
240
241 daddu $lo0,$alo,$hi0
242 sltu AT,$lo0,$hi0
243 daddu $hi0,$ahi,AT
244 daddu $lo0,$tj
245 sltu s7,$lo0,$tj
246 daddu $hi0,s7
247
248 ld $tj,16($tp)
249 daddu $lo1,$nlo,$hi1
250 sltu AT,$lo1,$hi1
251 daddu $hi1,$nhi,AT
252 daddu $lo1,$lo0
253 sltu s7,$lo1,$lo0
254 daddu $hi1,s7
255 sd $lo1,($tp)
256
257 daddu $lo1,$hi1,$hi0
258 sltu $hi1,$lo1,$hi0
259 daddu $lo1,$tj
260 sltu AT,$lo1,$tj
261 daddu $hi1,AT
262 sd $lo1,8($tp)
263 sd $hi1,16($tp)
264
265 addu $i,8
266 sltu s7,$i,$num
267 bnez s7,.Louter
268
269 .set noreorder
270 PTR_ADD $tj,sp,$num # &tp[num]
271 move $tp,sp
272 move $ap,sp
273 li $hi0,0 # clear borrow bit
274
275.align 4
276.Lsub: ld $lo0,($tp)
277 ld $lo1,($np)
278 PTR_ADD $tp,8
279 PTR_ADD $np,8
280 dsubu $lo1,$lo0,$lo1 # tp[i]-np[i]
281 sgtu AT,$lo1,$lo0
282 dsubu $lo0,$lo1,$hi0
283 sgtu $hi0,$lo0,$lo1
284 sd $lo0,($rp)
285 or $hi0,AT
286 sltu AT,$tp,$tj
287 bnez AT,.Lsub
288 PTR_ADD $rp,8
289
290 dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit
291 move $tp,sp
292 PTR_SUB $rp,$num # restore rp
293 not $hi1,$hi0
294
295 and $ap,$hi0,sp
296 and $bp,$hi1,$rp
297 or $ap,$ap,$bp # ap=borrow?tp:rp
298
299.align 4
300.Lcopy: ld $aj,($ap)
301 PTR_ADD $ap,8
302 PTR_ADD $tp,8
303 sd zero,-8($tp)
304 sltu AT,$tp,$tj
305 sd $aj,($rp)
306 bnez AT,.Lcopy
307 PTR_ADD $rp,8
308
309 ld s0,0($fp)
310 ld s1,8($fp)
311 ld s2,16($fp)
312 ld s3,24($fp)
313 ld s4,32($fp)
314 ld s5,40($fp)
315 ld s6,48($fp)
316 ld s7,56($fp)
317 li v0,1
318 jr ra
319 PTR_ADD sp,$fp,64
320 .set reorder
321END(bn_mul_mont)
322.rdata
323.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
324___
325
326print $code;
327close STDOUT;