blob: f04aab16bc59d69c26cca058b4c4eefddcff7f52 [file] [log] [blame]
Adam Langleyfad63272015-11-12 12:15:39 -08001#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# March 2015
11#
12# "Teaser" Montgomery multiplication module for ARMv8. Needs more
13# work. While it does improve RSA sign performance by 20-30% (less for
14# longer keys) on most processors, for some reason RSA2048 is not
15# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
16# instruction issue rate is limited on processor in question, meaning
17# that dedicated squaring procedure is a must. Well, actually all
18# contemporary AArch64 processors seem to have limited multiplication
19# issue rate, i.e. they can't issue multiplication every cycle, which
20# explains moderate improvement coefficients in comparison to
21# compiler-generated code. Recall that compiler is instructed to use
22# umulh and therefore uses same amount of multiplication instructions
23# to do the job. Assembly's edge is to minimize number of "collateral"
24# instructions and of course instruction scheduling.
25#
26# April 2015
27#
28# Squaring procedure that handles lengths divisible by 8 improves
29# RSA/DSA performance by 25-40-60% depending on processor and key
30# length. Overall improvement coefficients are always positive in
31# comparison to compiler-generated code. On Cortex-A57 improvement
32# is still modest on longest key lengths, while others exhibit e.g.
33# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
34# on Cortex-A57 and ~60-100% faster on others.
35
36$flavour = shift;
37$output = shift;
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
41( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
42die "can't locate arm-xlate.pl";
43
44open OUT,"| \"$^X\" $xlate $flavour $output";
45*STDOUT=*OUT;
46
47($lo0,$hi0,$aj,$m0,$alo,$ahi,
48 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
49 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
50
51# int bn_mul_mont(
52$rp="x0"; # BN_ULONG *rp,
53$ap="x1"; # const BN_ULONG *ap,
54$bp="x2"; # const BN_ULONG *bp,
55$np="x3"; # const BN_ULONG *np,
56$n0="x4"; # const BN_ULONG *n0,
57$num="x5"; # int num);
58
59$code.=<<___;
60.text
61
62.globl bn_mul_mont
63.type bn_mul_mont,%function
64.align 5
65bn_mul_mont:
66 tst $num,#7
67 b.eq __bn_sqr8x_mont
68 tst $num,#3
69 b.eq __bn_mul4x_mont
70.Lmul_mont:
71 stp x29,x30,[sp,#-64]!
72 add x29,sp,#0
73 stp x19,x20,[sp,#16]
74 stp x21,x22,[sp,#32]
75 stp x23,x24,[sp,#48]
76
77 ldr $m0,[$bp],#8 // bp[0]
78 sub $tp,sp,$num,lsl#3
79 ldp $hi0,$aj,[$ap],#16 // ap[0..1]
80 lsl $num,$num,#3
81 ldr $n0,[$n0] // *n0
82 and $tp,$tp,#-16 // ABI says so
83 ldp $hi1,$nj,[$np],#16 // np[0..1]
84
85 mul $lo0,$hi0,$m0 // ap[0]*bp[0]
86 sub $j,$num,#16 // j=num-2
87 umulh $hi0,$hi0,$m0
88 mul $alo,$aj,$m0 // ap[1]*bp[0]
89 umulh $ahi,$aj,$m0
90
91 mul $m1,$lo0,$n0 // "tp[0]"*n0
92 mov sp,$tp // alloca
93
94 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
95 umulh $hi1,$hi1,$m1
96 mul $nlo,$nj,$m1 // np[1]*m1
97 // (*) adds $lo1,$lo1,$lo0 // discarded
98 // (*) As for removal of first multiplication and addition
99 // instructions. The outcome of first addition is
100 // guaranteed to be zero, which leaves two computationally
101 // significant outcomes: it either carries or not. Then
102 // question is when does it carry? Is there alternative
103 // way to deduce it? If you follow operations, you can
104 // observe that condition for carry is quite simple:
105 // $lo0 being non-zero. So that carry can be calculated
106 // by adding -1 to $lo0. That's what next instruction does.
107 subs xzr,$lo0,#1 // (*)
108 umulh $nhi,$nj,$m1
109 adc $hi1,$hi1,xzr
110 cbz $j,.L1st_skip
111
112.L1st:
113 ldr $aj,[$ap],#8
114 adds $lo0,$alo,$hi0
115 sub $j,$j,#8 // j--
116 adc $hi0,$ahi,xzr
117
118 ldr $nj,[$np],#8
119 adds $lo1,$nlo,$hi1
120 mul $alo,$aj,$m0 // ap[j]*bp[0]
121 adc $hi1,$nhi,xzr
122 umulh $ahi,$aj,$m0
123
124 adds $lo1,$lo1,$lo0
125 mul $nlo,$nj,$m1 // np[j]*m1
126 adc $hi1,$hi1,xzr
127 umulh $nhi,$nj,$m1
128 str $lo1,[$tp],#8 // tp[j-1]
129 cbnz $j,.L1st
130
131.L1st_skip:
132 adds $lo0,$alo,$hi0
133 sub $ap,$ap,$num // rewind $ap
134 adc $hi0,$ahi,xzr
135
136 adds $lo1,$nlo,$hi1
137 sub $np,$np,$num // rewind $np
138 adc $hi1,$nhi,xzr
139
140 adds $lo1,$lo1,$lo0
141 sub $i,$num,#8 // i=num-1
142 adcs $hi1,$hi1,$hi0
143
144 adc $ovf,xzr,xzr // upmost overflow bit
145 stp $lo1,$hi1,[$tp]
146
147.Louter:
148 ldr $m0,[$bp],#8 // bp[i]
149 ldp $hi0,$aj,[$ap],#16
150 ldr $tj,[sp] // tp[0]
151 add $tp,sp,#8
152
153 mul $lo0,$hi0,$m0 // ap[0]*bp[i]
154 sub $j,$num,#16 // j=num-2
155 umulh $hi0,$hi0,$m0
156 ldp $hi1,$nj,[$np],#16
157 mul $alo,$aj,$m0 // ap[1]*bp[i]
158 adds $lo0,$lo0,$tj
159 umulh $ahi,$aj,$m0
160 adc $hi0,$hi0,xzr
161
162 mul $m1,$lo0,$n0
163 sub $i,$i,#8 // i--
164
165 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
166 umulh $hi1,$hi1,$m1
167 mul $nlo,$nj,$m1 // np[1]*m1
168 // (*) adds $lo1,$lo1,$lo0
169 subs xzr,$lo0,#1 // (*)
170 umulh $nhi,$nj,$m1
171 cbz $j,.Linner_skip
172
173.Linner:
174 ldr $aj,[$ap],#8
175 adc $hi1,$hi1,xzr
176 ldr $tj,[$tp],#8 // tp[j]
177 adds $lo0,$alo,$hi0
178 sub $j,$j,#8 // j--
179 adc $hi0,$ahi,xzr
180
181 adds $lo1,$nlo,$hi1
182 ldr $nj,[$np],#8
183 adc $hi1,$nhi,xzr
184
185 mul $alo,$aj,$m0 // ap[j]*bp[i]
186 adds $lo0,$lo0,$tj
187 umulh $ahi,$aj,$m0
188 adc $hi0,$hi0,xzr
189
190 mul $nlo,$nj,$m1 // np[j]*m1
191 adds $lo1,$lo1,$lo0
192 umulh $nhi,$nj,$m1
193 str $lo1,[$tp,#-16] // tp[j-1]
194 cbnz $j,.Linner
195
196.Linner_skip:
197 ldr $tj,[$tp],#8 // tp[j]
198 adc $hi1,$hi1,xzr
199 adds $lo0,$alo,$hi0
200 sub $ap,$ap,$num // rewind $ap
201 adc $hi0,$ahi,xzr
202
203 adds $lo1,$nlo,$hi1
204 sub $np,$np,$num // rewind $np
205 adcs $hi1,$nhi,$ovf
206 adc $ovf,xzr,xzr
207
208 adds $lo0,$lo0,$tj
209 adc $hi0,$hi0,xzr
210
211 adds $lo1,$lo1,$lo0
212 adcs $hi1,$hi1,$hi0
213 adc $ovf,$ovf,xzr // upmost overflow bit
214 stp $lo1,$hi1,[$tp,#-16]
215
216 cbnz $i,.Louter
217
218 // Final step. We see if result is larger than modulus, and
219 // if it is, subtract the modulus. But comparison implies
220 // subtraction. So we subtract modulus, see if it borrowed,
221 // and conditionally copy original value.
222 ldr $tj,[sp] // tp[0]
223 add $tp,sp,#8
224 ldr $nj,[$np],#8 // np[0]
225 subs $j,$num,#8 // j=num-1 and clear borrow
226 mov $ap,$rp
227.Lsub:
228 sbcs $aj,$tj,$nj // tp[j]-np[j]
229 ldr $tj,[$tp],#8
230 sub $j,$j,#8 // j--
231 ldr $nj,[$np],#8
232 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
233 cbnz $j,.Lsub
234
235 sbcs $aj,$tj,$nj
236 sbcs $ovf,$ovf,xzr // did it borrow?
237 str $aj,[$ap],#8 // rp[num-1]
238
239 ldr $tj,[sp] // tp[0]
240 add $tp,sp,#8
241 ldr $aj,[$rp],#8 // rp[0]
242 sub $num,$num,#8 // num--
243 nop
244.Lcond_copy:
245 sub $num,$num,#8 // num--
246 csel $nj,$tj,$aj,lo // did it borrow?
247 ldr $tj,[$tp],#8
248 ldr $aj,[$rp],#8
249 str xzr,[$tp,#-16] // wipe tp
250 str $nj,[$rp,#-16]
251 cbnz $num,.Lcond_copy
252
253 csel $nj,$tj,$aj,lo
254 str xzr,[$tp,#-8] // wipe tp
255 str $nj,[$rp,#-8]
256
257 ldp x19,x20,[x29,#16]
258 mov sp,x29
259 ldp x21,x22,[x29,#32]
260 mov x0,#1
261 ldp x23,x24,[x29,#48]
262 ldr x29,[sp],#64
263 ret
264.size bn_mul_mont,.-bn_mul_mont
265___
266{
267########################################################################
268# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
269
270my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
271my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
272my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
273my ($cnt,$carry,$topmost)=("x27","x28","x30");
274my ($tp,$ap_end,$na0)=($bp,$np,$carry);
275
276$code.=<<___;
277.type __bn_sqr8x_mont,%function
278.align 5
279__bn_sqr8x_mont:
280 cmp $ap,$bp
281 b.ne __bn_mul4x_mont
282.Lsqr8x_mont:
283 stp x29,x30,[sp,#-128]!
284 add x29,sp,#0
285 stp x19,x20,[sp,#16]
286 stp x21,x22,[sp,#32]
287 stp x23,x24,[sp,#48]
288 stp x25,x26,[sp,#64]
289 stp x27,x28,[sp,#80]
290 stp $rp,$np,[sp,#96] // offload rp and np
291
292 ldp $a0,$a1,[$ap,#8*0]
293 ldp $a2,$a3,[$ap,#8*2]
294 ldp $a4,$a5,[$ap,#8*4]
295 ldp $a6,$a7,[$ap,#8*6]
296
297 sub $tp,sp,$num,lsl#4
298 lsl $num,$num,#3
299 ldr $n0,[$n0] // *n0
300 mov sp,$tp // alloca
301 sub $cnt,$num,#8*8
302 b .Lsqr8x_zero_start
303
304.Lsqr8x_zero:
305 sub $cnt,$cnt,#8*8
306 stp xzr,xzr,[$tp,#8*0]
307 stp xzr,xzr,[$tp,#8*2]
308 stp xzr,xzr,[$tp,#8*4]
309 stp xzr,xzr,[$tp,#8*6]
310.Lsqr8x_zero_start:
311 stp xzr,xzr,[$tp,#8*8]
312 stp xzr,xzr,[$tp,#8*10]
313 stp xzr,xzr,[$tp,#8*12]
314 stp xzr,xzr,[$tp,#8*14]
315 add $tp,$tp,#8*16
316 cbnz $cnt,.Lsqr8x_zero
317
318 add $ap_end,$ap,$num
319 add $ap,$ap,#8*8
320 mov $acc0,xzr
321 mov $acc1,xzr
322 mov $acc2,xzr
323 mov $acc3,xzr
324 mov $acc4,xzr
325 mov $acc5,xzr
326 mov $acc6,xzr
327 mov $acc7,xzr
328 mov $tp,sp
329 str $n0,[x29,#112] // offload n0
330
331 // Multiply everything but a[i]*a[i]
332.align 4
333.Lsqr8x_outer_loop:
334 // a[1]a[0] (i)
335 // a[2]a[0]
336 // a[3]a[0]
337 // a[4]a[0]
338 // a[5]a[0]
339 // a[6]a[0]
340 // a[7]a[0]
341 // a[2]a[1] (ii)
342 // a[3]a[1]
343 // a[4]a[1]
344 // a[5]a[1]
345 // a[6]a[1]
346 // a[7]a[1]
347 // a[3]a[2] (iii)
348 // a[4]a[2]
349 // a[5]a[2]
350 // a[6]a[2]
351 // a[7]a[2]
352 // a[4]a[3] (iv)
353 // a[5]a[3]
354 // a[6]a[3]
355 // a[7]a[3]
356 // a[5]a[4] (v)
357 // a[6]a[4]
358 // a[7]a[4]
359 // a[6]a[5] (vi)
360 // a[7]a[5]
361 // a[7]a[6] (vii)
362
363 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
364 mul $t1,$a2,$a0
365 mul $t2,$a3,$a0
366 mul $t3,$a4,$a0
367 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
368 mul $t0,$a5,$a0
369 adcs $acc2,$acc2,$t1
370 mul $t1,$a6,$a0
371 adcs $acc3,$acc3,$t2
372 mul $t2,$a7,$a0
373 adcs $acc4,$acc4,$t3
374 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
375 adcs $acc5,$acc5,$t0
376 umulh $t0,$a2,$a0
377 adcs $acc6,$acc6,$t1
378 umulh $t1,$a3,$a0
379 adcs $acc7,$acc7,$t2
380 umulh $t2,$a4,$a0
381 stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
382 adc $acc0,xzr,xzr // t[8]
383 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
384 umulh $t3,$a5,$a0
385 adcs $acc3,$acc3,$t0
386 umulh $t0,$a6,$a0
387 adcs $acc4,$acc4,$t1
388 umulh $t1,$a7,$a0
389 adcs $acc5,$acc5,$t2
390 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
391 adcs $acc6,$acc6,$t3
392 mul $t3,$a3,$a1
393 adcs $acc7,$acc7,$t0
394 mul $t0,$a4,$a1
395 adc $acc0,$acc0,$t1
396
397 mul $t1,$a5,$a1
398 adds $acc3,$acc3,$t2
399 mul $t2,$a6,$a1
400 adcs $acc4,$acc4,$t3
401 mul $t3,$a7,$a1
402 adcs $acc5,$acc5,$t0
403 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
404 adcs $acc6,$acc6,$t1
405 umulh $t1,$a3,$a1
406 adcs $acc7,$acc7,$t2
407 umulh $t2,$a4,$a1
408 adcs $acc0,$acc0,$t3
409 umulh $t3,$a5,$a1
410 stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
411 adc $acc1,xzr,xzr // t[9]
412 adds $acc4,$acc4,$t0
413 umulh $t0,$a6,$a1
414 adcs $acc5,$acc5,$t1
415 umulh $t1,$a7,$a1
416 adcs $acc6,$acc6,$t2
417 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
418 adcs $acc7,$acc7,$t3
419 mul $t3,$a4,$a2
420 adcs $acc0,$acc0,$t0
421 mul $t0,$a5,$a2
422 adc $acc1,$acc1,$t1
423
424 mul $t1,$a6,$a2
425 adds $acc5,$acc5,$t2
426 mul $t2,$a7,$a2
427 adcs $acc6,$acc6,$t3
428 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
429 adcs $acc7,$acc7,$t0
430 umulh $t0,$a4,$a2
431 adcs $acc0,$acc0,$t1
432 umulh $t1,$a5,$a2
433 adcs $acc1,$acc1,$t2
434 umulh $t2,$a6,$a2
435 stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
436 adc $acc2,xzr,xzr // t[10]
437 adds $acc6,$acc6,$t3
438 umulh $t3,$a7,$a2
439 adcs $acc7,$acc7,$t0
440 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
441 adcs $acc0,$acc0,$t1
442 mul $t1,$a5,$a3
443 adcs $acc1,$acc1,$t2
444 mul $t2,$a6,$a3
445 adc $acc2,$acc2,$t3
446
447 mul $t3,$a7,$a3
448 adds $acc7,$acc7,$t0
449 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
450 adcs $acc0,$acc0,$t1
451 umulh $t1,$a5,$a3
452 adcs $acc1,$acc1,$t2
453 umulh $t2,$a6,$a3
454 adcs $acc2,$acc2,$t3
455 umulh $t3,$a7,$a3
456 stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
457 adc $acc3,xzr,xzr // t[11]
458 adds $acc0,$acc0,$t0
459 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
460 adcs $acc1,$acc1,$t1
461 mul $t1,$a6,$a4
462 adcs $acc2,$acc2,$t2
463 mul $t2,$a7,$a4
464 adc $acc3,$acc3,$t3
465
466 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
467 adds $acc1,$acc1,$t0
468 umulh $t0,$a6,$a4
469 adcs $acc2,$acc2,$t1
470 umulh $t1,$a7,$a4
471 adcs $acc3,$acc3,$t2
472 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
473 adc $acc4,xzr,xzr // t[12]
474 adds $acc2,$acc2,$t3
475 mul $t3,$a7,$a5
476 adcs $acc3,$acc3,$t0
477 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
478 adc $acc4,$acc4,$t1
479
480 umulh $t1,$a7,$a5
481 adds $acc3,$acc3,$t2
482 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
483 adcs $acc4,$acc4,$t3
484 umulh $t3,$a7,$a6 // hi(a[7]*a[6])
485 adc $acc5,xzr,xzr // t[13]
486 adds $acc4,$acc4,$t0
487 sub $cnt,$ap_end,$ap // done yet?
488 adc $acc5,$acc5,$t1
489
490 adds $acc5,$acc5,$t2
491 sub $t0,$ap_end,$num // rewinded ap
492 adc $acc6,xzr,xzr // t[14]
493 add $acc6,$acc6,$t3
494
495 cbz $cnt,.Lsqr8x_outer_break
496
497 mov $n0,$a0
498 ldp $a0,$a1,[$tp,#8*0]
499 ldp $a2,$a3,[$tp,#8*2]
500 ldp $a4,$a5,[$tp,#8*4]
501 ldp $a6,$a7,[$tp,#8*6]
502 adds $acc0,$acc0,$a0
503 adcs $acc1,$acc1,$a1
504 ldp $a0,$a1,[$ap,#8*0]
505 adcs $acc2,$acc2,$a2
506 adcs $acc3,$acc3,$a3
507 ldp $a2,$a3,[$ap,#8*2]
508 adcs $acc4,$acc4,$a4
509 adcs $acc5,$acc5,$a5
510 ldp $a4,$a5,[$ap,#8*4]
511 adcs $acc6,$acc6,$a6
512 mov $rp,$ap
513 adcs $acc7,xzr,$a7
514 ldp $a6,$a7,[$ap,#8*6]
515 add $ap,$ap,#8*8
516 //adc $carry,xzr,xzr // moved below
517 mov $cnt,#-8*8
518
519 // a[8]a[0]
520 // a[9]a[0]
521 // a[a]a[0]
522 // a[b]a[0]
523 // a[c]a[0]
524 // a[d]a[0]
525 // a[e]a[0]
526 // a[f]a[0]
527 // a[8]a[1]
528 // a[f]a[1]........................
529 // a[8]a[2]
530 // a[f]a[2]........................
531 // a[8]a[3]
532 // a[f]a[3]........................
533 // a[8]a[4]
534 // a[f]a[4]........................
535 // a[8]a[5]
536 // a[f]a[5]........................
537 // a[8]a[6]
538 // a[f]a[6]........................
539 // a[8]a[7]
540 // a[f]a[7]........................
541.Lsqr8x_mul:
542 mul $t0,$a0,$n0
543 adc $carry,xzr,xzr // carry bit, modulo-scheduled
544 mul $t1,$a1,$n0
545 add $cnt,$cnt,#8
546 mul $t2,$a2,$n0
547 mul $t3,$a3,$n0
548 adds $acc0,$acc0,$t0
549 mul $t0,$a4,$n0
550 adcs $acc1,$acc1,$t1
551 mul $t1,$a5,$n0
552 adcs $acc2,$acc2,$t2
553 mul $t2,$a6,$n0
554 adcs $acc3,$acc3,$t3
555 mul $t3,$a7,$n0
556 adcs $acc4,$acc4,$t0
557 umulh $t0,$a0,$n0
558 adcs $acc5,$acc5,$t1
559 umulh $t1,$a1,$n0
560 adcs $acc6,$acc6,$t2
561 umulh $t2,$a2,$n0
562 adcs $acc7,$acc7,$t3
563 umulh $t3,$a3,$n0
564 adc $carry,$carry,xzr
565 str $acc0,[$tp],#8
566 adds $acc0,$acc1,$t0
567 umulh $t0,$a4,$n0
568 adcs $acc1,$acc2,$t1
569 umulh $t1,$a5,$n0
570 adcs $acc2,$acc3,$t2
571 umulh $t2,$a6,$n0
572 adcs $acc3,$acc4,$t3
573 umulh $t3,$a7,$n0
574 ldr $n0,[$rp,$cnt]
575 adcs $acc4,$acc5,$t0
576 adcs $acc5,$acc6,$t1
577 adcs $acc6,$acc7,$t2
578 adcs $acc7,$carry,$t3
579 //adc $carry,xzr,xzr // moved above
580 cbnz $cnt,.Lsqr8x_mul
581 // note that carry flag is guaranteed
582 // to be zero at this point
583 cmp $ap,$ap_end // done yet?
584 b.eq .Lsqr8x_break
585
586 ldp $a0,$a1,[$tp,#8*0]
587 ldp $a2,$a3,[$tp,#8*2]
588 ldp $a4,$a5,[$tp,#8*4]
589 ldp $a6,$a7,[$tp,#8*6]
590 adds $acc0,$acc0,$a0
591 ldr $n0,[$rp,#-8*8]
592 adcs $acc1,$acc1,$a1
593 ldp $a0,$a1,[$ap,#8*0]
594 adcs $acc2,$acc2,$a2
595 adcs $acc3,$acc3,$a3
596 ldp $a2,$a3,[$ap,#8*2]
597 adcs $acc4,$acc4,$a4
598 adcs $acc5,$acc5,$a5
599 ldp $a4,$a5,[$ap,#8*4]
600 adcs $acc6,$acc6,$a6
601 mov $cnt,#-8*8
602 adcs $acc7,$acc7,$a7
603 ldp $a6,$a7,[$ap,#8*6]
604 add $ap,$ap,#8*8
605 //adc $carry,xzr,xzr // moved above
606 b .Lsqr8x_mul
607
608.align 4
609.Lsqr8x_break:
610 ldp $a0,$a1,[$rp,#8*0]
611 add $ap,$rp,#8*8
612 ldp $a2,$a3,[$rp,#8*2]
613 sub $t0,$ap_end,$ap // is it last iteration?
614 ldp $a4,$a5,[$rp,#8*4]
615 sub $t1,$tp,$t0
616 ldp $a6,$a7,[$rp,#8*6]
617 cbz $t0,.Lsqr8x_outer_loop
618
619 stp $acc0,$acc1,[$tp,#8*0]
620 ldp $acc0,$acc1,[$t1,#8*0]
621 stp $acc2,$acc3,[$tp,#8*2]
622 ldp $acc2,$acc3,[$t1,#8*2]
623 stp $acc4,$acc5,[$tp,#8*4]
624 ldp $acc4,$acc5,[$t1,#8*4]
625 stp $acc6,$acc7,[$tp,#8*6]
626 mov $tp,$t1
627 ldp $acc6,$acc7,[$t1,#8*6]
628 b .Lsqr8x_outer_loop
629
630.align 4
631.Lsqr8x_outer_break:
632 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
633 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
634 ldp $t1,$t2,[sp,#8*1]
635 ldp $a5,$a7,[$t0,#8*2]
636 add $ap,$t0,#8*4
637 ldp $t3,$t0,[sp,#8*3]
638
639 stp $acc0,$acc1,[$tp,#8*0]
640 mul $acc0,$a1,$a1
641 stp $acc2,$acc3,[$tp,#8*2]
642 umulh $a1,$a1,$a1
643 stp $acc4,$acc5,[$tp,#8*4]
644 mul $a2,$a3,$a3
645 stp $acc6,$acc7,[$tp,#8*6]
646 mov $tp,sp
647 umulh $a3,$a3,$a3
648 adds $acc1,$a1,$t1,lsl#1
649 extr $t1,$t2,$t1,#63
650 sub $cnt,$num,#8*4
651
652.Lsqr4x_shift_n_add:
653 adcs $acc2,$a2,$t1
654 extr $t2,$t3,$t2,#63
655 sub $cnt,$cnt,#8*4
656 adcs $acc3,$a3,$t2
657 ldp $t1,$t2,[$tp,#8*5]
658 mul $a4,$a5,$a5
659 ldp $a1,$a3,[$ap],#8*2
660 umulh $a5,$a5,$a5
661 mul $a6,$a7,$a7
662 umulh $a7,$a7,$a7
663 extr $t3,$t0,$t3,#63
664 stp $acc0,$acc1,[$tp,#8*0]
665 adcs $acc4,$a4,$t3
666 extr $t0,$t1,$t0,#63
667 stp $acc2,$acc3,[$tp,#8*2]
668 adcs $acc5,$a5,$t0
669 ldp $t3,$t0,[$tp,#8*7]
670 extr $t1,$t2,$t1,#63
671 adcs $acc6,$a6,$t1
672 extr $t2,$t3,$t2,#63
673 adcs $acc7,$a7,$t2
674 ldp $t1,$t2,[$tp,#8*9]
675 mul $a0,$a1,$a1
676 ldp $a5,$a7,[$ap],#8*2
677 umulh $a1,$a1,$a1
678 mul $a2,$a3,$a3
679 umulh $a3,$a3,$a3
680 stp $acc4,$acc5,[$tp,#8*4]
681 extr $t3,$t0,$t3,#63
682 stp $acc6,$acc7,[$tp,#8*6]
683 add $tp,$tp,#8*8
684 adcs $acc0,$a0,$t3
685 extr $t0,$t1,$t0,#63
686 adcs $acc1,$a1,$t0
687 ldp $t3,$t0,[$tp,#8*3]
688 extr $t1,$t2,$t1,#63
689 cbnz $cnt,.Lsqr4x_shift_n_add
690___
691my ($np,$np_end)=($ap,$ap_end);
692$code.=<<___;
693 ldp $np,$n0,[x29,#104] // pull np and n0
694
695 adcs $acc2,$a2,$t1
696 extr $t2,$t3,$t2,#63
697 adcs $acc3,$a3,$t2
698 ldp $t1,$t2,[$tp,#8*5]
699 mul $a4,$a5,$a5
700 umulh $a5,$a5,$a5
701 stp $acc0,$acc1,[$tp,#8*0]
702 mul $a6,$a7,$a7
703 umulh $a7,$a7,$a7
704 stp $acc2,$acc3,[$tp,#8*2]
705 extr $t3,$t0,$t3,#63
706 adcs $acc4,$a4,$t3
707 extr $t0,$t1,$t0,#63
708 ldp $acc0,$acc1,[sp,#8*0]
709 adcs $acc5,$a5,$t0
710 extr $t1,$t2,$t1,#63
711 ldp $a0,$a1,[$np,#8*0]
712 adcs $acc6,$a6,$t1
713 extr $t2,xzr,$t2,#63
714 ldp $a2,$a3,[$np,#8*2]
715 adc $acc7,$a7,$t2
716 ldp $a4,$a5,[$np,#8*4]
717
718 // Reduce by 512 bits per iteration
719 mul $na0,$n0,$acc0 // t[0]*n0
720 ldp $a6,$a7,[$np,#8*6]
721 add $np_end,$np,$num
722 ldp $acc2,$acc3,[sp,#8*2]
723 stp $acc4,$acc5,[$tp,#8*4]
724 ldp $acc4,$acc5,[sp,#8*4]
725 stp $acc6,$acc7,[$tp,#8*6]
726 ldp $acc6,$acc7,[sp,#8*6]
727 add $np,$np,#8*8
728 mov $topmost,xzr // initial top-most carry
729 mov $tp,sp
730 mov $cnt,#8
731
732.Lsqr8x_reduction:
733 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
734 mul $t1,$a1,$na0
735 sub $cnt,$cnt,#1
736 mul $t2,$a2,$na0
737 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
738 mul $t3,$a3,$na0
739 // (*) adds xzr,$acc0,$t0
740 subs xzr,$acc0,#1 // (*)
741 mul $t0,$a4,$na0
742 adcs $acc0,$acc1,$t1
743 mul $t1,$a5,$na0
744 adcs $acc1,$acc2,$t2
745 mul $t2,$a6,$na0
746 adcs $acc2,$acc3,$t3
747 mul $t3,$a7,$na0
748 adcs $acc3,$acc4,$t0
749 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
750 adcs $acc4,$acc5,$t1
751 umulh $t1,$a1,$na0
752 adcs $acc5,$acc6,$t2
753 umulh $t2,$a2,$na0
754 adcs $acc6,$acc7,$t3
755 umulh $t3,$a3,$na0
756 adc $acc7,xzr,xzr
757 adds $acc0,$acc0,$t0
758 umulh $t0,$a4,$na0
759 adcs $acc1,$acc1,$t1
760 umulh $t1,$a5,$na0
761 adcs $acc2,$acc2,$t2
762 umulh $t2,$a6,$na0
763 adcs $acc3,$acc3,$t3
764 umulh $t3,$a7,$na0
765 mul $na0,$n0,$acc0 // next t[0]*n0
766 adcs $acc4,$acc4,$t0
767 adcs $acc5,$acc5,$t1
768 adcs $acc6,$acc6,$t2
769 adc $acc7,$acc7,$t3
770 cbnz $cnt,.Lsqr8x_reduction
771
772 ldp $t0,$t1,[$tp,#8*0]
773 ldp $t2,$t3,[$tp,#8*2]
774 mov $rp,$tp
775 sub $cnt,$np_end,$np // done yet?
776 adds $acc0,$acc0,$t0
777 adcs $acc1,$acc1,$t1
778 ldp $t0,$t1,[$tp,#8*4]
779 adcs $acc2,$acc2,$t2
780 adcs $acc3,$acc3,$t3
781 ldp $t2,$t3,[$tp,#8*6]
782 adcs $acc4,$acc4,$t0
783 adcs $acc5,$acc5,$t1
784 adcs $acc6,$acc6,$t2
785 adcs $acc7,$acc7,$t3
786 //adc $carry,xzr,xzr // moved below
787 cbz $cnt,.Lsqr8x8_post_condition
788
789 ldr $n0,[$tp,#-8*8]
790 ldp $a0,$a1,[$np,#8*0]
791 ldp $a2,$a3,[$np,#8*2]
792 ldp $a4,$a5,[$np,#8*4]
793 mov $cnt,#-8*8
794 ldp $a6,$a7,[$np,#8*6]
795 add $np,$np,#8*8
796
797.Lsqr8x_tail:
798 mul $t0,$a0,$n0
799 adc $carry,xzr,xzr // carry bit, modulo-scheduled
800 mul $t1,$a1,$n0
801 add $cnt,$cnt,#8
802 mul $t2,$a2,$n0
803 mul $t3,$a3,$n0
804 adds $acc0,$acc0,$t0
805 mul $t0,$a4,$n0
806 adcs $acc1,$acc1,$t1
807 mul $t1,$a5,$n0
808 adcs $acc2,$acc2,$t2
809 mul $t2,$a6,$n0
810 adcs $acc3,$acc3,$t3
811 mul $t3,$a7,$n0
812 adcs $acc4,$acc4,$t0
813 umulh $t0,$a0,$n0
814 adcs $acc5,$acc5,$t1
815 umulh $t1,$a1,$n0
816 adcs $acc6,$acc6,$t2
817 umulh $t2,$a2,$n0
818 adcs $acc7,$acc7,$t3
819 umulh $t3,$a3,$n0
820 adc $carry,$carry,xzr
821 str $acc0,[$tp],#8
822 adds $acc0,$acc1,$t0
823 umulh $t0,$a4,$n0
824 adcs $acc1,$acc2,$t1
825 umulh $t1,$a5,$n0
826 adcs $acc2,$acc3,$t2
827 umulh $t2,$a6,$n0
828 adcs $acc3,$acc4,$t3
829 umulh $t3,$a7,$n0
830 ldr $n0,[$rp,$cnt]
831 adcs $acc4,$acc5,$t0
832 adcs $acc5,$acc6,$t1
833 adcs $acc6,$acc7,$t2
834 adcs $acc7,$carry,$t3
835 //adc $carry,xzr,xzr // moved above
836 cbnz $cnt,.Lsqr8x_tail
837 // note that carry flag is guaranteed
838 // to be zero at this point
839 ldp $a0,$a1,[$tp,#8*0]
840 sub $cnt,$np_end,$np // done yet?
841 sub $t2,$np_end,$num // rewinded np
842 ldp $a2,$a3,[$tp,#8*2]
843 ldp $a4,$a5,[$tp,#8*4]
844 ldp $a6,$a7,[$tp,#8*6]
845 cbz $cnt,.Lsqr8x_tail_break
846
847 ldr $n0,[$rp,#-8*8]
848 adds $acc0,$acc0,$a0
849 adcs $acc1,$acc1,$a1
850 ldp $a0,$a1,[$np,#8*0]
851 adcs $acc2,$acc2,$a2
852 adcs $acc3,$acc3,$a3
853 ldp $a2,$a3,[$np,#8*2]
854 adcs $acc4,$acc4,$a4
855 adcs $acc5,$acc5,$a5
856 ldp $a4,$a5,[$np,#8*4]
857 adcs $acc6,$acc6,$a6
858 mov $cnt,#-8*8
859 adcs $acc7,$acc7,$a7
860 ldp $a6,$a7,[$np,#8*6]
861 add $np,$np,#8*8
862 //adc $carry,xzr,xzr // moved above
863 b .Lsqr8x_tail
864
865.align 4
866.Lsqr8x_tail_break:
867 ldr $n0,[x29,#112] // pull n0
868 add $cnt,$tp,#8*8 // end of current t[num] window
869
870 subs xzr,$topmost,#1 // "move" top-most carry to carry bit
871 adcs $t0,$acc0,$a0
872 adcs $t1,$acc1,$a1
873 ldp $acc0,$acc1,[$rp,#8*0]
874 adcs $acc2,$acc2,$a2
875 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
876 adcs $acc3,$acc3,$a3
877 ldp $a2,$a3,[$t2,#8*2]
878 adcs $acc4,$acc4,$a4
879 adcs $acc5,$acc5,$a5
880 ldp $a4,$a5,[$t2,#8*4]
881 adcs $acc6,$acc6,$a6
882 adcs $acc7,$acc7,$a7
883 ldp $a6,$a7,[$t2,#8*6]
884 add $np,$t2,#8*8
885 adc $topmost,xzr,xzr // top-most carry
886 mul $na0,$n0,$acc0
887 stp $t0,$t1,[$tp,#8*0]
888 stp $acc2,$acc3,[$tp,#8*2]
889 ldp $acc2,$acc3,[$rp,#8*2]
890 stp $acc4,$acc5,[$tp,#8*4]
891 ldp $acc4,$acc5,[$rp,#8*4]
892 cmp $cnt,x29 // did we hit the bottom?
893 stp $acc6,$acc7,[$tp,#8*6]
894 mov $tp,$rp // slide the window
895 ldp $acc6,$acc7,[$rp,#8*6]
896 mov $cnt,#8
897 b.ne .Lsqr8x_reduction
898
899 // Final step. We see if result is larger than modulus, and
900 // if it is, subtract the modulus. But comparison implies
901 // subtraction. So we subtract modulus, see if it borrowed,
902 // and conditionally copy original value.
903 ldr $rp,[x29,#96] // pull rp
904 add $tp,$tp,#8*8
905 subs $t0,$acc0,$a0
906 sbcs $t1,$acc1,$a1
907 sub $cnt,$num,#8*8
908 mov $ap_end,$rp // $rp copy
909
910.Lsqr8x_sub:
911 sbcs $t2,$acc2,$a2
912 ldp $a0,$a1,[$np,#8*0]
913 sbcs $t3,$acc3,$a3
914 stp $t0,$t1,[$rp,#8*0]
915 sbcs $t0,$acc4,$a4
916 ldp $a2,$a3,[$np,#8*2]
917 sbcs $t1,$acc5,$a5
918 stp $t2,$t3,[$rp,#8*2]
919 sbcs $t2,$acc6,$a6
920 ldp $a4,$a5,[$np,#8*4]
921 sbcs $t3,$acc7,$a7
922 ldp $a6,$a7,[$np,#8*6]
923 add $np,$np,#8*8
924 ldp $acc0,$acc1,[$tp,#8*0]
925 sub $cnt,$cnt,#8*8
926 ldp $acc2,$acc3,[$tp,#8*2]
927 ldp $acc4,$acc5,[$tp,#8*4]
928 ldp $acc6,$acc7,[$tp,#8*6]
929 add $tp,$tp,#8*8
930 stp $t0,$t1,[$rp,#8*4]
931 sbcs $t0,$acc0,$a0
932 stp $t2,$t3,[$rp,#8*6]
933 add $rp,$rp,#8*8
934 sbcs $t1,$acc1,$a1
935 cbnz $cnt,.Lsqr8x_sub
936
937 sbcs $t2,$acc2,$a2
938 mov $tp,sp
939 add $ap,sp,$num
940 ldp $a0,$a1,[$ap_end,#8*0]
941 sbcs $t3,$acc3,$a3
942 stp $t0,$t1,[$rp,#8*0]
943 sbcs $t0,$acc4,$a4
944 ldp $a2,$a3,[$ap_end,#8*2]
945 sbcs $t1,$acc5,$a5
946 stp $t2,$t3,[$rp,#8*2]
947 sbcs $t2,$acc6,$a6
948 ldp $acc0,$acc1,[$ap,#8*0]
949 sbcs $t3,$acc7,$a7
950 ldp $acc2,$acc3,[$ap,#8*2]
951 sbcs xzr,$topmost,xzr // did it borrow?
952 ldr x30,[x29,#8] // pull return address
953 stp $t0,$t1,[$rp,#8*4]
954 stp $t2,$t3,[$rp,#8*6]
955
956 sub $cnt,$num,#8*4
957.Lsqr4x_cond_copy:
958 sub $cnt,$cnt,#8*4
959 csel $t0,$acc0,$a0,lo
960 stp xzr,xzr,[$tp,#8*0]
961 csel $t1,$acc1,$a1,lo
962 ldp $a0,$a1,[$ap_end,#8*4]
963 ldp $acc0,$acc1,[$ap,#8*4]
964 csel $t2,$acc2,$a2,lo
965 stp xzr,xzr,[$tp,#8*2]
966 add $tp,$tp,#8*4
967 csel $t3,$acc3,$a3,lo
968 ldp $a2,$a3,[$ap_end,#8*6]
969 ldp $acc2,$acc3,[$ap,#8*6]
970 add $ap,$ap,#8*4
971 stp $t0,$t1,[$ap_end,#8*0]
972 stp $t2,$t3,[$ap_end,#8*2]
973 add $ap_end,$ap_end,#8*4
974 stp xzr,xzr,[$ap,#8*0]
975 stp xzr,xzr,[$ap,#8*2]
976 cbnz $cnt,.Lsqr4x_cond_copy
977
978 csel $t0,$acc0,$a0,lo
979 stp xzr,xzr,[$tp,#8*0]
980 csel $t1,$acc1,$a1,lo
981 stp xzr,xzr,[$tp,#8*2]
982 csel $t2,$acc2,$a2,lo
983 csel $t3,$acc3,$a3,lo
984 stp $t0,$t1,[$ap_end,#8*0]
985 stp $t2,$t3,[$ap_end,#8*2]
986
987 b .Lsqr8x_done
988
989.align 4
990.Lsqr8x8_post_condition:
991 adc $carry,xzr,xzr
992 ldr x30,[x29,#8] // pull return address
993 // $acc0-7,$carry hold result, $a0-7 hold modulus
994 subs $a0,$acc0,$a0
995 ldr $ap,[x29,#96] // pull rp
996 sbcs $a1,$acc1,$a1
997 stp xzr,xzr,[sp,#8*0]
998 sbcs $a2,$acc2,$a2
999 stp xzr,xzr,[sp,#8*2]
1000 sbcs $a3,$acc3,$a3
1001 stp xzr,xzr,[sp,#8*4]
1002 sbcs $a4,$acc4,$a4
1003 stp xzr,xzr,[sp,#8*6]
1004 sbcs $a5,$acc5,$a5
1005 stp xzr,xzr,[sp,#8*8]
1006 sbcs $a6,$acc6,$a6
1007 stp xzr,xzr,[sp,#8*10]
1008 sbcs $a7,$acc7,$a7
1009 stp xzr,xzr,[sp,#8*12]
1010 sbcs $carry,$carry,xzr // did it borrow?
1011 stp xzr,xzr,[sp,#8*14]
1012
1013 // $a0-7 hold result-modulus
1014 csel $a0,$acc0,$a0,lo
1015 csel $a1,$acc1,$a1,lo
1016 csel $a2,$acc2,$a2,lo
1017 csel $a3,$acc3,$a3,lo
1018 stp $a0,$a1,[$ap,#8*0]
1019 csel $a4,$acc4,$a4,lo
1020 csel $a5,$acc5,$a5,lo
1021 stp $a2,$a3,[$ap,#8*2]
1022 csel $a6,$acc6,$a6,lo
1023 csel $a7,$acc7,$a7,lo
1024 stp $a4,$a5,[$ap,#8*4]
1025 stp $a6,$a7,[$ap,#8*6]
1026
1027.Lsqr8x_done:
1028 ldp x19,x20,[x29,#16]
1029 mov sp,x29
1030 ldp x21,x22,[x29,#32]
1031 mov x0,#1
1032 ldp x23,x24,[x29,#48]
1033 ldp x25,x26,[x29,#64]
1034 ldp x27,x28,[x29,#80]
1035 ldr x29,[sp],#128
1036 ret
1037.size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1038___
1039}
1040
1041{
1042########################################################################
1043# Even though this might look as ARMv8 adaptation of mulx4x_mont from
1044# x86_64-mont5 module, it's different in sense that it performs
1045# reduction 256 bits at a time.
1046
1047my ($a0,$a1,$a2,$a3,
1048 $t0,$t1,$t2,$t3,
1049 $m0,$m1,$m2,$m3,
1050 $acc0,$acc1,$acc2,$acc3,$acc4,
1051 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1052my $bp_end=$rp;
1053my ($carry,$topmost) = ($rp,"x30");
1054
1055$code.=<<___;
1056.type __bn_mul4x_mont,%function
1057.align 5
1058__bn_mul4x_mont:
1059 stp x29,x30,[sp,#-128]!
1060 add x29,sp,#0
1061 stp x19,x20,[sp,#16]
1062 stp x21,x22,[sp,#32]
1063 stp x23,x24,[sp,#48]
1064 stp x25,x26,[sp,#64]
1065 stp x27,x28,[sp,#80]
1066
1067 sub $tp,sp,$num,lsl#3
1068 lsl $num,$num,#3
1069 ldr $n0,[$n0] // *n0
1070 sub sp,$tp,#8*4 // alloca
1071
1072 add $t0,$bp,$num
1073 add $ap_end,$ap,$num
1074 stp $rp,$t0,[x29,#96] // offload rp and &b[num]
1075
1076 ldr $bi,[$bp,#8*0] // b[0]
1077 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1078 ldp $a2,$a3,[$ap,#8*2]
1079 add $ap,$ap,#8*4
1080 mov $acc0,xzr
1081 mov $acc1,xzr
1082 mov $acc2,xzr
1083 mov $acc3,xzr
1084 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1085 ldp $m2,$m3,[$np,#8*2]
1086 adds $np,$np,#8*4 // clear carry bit
1087 mov $carry,xzr
1088 mov $cnt,#0
1089 mov $tp,sp
1090
1091.Loop_mul4x_1st_reduction:
1092 mul $t0,$a0,$bi // lo(a[0..3]*b[0])
1093 adc $carry,$carry,xzr // modulo-scheduled
1094 mul $t1,$a1,$bi
1095 add $cnt,$cnt,#8
1096 mul $t2,$a2,$bi
1097 and $cnt,$cnt,#31
1098 mul $t3,$a3,$bi
1099 adds $acc0,$acc0,$t0
1100 umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
1101 adcs $acc1,$acc1,$t1
1102 mul $mi,$acc0,$n0 // t[0]*n0
1103 adcs $acc2,$acc2,$t2
1104 umulh $t1,$a1,$bi
1105 adcs $acc3,$acc3,$t3
1106 umulh $t2,$a2,$bi
1107 adc $acc4,xzr,xzr
1108 umulh $t3,$a3,$bi
1109 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1110 adds $acc1,$acc1,$t0
1111 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
1112 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1113 adcs $acc2,$acc2,$t1
1114 mul $t1,$m1,$mi
1115 adcs $acc3,$acc3,$t2
1116 mul $t2,$m2,$mi
1117 adc $acc4,$acc4,$t3 // can't overflow
1118 mul $t3,$m3,$mi
1119 // (*) adds xzr,$acc0,$t0
1120 subs xzr,$acc0,#1 // (*)
1121 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
1122 adcs $acc0,$acc1,$t1
1123 umulh $t1,$m1,$mi
1124 adcs $acc1,$acc2,$t2
1125 umulh $t2,$m2,$mi
1126 adcs $acc2,$acc3,$t3
1127 umulh $t3,$m3,$mi
1128 adcs $acc3,$acc4,$carry
1129 adc $carry,xzr,xzr
1130 adds $acc0,$acc0,$t0
1131 sub $t0,$ap_end,$ap
1132 adcs $acc1,$acc1,$t1
1133 adcs $acc2,$acc2,$t2
1134 adcs $acc3,$acc3,$t3
1135 //adc $carry,$carry,xzr
1136 cbnz $cnt,.Loop_mul4x_1st_reduction
1137
1138 cbz $t0,.Lmul4x4_post_condition
1139
1140 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1141 ldp $a2,$a3,[$ap,#8*2]
1142 add $ap,$ap,#8*4
1143 ldr $mi,[sp] // a[0]*n0
1144 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1145 ldp $m2,$m3,[$np,#8*2]
1146 add $np,$np,#8*4
1147
1148.Loop_mul4x_1st_tail:
1149 mul $t0,$a0,$bi // lo(a[4..7]*b[i])
1150 adc $carry,$carry,xzr // modulo-scheduled
1151 mul $t1,$a1,$bi
1152 add $cnt,$cnt,#8
1153 mul $t2,$a2,$bi
1154 and $cnt,$cnt,#31
1155 mul $t3,$a3,$bi
1156 adds $acc0,$acc0,$t0
1157 umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
1158 adcs $acc1,$acc1,$t1
1159 umulh $t1,$a1,$bi
1160 adcs $acc2,$acc2,$t2
1161 umulh $t2,$a2,$bi
1162 adcs $acc3,$acc3,$t3
1163 umulh $t3,$a3,$bi
1164 adc $acc4,xzr,xzr
1165 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1166 adds $acc1,$acc1,$t0
1167 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
1168 adcs $acc2,$acc2,$t1
1169 mul $t1,$m1,$mi
1170 adcs $acc3,$acc3,$t2
1171 mul $t2,$m2,$mi
1172 adc $acc4,$acc4,$t3 // can't overflow
1173 mul $t3,$m3,$mi
1174 adds $acc0,$acc0,$t0
1175 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
1176 adcs $acc1,$acc1,$t1
1177 umulh $t1,$m1,$mi
1178 adcs $acc2,$acc2,$t2
1179 umulh $t2,$m2,$mi
1180 adcs $acc3,$acc3,$t3
1181 adcs $acc4,$acc4,$carry
1182 umulh $t3,$m3,$mi
1183 adc $carry,xzr,xzr
1184 ldr $mi,[sp,$cnt] // next t[0]*n0
1185 str $acc0,[$tp],#8 // result!!!
1186 adds $acc0,$acc1,$t0
1187 sub $t0,$ap_end,$ap // done yet?
1188 adcs $acc1,$acc2,$t1
1189 adcs $acc2,$acc3,$t2
1190 adcs $acc3,$acc4,$t3
1191 //adc $carry,$carry,xzr
1192 cbnz $cnt,.Loop_mul4x_1st_tail
1193
1194 sub $t1,$ap_end,$num // rewinded $ap
1195 cbz $t0,.Lmul4x_proceed
1196
1197 ldp $a0,$a1,[$ap,#8*0]
1198 ldp $a2,$a3,[$ap,#8*2]
1199 add $ap,$ap,#8*4
1200 ldp $m0,$m1,[$np,#8*0]
1201 ldp $m2,$m3,[$np,#8*2]
1202 add $np,$np,#8*4
1203 b .Loop_mul4x_1st_tail
1204
1205.align 5
1206.Lmul4x_proceed:
1207 ldr $bi,[$bp,#8*4]! // *++b
1208 adc $topmost,$carry,xzr
1209 ldp $a0,$a1,[$t1,#8*0] // a[0..3]
1210 sub $np,$np,$num // rewind np
1211 ldp $a2,$a3,[$t1,#8*2]
1212 add $ap,$t1,#8*4
1213
1214 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1215 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1216 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1217 ldp $acc2,$acc3,[sp,#8*6]
1218
1219 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1220 mov $tp,sp
1221 ldp $m2,$m3,[$np,#8*2]
1222 adds $np,$np,#8*4 // clear carry bit
1223 mov $carry,xzr
1224
1225.align 4
1226.Loop_mul4x_reduction:
1227 mul $t0,$a0,$bi // lo(a[0..3]*b[4])
1228 adc $carry,$carry,xzr // modulo-scheduled
1229 mul $t1,$a1,$bi
1230 add $cnt,$cnt,#8
1231 mul $t2,$a2,$bi
1232 and $cnt,$cnt,#31
1233 mul $t3,$a3,$bi
1234 adds $acc0,$acc0,$t0
1235 umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
1236 adcs $acc1,$acc1,$t1
1237 mul $mi,$acc0,$n0 // t[0]*n0
1238 adcs $acc2,$acc2,$t2
1239 umulh $t1,$a1,$bi
1240 adcs $acc3,$acc3,$t3
1241 umulh $t2,$a2,$bi
1242 adc $acc4,xzr,xzr
1243 umulh $t3,$a3,$bi
1244 ldr $bi,[$bp,$cnt] // next b[i]
1245 adds $acc1,$acc1,$t0
1246 // (*) mul $t0,$m0,$mi
1247 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1248 adcs $acc2,$acc2,$t1
1249 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
1250 adcs $acc3,$acc3,$t2
1251 mul $t2,$m2,$mi
1252 adc $acc4,$acc4,$t3 // can't overflow
1253 mul $t3,$m3,$mi
1254 // (*) adds xzr,$acc0,$t0
1255 subs xzr,$acc0,#1 // (*)
1256 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
1257 adcs $acc0,$acc1,$t1
1258 umulh $t1,$m1,$mi
1259 adcs $acc1,$acc2,$t2
1260 umulh $t2,$m2,$mi
1261 adcs $acc2,$acc3,$t3
1262 umulh $t3,$m3,$mi
1263 adcs $acc3,$acc4,$carry
1264 adc $carry,xzr,xzr
1265 adds $acc0,$acc0,$t0
1266 adcs $acc1,$acc1,$t1
1267 adcs $acc2,$acc2,$t2
1268 adcs $acc3,$acc3,$t3
1269 //adc $carry,$carry,xzr
1270 cbnz $cnt,.Loop_mul4x_reduction
1271
1272 adc $carry,$carry,xzr
1273 ldp $t0,$t1,[$tp,#8*4] // t[4..7]
1274 ldp $t2,$t3,[$tp,#8*6]
1275 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1276 ldp $a2,$a3,[$ap,#8*2]
1277 add $ap,$ap,#8*4
1278 adds $acc0,$acc0,$t0
1279 adcs $acc1,$acc1,$t1
1280 adcs $acc2,$acc2,$t2
1281 adcs $acc3,$acc3,$t3
1282 //adc $carry,$carry,xzr
1283
1284 ldr $mi,[sp] // t[0]*n0
1285 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1286 ldp $m2,$m3,[$np,#8*2]
1287 add $np,$np,#8*4
1288
1289.align 4
1290.Loop_mul4x_tail:
1291 mul $t0,$a0,$bi // lo(a[4..7]*b[4])
1292 adc $carry,$carry,xzr // modulo-scheduled
1293 mul $t1,$a1,$bi
1294 add $cnt,$cnt,#8
1295 mul $t2,$a2,$bi
1296 and $cnt,$cnt,#31
1297 mul $t3,$a3,$bi
1298 adds $acc0,$acc0,$t0
1299 umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
1300 adcs $acc1,$acc1,$t1
1301 umulh $t1,$a1,$bi
1302 adcs $acc2,$acc2,$t2
1303 umulh $t2,$a2,$bi
1304 adcs $acc3,$acc3,$t3
1305 umulh $t3,$a3,$bi
1306 adc $acc4,xzr,xzr
1307 ldr $bi,[$bp,$cnt] // next b[i]
1308 adds $acc1,$acc1,$t0
1309 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
1310 adcs $acc2,$acc2,$t1
1311 mul $t1,$m1,$mi
1312 adcs $acc3,$acc3,$t2
1313 mul $t2,$m2,$mi
1314 adc $acc4,$acc4,$t3 // can't overflow
1315 mul $t3,$m3,$mi
1316 adds $acc0,$acc0,$t0
1317 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
1318 adcs $acc1,$acc1,$t1
1319 umulh $t1,$m1,$mi
1320 adcs $acc2,$acc2,$t2
1321 umulh $t2,$m2,$mi
1322 adcs $acc3,$acc3,$t3
1323 umulh $t3,$m3,$mi
1324 adcs $acc4,$acc4,$carry
1325 ldr $mi,[sp,$cnt] // next a[0]*n0
1326 adc $carry,xzr,xzr
1327 str $acc0,[$tp],#8 // result!!!
1328 adds $acc0,$acc1,$t0
1329 sub $t0,$ap_end,$ap // done yet?
1330 adcs $acc1,$acc2,$t1
1331 adcs $acc2,$acc3,$t2
1332 adcs $acc3,$acc4,$t3
1333 //adc $carry,$carry,xzr
1334 cbnz $cnt,.Loop_mul4x_tail
1335
1336 sub $t1,$np,$num // rewinded np?
1337 adc $carry,$carry,xzr
1338 cbz $t0,.Loop_mul4x_break
1339
1340 ldp $t0,$t1,[$tp,#8*4]
1341 ldp $t2,$t3,[$tp,#8*6]
1342 ldp $a0,$a1,[$ap,#8*0]
1343 ldp $a2,$a3,[$ap,#8*2]
1344 add $ap,$ap,#8*4
1345 adds $acc0,$acc0,$t0
1346 adcs $acc1,$acc1,$t1
1347 adcs $acc2,$acc2,$t2
1348 adcs $acc3,$acc3,$t3
1349 //adc $carry,$carry,xzr
1350 ldp $m0,$m1,[$np,#8*0]
1351 ldp $m2,$m3,[$np,#8*2]
1352 add $np,$np,#8*4
1353 b .Loop_mul4x_tail
1354
1355.align 4
1356.Loop_mul4x_break:
1357 ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
1358 adds $acc0,$acc0,$topmost
1359 add $bp,$bp,#8*4 // bp++
1360 adcs $acc1,$acc1,xzr
1361 sub $ap,$ap,$num // rewind ap
1362 adcs $acc2,$acc2,xzr
1363 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1364 adcs $acc3,$acc3,xzr
1365 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1366 adc $topmost,$carry,xzr
1367 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1368 cmp $bp,$t3 // done yet?
1369 ldp $acc2,$acc3,[sp,#8*6]
1370 ldp $m0,$m1,[$t1,#8*0] // n[0..3]
1371 ldp $m2,$m3,[$t1,#8*2]
1372 add $np,$t1,#8*4
1373 b.eq .Lmul4x_post
1374
1375 ldr $bi,[$bp]
1376 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1377 ldp $a2,$a3,[$ap,#8*2]
1378 adds $ap,$ap,#8*4 // clear carry bit
1379 mov $carry,xzr
1380 mov $tp,sp
1381 b .Loop_mul4x_reduction
1382
1383.align 4
1384.Lmul4x_post:
1385 // Final step. We see if result is larger than modulus, and
1386 // if it is, subtract the modulus. But comparison implies
1387 // subtraction. So we subtract modulus, see if it borrowed,
1388 // and conditionally copy original value.
1389 mov $rp,$t2
1390 mov $ap_end,$t2 // $rp copy
1391 subs $t0,$acc0,$m0
1392 add $tp,sp,#8*8
1393 sbcs $t1,$acc1,$m1
1394 sub $cnt,$num,#8*4
1395
1396.Lmul4x_sub:
1397 sbcs $t2,$acc2,$m2
1398 ldp $m0,$m1,[$np,#8*0]
1399 sub $cnt,$cnt,#8*4
1400 ldp $acc0,$acc1,[$tp,#8*0]
1401 sbcs $t3,$acc3,$m3
1402 ldp $m2,$m3,[$np,#8*2]
1403 add $np,$np,#8*4
1404 ldp $acc2,$acc3,[$tp,#8*2]
1405 add $tp,$tp,#8*4
1406 stp $t0,$t1,[$rp,#8*0]
1407 sbcs $t0,$acc0,$m0
1408 stp $t2,$t3,[$rp,#8*2]
1409 add $rp,$rp,#8*4
1410 sbcs $t1,$acc1,$m1
1411 cbnz $cnt,.Lmul4x_sub
1412
1413 sbcs $t2,$acc2,$m2
1414 mov $tp,sp
1415 add $ap,sp,#8*4
1416 ldp $a0,$a1,[$ap_end,#8*0]
1417 sbcs $t3,$acc3,$m3
1418 stp $t0,$t1,[$rp,#8*0]
1419 ldp $a2,$a3,[$ap_end,#8*2]
1420 stp $t2,$t3,[$rp,#8*2]
1421 ldp $acc0,$acc1,[$ap,#8*0]
1422 ldp $acc2,$acc3,[$ap,#8*2]
1423 sbcs xzr,$topmost,xzr // did it borrow?
1424 ldr x30,[x29,#8] // pull return address
1425
1426 sub $cnt,$num,#8*4
1427.Lmul4x_cond_copy:
1428 sub $cnt,$cnt,#8*4
1429 csel $t0,$acc0,$a0,lo
1430 stp xzr,xzr,[$tp,#8*0]
1431 csel $t1,$acc1,$a1,lo
1432 ldp $a0,$a1,[$ap_end,#8*4]
1433 ldp $acc0,$acc1,[$ap,#8*4]
1434 csel $t2,$acc2,$a2,lo
1435 stp xzr,xzr,[$tp,#8*2]
1436 add $tp,$tp,#8*4
1437 csel $t3,$acc3,$a3,lo
1438 ldp $a2,$a3,[$ap_end,#8*6]
1439 ldp $acc2,$acc3,[$ap,#8*6]
1440 add $ap,$ap,#8*4
1441 stp $t0,$t1,[$ap_end,#8*0]
1442 stp $t2,$t3,[$ap_end,#8*2]
1443 add $ap_end,$ap_end,#8*4
1444 cbnz $cnt,.Lmul4x_cond_copy
1445
1446 csel $t0,$acc0,$a0,lo
1447 stp xzr,xzr,[$tp,#8*0]
1448 csel $t1,$acc1,$a1,lo
1449 stp xzr,xzr,[$tp,#8*2]
1450 csel $t2,$acc2,$a2,lo
1451 stp xzr,xzr,[$tp,#8*3]
1452 csel $t3,$acc3,$a3,lo
1453 stp xzr,xzr,[$tp,#8*4]
1454 stp $t0,$t1,[$ap_end,#8*0]
1455 stp $t2,$t3,[$ap_end,#8*2]
1456
1457 b .Lmul4x_done
1458
1459.align 4
1460.Lmul4x4_post_condition:
1461 adc $carry,$carry,xzr
1462 ldr $ap,[x29,#96] // pull rp
1463 // $acc0-3,$carry hold result, $m0-7 hold modulus
1464 subs $a0,$acc0,$m0
1465 ldr x30,[x29,#8] // pull return address
1466 sbcs $a1,$acc1,$m1
1467 stp xzr,xzr,[sp,#8*0]
1468 sbcs $a2,$acc2,$m2
1469 stp xzr,xzr,[sp,#8*2]
1470 sbcs $a3,$acc3,$m3
1471 stp xzr,xzr,[sp,#8*4]
1472 sbcs xzr,$carry,xzr // did it borrow?
1473 stp xzr,xzr,[sp,#8*6]
1474
1475 // $a0-3 hold result-modulus
1476 csel $a0,$acc0,$a0,lo
1477 csel $a1,$acc1,$a1,lo
1478 csel $a2,$acc2,$a2,lo
1479 csel $a3,$acc3,$a3,lo
1480 stp $a0,$a1,[$ap,#8*0]
1481 stp $a2,$a3,[$ap,#8*2]
1482
1483.Lmul4x_done:
1484 ldp x19,x20,[x29,#16]
1485 mov sp,x29
1486 ldp x21,x22,[x29,#32]
1487 mov x0,#1
1488 ldp x23,x24,[x29,#48]
1489 ldp x25,x26,[x29,#64]
1490 ldp x27,x28,[x29,#80]
1491 ldr x29,[sp],#128
1492 ret
1493.size __bn_mul4x_mont,.-__bn_mul4x_mont
1494___
1495}
1496$code.=<<___;
1497.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1498.align 4
1499___
1500
1501print $code;
1502
1503close STDOUT;