blob: 74702db6aa9c12910def36097b06715804a162e9 [file] [log] [blame]
Adam Langleyfad63272015-11-12 12:15:39 -08001#if defined(__aarch64__)
2.text
3
4.globl bn_mul_mont
David Benjamin4969cc92016-04-22 15:02:23 -04005.hidden bn_mul_mont
Adam Langleyfad63272015-11-12 12:15:39 -08006.type bn_mul_mont,%function
7.align 5
8bn_mul_mont:
9 tst x5,#7
10 b.eq __bn_sqr8x_mont
11 tst x5,#3
12 b.eq __bn_mul4x_mont
13.Lmul_mont:
14 stp x29,x30,[sp,#-64]!
15 add x29,sp,#0
16 stp x19,x20,[sp,#16]
17 stp x21,x22,[sp,#32]
18 stp x23,x24,[sp,#48]
19
20 ldr x9,[x2],#8 // bp[0]
21 sub x22,sp,x5,lsl#3
22 ldp x7,x8,[x1],#16 // ap[0..1]
23 lsl x5,x5,#3
24 ldr x4,[x4] // *n0
25 and x22,x22,#-16 // ABI says so
26 ldp x13,x14,[x3],#16 // np[0..1]
27
28 mul x6,x7,x9 // ap[0]*bp[0]
29 sub x21,x5,#16 // j=num-2
30 umulh x7,x7,x9
31 mul x10,x8,x9 // ap[1]*bp[0]
32 umulh x11,x8,x9
33
34 mul x15,x6,x4 // "tp[0]"*n0
35 mov sp,x22 // alloca
36
37 // (*) mul x12,x13,x15 // np[0]*m1
38 umulh x13,x13,x15
39 mul x16,x14,x15 // np[1]*m1
40 // (*) adds x12,x12,x6 // discarded
41 // (*) As for removal of first multiplication and addition
42 // instructions. The outcome of first addition is
43 // guaranteed to be zero, which leaves two computationally
44 // significant outcomes: it either carries or not. Then
45 // question is when does it carry? Is there alternative
46 // way to deduce it? If you follow operations, you can
47 // observe that condition for carry is quite simple:
48 // x6 being non-zero. So that carry can be calculated
49 // by adding -1 to x6. That's what next instruction does.
50 subs xzr,x6,#1 // (*)
51 umulh x17,x14,x15
52 adc x13,x13,xzr
53 cbz x21,.L1st_skip
54
55.L1st:
56 ldr x8,[x1],#8
57 adds x6,x10,x7
58 sub x21,x21,#8 // j--
59 adc x7,x11,xzr
60
61 ldr x14,[x3],#8
62 adds x12,x16,x13
63 mul x10,x8,x9 // ap[j]*bp[0]
64 adc x13,x17,xzr
65 umulh x11,x8,x9
66
67 adds x12,x12,x6
68 mul x16,x14,x15 // np[j]*m1
69 adc x13,x13,xzr
70 umulh x17,x14,x15
71 str x12,[x22],#8 // tp[j-1]
72 cbnz x21,.L1st
73
74.L1st_skip:
75 adds x6,x10,x7
76 sub x1,x1,x5 // rewind x1
77 adc x7,x11,xzr
78
79 adds x12,x16,x13
80 sub x3,x3,x5 // rewind x3
81 adc x13,x17,xzr
82
83 adds x12,x12,x6
84 sub x20,x5,#8 // i=num-1
85 adcs x13,x13,x7
86
87 adc x19,xzr,xzr // upmost overflow bit
88 stp x12,x13,[x22]
89
90.Louter:
91 ldr x9,[x2],#8 // bp[i]
92 ldp x7,x8,[x1],#16
93 ldr x23,[sp] // tp[0]
94 add x22,sp,#8
95
96 mul x6,x7,x9 // ap[0]*bp[i]
97 sub x21,x5,#16 // j=num-2
98 umulh x7,x7,x9
99 ldp x13,x14,[x3],#16
100 mul x10,x8,x9 // ap[1]*bp[i]
101 adds x6,x6,x23
102 umulh x11,x8,x9
103 adc x7,x7,xzr
104
105 mul x15,x6,x4
106 sub x20,x20,#8 // i--
107
108 // (*) mul x12,x13,x15 // np[0]*m1
109 umulh x13,x13,x15
110 mul x16,x14,x15 // np[1]*m1
111 // (*) adds x12,x12,x6
112 subs xzr,x6,#1 // (*)
113 umulh x17,x14,x15
114 cbz x21,.Linner_skip
115
116.Linner:
117 ldr x8,[x1],#8
118 adc x13,x13,xzr
119 ldr x23,[x22],#8 // tp[j]
120 adds x6,x10,x7
121 sub x21,x21,#8 // j--
122 adc x7,x11,xzr
123
124 adds x12,x16,x13
125 ldr x14,[x3],#8
126 adc x13,x17,xzr
127
128 mul x10,x8,x9 // ap[j]*bp[i]
129 adds x6,x6,x23
130 umulh x11,x8,x9
131 adc x7,x7,xzr
132
133 mul x16,x14,x15 // np[j]*m1
134 adds x12,x12,x6
135 umulh x17,x14,x15
136 str x12,[x22,#-16] // tp[j-1]
137 cbnz x21,.Linner
138
139.Linner_skip:
140 ldr x23,[x22],#8 // tp[j]
141 adc x13,x13,xzr
142 adds x6,x10,x7
143 sub x1,x1,x5 // rewind x1
144 adc x7,x11,xzr
145
146 adds x12,x16,x13
147 sub x3,x3,x5 // rewind x3
148 adcs x13,x17,x19
149 adc x19,xzr,xzr
150
151 adds x6,x6,x23
152 adc x7,x7,xzr
153
154 adds x12,x12,x6
155 adcs x13,x13,x7
156 adc x19,x19,xzr // upmost overflow bit
157 stp x12,x13,[x22,#-16]
158
159 cbnz x20,.Louter
160
161 // Final step. We see if result is larger than modulus, and
162 // if it is, subtract the modulus. But comparison implies
163 // subtraction. So we subtract modulus, see if it borrowed,
164 // and conditionally copy original value.
165 ldr x23,[sp] // tp[0]
166 add x22,sp,#8
167 ldr x14,[x3],#8 // np[0]
168 subs x21,x5,#8 // j=num-1 and clear borrow
169 mov x1,x0
170.Lsub:
171 sbcs x8,x23,x14 // tp[j]-np[j]
172 ldr x23,[x22],#8
173 sub x21,x21,#8 // j--
174 ldr x14,[x3],#8
175 str x8,[x1],#8 // rp[j]=tp[j]-np[j]
176 cbnz x21,.Lsub
177
178 sbcs x8,x23,x14
179 sbcs x19,x19,xzr // did it borrow?
180 str x8,[x1],#8 // rp[num-1]
181
182 ldr x23,[sp] // tp[0]
183 add x22,sp,#8
184 ldr x8,[x0],#8 // rp[0]
185 sub x5,x5,#8 // num--
186 nop
187.Lcond_copy:
188 sub x5,x5,#8 // num--
189 csel x14,x23,x8,lo // did it borrow?
190 ldr x23,[x22],#8
191 ldr x8,[x0],#8
192 str xzr,[x22,#-16] // wipe tp
193 str x14,[x0,#-16]
194 cbnz x5,.Lcond_copy
195
196 csel x14,x23,x8,lo
197 str xzr,[x22,#-8] // wipe tp
198 str x14,[x0,#-8]
199
200 ldp x19,x20,[x29,#16]
201 mov sp,x29
202 ldp x21,x22,[x29,#32]
203 mov x0,#1
204 ldp x23,x24,[x29,#48]
205 ldr x29,[sp],#64
206 ret
207.size bn_mul_mont,.-bn_mul_mont
208.type __bn_sqr8x_mont,%function
209.align 5
210__bn_sqr8x_mont:
211 cmp x1,x2
212 b.ne __bn_mul4x_mont
213.Lsqr8x_mont:
214 stp x29,x30,[sp,#-128]!
215 add x29,sp,#0
216 stp x19,x20,[sp,#16]
217 stp x21,x22,[sp,#32]
218 stp x23,x24,[sp,#48]
219 stp x25,x26,[sp,#64]
220 stp x27,x28,[sp,#80]
221 stp x0,x3,[sp,#96] // offload rp and np
222
223 ldp x6,x7,[x1,#8*0]
224 ldp x8,x9,[x1,#8*2]
225 ldp x10,x11,[x1,#8*4]
226 ldp x12,x13,[x1,#8*6]
227
228 sub x2,sp,x5,lsl#4
229 lsl x5,x5,#3
230 ldr x4,[x4] // *n0
231 mov sp,x2 // alloca
232 sub x27,x5,#8*8
233 b .Lsqr8x_zero_start
234
235.Lsqr8x_zero:
236 sub x27,x27,#8*8
237 stp xzr,xzr,[x2,#8*0]
238 stp xzr,xzr,[x2,#8*2]
239 stp xzr,xzr,[x2,#8*4]
240 stp xzr,xzr,[x2,#8*6]
241.Lsqr8x_zero_start:
242 stp xzr,xzr,[x2,#8*8]
243 stp xzr,xzr,[x2,#8*10]
244 stp xzr,xzr,[x2,#8*12]
245 stp xzr,xzr,[x2,#8*14]
246 add x2,x2,#8*16
247 cbnz x27,.Lsqr8x_zero
248
249 add x3,x1,x5
250 add x1,x1,#8*8
251 mov x19,xzr
252 mov x20,xzr
253 mov x21,xzr
254 mov x22,xzr
255 mov x23,xzr
256 mov x24,xzr
257 mov x25,xzr
258 mov x26,xzr
259 mov x2,sp
260 str x4,[x29,#112] // offload n0
261
262 // Multiply everything but a[i]*a[i]
263.align 4
264.Lsqr8x_outer_loop:
265 // a[1]a[0] (i)
266 // a[2]a[0]
267 // a[3]a[0]
268 // a[4]a[0]
269 // a[5]a[0]
270 // a[6]a[0]
271 // a[7]a[0]
272 // a[2]a[1] (ii)
273 // a[3]a[1]
274 // a[4]a[1]
275 // a[5]a[1]
276 // a[6]a[1]
277 // a[7]a[1]
278 // a[3]a[2] (iii)
279 // a[4]a[2]
280 // a[5]a[2]
281 // a[6]a[2]
282 // a[7]a[2]
283 // a[4]a[3] (iv)
284 // a[5]a[3]
285 // a[6]a[3]
286 // a[7]a[3]
287 // a[5]a[4] (v)
288 // a[6]a[4]
289 // a[7]a[4]
290 // a[6]a[5] (vi)
291 // a[7]a[5]
292 // a[7]a[6] (vii)
293
294 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
295 mul x15,x8,x6
296 mul x16,x9,x6
297 mul x17,x10,x6
298 adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
299 mul x14,x11,x6
300 adcs x21,x21,x15
301 mul x15,x12,x6
302 adcs x22,x22,x16
303 mul x16,x13,x6
304 adcs x23,x23,x17
305 umulh x17,x7,x6 // hi(a[1..7]*a[0])
306 adcs x24,x24,x14
307 umulh x14,x8,x6
308 adcs x25,x25,x15
309 umulh x15,x9,x6
310 adcs x26,x26,x16
311 umulh x16,x10,x6
312 stp x19,x20,[x2],#8*2 // t[0..1]
313 adc x19,xzr,xzr // t[8]
314 adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
315 umulh x17,x11,x6
316 adcs x22,x22,x14
317 umulh x14,x12,x6
318 adcs x23,x23,x15
319 umulh x15,x13,x6
320 adcs x24,x24,x16
321 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
322 adcs x25,x25,x17
323 mul x17,x9,x7
324 adcs x26,x26,x14
325 mul x14,x10,x7
326 adc x19,x19,x15
327
328 mul x15,x11,x7
329 adds x22,x22,x16
330 mul x16,x12,x7
331 adcs x23,x23,x17
332 mul x17,x13,x7
333 adcs x24,x24,x14
334 umulh x14,x8,x7 // hi(a[2..7]*a[1])
335 adcs x25,x25,x15
336 umulh x15,x9,x7
337 adcs x26,x26,x16
338 umulh x16,x10,x7
339 adcs x19,x19,x17
340 umulh x17,x11,x7
341 stp x21,x22,[x2],#8*2 // t[2..3]
342 adc x20,xzr,xzr // t[9]
343 adds x23,x23,x14
344 umulh x14,x12,x7
345 adcs x24,x24,x15
346 umulh x15,x13,x7
347 adcs x25,x25,x16
348 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
349 adcs x26,x26,x17
350 mul x17,x10,x8
351 adcs x19,x19,x14
352 mul x14,x11,x8
353 adc x20,x20,x15
354
355 mul x15,x12,x8
356 adds x24,x24,x16
357 mul x16,x13,x8
358 adcs x25,x25,x17
359 umulh x17,x9,x8 // hi(a[3..7]*a[2])
360 adcs x26,x26,x14
361 umulh x14,x10,x8
362 adcs x19,x19,x15
363 umulh x15,x11,x8
364 adcs x20,x20,x16
365 umulh x16,x12,x8
366 stp x23,x24,[x2],#8*2 // t[4..5]
367 adc x21,xzr,xzr // t[10]
368 adds x25,x25,x17
369 umulh x17,x13,x8
370 adcs x26,x26,x14
371 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
372 adcs x19,x19,x15
373 mul x15,x11,x9
374 adcs x20,x20,x16
375 mul x16,x12,x9
376 adc x21,x21,x17
377
378 mul x17,x13,x9
379 adds x26,x26,x14
380 umulh x14,x10,x9 // hi(a[4..7]*a[3])
381 adcs x19,x19,x15
382 umulh x15,x11,x9
383 adcs x20,x20,x16
384 umulh x16,x12,x9
385 adcs x21,x21,x17
386 umulh x17,x13,x9
387 stp x25,x26,[x2],#8*2 // t[6..7]
388 adc x22,xzr,xzr // t[11]
389 adds x19,x19,x14
390 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
391 adcs x20,x20,x15
392 mul x15,x12,x10
393 adcs x21,x21,x16
394 mul x16,x13,x10
395 adc x22,x22,x17
396
397 umulh x17,x11,x10 // hi(a[5..7]*a[4])
398 adds x20,x20,x14
399 umulh x14,x12,x10
400 adcs x21,x21,x15
401 umulh x15,x13,x10
402 adcs x22,x22,x16
403 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
404 adc x23,xzr,xzr // t[12]
405 adds x21,x21,x17
406 mul x17,x13,x11
407 adcs x22,x22,x14
408 umulh x14,x12,x11 // hi(a[6..7]*a[5])
409 adc x23,x23,x15
410
411 umulh x15,x13,x11
412 adds x22,x22,x16
413 mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
414 adcs x23,x23,x17
415 umulh x17,x13,x12 // hi(a[7]*a[6])
416 adc x24,xzr,xzr // t[13]
417 adds x23,x23,x14
418 sub x27,x3,x1 // done yet?
419 adc x24,x24,x15
420
421 adds x24,x24,x16
422 sub x14,x3,x5 // rewinded ap
423 adc x25,xzr,xzr // t[14]
424 add x25,x25,x17
425
426 cbz x27,.Lsqr8x_outer_break
427
428 mov x4,x6
429 ldp x6,x7,[x2,#8*0]
430 ldp x8,x9,[x2,#8*2]
431 ldp x10,x11,[x2,#8*4]
432 ldp x12,x13,[x2,#8*6]
433 adds x19,x19,x6
434 adcs x20,x20,x7
435 ldp x6,x7,[x1,#8*0]
436 adcs x21,x21,x8
437 adcs x22,x22,x9
438 ldp x8,x9,[x1,#8*2]
439 adcs x23,x23,x10
440 adcs x24,x24,x11
441 ldp x10,x11,[x1,#8*4]
442 adcs x25,x25,x12
443 mov x0,x1
444 adcs x26,xzr,x13
445 ldp x12,x13,[x1,#8*6]
446 add x1,x1,#8*8
447 //adc x28,xzr,xzr // moved below
448 mov x27,#-8*8
449
450 // a[8]a[0]
451 // a[9]a[0]
452 // a[a]a[0]
453 // a[b]a[0]
454 // a[c]a[0]
455 // a[d]a[0]
456 // a[e]a[0]
457 // a[f]a[0]
458 // a[8]a[1]
459 // a[f]a[1]........................
460 // a[8]a[2]
461 // a[f]a[2]........................
462 // a[8]a[3]
463 // a[f]a[3]........................
464 // a[8]a[4]
465 // a[f]a[4]........................
466 // a[8]a[5]
467 // a[f]a[5]........................
468 // a[8]a[6]
469 // a[f]a[6]........................
470 // a[8]a[7]
471 // a[f]a[7]........................
472.Lsqr8x_mul:
473 mul x14,x6,x4
474 adc x28,xzr,xzr // carry bit, modulo-scheduled
475 mul x15,x7,x4
476 add x27,x27,#8
477 mul x16,x8,x4
478 mul x17,x9,x4
479 adds x19,x19,x14
480 mul x14,x10,x4
481 adcs x20,x20,x15
482 mul x15,x11,x4
483 adcs x21,x21,x16
484 mul x16,x12,x4
485 adcs x22,x22,x17
486 mul x17,x13,x4
487 adcs x23,x23,x14
488 umulh x14,x6,x4
489 adcs x24,x24,x15
490 umulh x15,x7,x4
491 adcs x25,x25,x16
492 umulh x16,x8,x4
493 adcs x26,x26,x17
494 umulh x17,x9,x4
495 adc x28,x28,xzr
496 str x19,[x2],#8
497 adds x19,x20,x14
498 umulh x14,x10,x4
499 adcs x20,x21,x15
500 umulh x15,x11,x4
501 adcs x21,x22,x16
502 umulh x16,x12,x4
503 adcs x22,x23,x17
504 umulh x17,x13,x4
505 ldr x4,[x0,x27]
506 adcs x23,x24,x14
507 adcs x24,x25,x15
508 adcs x25,x26,x16
509 adcs x26,x28,x17
510 //adc x28,xzr,xzr // moved above
511 cbnz x27,.Lsqr8x_mul
512 // note that carry flag is guaranteed
513 // to be zero at this point
514 cmp x1,x3 // done yet?
515 b.eq .Lsqr8x_break
516
517 ldp x6,x7,[x2,#8*0]
518 ldp x8,x9,[x2,#8*2]
519 ldp x10,x11,[x2,#8*4]
520 ldp x12,x13,[x2,#8*6]
521 adds x19,x19,x6
522 ldr x4,[x0,#-8*8]
523 adcs x20,x20,x7
524 ldp x6,x7,[x1,#8*0]
525 adcs x21,x21,x8
526 adcs x22,x22,x9
527 ldp x8,x9,[x1,#8*2]
528 adcs x23,x23,x10
529 adcs x24,x24,x11
530 ldp x10,x11,[x1,#8*4]
531 adcs x25,x25,x12
532 mov x27,#-8*8
533 adcs x26,x26,x13
534 ldp x12,x13,[x1,#8*6]
535 add x1,x1,#8*8
536 //adc x28,xzr,xzr // moved above
537 b .Lsqr8x_mul
538
539.align 4
540.Lsqr8x_break:
541 ldp x6,x7,[x0,#8*0]
542 add x1,x0,#8*8
543 ldp x8,x9,[x0,#8*2]
544 sub x14,x3,x1 // is it last iteration?
545 ldp x10,x11,[x0,#8*4]
546 sub x15,x2,x14
547 ldp x12,x13,[x0,#8*6]
548 cbz x14,.Lsqr8x_outer_loop
549
550 stp x19,x20,[x2,#8*0]
551 ldp x19,x20,[x15,#8*0]
552 stp x21,x22,[x2,#8*2]
553 ldp x21,x22,[x15,#8*2]
554 stp x23,x24,[x2,#8*4]
555 ldp x23,x24,[x15,#8*4]
556 stp x25,x26,[x2,#8*6]
557 mov x2,x15
558 ldp x25,x26,[x15,#8*6]
559 b .Lsqr8x_outer_loop
560
561.align 4
562.Lsqr8x_outer_break:
563 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
564 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
565 ldp x15,x16,[sp,#8*1]
566 ldp x11,x13,[x14,#8*2]
567 add x1,x14,#8*4
568 ldp x17,x14,[sp,#8*3]
569
570 stp x19,x20,[x2,#8*0]
571 mul x19,x7,x7
572 stp x21,x22,[x2,#8*2]
573 umulh x7,x7,x7
574 stp x23,x24,[x2,#8*4]
575 mul x8,x9,x9
576 stp x25,x26,[x2,#8*6]
577 mov x2,sp
578 umulh x9,x9,x9
579 adds x20,x7,x15,lsl#1
580 extr x15,x16,x15,#63
581 sub x27,x5,#8*4
582
583.Lsqr4x_shift_n_add:
584 adcs x21,x8,x15
585 extr x16,x17,x16,#63
586 sub x27,x27,#8*4
587 adcs x22,x9,x16
588 ldp x15,x16,[x2,#8*5]
589 mul x10,x11,x11
590 ldp x7,x9,[x1],#8*2
591 umulh x11,x11,x11
592 mul x12,x13,x13
593 umulh x13,x13,x13
594 extr x17,x14,x17,#63
595 stp x19,x20,[x2,#8*0]
596 adcs x23,x10,x17
597 extr x14,x15,x14,#63
598 stp x21,x22,[x2,#8*2]
599 adcs x24,x11,x14
600 ldp x17,x14,[x2,#8*7]
601 extr x15,x16,x15,#63
602 adcs x25,x12,x15
603 extr x16,x17,x16,#63
604 adcs x26,x13,x16
605 ldp x15,x16,[x2,#8*9]
606 mul x6,x7,x7
607 ldp x11,x13,[x1],#8*2
608 umulh x7,x7,x7
609 mul x8,x9,x9
610 umulh x9,x9,x9
611 stp x23,x24,[x2,#8*4]
612 extr x17,x14,x17,#63
613 stp x25,x26,[x2,#8*6]
614 add x2,x2,#8*8
615 adcs x19,x6,x17
616 extr x14,x15,x14,#63
617 adcs x20,x7,x14
618 ldp x17,x14,[x2,#8*3]
619 extr x15,x16,x15,#63
620 cbnz x27,.Lsqr4x_shift_n_add
621 ldp x1,x4,[x29,#104] // pull np and n0
622
623 adcs x21,x8,x15
624 extr x16,x17,x16,#63
625 adcs x22,x9,x16
626 ldp x15,x16,[x2,#8*5]
627 mul x10,x11,x11
628 umulh x11,x11,x11
629 stp x19,x20,[x2,#8*0]
630 mul x12,x13,x13
631 umulh x13,x13,x13
632 stp x21,x22,[x2,#8*2]
633 extr x17,x14,x17,#63
634 adcs x23,x10,x17
635 extr x14,x15,x14,#63
636 ldp x19,x20,[sp,#8*0]
637 adcs x24,x11,x14
638 extr x15,x16,x15,#63
639 ldp x6,x7,[x1,#8*0]
640 adcs x25,x12,x15
641 extr x16,xzr,x16,#63
642 ldp x8,x9,[x1,#8*2]
643 adc x26,x13,x16
644 ldp x10,x11,[x1,#8*4]
645
646 // Reduce by 512 bits per iteration
647 mul x28,x4,x19 // t[0]*n0
648 ldp x12,x13,[x1,#8*6]
649 add x3,x1,x5
650 ldp x21,x22,[sp,#8*2]
651 stp x23,x24,[x2,#8*4]
652 ldp x23,x24,[sp,#8*4]
653 stp x25,x26,[x2,#8*6]
654 ldp x25,x26,[sp,#8*6]
655 add x1,x1,#8*8
656 mov x30,xzr // initial top-most carry
657 mov x2,sp
658 mov x27,#8
659
660.Lsqr8x_reduction:
661 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
662 mul x15,x7,x28
663 sub x27,x27,#1
664 mul x16,x8,x28
665 str x28,[x2],#8 // put aside t[0]*n0 for tail processing
666 mul x17,x9,x28
667 // (*) adds xzr,x19,x14
668 subs xzr,x19,#1 // (*)
669 mul x14,x10,x28
670 adcs x19,x20,x15
671 mul x15,x11,x28
672 adcs x20,x21,x16
673 mul x16,x12,x28
674 adcs x21,x22,x17
675 mul x17,x13,x28
676 adcs x22,x23,x14
677 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
678 adcs x23,x24,x15
679 umulh x15,x7,x28
680 adcs x24,x25,x16
681 umulh x16,x8,x28
682 adcs x25,x26,x17
683 umulh x17,x9,x28
684 adc x26,xzr,xzr
685 adds x19,x19,x14
686 umulh x14,x10,x28
687 adcs x20,x20,x15
688 umulh x15,x11,x28
689 adcs x21,x21,x16
690 umulh x16,x12,x28
691 adcs x22,x22,x17
692 umulh x17,x13,x28
693 mul x28,x4,x19 // next t[0]*n0
694 adcs x23,x23,x14
695 adcs x24,x24,x15
696 adcs x25,x25,x16
697 adc x26,x26,x17
698 cbnz x27,.Lsqr8x_reduction
699
700 ldp x14,x15,[x2,#8*0]
701 ldp x16,x17,[x2,#8*2]
702 mov x0,x2
703 sub x27,x3,x1 // done yet?
704 adds x19,x19,x14
705 adcs x20,x20,x15
706 ldp x14,x15,[x2,#8*4]
707 adcs x21,x21,x16
708 adcs x22,x22,x17
709 ldp x16,x17,[x2,#8*6]
710 adcs x23,x23,x14
711 adcs x24,x24,x15
712 adcs x25,x25,x16
713 adcs x26,x26,x17
714 //adc x28,xzr,xzr // moved below
715 cbz x27,.Lsqr8x8_post_condition
716
717 ldr x4,[x2,#-8*8]
718 ldp x6,x7,[x1,#8*0]
719 ldp x8,x9,[x1,#8*2]
720 ldp x10,x11,[x1,#8*4]
721 mov x27,#-8*8
722 ldp x12,x13,[x1,#8*6]
723 add x1,x1,#8*8
724
725.Lsqr8x_tail:
726 mul x14,x6,x4
727 adc x28,xzr,xzr // carry bit, modulo-scheduled
728 mul x15,x7,x4
729 add x27,x27,#8
730 mul x16,x8,x4
731 mul x17,x9,x4
732 adds x19,x19,x14
733 mul x14,x10,x4
734 adcs x20,x20,x15
735 mul x15,x11,x4
736 adcs x21,x21,x16
737 mul x16,x12,x4
738 adcs x22,x22,x17
739 mul x17,x13,x4
740 adcs x23,x23,x14
741 umulh x14,x6,x4
742 adcs x24,x24,x15
743 umulh x15,x7,x4
744 adcs x25,x25,x16
745 umulh x16,x8,x4
746 adcs x26,x26,x17
747 umulh x17,x9,x4
748 adc x28,x28,xzr
749 str x19,[x2],#8
750 adds x19,x20,x14
751 umulh x14,x10,x4
752 adcs x20,x21,x15
753 umulh x15,x11,x4
754 adcs x21,x22,x16
755 umulh x16,x12,x4
756 adcs x22,x23,x17
757 umulh x17,x13,x4
758 ldr x4,[x0,x27]
759 adcs x23,x24,x14
760 adcs x24,x25,x15
761 adcs x25,x26,x16
762 adcs x26,x28,x17
763 //adc x28,xzr,xzr // moved above
764 cbnz x27,.Lsqr8x_tail
765 // note that carry flag is guaranteed
766 // to be zero at this point
767 ldp x6,x7,[x2,#8*0]
768 sub x27,x3,x1 // done yet?
769 sub x16,x3,x5 // rewinded np
770 ldp x8,x9,[x2,#8*2]
771 ldp x10,x11,[x2,#8*4]
772 ldp x12,x13,[x2,#8*6]
773 cbz x27,.Lsqr8x_tail_break
774
775 ldr x4,[x0,#-8*8]
776 adds x19,x19,x6
777 adcs x20,x20,x7
778 ldp x6,x7,[x1,#8*0]
779 adcs x21,x21,x8
780 adcs x22,x22,x9
781 ldp x8,x9,[x1,#8*2]
782 adcs x23,x23,x10
783 adcs x24,x24,x11
784 ldp x10,x11,[x1,#8*4]
785 adcs x25,x25,x12
786 mov x27,#-8*8
787 adcs x26,x26,x13
788 ldp x12,x13,[x1,#8*6]
789 add x1,x1,#8*8
790 //adc x28,xzr,xzr // moved above
791 b .Lsqr8x_tail
792
793.align 4
794.Lsqr8x_tail_break:
795 ldr x4,[x29,#112] // pull n0
796 add x27,x2,#8*8 // end of current t[num] window
797
798 subs xzr,x30,#1 // "move" top-most carry to carry bit
799 adcs x14,x19,x6
800 adcs x15,x20,x7
801 ldp x19,x20,[x0,#8*0]
802 adcs x21,x21,x8
803 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
804 adcs x22,x22,x9
805 ldp x8,x9,[x16,#8*2]
806 adcs x23,x23,x10
807 adcs x24,x24,x11
808 ldp x10,x11,[x16,#8*4]
809 adcs x25,x25,x12
810 adcs x26,x26,x13
811 ldp x12,x13,[x16,#8*6]
812 add x1,x16,#8*8
813 adc x30,xzr,xzr // top-most carry
814 mul x28,x4,x19
815 stp x14,x15,[x2,#8*0]
816 stp x21,x22,[x2,#8*2]
817 ldp x21,x22,[x0,#8*2]
818 stp x23,x24,[x2,#8*4]
819 ldp x23,x24,[x0,#8*4]
820 cmp x27,x29 // did we hit the bottom?
821 stp x25,x26,[x2,#8*6]
822 mov x2,x0 // slide the window
823 ldp x25,x26,[x0,#8*6]
824 mov x27,#8
825 b.ne .Lsqr8x_reduction
826
827 // Final step. We see if result is larger than modulus, and
828 // if it is, subtract the modulus. But comparison implies
829 // subtraction. So we subtract modulus, see if it borrowed,
830 // and conditionally copy original value.
831 ldr x0,[x29,#96] // pull rp
832 add x2,x2,#8*8
833 subs x14,x19,x6
834 sbcs x15,x20,x7
835 sub x27,x5,#8*8
836 mov x3,x0 // x0 copy
837
838.Lsqr8x_sub:
839 sbcs x16,x21,x8
840 ldp x6,x7,[x1,#8*0]
841 sbcs x17,x22,x9
842 stp x14,x15,[x0,#8*0]
843 sbcs x14,x23,x10
844 ldp x8,x9,[x1,#8*2]
845 sbcs x15,x24,x11
846 stp x16,x17,[x0,#8*2]
847 sbcs x16,x25,x12
848 ldp x10,x11,[x1,#8*4]
849 sbcs x17,x26,x13
850 ldp x12,x13,[x1,#8*6]
851 add x1,x1,#8*8
852 ldp x19,x20,[x2,#8*0]
853 sub x27,x27,#8*8
854 ldp x21,x22,[x2,#8*2]
855 ldp x23,x24,[x2,#8*4]
856 ldp x25,x26,[x2,#8*6]
857 add x2,x2,#8*8
858 stp x14,x15,[x0,#8*4]
859 sbcs x14,x19,x6
860 stp x16,x17,[x0,#8*6]
861 add x0,x0,#8*8
862 sbcs x15,x20,x7
863 cbnz x27,.Lsqr8x_sub
864
865 sbcs x16,x21,x8
866 mov x2,sp
867 add x1,sp,x5
868 ldp x6,x7,[x3,#8*0]
869 sbcs x17,x22,x9
870 stp x14,x15,[x0,#8*0]
871 sbcs x14,x23,x10
872 ldp x8,x9,[x3,#8*2]
873 sbcs x15,x24,x11
874 stp x16,x17,[x0,#8*2]
875 sbcs x16,x25,x12
876 ldp x19,x20,[x1,#8*0]
877 sbcs x17,x26,x13
878 ldp x21,x22,[x1,#8*2]
879 sbcs xzr,x30,xzr // did it borrow?
880 ldr x30,[x29,#8] // pull return address
881 stp x14,x15,[x0,#8*4]
882 stp x16,x17,[x0,#8*6]
883
884 sub x27,x5,#8*4
885.Lsqr4x_cond_copy:
886 sub x27,x27,#8*4
887 csel x14,x19,x6,lo
888 stp xzr,xzr,[x2,#8*0]
889 csel x15,x20,x7,lo
890 ldp x6,x7,[x3,#8*4]
891 ldp x19,x20,[x1,#8*4]
892 csel x16,x21,x8,lo
893 stp xzr,xzr,[x2,#8*2]
894 add x2,x2,#8*4
895 csel x17,x22,x9,lo
896 ldp x8,x9,[x3,#8*6]
897 ldp x21,x22,[x1,#8*6]
898 add x1,x1,#8*4
899 stp x14,x15,[x3,#8*0]
900 stp x16,x17,[x3,#8*2]
901 add x3,x3,#8*4
902 stp xzr,xzr,[x1,#8*0]
903 stp xzr,xzr,[x1,#8*2]
904 cbnz x27,.Lsqr4x_cond_copy
905
906 csel x14,x19,x6,lo
907 stp xzr,xzr,[x2,#8*0]
908 csel x15,x20,x7,lo
909 stp xzr,xzr,[x2,#8*2]
910 csel x16,x21,x8,lo
911 csel x17,x22,x9,lo
912 stp x14,x15,[x3,#8*0]
913 stp x16,x17,[x3,#8*2]
914
915 b .Lsqr8x_done
916
917.align 4
918.Lsqr8x8_post_condition:
919 adc x28,xzr,xzr
920 ldr x30,[x29,#8] // pull return address
921 // x19-7,x28 hold result, x6-7 hold modulus
922 subs x6,x19,x6
923 ldr x1,[x29,#96] // pull rp
924 sbcs x7,x20,x7
925 stp xzr,xzr,[sp,#8*0]
926 sbcs x8,x21,x8
927 stp xzr,xzr,[sp,#8*2]
928 sbcs x9,x22,x9
929 stp xzr,xzr,[sp,#8*4]
930 sbcs x10,x23,x10
931 stp xzr,xzr,[sp,#8*6]
932 sbcs x11,x24,x11
933 stp xzr,xzr,[sp,#8*8]
934 sbcs x12,x25,x12
935 stp xzr,xzr,[sp,#8*10]
936 sbcs x13,x26,x13
937 stp xzr,xzr,[sp,#8*12]
938 sbcs x28,x28,xzr // did it borrow?
939 stp xzr,xzr,[sp,#8*14]
940
941 // x6-7 hold result-modulus
942 csel x6,x19,x6,lo
943 csel x7,x20,x7,lo
944 csel x8,x21,x8,lo
945 csel x9,x22,x9,lo
946 stp x6,x7,[x1,#8*0]
947 csel x10,x23,x10,lo
948 csel x11,x24,x11,lo
949 stp x8,x9,[x1,#8*2]
950 csel x12,x25,x12,lo
951 csel x13,x26,x13,lo
952 stp x10,x11,[x1,#8*4]
953 stp x12,x13,[x1,#8*6]
954
955.Lsqr8x_done:
956 ldp x19,x20,[x29,#16]
957 mov sp,x29
958 ldp x21,x22,[x29,#32]
959 mov x0,#1
960 ldp x23,x24,[x29,#48]
961 ldp x25,x26,[x29,#64]
962 ldp x27,x28,[x29,#80]
963 ldr x29,[sp],#128
964 ret
965.size __bn_sqr8x_mont,.-__bn_sqr8x_mont
966.type __bn_mul4x_mont,%function
967.align 5
968__bn_mul4x_mont:
969 stp x29,x30,[sp,#-128]!
970 add x29,sp,#0
971 stp x19,x20,[sp,#16]
972 stp x21,x22,[sp,#32]
973 stp x23,x24,[sp,#48]
974 stp x25,x26,[sp,#64]
975 stp x27,x28,[sp,#80]
976
977 sub x26,sp,x5,lsl#3
978 lsl x5,x5,#3
979 ldr x4,[x4] // *n0
980 sub sp,x26,#8*4 // alloca
981
982 add x10,x2,x5
983 add x27,x1,x5
984 stp x0,x10,[x29,#96] // offload rp and &b[num]
985
986 ldr x24,[x2,#8*0] // b[0]
987 ldp x6,x7,[x1,#8*0] // a[0..3]
988 ldp x8,x9,[x1,#8*2]
989 add x1,x1,#8*4
990 mov x19,xzr
991 mov x20,xzr
992 mov x21,xzr
993 mov x22,xzr
994 ldp x14,x15,[x3,#8*0] // n[0..3]
995 ldp x16,x17,[x3,#8*2]
996 adds x3,x3,#8*4 // clear carry bit
997 mov x0,xzr
998 mov x28,#0
999 mov x26,sp
1000
1001.Loop_mul4x_1st_reduction:
1002 mul x10,x6,x24 // lo(a[0..3]*b[0])
1003 adc x0,x0,xzr // modulo-scheduled
1004 mul x11,x7,x24
1005 add x28,x28,#8
1006 mul x12,x8,x24
1007 and x28,x28,#31
1008 mul x13,x9,x24
1009 adds x19,x19,x10
1010 umulh x10,x6,x24 // hi(a[0..3]*b[0])
1011 adcs x20,x20,x11
1012 mul x25,x19,x4 // t[0]*n0
1013 adcs x21,x21,x12
1014 umulh x11,x7,x24
1015 adcs x22,x22,x13
1016 umulh x12,x8,x24
1017 adc x23,xzr,xzr
1018 umulh x13,x9,x24
1019 ldr x24,[x2,x28] // next b[i] (or b[0])
1020 adds x20,x20,x10
1021 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
1022 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1023 adcs x21,x21,x11
1024 mul x11,x15,x25
1025 adcs x22,x22,x12
1026 mul x12,x16,x25
1027 adc x23,x23,x13 // can't overflow
1028 mul x13,x17,x25
1029 // (*) adds xzr,x19,x10
1030 subs xzr,x19,#1 // (*)
1031 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
1032 adcs x19,x20,x11
1033 umulh x11,x15,x25
1034 adcs x20,x21,x12
1035 umulh x12,x16,x25
1036 adcs x21,x22,x13
1037 umulh x13,x17,x25
1038 adcs x22,x23,x0
1039 adc x0,xzr,xzr
1040 adds x19,x19,x10
1041 sub x10,x27,x1
1042 adcs x20,x20,x11
1043 adcs x21,x21,x12
1044 adcs x22,x22,x13
1045 //adc x0,x0,xzr
1046 cbnz x28,.Loop_mul4x_1st_reduction
1047
1048 cbz x10,.Lmul4x4_post_condition
1049
1050 ldp x6,x7,[x1,#8*0] // a[4..7]
1051 ldp x8,x9,[x1,#8*2]
1052 add x1,x1,#8*4
1053 ldr x25,[sp] // a[0]*n0
1054 ldp x14,x15,[x3,#8*0] // n[4..7]
1055 ldp x16,x17,[x3,#8*2]
1056 add x3,x3,#8*4
1057
1058.Loop_mul4x_1st_tail:
1059 mul x10,x6,x24 // lo(a[4..7]*b[i])
1060 adc x0,x0,xzr // modulo-scheduled
1061 mul x11,x7,x24
1062 add x28,x28,#8
1063 mul x12,x8,x24
1064 and x28,x28,#31
1065 mul x13,x9,x24
1066 adds x19,x19,x10
1067 umulh x10,x6,x24 // hi(a[4..7]*b[i])
1068 adcs x20,x20,x11
1069 umulh x11,x7,x24
1070 adcs x21,x21,x12
1071 umulh x12,x8,x24
1072 adcs x22,x22,x13
1073 umulh x13,x9,x24
1074 adc x23,xzr,xzr
1075 ldr x24,[x2,x28] // next b[i] (or b[0])
1076 adds x20,x20,x10
1077 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
1078 adcs x21,x21,x11
1079 mul x11,x15,x25
1080 adcs x22,x22,x12
1081 mul x12,x16,x25
1082 adc x23,x23,x13 // can't overflow
1083 mul x13,x17,x25
1084 adds x19,x19,x10
1085 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
1086 adcs x20,x20,x11
1087 umulh x11,x15,x25
1088 adcs x21,x21,x12
1089 umulh x12,x16,x25
1090 adcs x22,x22,x13
1091 adcs x23,x23,x0
1092 umulh x13,x17,x25
1093 adc x0,xzr,xzr
1094 ldr x25,[sp,x28] // next t[0]*n0
1095 str x19,[x26],#8 // result!!!
1096 adds x19,x20,x10
1097 sub x10,x27,x1 // done yet?
1098 adcs x20,x21,x11
1099 adcs x21,x22,x12
1100 adcs x22,x23,x13
1101 //adc x0,x0,xzr
1102 cbnz x28,.Loop_mul4x_1st_tail
1103
1104 sub x11,x27,x5 // rewinded x1
1105 cbz x10,.Lmul4x_proceed
1106
1107 ldp x6,x7,[x1,#8*0]
1108 ldp x8,x9,[x1,#8*2]
1109 add x1,x1,#8*4
1110 ldp x14,x15,[x3,#8*0]
1111 ldp x16,x17,[x3,#8*2]
1112 add x3,x3,#8*4
1113 b .Loop_mul4x_1st_tail
1114
1115.align 5
1116.Lmul4x_proceed:
1117 ldr x24,[x2,#8*4]! // *++b
1118 adc x30,x0,xzr
1119 ldp x6,x7,[x11,#8*0] // a[0..3]
1120 sub x3,x3,x5 // rewind np
1121 ldp x8,x9,[x11,#8*2]
1122 add x1,x11,#8*4
1123
1124 stp x19,x20,[x26,#8*0] // result!!!
1125 ldp x19,x20,[sp,#8*4] // t[0..3]
1126 stp x21,x22,[x26,#8*2] // result!!!
1127 ldp x21,x22,[sp,#8*6]
1128
1129 ldp x14,x15,[x3,#8*0] // n[0..3]
1130 mov x26,sp
1131 ldp x16,x17,[x3,#8*2]
1132 adds x3,x3,#8*4 // clear carry bit
1133 mov x0,xzr
1134
1135.align 4
1136.Loop_mul4x_reduction:
1137 mul x10,x6,x24 // lo(a[0..3]*b[4])
1138 adc x0,x0,xzr // modulo-scheduled
1139 mul x11,x7,x24
1140 add x28,x28,#8
1141 mul x12,x8,x24
1142 and x28,x28,#31
1143 mul x13,x9,x24
1144 adds x19,x19,x10
1145 umulh x10,x6,x24 // hi(a[0..3]*b[4])
1146 adcs x20,x20,x11
1147 mul x25,x19,x4 // t[0]*n0
1148 adcs x21,x21,x12
1149 umulh x11,x7,x24
1150 adcs x22,x22,x13
1151 umulh x12,x8,x24
1152 adc x23,xzr,xzr
1153 umulh x13,x9,x24
1154 ldr x24,[x2,x28] // next b[i]
1155 adds x20,x20,x10
1156 // (*) mul x10,x14,x25
1157 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1158 adcs x21,x21,x11
1159 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
1160 adcs x22,x22,x12
1161 mul x12,x16,x25
1162 adc x23,x23,x13 // can't overflow
1163 mul x13,x17,x25
1164 // (*) adds xzr,x19,x10
1165 subs xzr,x19,#1 // (*)
1166 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
1167 adcs x19,x20,x11
1168 umulh x11,x15,x25
1169 adcs x20,x21,x12
1170 umulh x12,x16,x25
1171 adcs x21,x22,x13
1172 umulh x13,x17,x25
1173 adcs x22,x23,x0
1174 adc x0,xzr,xzr
1175 adds x19,x19,x10
1176 adcs x20,x20,x11
1177 adcs x21,x21,x12
1178 adcs x22,x22,x13
1179 //adc x0,x0,xzr
1180 cbnz x28,.Loop_mul4x_reduction
1181
1182 adc x0,x0,xzr
1183 ldp x10,x11,[x26,#8*4] // t[4..7]
1184 ldp x12,x13,[x26,#8*6]
1185 ldp x6,x7,[x1,#8*0] // a[4..7]
1186 ldp x8,x9,[x1,#8*2]
1187 add x1,x1,#8*4
1188 adds x19,x19,x10
1189 adcs x20,x20,x11
1190 adcs x21,x21,x12
1191 adcs x22,x22,x13
1192 //adc x0,x0,xzr
1193
1194 ldr x25,[sp] // t[0]*n0
1195 ldp x14,x15,[x3,#8*0] // n[4..7]
1196 ldp x16,x17,[x3,#8*2]
1197 add x3,x3,#8*4
1198
1199.align 4
1200.Loop_mul4x_tail:
1201 mul x10,x6,x24 // lo(a[4..7]*b[4])
1202 adc x0,x0,xzr // modulo-scheduled
1203 mul x11,x7,x24
1204 add x28,x28,#8
1205 mul x12,x8,x24
1206 and x28,x28,#31
1207 mul x13,x9,x24
1208 adds x19,x19,x10
1209 umulh x10,x6,x24 // hi(a[4..7]*b[4])
1210 adcs x20,x20,x11
1211 umulh x11,x7,x24
1212 adcs x21,x21,x12
1213 umulh x12,x8,x24
1214 adcs x22,x22,x13
1215 umulh x13,x9,x24
1216 adc x23,xzr,xzr
1217 ldr x24,[x2,x28] // next b[i]
1218 adds x20,x20,x10
1219 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
1220 adcs x21,x21,x11
1221 mul x11,x15,x25
1222 adcs x22,x22,x12
1223 mul x12,x16,x25
1224 adc x23,x23,x13 // can't overflow
1225 mul x13,x17,x25
1226 adds x19,x19,x10
1227 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
1228 adcs x20,x20,x11
1229 umulh x11,x15,x25
1230 adcs x21,x21,x12
1231 umulh x12,x16,x25
1232 adcs x22,x22,x13
1233 umulh x13,x17,x25
1234 adcs x23,x23,x0
1235 ldr x25,[sp,x28] // next a[0]*n0
1236 adc x0,xzr,xzr
1237 str x19,[x26],#8 // result!!!
1238 adds x19,x20,x10
1239 sub x10,x27,x1 // done yet?
1240 adcs x20,x21,x11
1241 adcs x21,x22,x12
1242 adcs x22,x23,x13
1243 //adc x0,x0,xzr
1244 cbnz x28,.Loop_mul4x_tail
1245
1246 sub x11,x3,x5 // rewinded np?
1247 adc x0,x0,xzr
1248 cbz x10,.Loop_mul4x_break
1249
1250 ldp x10,x11,[x26,#8*4]
1251 ldp x12,x13,[x26,#8*6]
1252 ldp x6,x7,[x1,#8*0]
1253 ldp x8,x9,[x1,#8*2]
1254 add x1,x1,#8*4
1255 adds x19,x19,x10
1256 adcs x20,x20,x11
1257 adcs x21,x21,x12
1258 adcs x22,x22,x13
1259 //adc x0,x0,xzr
1260 ldp x14,x15,[x3,#8*0]
1261 ldp x16,x17,[x3,#8*2]
1262 add x3,x3,#8*4
1263 b .Loop_mul4x_tail
1264
1265.align 4
1266.Loop_mul4x_break:
1267 ldp x12,x13,[x29,#96] // pull rp and &b[num]
1268 adds x19,x19,x30
1269 add x2,x2,#8*4 // bp++
1270 adcs x20,x20,xzr
1271 sub x1,x1,x5 // rewind ap
1272 adcs x21,x21,xzr
1273 stp x19,x20,[x26,#8*0] // result!!!
1274 adcs x22,x22,xzr
1275 ldp x19,x20,[sp,#8*4] // t[0..3]
1276 adc x30,x0,xzr
1277 stp x21,x22,[x26,#8*2] // result!!!
1278 cmp x2,x13 // done yet?
1279 ldp x21,x22,[sp,#8*6]
1280 ldp x14,x15,[x11,#8*0] // n[0..3]
1281 ldp x16,x17,[x11,#8*2]
1282 add x3,x11,#8*4
1283 b.eq .Lmul4x_post
1284
1285 ldr x24,[x2]
1286 ldp x6,x7,[x1,#8*0] // a[0..3]
1287 ldp x8,x9,[x1,#8*2]
1288 adds x1,x1,#8*4 // clear carry bit
1289 mov x0,xzr
1290 mov x26,sp
1291 b .Loop_mul4x_reduction
1292
1293.align 4
1294.Lmul4x_post:
1295 // Final step. We see if result is larger than modulus, and
1296 // if it is, subtract the modulus. But comparison implies
1297 // subtraction. So we subtract modulus, see if it borrowed,
1298 // and conditionally copy original value.
1299 mov x0,x12
1300 mov x27,x12 // x0 copy
1301 subs x10,x19,x14
1302 add x26,sp,#8*8
1303 sbcs x11,x20,x15
1304 sub x28,x5,#8*4
1305
1306.Lmul4x_sub:
1307 sbcs x12,x21,x16
1308 ldp x14,x15,[x3,#8*0]
1309 sub x28,x28,#8*4
1310 ldp x19,x20,[x26,#8*0]
1311 sbcs x13,x22,x17
1312 ldp x16,x17,[x3,#8*2]
1313 add x3,x3,#8*4
1314 ldp x21,x22,[x26,#8*2]
1315 add x26,x26,#8*4
1316 stp x10,x11,[x0,#8*0]
1317 sbcs x10,x19,x14
1318 stp x12,x13,[x0,#8*2]
1319 add x0,x0,#8*4
1320 sbcs x11,x20,x15
1321 cbnz x28,.Lmul4x_sub
1322
1323 sbcs x12,x21,x16
1324 mov x26,sp
1325 add x1,sp,#8*4
1326 ldp x6,x7,[x27,#8*0]
1327 sbcs x13,x22,x17
1328 stp x10,x11,[x0,#8*0]
1329 ldp x8,x9,[x27,#8*2]
1330 stp x12,x13,[x0,#8*2]
1331 ldp x19,x20,[x1,#8*0]
1332 ldp x21,x22,[x1,#8*2]
1333 sbcs xzr,x30,xzr // did it borrow?
1334 ldr x30,[x29,#8] // pull return address
1335
1336 sub x28,x5,#8*4
1337.Lmul4x_cond_copy:
1338 sub x28,x28,#8*4
1339 csel x10,x19,x6,lo
1340 stp xzr,xzr,[x26,#8*0]
1341 csel x11,x20,x7,lo
1342 ldp x6,x7,[x27,#8*4]
1343 ldp x19,x20,[x1,#8*4]
1344 csel x12,x21,x8,lo
1345 stp xzr,xzr,[x26,#8*2]
1346 add x26,x26,#8*4
1347 csel x13,x22,x9,lo
1348 ldp x8,x9,[x27,#8*6]
1349 ldp x21,x22,[x1,#8*6]
1350 add x1,x1,#8*4
1351 stp x10,x11,[x27,#8*0]
1352 stp x12,x13,[x27,#8*2]
1353 add x27,x27,#8*4
1354 cbnz x28,.Lmul4x_cond_copy
1355
1356 csel x10,x19,x6,lo
1357 stp xzr,xzr,[x26,#8*0]
1358 csel x11,x20,x7,lo
1359 stp xzr,xzr,[x26,#8*2]
1360 csel x12,x21,x8,lo
1361 stp xzr,xzr,[x26,#8*3]
1362 csel x13,x22,x9,lo
1363 stp xzr,xzr,[x26,#8*4]
1364 stp x10,x11,[x27,#8*0]
1365 stp x12,x13,[x27,#8*2]
1366
1367 b .Lmul4x_done
1368
1369.align 4
1370.Lmul4x4_post_condition:
1371 adc x0,x0,xzr
1372 ldr x1,[x29,#96] // pull rp
1373 // x19-3,x0 hold result, x14-7 hold modulus
1374 subs x6,x19,x14
1375 ldr x30,[x29,#8] // pull return address
1376 sbcs x7,x20,x15
1377 stp xzr,xzr,[sp,#8*0]
1378 sbcs x8,x21,x16
1379 stp xzr,xzr,[sp,#8*2]
1380 sbcs x9,x22,x17
1381 stp xzr,xzr,[sp,#8*4]
1382 sbcs xzr,x0,xzr // did it borrow?
1383 stp xzr,xzr,[sp,#8*6]
1384
1385 // x6-3 hold result-modulus
1386 csel x6,x19,x6,lo
1387 csel x7,x20,x7,lo
1388 csel x8,x21,x8,lo
1389 csel x9,x22,x9,lo
1390 stp x6,x7,[x1,#8*0]
1391 stp x8,x9,[x1,#8*2]
1392
1393.Lmul4x_done:
1394 ldp x19,x20,[x29,#16]
1395 mov sp,x29
1396 ldp x21,x22,[x29,#32]
1397 mov x0,#1
1398 ldp x23,x24,[x29,#48]
1399 ldp x25,x26,[x29,#64]
1400 ldp x27,x28,[x29,#80]
1401 ldr x29,[sp],#128
1402 ret
1403.size __bn_mul4x_mont,.-__bn_mul4x_mont
1404.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1405.align 2
1406.align 4
David Benjamin4969cc92016-04-22 15:02:23 -04001407#endif