blob: 3dca443976a0a8f524ff980b772b68bbda518c28 [file] [log] [blame]
Robert Sloanc9abfe42018-11-26 12:19:07 -08001// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
Robert Sloan726e9d12018-09-11 11:45:04 -07004#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if !defined(OPENSSL_NO_ASM)
Adam Langleyfad63272015-11-12 12:15:39 -080011#if defined(__aarch64__)
Robert Sloan726e9d12018-09-11 11:45:04 -070012#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
Adam Langleyfad63272015-11-12 12:15:39 -080015.text
16
17.globl bn_mul_mont
David Benjamin4969cc92016-04-22 15:02:23 -040018.hidden bn_mul_mont
Adam Langleyfad63272015-11-12 12:15:39 -080019.type bn_mul_mont,%function
20.align 5
21bn_mul_mont:
22 tst x5,#7
23 b.eq __bn_sqr8x_mont
24 tst x5,#3
25 b.eq __bn_mul4x_mont
26.Lmul_mont:
27 stp x29,x30,[sp,#-64]!
28 add x29,sp,#0
29 stp x19,x20,[sp,#16]
30 stp x21,x22,[sp,#32]
31 stp x23,x24,[sp,#48]
32
33 ldr x9,[x2],#8 // bp[0]
34 sub x22,sp,x5,lsl#3
35 ldp x7,x8,[x1],#16 // ap[0..1]
36 lsl x5,x5,#3
37 ldr x4,[x4] // *n0
38 and x22,x22,#-16 // ABI says so
39 ldp x13,x14,[x3],#16 // np[0..1]
40
41 mul x6,x7,x9 // ap[0]*bp[0]
42 sub x21,x5,#16 // j=num-2
43 umulh x7,x7,x9
44 mul x10,x8,x9 // ap[1]*bp[0]
45 umulh x11,x8,x9
46
47 mul x15,x6,x4 // "tp[0]"*n0
48 mov sp,x22 // alloca
49
50 // (*) mul x12,x13,x15 // np[0]*m1
51 umulh x13,x13,x15
52 mul x16,x14,x15 // np[1]*m1
53 // (*) adds x12,x12,x6 // discarded
54 // (*) As for removal of first multiplication and addition
55 // instructions. The outcome of first addition is
56 // guaranteed to be zero, which leaves two computationally
57 // significant outcomes: it either carries or not. Then
58 // question is when does it carry? Is there alternative
59 // way to deduce it? If you follow operations, you can
60 // observe that condition for carry is quite simple:
61 // x6 being non-zero. So that carry can be calculated
62 // by adding -1 to x6. That's what next instruction does.
63 subs xzr,x6,#1 // (*)
64 umulh x17,x14,x15
65 adc x13,x13,xzr
66 cbz x21,.L1st_skip
67
68.L1st:
69 ldr x8,[x1],#8
70 adds x6,x10,x7
71 sub x21,x21,#8 // j--
72 adc x7,x11,xzr
73
74 ldr x14,[x3],#8
75 adds x12,x16,x13
76 mul x10,x8,x9 // ap[j]*bp[0]
77 adc x13,x17,xzr
78 umulh x11,x8,x9
79
80 adds x12,x12,x6
81 mul x16,x14,x15 // np[j]*m1
82 adc x13,x13,xzr
83 umulh x17,x14,x15
84 str x12,[x22],#8 // tp[j-1]
85 cbnz x21,.L1st
86
87.L1st_skip:
88 adds x6,x10,x7
89 sub x1,x1,x5 // rewind x1
90 adc x7,x11,xzr
91
92 adds x12,x16,x13
93 sub x3,x3,x5 // rewind x3
94 adc x13,x17,xzr
95
96 adds x12,x12,x6
97 sub x20,x5,#8 // i=num-1
98 adcs x13,x13,x7
99
100 adc x19,xzr,xzr // upmost overflow bit
101 stp x12,x13,[x22]
102
103.Louter:
104 ldr x9,[x2],#8 // bp[i]
105 ldp x7,x8,[x1],#16
106 ldr x23,[sp] // tp[0]
107 add x22,sp,#8
108
109 mul x6,x7,x9 // ap[0]*bp[i]
110 sub x21,x5,#16 // j=num-2
111 umulh x7,x7,x9
112 ldp x13,x14,[x3],#16
113 mul x10,x8,x9 // ap[1]*bp[i]
114 adds x6,x6,x23
115 umulh x11,x8,x9
116 adc x7,x7,xzr
117
118 mul x15,x6,x4
119 sub x20,x20,#8 // i--
120
121 // (*) mul x12,x13,x15 // np[0]*m1
122 umulh x13,x13,x15
123 mul x16,x14,x15 // np[1]*m1
124 // (*) adds x12,x12,x6
125 subs xzr,x6,#1 // (*)
126 umulh x17,x14,x15
127 cbz x21,.Linner_skip
128
129.Linner:
130 ldr x8,[x1],#8
131 adc x13,x13,xzr
132 ldr x23,[x22],#8 // tp[j]
133 adds x6,x10,x7
134 sub x21,x21,#8 // j--
135 adc x7,x11,xzr
136
137 adds x12,x16,x13
138 ldr x14,[x3],#8
139 adc x13,x17,xzr
140
141 mul x10,x8,x9 // ap[j]*bp[i]
142 adds x6,x6,x23
143 umulh x11,x8,x9
144 adc x7,x7,xzr
145
146 mul x16,x14,x15 // np[j]*m1
147 adds x12,x12,x6
148 umulh x17,x14,x15
149 str x12,[x22,#-16] // tp[j-1]
150 cbnz x21,.Linner
151
152.Linner_skip:
153 ldr x23,[x22],#8 // tp[j]
154 adc x13,x13,xzr
155 adds x6,x10,x7
156 sub x1,x1,x5 // rewind x1
157 adc x7,x11,xzr
158
159 adds x12,x16,x13
160 sub x3,x3,x5 // rewind x3
161 adcs x13,x17,x19
162 adc x19,xzr,xzr
163
164 adds x6,x6,x23
165 adc x7,x7,xzr
166
167 adds x12,x12,x6
168 adcs x13,x13,x7
169 adc x19,x19,xzr // upmost overflow bit
170 stp x12,x13,[x22,#-16]
171
172 cbnz x20,.Louter
173
174 // Final step. We see if result is larger than modulus, and
175 // if it is, subtract the modulus. But comparison implies
176 // subtraction. So we subtract modulus, see if it borrowed,
177 // and conditionally copy original value.
178 ldr x23,[sp] // tp[0]
179 add x22,sp,#8
180 ldr x14,[x3],#8 // np[0]
181 subs x21,x5,#8 // j=num-1 and clear borrow
182 mov x1,x0
183.Lsub:
184 sbcs x8,x23,x14 // tp[j]-np[j]
185 ldr x23,[x22],#8
186 sub x21,x21,#8 // j--
187 ldr x14,[x3],#8
188 str x8,[x1],#8 // rp[j]=tp[j]-np[j]
189 cbnz x21,.Lsub
190
191 sbcs x8,x23,x14
192 sbcs x19,x19,xzr // did it borrow?
193 str x8,[x1],#8 // rp[num-1]
194
195 ldr x23,[sp] // tp[0]
196 add x22,sp,#8
197 ldr x8,[x0],#8 // rp[0]
198 sub x5,x5,#8 // num--
199 nop
200.Lcond_copy:
201 sub x5,x5,#8 // num--
202 csel x14,x23,x8,lo // did it borrow?
203 ldr x23,[x22],#8
204 ldr x8,[x0],#8
205 str xzr,[x22,#-16] // wipe tp
206 str x14,[x0,#-16]
207 cbnz x5,.Lcond_copy
208
209 csel x14,x23,x8,lo
210 str xzr,[x22,#-8] // wipe tp
211 str x14,[x0,#-8]
212
213 ldp x19,x20,[x29,#16]
214 mov sp,x29
215 ldp x21,x22,[x29,#32]
216 mov x0,#1
217 ldp x23,x24,[x29,#48]
218 ldr x29,[sp],#64
219 ret
220.size bn_mul_mont,.-bn_mul_mont
221.type __bn_sqr8x_mont,%function
222.align 5
223__bn_sqr8x_mont:
224 cmp x1,x2
225 b.ne __bn_mul4x_mont
226.Lsqr8x_mont:
227 stp x29,x30,[sp,#-128]!
228 add x29,sp,#0
229 stp x19,x20,[sp,#16]
230 stp x21,x22,[sp,#32]
231 stp x23,x24,[sp,#48]
232 stp x25,x26,[sp,#64]
233 stp x27,x28,[sp,#80]
234 stp x0,x3,[sp,#96] // offload rp and np
235
236 ldp x6,x7,[x1,#8*0]
237 ldp x8,x9,[x1,#8*2]
238 ldp x10,x11,[x1,#8*4]
239 ldp x12,x13,[x1,#8*6]
240
241 sub x2,sp,x5,lsl#4
242 lsl x5,x5,#3
243 ldr x4,[x4] // *n0
244 mov sp,x2 // alloca
245 sub x27,x5,#8*8
246 b .Lsqr8x_zero_start
247
248.Lsqr8x_zero:
249 sub x27,x27,#8*8
250 stp xzr,xzr,[x2,#8*0]
251 stp xzr,xzr,[x2,#8*2]
252 stp xzr,xzr,[x2,#8*4]
253 stp xzr,xzr,[x2,#8*6]
254.Lsqr8x_zero_start:
255 stp xzr,xzr,[x2,#8*8]
256 stp xzr,xzr,[x2,#8*10]
257 stp xzr,xzr,[x2,#8*12]
258 stp xzr,xzr,[x2,#8*14]
259 add x2,x2,#8*16
260 cbnz x27,.Lsqr8x_zero
261
262 add x3,x1,x5
263 add x1,x1,#8*8
264 mov x19,xzr
265 mov x20,xzr
266 mov x21,xzr
267 mov x22,xzr
268 mov x23,xzr
269 mov x24,xzr
270 mov x25,xzr
271 mov x26,xzr
272 mov x2,sp
273 str x4,[x29,#112] // offload n0
274
275 // Multiply everything but a[i]*a[i]
276.align 4
277.Lsqr8x_outer_loop:
278 // a[1]a[0] (i)
279 // a[2]a[0]
280 // a[3]a[0]
281 // a[4]a[0]
282 // a[5]a[0]
283 // a[6]a[0]
284 // a[7]a[0]
285 // a[2]a[1] (ii)
286 // a[3]a[1]
287 // a[4]a[1]
288 // a[5]a[1]
289 // a[6]a[1]
290 // a[7]a[1]
291 // a[3]a[2] (iii)
292 // a[4]a[2]
293 // a[5]a[2]
294 // a[6]a[2]
295 // a[7]a[2]
296 // a[4]a[3] (iv)
297 // a[5]a[3]
298 // a[6]a[3]
299 // a[7]a[3]
300 // a[5]a[4] (v)
301 // a[6]a[4]
302 // a[7]a[4]
303 // a[6]a[5] (vi)
304 // a[7]a[5]
305 // a[7]a[6] (vii)
306
307 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
308 mul x15,x8,x6
309 mul x16,x9,x6
310 mul x17,x10,x6
311 adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
312 mul x14,x11,x6
313 adcs x21,x21,x15
314 mul x15,x12,x6
315 adcs x22,x22,x16
316 mul x16,x13,x6
317 adcs x23,x23,x17
318 umulh x17,x7,x6 // hi(a[1..7]*a[0])
319 adcs x24,x24,x14
320 umulh x14,x8,x6
321 adcs x25,x25,x15
322 umulh x15,x9,x6
323 adcs x26,x26,x16
324 umulh x16,x10,x6
325 stp x19,x20,[x2],#8*2 // t[0..1]
326 adc x19,xzr,xzr // t[8]
327 adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
328 umulh x17,x11,x6
329 adcs x22,x22,x14
330 umulh x14,x12,x6
331 adcs x23,x23,x15
332 umulh x15,x13,x6
333 adcs x24,x24,x16
334 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
335 adcs x25,x25,x17
336 mul x17,x9,x7
337 adcs x26,x26,x14
338 mul x14,x10,x7
339 adc x19,x19,x15
340
341 mul x15,x11,x7
342 adds x22,x22,x16
343 mul x16,x12,x7
344 adcs x23,x23,x17
345 mul x17,x13,x7
346 adcs x24,x24,x14
347 umulh x14,x8,x7 // hi(a[2..7]*a[1])
348 adcs x25,x25,x15
349 umulh x15,x9,x7
350 adcs x26,x26,x16
351 umulh x16,x10,x7
352 adcs x19,x19,x17
353 umulh x17,x11,x7
354 stp x21,x22,[x2],#8*2 // t[2..3]
355 adc x20,xzr,xzr // t[9]
356 adds x23,x23,x14
357 umulh x14,x12,x7
358 adcs x24,x24,x15
359 umulh x15,x13,x7
360 adcs x25,x25,x16
361 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
362 adcs x26,x26,x17
363 mul x17,x10,x8
364 adcs x19,x19,x14
365 mul x14,x11,x8
366 adc x20,x20,x15
367
368 mul x15,x12,x8
369 adds x24,x24,x16
370 mul x16,x13,x8
371 adcs x25,x25,x17
372 umulh x17,x9,x8 // hi(a[3..7]*a[2])
373 adcs x26,x26,x14
374 umulh x14,x10,x8
375 adcs x19,x19,x15
376 umulh x15,x11,x8
377 adcs x20,x20,x16
378 umulh x16,x12,x8
379 stp x23,x24,[x2],#8*2 // t[4..5]
380 adc x21,xzr,xzr // t[10]
381 adds x25,x25,x17
382 umulh x17,x13,x8
383 adcs x26,x26,x14
384 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
385 adcs x19,x19,x15
386 mul x15,x11,x9
387 adcs x20,x20,x16
388 mul x16,x12,x9
389 adc x21,x21,x17
390
391 mul x17,x13,x9
392 adds x26,x26,x14
393 umulh x14,x10,x9 // hi(a[4..7]*a[3])
394 adcs x19,x19,x15
395 umulh x15,x11,x9
396 adcs x20,x20,x16
397 umulh x16,x12,x9
398 adcs x21,x21,x17
399 umulh x17,x13,x9
400 stp x25,x26,[x2],#8*2 // t[6..7]
401 adc x22,xzr,xzr // t[11]
402 adds x19,x19,x14
403 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
404 adcs x20,x20,x15
405 mul x15,x12,x10
406 adcs x21,x21,x16
407 mul x16,x13,x10
408 adc x22,x22,x17
409
410 umulh x17,x11,x10 // hi(a[5..7]*a[4])
411 adds x20,x20,x14
412 umulh x14,x12,x10
413 adcs x21,x21,x15
414 umulh x15,x13,x10
415 adcs x22,x22,x16
416 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
417 adc x23,xzr,xzr // t[12]
418 adds x21,x21,x17
419 mul x17,x13,x11
420 adcs x22,x22,x14
421 umulh x14,x12,x11 // hi(a[6..7]*a[5])
422 adc x23,x23,x15
423
424 umulh x15,x13,x11
425 adds x22,x22,x16
426 mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
427 adcs x23,x23,x17
428 umulh x17,x13,x12 // hi(a[7]*a[6])
429 adc x24,xzr,xzr // t[13]
430 adds x23,x23,x14
431 sub x27,x3,x1 // done yet?
432 adc x24,x24,x15
433
434 adds x24,x24,x16
435 sub x14,x3,x5 // rewinded ap
436 adc x25,xzr,xzr // t[14]
437 add x25,x25,x17
438
439 cbz x27,.Lsqr8x_outer_break
440
441 mov x4,x6
442 ldp x6,x7,[x2,#8*0]
443 ldp x8,x9,[x2,#8*2]
444 ldp x10,x11,[x2,#8*4]
445 ldp x12,x13,[x2,#8*6]
446 adds x19,x19,x6
447 adcs x20,x20,x7
448 ldp x6,x7,[x1,#8*0]
449 adcs x21,x21,x8
450 adcs x22,x22,x9
451 ldp x8,x9,[x1,#8*2]
452 adcs x23,x23,x10
453 adcs x24,x24,x11
454 ldp x10,x11,[x1,#8*4]
455 adcs x25,x25,x12
456 mov x0,x1
457 adcs x26,xzr,x13
458 ldp x12,x13,[x1,#8*6]
459 add x1,x1,#8*8
460 //adc x28,xzr,xzr // moved below
461 mov x27,#-8*8
462
463 // a[8]a[0]
464 // a[9]a[0]
465 // a[a]a[0]
466 // a[b]a[0]
467 // a[c]a[0]
468 // a[d]a[0]
469 // a[e]a[0]
470 // a[f]a[0]
471 // a[8]a[1]
472 // a[f]a[1]........................
473 // a[8]a[2]
474 // a[f]a[2]........................
475 // a[8]a[3]
476 // a[f]a[3]........................
477 // a[8]a[4]
478 // a[f]a[4]........................
479 // a[8]a[5]
480 // a[f]a[5]........................
481 // a[8]a[6]
482 // a[f]a[6]........................
483 // a[8]a[7]
484 // a[f]a[7]........................
485.Lsqr8x_mul:
486 mul x14,x6,x4
487 adc x28,xzr,xzr // carry bit, modulo-scheduled
488 mul x15,x7,x4
489 add x27,x27,#8
490 mul x16,x8,x4
491 mul x17,x9,x4
492 adds x19,x19,x14
493 mul x14,x10,x4
494 adcs x20,x20,x15
495 mul x15,x11,x4
496 adcs x21,x21,x16
497 mul x16,x12,x4
498 adcs x22,x22,x17
499 mul x17,x13,x4
500 adcs x23,x23,x14
501 umulh x14,x6,x4
502 adcs x24,x24,x15
503 umulh x15,x7,x4
504 adcs x25,x25,x16
505 umulh x16,x8,x4
506 adcs x26,x26,x17
507 umulh x17,x9,x4
508 adc x28,x28,xzr
509 str x19,[x2],#8
510 adds x19,x20,x14
511 umulh x14,x10,x4
512 adcs x20,x21,x15
513 umulh x15,x11,x4
514 adcs x21,x22,x16
515 umulh x16,x12,x4
516 adcs x22,x23,x17
517 umulh x17,x13,x4
518 ldr x4,[x0,x27]
519 adcs x23,x24,x14
520 adcs x24,x25,x15
521 adcs x25,x26,x16
522 adcs x26,x28,x17
523 //adc x28,xzr,xzr // moved above
524 cbnz x27,.Lsqr8x_mul
525 // note that carry flag is guaranteed
526 // to be zero at this point
527 cmp x1,x3 // done yet?
528 b.eq .Lsqr8x_break
529
530 ldp x6,x7,[x2,#8*0]
531 ldp x8,x9,[x2,#8*2]
532 ldp x10,x11,[x2,#8*4]
533 ldp x12,x13,[x2,#8*6]
534 adds x19,x19,x6
535 ldr x4,[x0,#-8*8]
536 adcs x20,x20,x7
537 ldp x6,x7,[x1,#8*0]
538 adcs x21,x21,x8
539 adcs x22,x22,x9
540 ldp x8,x9,[x1,#8*2]
541 adcs x23,x23,x10
542 adcs x24,x24,x11
543 ldp x10,x11,[x1,#8*4]
544 adcs x25,x25,x12
545 mov x27,#-8*8
546 adcs x26,x26,x13
547 ldp x12,x13,[x1,#8*6]
548 add x1,x1,#8*8
549 //adc x28,xzr,xzr // moved above
550 b .Lsqr8x_mul
551
552.align 4
553.Lsqr8x_break:
554 ldp x6,x7,[x0,#8*0]
555 add x1,x0,#8*8
556 ldp x8,x9,[x0,#8*2]
557 sub x14,x3,x1 // is it last iteration?
558 ldp x10,x11,[x0,#8*4]
559 sub x15,x2,x14
560 ldp x12,x13,[x0,#8*6]
561 cbz x14,.Lsqr8x_outer_loop
562
563 stp x19,x20,[x2,#8*0]
564 ldp x19,x20,[x15,#8*0]
565 stp x21,x22,[x2,#8*2]
566 ldp x21,x22,[x15,#8*2]
567 stp x23,x24,[x2,#8*4]
568 ldp x23,x24,[x15,#8*4]
569 stp x25,x26,[x2,#8*6]
570 mov x2,x15
571 ldp x25,x26,[x15,#8*6]
572 b .Lsqr8x_outer_loop
573
574.align 4
575.Lsqr8x_outer_break:
576 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
577 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
578 ldp x15,x16,[sp,#8*1]
579 ldp x11,x13,[x14,#8*2]
580 add x1,x14,#8*4
581 ldp x17,x14,[sp,#8*3]
582
583 stp x19,x20,[x2,#8*0]
584 mul x19,x7,x7
585 stp x21,x22,[x2,#8*2]
586 umulh x7,x7,x7
587 stp x23,x24,[x2,#8*4]
588 mul x8,x9,x9
589 stp x25,x26,[x2,#8*6]
590 mov x2,sp
591 umulh x9,x9,x9
592 adds x20,x7,x15,lsl#1
593 extr x15,x16,x15,#63
594 sub x27,x5,#8*4
595
596.Lsqr4x_shift_n_add:
597 adcs x21,x8,x15
598 extr x16,x17,x16,#63
599 sub x27,x27,#8*4
600 adcs x22,x9,x16
601 ldp x15,x16,[x2,#8*5]
602 mul x10,x11,x11
603 ldp x7,x9,[x1],#8*2
604 umulh x11,x11,x11
605 mul x12,x13,x13
606 umulh x13,x13,x13
607 extr x17,x14,x17,#63
608 stp x19,x20,[x2,#8*0]
609 adcs x23,x10,x17
610 extr x14,x15,x14,#63
611 stp x21,x22,[x2,#8*2]
612 adcs x24,x11,x14
613 ldp x17,x14,[x2,#8*7]
614 extr x15,x16,x15,#63
615 adcs x25,x12,x15
616 extr x16,x17,x16,#63
617 adcs x26,x13,x16
618 ldp x15,x16,[x2,#8*9]
619 mul x6,x7,x7
620 ldp x11,x13,[x1],#8*2
621 umulh x7,x7,x7
622 mul x8,x9,x9
623 umulh x9,x9,x9
624 stp x23,x24,[x2,#8*4]
625 extr x17,x14,x17,#63
626 stp x25,x26,[x2,#8*6]
627 add x2,x2,#8*8
628 adcs x19,x6,x17
629 extr x14,x15,x14,#63
630 adcs x20,x7,x14
631 ldp x17,x14,[x2,#8*3]
632 extr x15,x16,x15,#63
633 cbnz x27,.Lsqr4x_shift_n_add
634 ldp x1,x4,[x29,#104] // pull np and n0
635
636 adcs x21,x8,x15
637 extr x16,x17,x16,#63
638 adcs x22,x9,x16
639 ldp x15,x16,[x2,#8*5]
640 mul x10,x11,x11
641 umulh x11,x11,x11
642 stp x19,x20,[x2,#8*0]
643 mul x12,x13,x13
644 umulh x13,x13,x13
645 stp x21,x22,[x2,#8*2]
646 extr x17,x14,x17,#63
647 adcs x23,x10,x17
648 extr x14,x15,x14,#63
649 ldp x19,x20,[sp,#8*0]
650 adcs x24,x11,x14
651 extr x15,x16,x15,#63
652 ldp x6,x7,[x1,#8*0]
653 adcs x25,x12,x15
654 extr x16,xzr,x16,#63
655 ldp x8,x9,[x1,#8*2]
656 adc x26,x13,x16
657 ldp x10,x11,[x1,#8*4]
658
659 // Reduce by 512 bits per iteration
660 mul x28,x4,x19 // t[0]*n0
661 ldp x12,x13,[x1,#8*6]
662 add x3,x1,x5
663 ldp x21,x22,[sp,#8*2]
664 stp x23,x24,[x2,#8*4]
665 ldp x23,x24,[sp,#8*4]
666 stp x25,x26,[x2,#8*6]
667 ldp x25,x26,[sp,#8*6]
668 add x1,x1,#8*8
669 mov x30,xzr // initial top-most carry
670 mov x2,sp
671 mov x27,#8
672
673.Lsqr8x_reduction:
674 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
675 mul x15,x7,x28
676 sub x27,x27,#1
677 mul x16,x8,x28
678 str x28,[x2],#8 // put aside t[0]*n0 for tail processing
679 mul x17,x9,x28
680 // (*) adds xzr,x19,x14
681 subs xzr,x19,#1 // (*)
682 mul x14,x10,x28
683 adcs x19,x20,x15
684 mul x15,x11,x28
685 adcs x20,x21,x16
686 mul x16,x12,x28
687 adcs x21,x22,x17
688 mul x17,x13,x28
689 adcs x22,x23,x14
690 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
691 adcs x23,x24,x15
692 umulh x15,x7,x28
693 adcs x24,x25,x16
694 umulh x16,x8,x28
695 adcs x25,x26,x17
696 umulh x17,x9,x28
697 adc x26,xzr,xzr
698 adds x19,x19,x14
699 umulh x14,x10,x28
700 adcs x20,x20,x15
701 umulh x15,x11,x28
702 adcs x21,x21,x16
703 umulh x16,x12,x28
704 adcs x22,x22,x17
705 umulh x17,x13,x28
706 mul x28,x4,x19 // next t[0]*n0
707 adcs x23,x23,x14
708 adcs x24,x24,x15
709 adcs x25,x25,x16
710 adc x26,x26,x17
711 cbnz x27,.Lsqr8x_reduction
712
713 ldp x14,x15,[x2,#8*0]
714 ldp x16,x17,[x2,#8*2]
715 mov x0,x2
716 sub x27,x3,x1 // done yet?
717 adds x19,x19,x14
718 adcs x20,x20,x15
719 ldp x14,x15,[x2,#8*4]
720 adcs x21,x21,x16
721 adcs x22,x22,x17
722 ldp x16,x17,[x2,#8*6]
723 adcs x23,x23,x14
724 adcs x24,x24,x15
725 adcs x25,x25,x16
726 adcs x26,x26,x17
727 //adc x28,xzr,xzr // moved below
728 cbz x27,.Lsqr8x8_post_condition
729
730 ldr x4,[x2,#-8*8]
731 ldp x6,x7,[x1,#8*0]
732 ldp x8,x9,[x1,#8*2]
733 ldp x10,x11,[x1,#8*4]
734 mov x27,#-8*8
735 ldp x12,x13,[x1,#8*6]
736 add x1,x1,#8*8
737
738.Lsqr8x_tail:
739 mul x14,x6,x4
740 adc x28,xzr,xzr // carry bit, modulo-scheduled
741 mul x15,x7,x4
742 add x27,x27,#8
743 mul x16,x8,x4
744 mul x17,x9,x4
745 adds x19,x19,x14
746 mul x14,x10,x4
747 adcs x20,x20,x15
748 mul x15,x11,x4
749 adcs x21,x21,x16
750 mul x16,x12,x4
751 adcs x22,x22,x17
752 mul x17,x13,x4
753 adcs x23,x23,x14
754 umulh x14,x6,x4
755 adcs x24,x24,x15
756 umulh x15,x7,x4
757 adcs x25,x25,x16
758 umulh x16,x8,x4
759 adcs x26,x26,x17
760 umulh x17,x9,x4
761 adc x28,x28,xzr
762 str x19,[x2],#8
763 adds x19,x20,x14
764 umulh x14,x10,x4
765 adcs x20,x21,x15
766 umulh x15,x11,x4
767 adcs x21,x22,x16
768 umulh x16,x12,x4
769 adcs x22,x23,x17
770 umulh x17,x13,x4
771 ldr x4,[x0,x27]
772 adcs x23,x24,x14
773 adcs x24,x25,x15
774 adcs x25,x26,x16
775 adcs x26,x28,x17
776 //adc x28,xzr,xzr // moved above
777 cbnz x27,.Lsqr8x_tail
778 // note that carry flag is guaranteed
779 // to be zero at this point
780 ldp x6,x7,[x2,#8*0]
781 sub x27,x3,x1 // done yet?
782 sub x16,x3,x5 // rewinded np
783 ldp x8,x9,[x2,#8*2]
784 ldp x10,x11,[x2,#8*4]
785 ldp x12,x13,[x2,#8*6]
786 cbz x27,.Lsqr8x_tail_break
787
788 ldr x4,[x0,#-8*8]
789 adds x19,x19,x6
790 adcs x20,x20,x7
791 ldp x6,x7,[x1,#8*0]
792 adcs x21,x21,x8
793 adcs x22,x22,x9
794 ldp x8,x9,[x1,#8*2]
795 adcs x23,x23,x10
796 adcs x24,x24,x11
797 ldp x10,x11,[x1,#8*4]
798 adcs x25,x25,x12
799 mov x27,#-8*8
800 adcs x26,x26,x13
801 ldp x12,x13,[x1,#8*6]
802 add x1,x1,#8*8
803 //adc x28,xzr,xzr // moved above
804 b .Lsqr8x_tail
805
806.align 4
807.Lsqr8x_tail_break:
808 ldr x4,[x29,#112] // pull n0
809 add x27,x2,#8*8 // end of current t[num] window
810
811 subs xzr,x30,#1 // "move" top-most carry to carry bit
812 adcs x14,x19,x6
813 adcs x15,x20,x7
814 ldp x19,x20,[x0,#8*0]
815 adcs x21,x21,x8
816 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
817 adcs x22,x22,x9
818 ldp x8,x9,[x16,#8*2]
819 adcs x23,x23,x10
820 adcs x24,x24,x11
821 ldp x10,x11,[x16,#8*4]
822 adcs x25,x25,x12
823 adcs x26,x26,x13
824 ldp x12,x13,[x16,#8*6]
825 add x1,x16,#8*8
826 adc x30,xzr,xzr // top-most carry
827 mul x28,x4,x19
828 stp x14,x15,[x2,#8*0]
829 stp x21,x22,[x2,#8*2]
830 ldp x21,x22,[x0,#8*2]
831 stp x23,x24,[x2,#8*4]
832 ldp x23,x24,[x0,#8*4]
833 cmp x27,x29 // did we hit the bottom?
834 stp x25,x26,[x2,#8*6]
835 mov x2,x0 // slide the window
836 ldp x25,x26,[x0,#8*6]
837 mov x27,#8
838 b.ne .Lsqr8x_reduction
839
840 // Final step. We see if result is larger than modulus, and
841 // if it is, subtract the modulus. But comparison implies
842 // subtraction. So we subtract modulus, see if it borrowed,
843 // and conditionally copy original value.
844 ldr x0,[x29,#96] // pull rp
845 add x2,x2,#8*8
846 subs x14,x19,x6
847 sbcs x15,x20,x7
848 sub x27,x5,#8*8
849 mov x3,x0 // x0 copy
850
851.Lsqr8x_sub:
852 sbcs x16,x21,x8
853 ldp x6,x7,[x1,#8*0]
854 sbcs x17,x22,x9
855 stp x14,x15,[x0,#8*0]
856 sbcs x14,x23,x10
857 ldp x8,x9,[x1,#8*2]
858 sbcs x15,x24,x11
859 stp x16,x17,[x0,#8*2]
860 sbcs x16,x25,x12
861 ldp x10,x11,[x1,#8*4]
862 sbcs x17,x26,x13
863 ldp x12,x13,[x1,#8*6]
864 add x1,x1,#8*8
865 ldp x19,x20,[x2,#8*0]
866 sub x27,x27,#8*8
867 ldp x21,x22,[x2,#8*2]
868 ldp x23,x24,[x2,#8*4]
869 ldp x25,x26,[x2,#8*6]
870 add x2,x2,#8*8
871 stp x14,x15,[x0,#8*4]
872 sbcs x14,x19,x6
873 stp x16,x17,[x0,#8*6]
874 add x0,x0,#8*8
875 sbcs x15,x20,x7
876 cbnz x27,.Lsqr8x_sub
877
878 sbcs x16,x21,x8
879 mov x2,sp
880 add x1,sp,x5
881 ldp x6,x7,[x3,#8*0]
882 sbcs x17,x22,x9
883 stp x14,x15,[x0,#8*0]
884 sbcs x14,x23,x10
885 ldp x8,x9,[x3,#8*2]
886 sbcs x15,x24,x11
887 stp x16,x17,[x0,#8*2]
888 sbcs x16,x25,x12
889 ldp x19,x20,[x1,#8*0]
890 sbcs x17,x26,x13
891 ldp x21,x22,[x1,#8*2]
892 sbcs xzr,x30,xzr // did it borrow?
893 ldr x30,[x29,#8] // pull return address
894 stp x14,x15,[x0,#8*4]
895 stp x16,x17,[x0,#8*6]
896
897 sub x27,x5,#8*4
898.Lsqr4x_cond_copy:
899 sub x27,x27,#8*4
900 csel x14,x19,x6,lo
901 stp xzr,xzr,[x2,#8*0]
902 csel x15,x20,x7,lo
903 ldp x6,x7,[x3,#8*4]
904 ldp x19,x20,[x1,#8*4]
905 csel x16,x21,x8,lo
906 stp xzr,xzr,[x2,#8*2]
907 add x2,x2,#8*4
908 csel x17,x22,x9,lo
909 ldp x8,x9,[x3,#8*6]
910 ldp x21,x22,[x1,#8*6]
911 add x1,x1,#8*4
912 stp x14,x15,[x3,#8*0]
913 stp x16,x17,[x3,#8*2]
914 add x3,x3,#8*4
915 stp xzr,xzr,[x1,#8*0]
916 stp xzr,xzr,[x1,#8*2]
917 cbnz x27,.Lsqr4x_cond_copy
918
919 csel x14,x19,x6,lo
920 stp xzr,xzr,[x2,#8*0]
921 csel x15,x20,x7,lo
922 stp xzr,xzr,[x2,#8*2]
923 csel x16,x21,x8,lo
924 csel x17,x22,x9,lo
925 stp x14,x15,[x3,#8*0]
926 stp x16,x17,[x3,#8*2]
927
928 b .Lsqr8x_done
929
930.align 4
931.Lsqr8x8_post_condition:
932 adc x28,xzr,xzr
933 ldr x30,[x29,#8] // pull return address
934 // x19-7,x28 hold result, x6-7 hold modulus
935 subs x6,x19,x6
936 ldr x1,[x29,#96] // pull rp
937 sbcs x7,x20,x7
938 stp xzr,xzr,[sp,#8*0]
939 sbcs x8,x21,x8
940 stp xzr,xzr,[sp,#8*2]
941 sbcs x9,x22,x9
942 stp xzr,xzr,[sp,#8*4]
943 sbcs x10,x23,x10
944 stp xzr,xzr,[sp,#8*6]
945 sbcs x11,x24,x11
946 stp xzr,xzr,[sp,#8*8]
947 sbcs x12,x25,x12
948 stp xzr,xzr,[sp,#8*10]
949 sbcs x13,x26,x13
950 stp xzr,xzr,[sp,#8*12]
951 sbcs x28,x28,xzr // did it borrow?
952 stp xzr,xzr,[sp,#8*14]
953
954 // x6-7 hold result-modulus
955 csel x6,x19,x6,lo
956 csel x7,x20,x7,lo
957 csel x8,x21,x8,lo
958 csel x9,x22,x9,lo
959 stp x6,x7,[x1,#8*0]
960 csel x10,x23,x10,lo
961 csel x11,x24,x11,lo
962 stp x8,x9,[x1,#8*2]
963 csel x12,x25,x12,lo
964 csel x13,x26,x13,lo
965 stp x10,x11,[x1,#8*4]
966 stp x12,x13,[x1,#8*6]
967
968.Lsqr8x_done:
969 ldp x19,x20,[x29,#16]
970 mov sp,x29
971 ldp x21,x22,[x29,#32]
972 mov x0,#1
973 ldp x23,x24,[x29,#48]
974 ldp x25,x26,[x29,#64]
975 ldp x27,x28,[x29,#80]
976 ldr x29,[sp],#128
977 ret
978.size __bn_sqr8x_mont,.-__bn_sqr8x_mont
979.type __bn_mul4x_mont,%function
980.align 5
981__bn_mul4x_mont:
982 stp x29,x30,[sp,#-128]!
983 add x29,sp,#0
984 stp x19,x20,[sp,#16]
985 stp x21,x22,[sp,#32]
986 stp x23,x24,[sp,#48]
987 stp x25,x26,[sp,#64]
988 stp x27,x28,[sp,#80]
989
990 sub x26,sp,x5,lsl#3
991 lsl x5,x5,#3
992 ldr x4,[x4] // *n0
993 sub sp,x26,#8*4 // alloca
994
995 add x10,x2,x5
996 add x27,x1,x5
997 stp x0,x10,[x29,#96] // offload rp and &b[num]
998
999 ldr x24,[x2,#8*0] // b[0]
1000 ldp x6,x7,[x1,#8*0] // a[0..3]
1001 ldp x8,x9,[x1,#8*2]
1002 add x1,x1,#8*4
1003 mov x19,xzr
1004 mov x20,xzr
1005 mov x21,xzr
1006 mov x22,xzr
1007 ldp x14,x15,[x3,#8*0] // n[0..3]
1008 ldp x16,x17,[x3,#8*2]
1009 adds x3,x3,#8*4 // clear carry bit
1010 mov x0,xzr
1011 mov x28,#0
1012 mov x26,sp
1013
1014.Loop_mul4x_1st_reduction:
1015 mul x10,x6,x24 // lo(a[0..3]*b[0])
1016 adc x0,x0,xzr // modulo-scheduled
1017 mul x11,x7,x24
1018 add x28,x28,#8
1019 mul x12,x8,x24
1020 and x28,x28,#31
1021 mul x13,x9,x24
1022 adds x19,x19,x10
1023 umulh x10,x6,x24 // hi(a[0..3]*b[0])
1024 adcs x20,x20,x11
1025 mul x25,x19,x4 // t[0]*n0
1026 adcs x21,x21,x12
1027 umulh x11,x7,x24
1028 adcs x22,x22,x13
1029 umulh x12,x8,x24
1030 adc x23,xzr,xzr
1031 umulh x13,x9,x24
1032 ldr x24,[x2,x28] // next b[i] (or b[0])
1033 adds x20,x20,x10
1034 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
1035 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1036 adcs x21,x21,x11
1037 mul x11,x15,x25
1038 adcs x22,x22,x12
1039 mul x12,x16,x25
1040 adc x23,x23,x13 // can't overflow
1041 mul x13,x17,x25
1042 // (*) adds xzr,x19,x10
1043 subs xzr,x19,#1 // (*)
1044 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
1045 adcs x19,x20,x11
1046 umulh x11,x15,x25
1047 adcs x20,x21,x12
1048 umulh x12,x16,x25
1049 adcs x21,x22,x13
1050 umulh x13,x17,x25
1051 adcs x22,x23,x0
1052 adc x0,xzr,xzr
1053 adds x19,x19,x10
1054 sub x10,x27,x1
1055 adcs x20,x20,x11
1056 adcs x21,x21,x12
1057 adcs x22,x22,x13
1058 //adc x0,x0,xzr
1059 cbnz x28,.Loop_mul4x_1st_reduction
1060
1061 cbz x10,.Lmul4x4_post_condition
1062
1063 ldp x6,x7,[x1,#8*0] // a[4..7]
1064 ldp x8,x9,[x1,#8*2]
1065 add x1,x1,#8*4
1066 ldr x25,[sp] // a[0]*n0
1067 ldp x14,x15,[x3,#8*0] // n[4..7]
1068 ldp x16,x17,[x3,#8*2]
1069 add x3,x3,#8*4
1070
1071.Loop_mul4x_1st_tail:
1072 mul x10,x6,x24 // lo(a[4..7]*b[i])
1073 adc x0,x0,xzr // modulo-scheduled
1074 mul x11,x7,x24
1075 add x28,x28,#8
1076 mul x12,x8,x24
1077 and x28,x28,#31
1078 mul x13,x9,x24
1079 adds x19,x19,x10
1080 umulh x10,x6,x24 // hi(a[4..7]*b[i])
1081 adcs x20,x20,x11
1082 umulh x11,x7,x24
1083 adcs x21,x21,x12
1084 umulh x12,x8,x24
1085 adcs x22,x22,x13
1086 umulh x13,x9,x24
1087 adc x23,xzr,xzr
1088 ldr x24,[x2,x28] // next b[i] (or b[0])
1089 adds x20,x20,x10
1090 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
1091 adcs x21,x21,x11
1092 mul x11,x15,x25
1093 adcs x22,x22,x12
1094 mul x12,x16,x25
1095 adc x23,x23,x13 // can't overflow
1096 mul x13,x17,x25
1097 adds x19,x19,x10
1098 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
1099 adcs x20,x20,x11
1100 umulh x11,x15,x25
1101 adcs x21,x21,x12
1102 umulh x12,x16,x25
1103 adcs x22,x22,x13
1104 adcs x23,x23,x0
1105 umulh x13,x17,x25
1106 adc x0,xzr,xzr
1107 ldr x25,[sp,x28] // next t[0]*n0
1108 str x19,[x26],#8 // result!!!
1109 adds x19,x20,x10
1110 sub x10,x27,x1 // done yet?
1111 adcs x20,x21,x11
1112 adcs x21,x22,x12
1113 adcs x22,x23,x13
1114 //adc x0,x0,xzr
1115 cbnz x28,.Loop_mul4x_1st_tail
1116
1117 sub x11,x27,x5 // rewinded x1
1118 cbz x10,.Lmul4x_proceed
1119
1120 ldp x6,x7,[x1,#8*0]
1121 ldp x8,x9,[x1,#8*2]
1122 add x1,x1,#8*4
1123 ldp x14,x15,[x3,#8*0]
1124 ldp x16,x17,[x3,#8*2]
1125 add x3,x3,#8*4
1126 b .Loop_mul4x_1st_tail
1127
1128.align 5
1129.Lmul4x_proceed:
1130 ldr x24,[x2,#8*4]! // *++b
1131 adc x30,x0,xzr
1132 ldp x6,x7,[x11,#8*0] // a[0..3]
1133 sub x3,x3,x5 // rewind np
1134 ldp x8,x9,[x11,#8*2]
1135 add x1,x11,#8*4
1136
1137 stp x19,x20,[x26,#8*0] // result!!!
1138 ldp x19,x20,[sp,#8*4] // t[0..3]
1139 stp x21,x22,[x26,#8*2] // result!!!
1140 ldp x21,x22,[sp,#8*6]
1141
1142 ldp x14,x15,[x3,#8*0] // n[0..3]
1143 mov x26,sp
1144 ldp x16,x17,[x3,#8*2]
1145 adds x3,x3,#8*4 // clear carry bit
1146 mov x0,xzr
1147
1148.align 4
1149.Loop_mul4x_reduction:
1150 mul x10,x6,x24 // lo(a[0..3]*b[4])
1151 adc x0,x0,xzr // modulo-scheduled
1152 mul x11,x7,x24
1153 add x28,x28,#8
1154 mul x12,x8,x24
1155 and x28,x28,#31
1156 mul x13,x9,x24
1157 adds x19,x19,x10
1158 umulh x10,x6,x24 // hi(a[0..3]*b[4])
1159 adcs x20,x20,x11
1160 mul x25,x19,x4 // t[0]*n0
1161 adcs x21,x21,x12
1162 umulh x11,x7,x24
1163 adcs x22,x22,x13
1164 umulh x12,x8,x24
1165 adc x23,xzr,xzr
1166 umulh x13,x9,x24
1167 ldr x24,[x2,x28] // next b[i]
1168 adds x20,x20,x10
1169 // (*) mul x10,x14,x25
1170 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1171 adcs x21,x21,x11
1172 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
1173 adcs x22,x22,x12
1174 mul x12,x16,x25
1175 adc x23,x23,x13 // can't overflow
1176 mul x13,x17,x25
1177 // (*) adds xzr,x19,x10
1178 subs xzr,x19,#1 // (*)
1179 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
1180 adcs x19,x20,x11
1181 umulh x11,x15,x25
1182 adcs x20,x21,x12
1183 umulh x12,x16,x25
1184 adcs x21,x22,x13
1185 umulh x13,x17,x25
1186 adcs x22,x23,x0
1187 adc x0,xzr,xzr
1188 adds x19,x19,x10
1189 adcs x20,x20,x11
1190 adcs x21,x21,x12
1191 adcs x22,x22,x13
1192 //adc x0,x0,xzr
1193 cbnz x28,.Loop_mul4x_reduction
1194
1195 adc x0,x0,xzr
1196 ldp x10,x11,[x26,#8*4] // t[4..7]
1197 ldp x12,x13,[x26,#8*6]
1198 ldp x6,x7,[x1,#8*0] // a[4..7]
1199 ldp x8,x9,[x1,#8*2]
1200 add x1,x1,#8*4
1201 adds x19,x19,x10
1202 adcs x20,x20,x11
1203 adcs x21,x21,x12
1204 adcs x22,x22,x13
1205 //adc x0,x0,xzr
1206
1207 ldr x25,[sp] // t[0]*n0
1208 ldp x14,x15,[x3,#8*0] // n[4..7]
1209 ldp x16,x17,[x3,#8*2]
1210 add x3,x3,#8*4
1211
1212.align 4
1213.Loop_mul4x_tail:
1214 mul x10,x6,x24 // lo(a[4..7]*b[4])
1215 adc x0,x0,xzr // modulo-scheduled
1216 mul x11,x7,x24
1217 add x28,x28,#8
1218 mul x12,x8,x24
1219 and x28,x28,#31
1220 mul x13,x9,x24
1221 adds x19,x19,x10
1222 umulh x10,x6,x24 // hi(a[4..7]*b[4])
1223 adcs x20,x20,x11
1224 umulh x11,x7,x24
1225 adcs x21,x21,x12
1226 umulh x12,x8,x24
1227 adcs x22,x22,x13
1228 umulh x13,x9,x24
1229 adc x23,xzr,xzr
1230 ldr x24,[x2,x28] // next b[i]
1231 adds x20,x20,x10
1232 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
1233 adcs x21,x21,x11
1234 mul x11,x15,x25
1235 adcs x22,x22,x12
1236 mul x12,x16,x25
1237 adc x23,x23,x13 // can't overflow
1238 mul x13,x17,x25
1239 adds x19,x19,x10
1240 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
1241 adcs x20,x20,x11
1242 umulh x11,x15,x25
1243 adcs x21,x21,x12
1244 umulh x12,x16,x25
1245 adcs x22,x22,x13
1246 umulh x13,x17,x25
1247 adcs x23,x23,x0
1248 ldr x25,[sp,x28] // next a[0]*n0
1249 adc x0,xzr,xzr
1250 str x19,[x26],#8 // result!!!
1251 adds x19,x20,x10
1252 sub x10,x27,x1 // done yet?
1253 adcs x20,x21,x11
1254 adcs x21,x22,x12
1255 adcs x22,x23,x13
1256 //adc x0,x0,xzr
1257 cbnz x28,.Loop_mul4x_tail
1258
1259 sub x11,x3,x5 // rewinded np?
1260 adc x0,x0,xzr
1261 cbz x10,.Loop_mul4x_break
1262
1263 ldp x10,x11,[x26,#8*4]
1264 ldp x12,x13,[x26,#8*6]
1265 ldp x6,x7,[x1,#8*0]
1266 ldp x8,x9,[x1,#8*2]
1267 add x1,x1,#8*4
1268 adds x19,x19,x10
1269 adcs x20,x20,x11
1270 adcs x21,x21,x12
1271 adcs x22,x22,x13
1272 //adc x0,x0,xzr
1273 ldp x14,x15,[x3,#8*0]
1274 ldp x16,x17,[x3,#8*2]
1275 add x3,x3,#8*4
1276 b .Loop_mul4x_tail
1277
1278.align 4
1279.Loop_mul4x_break:
1280 ldp x12,x13,[x29,#96] // pull rp and &b[num]
1281 adds x19,x19,x30
1282 add x2,x2,#8*4 // bp++
1283 adcs x20,x20,xzr
1284 sub x1,x1,x5 // rewind ap
1285 adcs x21,x21,xzr
1286 stp x19,x20,[x26,#8*0] // result!!!
1287 adcs x22,x22,xzr
1288 ldp x19,x20,[sp,#8*4] // t[0..3]
1289 adc x30,x0,xzr
1290 stp x21,x22,[x26,#8*2] // result!!!
1291 cmp x2,x13 // done yet?
1292 ldp x21,x22,[sp,#8*6]
1293 ldp x14,x15,[x11,#8*0] // n[0..3]
1294 ldp x16,x17,[x11,#8*2]
1295 add x3,x11,#8*4
1296 b.eq .Lmul4x_post
1297
1298 ldr x24,[x2]
1299 ldp x6,x7,[x1,#8*0] // a[0..3]
1300 ldp x8,x9,[x1,#8*2]
1301 adds x1,x1,#8*4 // clear carry bit
1302 mov x0,xzr
1303 mov x26,sp
1304 b .Loop_mul4x_reduction
1305
1306.align 4
1307.Lmul4x_post:
1308 // Final step. We see if result is larger than modulus, and
1309 // if it is, subtract the modulus. But comparison implies
1310 // subtraction. So we subtract modulus, see if it borrowed,
1311 // and conditionally copy original value.
1312 mov x0,x12
1313 mov x27,x12 // x0 copy
1314 subs x10,x19,x14
1315 add x26,sp,#8*8
1316 sbcs x11,x20,x15
1317 sub x28,x5,#8*4
1318
1319.Lmul4x_sub:
1320 sbcs x12,x21,x16
1321 ldp x14,x15,[x3,#8*0]
1322 sub x28,x28,#8*4
1323 ldp x19,x20,[x26,#8*0]
1324 sbcs x13,x22,x17
1325 ldp x16,x17,[x3,#8*2]
1326 add x3,x3,#8*4
1327 ldp x21,x22,[x26,#8*2]
1328 add x26,x26,#8*4
1329 stp x10,x11,[x0,#8*0]
1330 sbcs x10,x19,x14
1331 stp x12,x13,[x0,#8*2]
1332 add x0,x0,#8*4
1333 sbcs x11,x20,x15
1334 cbnz x28,.Lmul4x_sub
1335
1336 sbcs x12,x21,x16
1337 mov x26,sp
1338 add x1,sp,#8*4
1339 ldp x6,x7,[x27,#8*0]
1340 sbcs x13,x22,x17
1341 stp x10,x11,[x0,#8*0]
1342 ldp x8,x9,[x27,#8*2]
1343 stp x12,x13,[x0,#8*2]
1344 ldp x19,x20,[x1,#8*0]
1345 ldp x21,x22,[x1,#8*2]
1346 sbcs xzr,x30,xzr // did it borrow?
1347 ldr x30,[x29,#8] // pull return address
1348
1349 sub x28,x5,#8*4
1350.Lmul4x_cond_copy:
1351 sub x28,x28,#8*4
1352 csel x10,x19,x6,lo
1353 stp xzr,xzr,[x26,#8*0]
1354 csel x11,x20,x7,lo
1355 ldp x6,x7,[x27,#8*4]
1356 ldp x19,x20,[x1,#8*4]
1357 csel x12,x21,x8,lo
1358 stp xzr,xzr,[x26,#8*2]
1359 add x26,x26,#8*4
1360 csel x13,x22,x9,lo
1361 ldp x8,x9,[x27,#8*6]
1362 ldp x21,x22,[x1,#8*6]
1363 add x1,x1,#8*4
1364 stp x10,x11,[x27,#8*0]
1365 stp x12,x13,[x27,#8*2]
1366 add x27,x27,#8*4
1367 cbnz x28,.Lmul4x_cond_copy
1368
1369 csel x10,x19,x6,lo
1370 stp xzr,xzr,[x26,#8*0]
1371 csel x11,x20,x7,lo
1372 stp xzr,xzr,[x26,#8*2]
1373 csel x12,x21,x8,lo
1374 stp xzr,xzr,[x26,#8*3]
1375 csel x13,x22,x9,lo
1376 stp xzr,xzr,[x26,#8*4]
1377 stp x10,x11,[x27,#8*0]
1378 stp x12,x13,[x27,#8*2]
1379
1380 b .Lmul4x_done
1381
1382.align 4
1383.Lmul4x4_post_condition:
1384 adc x0,x0,xzr
1385 ldr x1,[x29,#96] // pull rp
1386 // x19-3,x0 hold result, x14-7 hold modulus
1387 subs x6,x19,x14
1388 ldr x30,[x29,#8] // pull return address
1389 sbcs x7,x20,x15
1390 stp xzr,xzr,[sp,#8*0]
1391 sbcs x8,x21,x16
1392 stp xzr,xzr,[sp,#8*2]
1393 sbcs x9,x22,x17
1394 stp xzr,xzr,[sp,#8*4]
1395 sbcs xzr,x0,xzr // did it borrow?
1396 stp xzr,xzr,[sp,#8*6]
1397
1398 // x6-3 hold result-modulus
1399 csel x6,x19,x6,lo
1400 csel x7,x20,x7,lo
1401 csel x8,x21,x8,lo
1402 csel x9,x22,x9,lo
1403 stp x6,x7,[x1,#8*0]
1404 stp x8,x9,[x1,#8*2]
1405
1406.Lmul4x_done:
1407 ldp x19,x20,[x29,#16]
1408 mov sp,x29
1409 ldp x21,x22,[x29,#32]
1410 mov x0,#1
1411 ldp x23,x24,[x29,#48]
1412 ldp x25,x26,[x29,#64]
1413 ldp x27,x28,[x29,#80]
1414 ldr x29,[sp],#128
1415 ret
1416.size __bn_mul4x_mont,.-__bn_mul4x_mont
1417.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1418.align 2
1419.align 4
David Benjamin4969cc92016-04-22 15:02:23 -04001420#endif
Robert Sloan726e9d12018-09-11 11:45:04 -07001421#endif // !OPENSSL_NO_ASM