blob: 1ea05ba9f6fe2b8511637f25cdbd356bcbfb6bcd [file] [log] [blame]
Robert Sloanc9abfe42018-11-26 12:19:07 -08001// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
Pete Bentley0c61efe2019-08-13 09:32:23 +01004#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
Robert Sloan726e9d12018-09-11 11:45:04 -07007#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
Robert Sloan726e9d12018-09-11 11:45:04 -070010
11#if !defined(OPENSSL_NO_ASM)
Adam Langleyfad63272015-11-12 12:15:39 -080012#if defined(__aarch64__)
Robert Sloan726e9d12018-09-11 11:45:04 -070013#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
Adam Langleyfad63272015-11-12 12:15:39 -080016.text
17
18.globl bn_mul_mont
David Benjamin4969cc92016-04-22 15:02:23 -040019.hidden bn_mul_mont
Adam Langleyfad63272015-11-12 12:15:39 -080020.type bn_mul_mont,%function
21.align 5
22bn_mul_mont:
23 tst x5,#7
24 b.eq __bn_sqr8x_mont
25 tst x5,#3
26 b.eq __bn_mul4x_mont
27.Lmul_mont:
28 stp x29,x30,[sp,#-64]!
29 add x29,sp,#0
30 stp x19,x20,[sp,#16]
31 stp x21,x22,[sp,#32]
32 stp x23,x24,[sp,#48]
33
34 ldr x9,[x2],#8 // bp[0]
35 sub x22,sp,x5,lsl#3
36 ldp x7,x8,[x1],#16 // ap[0..1]
37 lsl x5,x5,#3
38 ldr x4,[x4] // *n0
39 and x22,x22,#-16 // ABI says so
40 ldp x13,x14,[x3],#16 // np[0..1]
41
42 mul x6,x7,x9 // ap[0]*bp[0]
43 sub x21,x5,#16 // j=num-2
44 umulh x7,x7,x9
45 mul x10,x8,x9 // ap[1]*bp[0]
46 umulh x11,x8,x9
47
48 mul x15,x6,x4 // "tp[0]"*n0
49 mov sp,x22 // alloca
50
51 // (*) mul x12,x13,x15 // np[0]*m1
52 umulh x13,x13,x15
53 mul x16,x14,x15 // np[1]*m1
54 // (*) adds x12,x12,x6 // discarded
55 // (*) As for removal of first multiplication and addition
56 // instructions. The outcome of first addition is
57 // guaranteed to be zero, which leaves two computationally
58 // significant outcomes: it either carries or not. Then
59 // question is when does it carry? Is there alternative
60 // way to deduce it? If you follow operations, you can
61 // observe that condition for carry is quite simple:
62 // x6 being non-zero. So that carry can be calculated
63 // by adding -1 to x6. That's what next instruction does.
64 subs xzr,x6,#1 // (*)
65 umulh x17,x14,x15
66 adc x13,x13,xzr
67 cbz x21,.L1st_skip
68
69.L1st:
70 ldr x8,[x1],#8
71 adds x6,x10,x7
72 sub x21,x21,#8 // j--
73 adc x7,x11,xzr
74
75 ldr x14,[x3],#8
76 adds x12,x16,x13
77 mul x10,x8,x9 // ap[j]*bp[0]
78 adc x13,x17,xzr
79 umulh x11,x8,x9
80
81 adds x12,x12,x6
82 mul x16,x14,x15 // np[j]*m1
83 adc x13,x13,xzr
84 umulh x17,x14,x15
85 str x12,[x22],#8 // tp[j-1]
86 cbnz x21,.L1st
87
88.L1st_skip:
89 adds x6,x10,x7
90 sub x1,x1,x5 // rewind x1
91 adc x7,x11,xzr
92
93 adds x12,x16,x13
94 sub x3,x3,x5 // rewind x3
95 adc x13,x17,xzr
96
97 adds x12,x12,x6
98 sub x20,x5,#8 // i=num-1
99 adcs x13,x13,x7
100
101 adc x19,xzr,xzr // upmost overflow bit
102 stp x12,x13,[x22]
103
104.Louter:
105 ldr x9,[x2],#8 // bp[i]
106 ldp x7,x8,[x1],#16
107 ldr x23,[sp] // tp[0]
108 add x22,sp,#8
109
110 mul x6,x7,x9 // ap[0]*bp[i]
111 sub x21,x5,#16 // j=num-2
112 umulh x7,x7,x9
113 ldp x13,x14,[x3],#16
114 mul x10,x8,x9 // ap[1]*bp[i]
115 adds x6,x6,x23
116 umulh x11,x8,x9
117 adc x7,x7,xzr
118
119 mul x15,x6,x4
120 sub x20,x20,#8 // i--
121
122 // (*) mul x12,x13,x15 // np[0]*m1
123 umulh x13,x13,x15
124 mul x16,x14,x15 // np[1]*m1
125 // (*) adds x12,x12,x6
126 subs xzr,x6,#1 // (*)
127 umulh x17,x14,x15
128 cbz x21,.Linner_skip
129
130.Linner:
131 ldr x8,[x1],#8
132 adc x13,x13,xzr
133 ldr x23,[x22],#8 // tp[j]
134 adds x6,x10,x7
135 sub x21,x21,#8 // j--
136 adc x7,x11,xzr
137
138 adds x12,x16,x13
139 ldr x14,[x3],#8
140 adc x13,x17,xzr
141
142 mul x10,x8,x9 // ap[j]*bp[i]
143 adds x6,x6,x23
144 umulh x11,x8,x9
145 adc x7,x7,xzr
146
147 mul x16,x14,x15 // np[j]*m1
148 adds x12,x12,x6
149 umulh x17,x14,x15
150 str x12,[x22,#-16] // tp[j-1]
151 cbnz x21,.Linner
152
153.Linner_skip:
154 ldr x23,[x22],#8 // tp[j]
155 adc x13,x13,xzr
156 adds x6,x10,x7
157 sub x1,x1,x5 // rewind x1
158 adc x7,x11,xzr
159
160 adds x12,x16,x13
161 sub x3,x3,x5 // rewind x3
162 adcs x13,x17,x19
163 adc x19,xzr,xzr
164
165 adds x6,x6,x23
166 adc x7,x7,xzr
167
168 adds x12,x12,x6
169 adcs x13,x13,x7
170 adc x19,x19,xzr // upmost overflow bit
171 stp x12,x13,[x22,#-16]
172
173 cbnz x20,.Louter
174
175 // Final step. We see if result is larger than modulus, and
176 // if it is, subtract the modulus. But comparison implies
177 // subtraction. So we subtract modulus, see if it borrowed,
178 // and conditionally copy original value.
179 ldr x23,[sp] // tp[0]
180 add x22,sp,#8
181 ldr x14,[x3],#8 // np[0]
182 subs x21,x5,#8 // j=num-1 and clear borrow
183 mov x1,x0
184.Lsub:
185 sbcs x8,x23,x14 // tp[j]-np[j]
186 ldr x23,[x22],#8
187 sub x21,x21,#8 // j--
188 ldr x14,[x3],#8
189 str x8,[x1],#8 // rp[j]=tp[j]-np[j]
190 cbnz x21,.Lsub
191
192 sbcs x8,x23,x14
193 sbcs x19,x19,xzr // did it borrow?
194 str x8,[x1],#8 // rp[num-1]
195
196 ldr x23,[sp] // tp[0]
197 add x22,sp,#8
198 ldr x8,[x0],#8 // rp[0]
199 sub x5,x5,#8 // num--
200 nop
201.Lcond_copy:
202 sub x5,x5,#8 // num--
203 csel x14,x23,x8,lo // did it borrow?
204 ldr x23,[x22],#8
205 ldr x8,[x0],#8
206 str xzr,[x22,#-16] // wipe tp
207 str x14,[x0,#-16]
208 cbnz x5,.Lcond_copy
209
210 csel x14,x23,x8,lo
211 str xzr,[x22,#-8] // wipe tp
212 str x14,[x0,#-8]
213
214 ldp x19,x20,[x29,#16]
215 mov sp,x29
216 ldp x21,x22,[x29,#32]
217 mov x0,#1
218 ldp x23,x24,[x29,#48]
219 ldr x29,[sp],#64
220 ret
221.size bn_mul_mont,.-bn_mul_mont
222.type __bn_sqr8x_mont,%function
223.align 5
224__bn_sqr8x_mont:
225 cmp x1,x2
226 b.ne __bn_mul4x_mont
227.Lsqr8x_mont:
228 stp x29,x30,[sp,#-128]!
229 add x29,sp,#0
230 stp x19,x20,[sp,#16]
231 stp x21,x22,[sp,#32]
232 stp x23,x24,[sp,#48]
233 stp x25,x26,[sp,#64]
234 stp x27,x28,[sp,#80]
235 stp x0,x3,[sp,#96] // offload rp and np
236
237 ldp x6,x7,[x1,#8*0]
238 ldp x8,x9,[x1,#8*2]
239 ldp x10,x11,[x1,#8*4]
240 ldp x12,x13,[x1,#8*6]
241
242 sub x2,sp,x5,lsl#4
243 lsl x5,x5,#3
244 ldr x4,[x4] // *n0
245 mov sp,x2 // alloca
246 sub x27,x5,#8*8
247 b .Lsqr8x_zero_start
248
249.Lsqr8x_zero:
250 sub x27,x27,#8*8
251 stp xzr,xzr,[x2,#8*0]
252 stp xzr,xzr,[x2,#8*2]
253 stp xzr,xzr,[x2,#8*4]
254 stp xzr,xzr,[x2,#8*6]
255.Lsqr8x_zero_start:
256 stp xzr,xzr,[x2,#8*8]
257 stp xzr,xzr,[x2,#8*10]
258 stp xzr,xzr,[x2,#8*12]
259 stp xzr,xzr,[x2,#8*14]
260 add x2,x2,#8*16
261 cbnz x27,.Lsqr8x_zero
262
263 add x3,x1,x5
264 add x1,x1,#8*8
265 mov x19,xzr
266 mov x20,xzr
267 mov x21,xzr
268 mov x22,xzr
269 mov x23,xzr
270 mov x24,xzr
271 mov x25,xzr
272 mov x26,xzr
273 mov x2,sp
274 str x4,[x29,#112] // offload n0
275
276 // Multiply everything but a[i]*a[i]
277.align 4
278.Lsqr8x_outer_loop:
279 // a[1]a[0] (i)
280 // a[2]a[0]
281 // a[3]a[0]
282 // a[4]a[0]
283 // a[5]a[0]
284 // a[6]a[0]
285 // a[7]a[0]
286 // a[2]a[1] (ii)
287 // a[3]a[1]
288 // a[4]a[1]
289 // a[5]a[1]
290 // a[6]a[1]
291 // a[7]a[1]
292 // a[3]a[2] (iii)
293 // a[4]a[2]
294 // a[5]a[2]
295 // a[6]a[2]
296 // a[7]a[2]
297 // a[4]a[3] (iv)
298 // a[5]a[3]
299 // a[6]a[3]
300 // a[7]a[3]
301 // a[5]a[4] (v)
302 // a[6]a[4]
303 // a[7]a[4]
304 // a[6]a[5] (vi)
305 // a[7]a[5]
306 // a[7]a[6] (vii)
307
308 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
309 mul x15,x8,x6
310 mul x16,x9,x6
311 mul x17,x10,x6
312 adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
313 mul x14,x11,x6
314 adcs x21,x21,x15
315 mul x15,x12,x6
316 adcs x22,x22,x16
317 mul x16,x13,x6
318 adcs x23,x23,x17
319 umulh x17,x7,x6 // hi(a[1..7]*a[0])
320 adcs x24,x24,x14
321 umulh x14,x8,x6
322 adcs x25,x25,x15
323 umulh x15,x9,x6
324 adcs x26,x26,x16
325 umulh x16,x10,x6
326 stp x19,x20,[x2],#8*2 // t[0..1]
327 adc x19,xzr,xzr // t[8]
328 adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
329 umulh x17,x11,x6
330 adcs x22,x22,x14
331 umulh x14,x12,x6
332 adcs x23,x23,x15
333 umulh x15,x13,x6
334 adcs x24,x24,x16
335 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
336 adcs x25,x25,x17
337 mul x17,x9,x7
338 adcs x26,x26,x14
339 mul x14,x10,x7
340 adc x19,x19,x15
341
342 mul x15,x11,x7
343 adds x22,x22,x16
344 mul x16,x12,x7
345 adcs x23,x23,x17
346 mul x17,x13,x7
347 adcs x24,x24,x14
348 umulh x14,x8,x7 // hi(a[2..7]*a[1])
349 adcs x25,x25,x15
350 umulh x15,x9,x7
351 adcs x26,x26,x16
352 umulh x16,x10,x7
353 adcs x19,x19,x17
354 umulh x17,x11,x7
355 stp x21,x22,[x2],#8*2 // t[2..3]
356 adc x20,xzr,xzr // t[9]
357 adds x23,x23,x14
358 umulh x14,x12,x7
359 adcs x24,x24,x15
360 umulh x15,x13,x7
361 adcs x25,x25,x16
362 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
363 adcs x26,x26,x17
364 mul x17,x10,x8
365 adcs x19,x19,x14
366 mul x14,x11,x8
367 adc x20,x20,x15
368
369 mul x15,x12,x8
370 adds x24,x24,x16
371 mul x16,x13,x8
372 adcs x25,x25,x17
373 umulh x17,x9,x8 // hi(a[3..7]*a[2])
374 adcs x26,x26,x14
375 umulh x14,x10,x8
376 adcs x19,x19,x15
377 umulh x15,x11,x8
378 adcs x20,x20,x16
379 umulh x16,x12,x8
380 stp x23,x24,[x2],#8*2 // t[4..5]
381 adc x21,xzr,xzr // t[10]
382 adds x25,x25,x17
383 umulh x17,x13,x8
384 adcs x26,x26,x14
385 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
386 adcs x19,x19,x15
387 mul x15,x11,x9
388 adcs x20,x20,x16
389 mul x16,x12,x9
390 adc x21,x21,x17
391
392 mul x17,x13,x9
393 adds x26,x26,x14
394 umulh x14,x10,x9 // hi(a[4..7]*a[3])
395 adcs x19,x19,x15
396 umulh x15,x11,x9
397 adcs x20,x20,x16
398 umulh x16,x12,x9
399 adcs x21,x21,x17
400 umulh x17,x13,x9
401 stp x25,x26,[x2],#8*2 // t[6..7]
402 adc x22,xzr,xzr // t[11]
403 adds x19,x19,x14
404 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
405 adcs x20,x20,x15
406 mul x15,x12,x10
407 adcs x21,x21,x16
408 mul x16,x13,x10
409 adc x22,x22,x17
410
411 umulh x17,x11,x10 // hi(a[5..7]*a[4])
412 adds x20,x20,x14
413 umulh x14,x12,x10
414 adcs x21,x21,x15
415 umulh x15,x13,x10
416 adcs x22,x22,x16
417 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
418 adc x23,xzr,xzr // t[12]
419 adds x21,x21,x17
420 mul x17,x13,x11
421 adcs x22,x22,x14
422 umulh x14,x12,x11 // hi(a[6..7]*a[5])
423 adc x23,x23,x15
424
425 umulh x15,x13,x11
426 adds x22,x22,x16
427 mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
428 adcs x23,x23,x17
429 umulh x17,x13,x12 // hi(a[7]*a[6])
430 adc x24,xzr,xzr // t[13]
431 adds x23,x23,x14
432 sub x27,x3,x1 // done yet?
433 adc x24,x24,x15
434
435 adds x24,x24,x16
436 sub x14,x3,x5 // rewinded ap
437 adc x25,xzr,xzr // t[14]
438 add x25,x25,x17
439
440 cbz x27,.Lsqr8x_outer_break
441
442 mov x4,x6
443 ldp x6,x7,[x2,#8*0]
444 ldp x8,x9,[x2,#8*2]
445 ldp x10,x11,[x2,#8*4]
446 ldp x12,x13,[x2,#8*6]
447 adds x19,x19,x6
448 adcs x20,x20,x7
449 ldp x6,x7,[x1,#8*0]
450 adcs x21,x21,x8
451 adcs x22,x22,x9
452 ldp x8,x9,[x1,#8*2]
453 adcs x23,x23,x10
454 adcs x24,x24,x11
455 ldp x10,x11,[x1,#8*4]
456 adcs x25,x25,x12
457 mov x0,x1
458 adcs x26,xzr,x13
459 ldp x12,x13,[x1,#8*6]
460 add x1,x1,#8*8
461 //adc x28,xzr,xzr // moved below
462 mov x27,#-8*8
463
464 // a[8]a[0]
465 // a[9]a[0]
466 // a[a]a[0]
467 // a[b]a[0]
468 // a[c]a[0]
469 // a[d]a[0]
470 // a[e]a[0]
471 // a[f]a[0]
472 // a[8]a[1]
473 // a[f]a[1]........................
474 // a[8]a[2]
475 // a[f]a[2]........................
476 // a[8]a[3]
477 // a[f]a[3]........................
478 // a[8]a[4]
479 // a[f]a[4]........................
480 // a[8]a[5]
481 // a[f]a[5]........................
482 // a[8]a[6]
483 // a[f]a[6]........................
484 // a[8]a[7]
485 // a[f]a[7]........................
486.Lsqr8x_mul:
487 mul x14,x6,x4
488 adc x28,xzr,xzr // carry bit, modulo-scheduled
489 mul x15,x7,x4
490 add x27,x27,#8
491 mul x16,x8,x4
492 mul x17,x9,x4
493 adds x19,x19,x14
494 mul x14,x10,x4
495 adcs x20,x20,x15
496 mul x15,x11,x4
497 adcs x21,x21,x16
498 mul x16,x12,x4
499 adcs x22,x22,x17
500 mul x17,x13,x4
501 adcs x23,x23,x14
502 umulh x14,x6,x4
503 adcs x24,x24,x15
504 umulh x15,x7,x4
505 adcs x25,x25,x16
506 umulh x16,x8,x4
507 adcs x26,x26,x17
508 umulh x17,x9,x4
509 adc x28,x28,xzr
510 str x19,[x2],#8
511 adds x19,x20,x14
512 umulh x14,x10,x4
513 adcs x20,x21,x15
514 umulh x15,x11,x4
515 adcs x21,x22,x16
516 umulh x16,x12,x4
517 adcs x22,x23,x17
518 umulh x17,x13,x4
519 ldr x4,[x0,x27]
520 adcs x23,x24,x14
521 adcs x24,x25,x15
522 adcs x25,x26,x16
523 adcs x26,x28,x17
524 //adc x28,xzr,xzr // moved above
525 cbnz x27,.Lsqr8x_mul
526 // note that carry flag is guaranteed
527 // to be zero at this point
528 cmp x1,x3 // done yet?
529 b.eq .Lsqr8x_break
530
531 ldp x6,x7,[x2,#8*0]
532 ldp x8,x9,[x2,#8*2]
533 ldp x10,x11,[x2,#8*4]
534 ldp x12,x13,[x2,#8*6]
535 adds x19,x19,x6
536 ldr x4,[x0,#-8*8]
537 adcs x20,x20,x7
538 ldp x6,x7,[x1,#8*0]
539 adcs x21,x21,x8
540 adcs x22,x22,x9
541 ldp x8,x9,[x1,#8*2]
542 adcs x23,x23,x10
543 adcs x24,x24,x11
544 ldp x10,x11,[x1,#8*4]
545 adcs x25,x25,x12
546 mov x27,#-8*8
547 adcs x26,x26,x13
548 ldp x12,x13,[x1,#8*6]
549 add x1,x1,#8*8
550 //adc x28,xzr,xzr // moved above
551 b .Lsqr8x_mul
552
553.align 4
554.Lsqr8x_break:
555 ldp x6,x7,[x0,#8*0]
556 add x1,x0,#8*8
557 ldp x8,x9,[x0,#8*2]
558 sub x14,x3,x1 // is it last iteration?
559 ldp x10,x11,[x0,#8*4]
560 sub x15,x2,x14
561 ldp x12,x13,[x0,#8*6]
562 cbz x14,.Lsqr8x_outer_loop
563
564 stp x19,x20,[x2,#8*0]
565 ldp x19,x20,[x15,#8*0]
566 stp x21,x22,[x2,#8*2]
567 ldp x21,x22,[x15,#8*2]
568 stp x23,x24,[x2,#8*4]
569 ldp x23,x24,[x15,#8*4]
570 stp x25,x26,[x2,#8*6]
571 mov x2,x15
572 ldp x25,x26,[x15,#8*6]
573 b .Lsqr8x_outer_loop
574
575.align 4
576.Lsqr8x_outer_break:
577 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
578 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
579 ldp x15,x16,[sp,#8*1]
580 ldp x11,x13,[x14,#8*2]
581 add x1,x14,#8*4
582 ldp x17,x14,[sp,#8*3]
583
584 stp x19,x20,[x2,#8*0]
585 mul x19,x7,x7
586 stp x21,x22,[x2,#8*2]
587 umulh x7,x7,x7
588 stp x23,x24,[x2,#8*4]
589 mul x8,x9,x9
590 stp x25,x26,[x2,#8*6]
591 mov x2,sp
592 umulh x9,x9,x9
593 adds x20,x7,x15,lsl#1
594 extr x15,x16,x15,#63
595 sub x27,x5,#8*4
596
597.Lsqr4x_shift_n_add:
598 adcs x21,x8,x15
599 extr x16,x17,x16,#63
600 sub x27,x27,#8*4
601 adcs x22,x9,x16
602 ldp x15,x16,[x2,#8*5]
603 mul x10,x11,x11
604 ldp x7,x9,[x1],#8*2
605 umulh x11,x11,x11
606 mul x12,x13,x13
607 umulh x13,x13,x13
608 extr x17,x14,x17,#63
609 stp x19,x20,[x2,#8*0]
610 adcs x23,x10,x17
611 extr x14,x15,x14,#63
612 stp x21,x22,[x2,#8*2]
613 adcs x24,x11,x14
614 ldp x17,x14,[x2,#8*7]
615 extr x15,x16,x15,#63
616 adcs x25,x12,x15
617 extr x16,x17,x16,#63
618 adcs x26,x13,x16
619 ldp x15,x16,[x2,#8*9]
620 mul x6,x7,x7
621 ldp x11,x13,[x1],#8*2
622 umulh x7,x7,x7
623 mul x8,x9,x9
624 umulh x9,x9,x9
625 stp x23,x24,[x2,#8*4]
626 extr x17,x14,x17,#63
627 stp x25,x26,[x2,#8*6]
628 add x2,x2,#8*8
629 adcs x19,x6,x17
630 extr x14,x15,x14,#63
631 adcs x20,x7,x14
632 ldp x17,x14,[x2,#8*3]
633 extr x15,x16,x15,#63
634 cbnz x27,.Lsqr4x_shift_n_add
635 ldp x1,x4,[x29,#104] // pull np and n0
636
637 adcs x21,x8,x15
638 extr x16,x17,x16,#63
639 adcs x22,x9,x16
640 ldp x15,x16,[x2,#8*5]
641 mul x10,x11,x11
642 umulh x11,x11,x11
643 stp x19,x20,[x2,#8*0]
644 mul x12,x13,x13
645 umulh x13,x13,x13
646 stp x21,x22,[x2,#8*2]
647 extr x17,x14,x17,#63
648 adcs x23,x10,x17
649 extr x14,x15,x14,#63
650 ldp x19,x20,[sp,#8*0]
651 adcs x24,x11,x14
652 extr x15,x16,x15,#63
653 ldp x6,x7,[x1,#8*0]
654 adcs x25,x12,x15
655 extr x16,xzr,x16,#63
656 ldp x8,x9,[x1,#8*2]
657 adc x26,x13,x16
658 ldp x10,x11,[x1,#8*4]
659
660 // Reduce by 512 bits per iteration
661 mul x28,x4,x19 // t[0]*n0
662 ldp x12,x13,[x1,#8*6]
663 add x3,x1,x5
664 ldp x21,x22,[sp,#8*2]
665 stp x23,x24,[x2,#8*4]
666 ldp x23,x24,[sp,#8*4]
667 stp x25,x26,[x2,#8*6]
668 ldp x25,x26,[sp,#8*6]
669 add x1,x1,#8*8
670 mov x30,xzr // initial top-most carry
671 mov x2,sp
672 mov x27,#8
673
674.Lsqr8x_reduction:
675 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
676 mul x15,x7,x28
677 sub x27,x27,#1
678 mul x16,x8,x28
679 str x28,[x2],#8 // put aside t[0]*n0 for tail processing
680 mul x17,x9,x28
681 // (*) adds xzr,x19,x14
682 subs xzr,x19,#1 // (*)
683 mul x14,x10,x28
684 adcs x19,x20,x15
685 mul x15,x11,x28
686 adcs x20,x21,x16
687 mul x16,x12,x28
688 adcs x21,x22,x17
689 mul x17,x13,x28
690 adcs x22,x23,x14
691 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
692 adcs x23,x24,x15
693 umulh x15,x7,x28
694 adcs x24,x25,x16
695 umulh x16,x8,x28
696 adcs x25,x26,x17
697 umulh x17,x9,x28
698 adc x26,xzr,xzr
699 adds x19,x19,x14
700 umulh x14,x10,x28
701 adcs x20,x20,x15
702 umulh x15,x11,x28
703 adcs x21,x21,x16
704 umulh x16,x12,x28
705 adcs x22,x22,x17
706 umulh x17,x13,x28
707 mul x28,x4,x19 // next t[0]*n0
708 adcs x23,x23,x14
709 adcs x24,x24,x15
710 adcs x25,x25,x16
711 adc x26,x26,x17
712 cbnz x27,.Lsqr8x_reduction
713
714 ldp x14,x15,[x2,#8*0]
715 ldp x16,x17,[x2,#8*2]
716 mov x0,x2
717 sub x27,x3,x1 // done yet?
718 adds x19,x19,x14
719 adcs x20,x20,x15
720 ldp x14,x15,[x2,#8*4]
721 adcs x21,x21,x16
722 adcs x22,x22,x17
723 ldp x16,x17,[x2,#8*6]
724 adcs x23,x23,x14
725 adcs x24,x24,x15
726 adcs x25,x25,x16
727 adcs x26,x26,x17
728 //adc x28,xzr,xzr // moved below
729 cbz x27,.Lsqr8x8_post_condition
730
731 ldr x4,[x2,#-8*8]
732 ldp x6,x7,[x1,#8*0]
733 ldp x8,x9,[x1,#8*2]
734 ldp x10,x11,[x1,#8*4]
735 mov x27,#-8*8
736 ldp x12,x13,[x1,#8*6]
737 add x1,x1,#8*8
738
739.Lsqr8x_tail:
740 mul x14,x6,x4
741 adc x28,xzr,xzr // carry bit, modulo-scheduled
742 mul x15,x7,x4
743 add x27,x27,#8
744 mul x16,x8,x4
745 mul x17,x9,x4
746 adds x19,x19,x14
747 mul x14,x10,x4
748 adcs x20,x20,x15
749 mul x15,x11,x4
750 adcs x21,x21,x16
751 mul x16,x12,x4
752 adcs x22,x22,x17
753 mul x17,x13,x4
754 adcs x23,x23,x14
755 umulh x14,x6,x4
756 adcs x24,x24,x15
757 umulh x15,x7,x4
758 adcs x25,x25,x16
759 umulh x16,x8,x4
760 adcs x26,x26,x17
761 umulh x17,x9,x4
762 adc x28,x28,xzr
763 str x19,[x2],#8
764 adds x19,x20,x14
765 umulh x14,x10,x4
766 adcs x20,x21,x15
767 umulh x15,x11,x4
768 adcs x21,x22,x16
769 umulh x16,x12,x4
770 adcs x22,x23,x17
771 umulh x17,x13,x4
772 ldr x4,[x0,x27]
773 adcs x23,x24,x14
774 adcs x24,x25,x15
775 adcs x25,x26,x16
776 adcs x26,x28,x17
777 //adc x28,xzr,xzr // moved above
778 cbnz x27,.Lsqr8x_tail
779 // note that carry flag is guaranteed
780 // to be zero at this point
781 ldp x6,x7,[x2,#8*0]
782 sub x27,x3,x1 // done yet?
783 sub x16,x3,x5 // rewinded np
784 ldp x8,x9,[x2,#8*2]
785 ldp x10,x11,[x2,#8*4]
786 ldp x12,x13,[x2,#8*6]
787 cbz x27,.Lsqr8x_tail_break
788
789 ldr x4,[x0,#-8*8]
790 adds x19,x19,x6
791 adcs x20,x20,x7
792 ldp x6,x7,[x1,#8*0]
793 adcs x21,x21,x8
794 adcs x22,x22,x9
795 ldp x8,x9,[x1,#8*2]
796 adcs x23,x23,x10
797 adcs x24,x24,x11
798 ldp x10,x11,[x1,#8*4]
799 adcs x25,x25,x12
800 mov x27,#-8*8
801 adcs x26,x26,x13
802 ldp x12,x13,[x1,#8*6]
803 add x1,x1,#8*8
804 //adc x28,xzr,xzr // moved above
805 b .Lsqr8x_tail
806
807.align 4
808.Lsqr8x_tail_break:
809 ldr x4,[x29,#112] // pull n0
810 add x27,x2,#8*8 // end of current t[num] window
811
812 subs xzr,x30,#1 // "move" top-most carry to carry bit
813 adcs x14,x19,x6
814 adcs x15,x20,x7
815 ldp x19,x20,[x0,#8*0]
816 adcs x21,x21,x8
817 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
818 adcs x22,x22,x9
819 ldp x8,x9,[x16,#8*2]
820 adcs x23,x23,x10
821 adcs x24,x24,x11
822 ldp x10,x11,[x16,#8*4]
823 adcs x25,x25,x12
824 adcs x26,x26,x13
825 ldp x12,x13,[x16,#8*6]
826 add x1,x16,#8*8
827 adc x30,xzr,xzr // top-most carry
828 mul x28,x4,x19
829 stp x14,x15,[x2,#8*0]
830 stp x21,x22,[x2,#8*2]
831 ldp x21,x22,[x0,#8*2]
832 stp x23,x24,[x2,#8*4]
833 ldp x23,x24,[x0,#8*4]
834 cmp x27,x29 // did we hit the bottom?
835 stp x25,x26,[x2,#8*6]
836 mov x2,x0 // slide the window
837 ldp x25,x26,[x0,#8*6]
838 mov x27,#8
839 b.ne .Lsqr8x_reduction
840
841 // Final step. We see if result is larger than modulus, and
842 // if it is, subtract the modulus. But comparison implies
843 // subtraction. So we subtract modulus, see if it borrowed,
844 // and conditionally copy original value.
845 ldr x0,[x29,#96] // pull rp
846 add x2,x2,#8*8
847 subs x14,x19,x6
848 sbcs x15,x20,x7
849 sub x27,x5,#8*8
850 mov x3,x0 // x0 copy
851
852.Lsqr8x_sub:
853 sbcs x16,x21,x8
854 ldp x6,x7,[x1,#8*0]
855 sbcs x17,x22,x9
856 stp x14,x15,[x0,#8*0]
857 sbcs x14,x23,x10
858 ldp x8,x9,[x1,#8*2]
859 sbcs x15,x24,x11
860 stp x16,x17,[x0,#8*2]
861 sbcs x16,x25,x12
862 ldp x10,x11,[x1,#8*4]
863 sbcs x17,x26,x13
864 ldp x12,x13,[x1,#8*6]
865 add x1,x1,#8*8
866 ldp x19,x20,[x2,#8*0]
867 sub x27,x27,#8*8
868 ldp x21,x22,[x2,#8*2]
869 ldp x23,x24,[x2,#8*4]
870 ldp x25,x26,[x2,#8*6]
871 add x2,x2,#8*8
872 stp x14,x15,[x0,#8*4]
873 sbcs x14,x19,x6
874 stp x16,x17,[x0,#8*6]
875 add x0,x0,#8*8
876 sbcs x15,x20,x7
877 cbnz x27,.Lsqr8x_sub
878
879 sbcs x16,x21,x8
880 mov x2,sp
881 add x1,sp,x5
882 ldp x6,x7,[x3,#8*0]
883 sbcs x17,x22,x9
884 stp x14,x15,[x0,#8*0]
885 sbcs x14,x23,x10
886 ldp x8,x9,[x3,#8*2]
887 sbcs x15,x24,x11
888 stp x16,x17,[x0,#8*2]
889 sbcs x16,x25,x12
890 ldp x19,x20,[x1,#8*0]
891 sbcs x17,x26,x13
892 ldp x21,x22,[x1,#8*2]
893 sbcs xzr,x30,xzr // did it borrow?
894 ldr x30,[x29,#8] // pull return address
895 stp x14,x15,[x0,#8*4]
896 stp x16,x17,[x0,#8*6]
897
898 sub x27,x5,#8*4
899.Lsqr4x_cond_copy:
900 sub x27,x27,#8*4
901 csel x14,x19,x6,lo
902 stp xzr,xzr,[x2,#8*0]
903 csel x15,x20,x7,lo
904 ldp x6,x7,[x3,#8*4]
905 ldp x19,x20,[x1,#8*4]
906 csel x16,x21,x8,lo
907 stp xzr,xzr,[x2,#8*2]
908 add x2,x2,#8*4
909 csel x17,x22,x9,lo
910 ldp x8,x9,[x3,#8*6]
911 ldp x21,x22,[x1,#8*6]
912 add x1,x1,#8*4
913 stp x14,x15,[x3,#8*0]
914 stp x16,x17,[x3,#8*2]
915 add x3,x3,#8*4
916 stp xzr,xzr,[x1,#8*0]
917 stp xzr,xzr,[x1,#8*2]
918 cbnz x27,.Lsqr4x_cond_copy
919
920 csel x14,x19,x6,lo
921 stp xzr,xzr,[x2,#8*0]
922 csel x15,x20,x7,lo
923 stp xzr,xzr,[x2,#8*2]
924 csel x16,x21,x8,lo
925 csel x17,x22,x9,lo
926 stp x14,x15,[x3,#8*0]
927 stp x16,x17,[x3,#8*2]
928
929 b .Lsqr8x_done
930
931.align 4
932.Lsqr8x8_post_condition:
933 adc x28,xzr,xzr
934 ldr x30,[x29,#8] // pull return address
935 // x19-7,x28 hold result, x6-7 hold modulus
936 subs x6,x19,x6
937 ldr x1,[x29,#96] // pull rp
938 sbcs x7,x20,x7
939 stp xzr,xzr,[sp,#8*0]
940 sbcs x8,x21,x8
941 stp xzr,xzr,[sp,#8*2]
942 sbcs x9,x22,x9
943 stp xzr,xzr,[sp,#8*4]
944 sbcs x10,x23,x10
945 stp xzr,xzr,[sp,#8*6]
946 sbcs x11,x24,x11
947 stp xzr,xzr,[sp,#8*8]
948 sbcs x12,x25,x12
949 stp xzr,xzr,[sp,#8*10]
950 sbcs x13,x26,x13
951 stp xzr,xzr,[sp,#8*12]
952 sbcs x28,x28,xzr // did it borrow?
953 stp xzr,xzr,[sp,#8*14]
954
955 // x6-7 hold result-modulus
956 csel x6,x19,x6,lo
957 csel x7,x20,x7,lo
958 csel x8,x21,x8,lo
959 csel x9,x22,x9,lo
960 stp x6,x7,[x1,#8*0]
961 csel x10,x23,x10,lo
962 csel x11,x24,x11,lo
963 stp x8,x9,[x1,#8*2]
964 csel x12,x25,x12,lo
965 csel x13,x26,x13,lo
966 stp x10,x11,[x1,#8*4]
967 stp x12,x13,[x1,#8*6]
968
969.Lsqr8x_done:
970 ldp x19,x20,[x29,#16]
971 mov sp,x29
972 ldp x21,x22,[x29,#32]
973 mov x0,#1
974 ldp x23,x24,[x29,#48]
975 ldp x25,x26,[x29,#64]
976 ldp x27,x28,[x29,#80]
977 ldr x29,[sp],#128
978 ret
979.size __bn_sqr8x_mont,.-__bn_sqr8x_mont
980.type __bn_mul4x_mont,%function
981.align 5
982__bn_mul4x_mont:
983 stp x29,x30,[sp,#-128]!
984 add x29,sp,#0
985 stp x19,x20,[sp,#16]
986 stp x21,x22,[sp,#32]
987 stp x23,x24,[sp,#48]
988 stp x25,x26,[sp,#64]
989 stp x27,x28,[sp,#80]
990
991 sub x26,sp,x5,lsl#3
992 lsl x5,x5,#3
993 ldr x4,[x4] // *n0
994 sub sp,x26,#8*4 // alloca
995
996 add x10,x2,x5
997 add x27,x1,x5
998 stp x0,x10,[x29,#96] // offload rp and &b[num]
999
1000 ldr x24,[x2,#8*0] // b[0]
1001 ldp x6,x7,[x1,#8*0] // a[0..3]
1002 ldp x8,x9,[x1,#8*2]
1003 add x1,x1,#8*4
1004 mov x19,xzr
1005 mov x20,xzr
1006 mov x21,xzr
1007 mov x22,xzr
1008 ldp x14,x15,[x3,#8*0] // n[0..3]
1009 ldp x16,x17,[x3,#8*2]
1010 adds x3,x3,#8*4 // clear carry bit
1011 mov x0,xzr
1012 mov x28,#0
1013 mov x26,sp
1014
1015.Loop_mul4x_1st_reduction:
1016 mul x10,x6,x24 // lo(a[0..3]*b[0])
1017 adc x0,x0,xzr // modulo-scheduled
1018 mul x11,x7,x24
1019 add x28,x28,#8
1020 mul x12,x8,x24
1021 and x28,x28,#31
1022 mul x13,x9,x24
1023 adds x19,x19,x10
1024 umulh x10,x6,x24 // hi(a[0..3]*b[0])
1025 adcs x20,x20,x11
1026 mul x25,x19,x4 // t[0]*n0
1027 adcs x21,x21,x12
1028 umulh x11,x7,x24
1029 adcs x22,x22,x13
1030 umulh x12,x8,x24
1031 adc x23,xzr,xzr
1032 umulh x13,x9,x24
1033 ldr x24,[x2,x28] // next b[i] (or b[0])
1034 adds x20,x20,x10
1035 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
1036 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1037 adcs x21,x21,x11
1038 mul x11,x15,x25
1039 adcs x22,x22,x12
1040 mul x12,x16,x25
1041 adc x23,x23,x13 // can't overflow
1042 mul x13,x17,x25
1043 // (*) adds xzr,x19,x10
1044 subs xzr,x19,#1 // (*)
1045 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
1046 adcs x19,x20,x11
1047 umulh x11,x15,x25
1048 adcs x20,x21,x12
1049 umulh x12,x16,x25
1050 adcs x21,x22,x13
1051 umulh x13,x17,x25
1052 adcs x22,x23,x0
1053 adc x0,xzr,xzr
1054 adds x19,x19,x10
1055 sub x10,x27,x1
1056 adcs x20,x20,x11
1057 adcs x21,x21,x12
1058 adcs x22,x22,x13
1059 //adc x0,x0,xzr
1060 cbnz x28,.Loop_mul4x_1st_reduction
1061
1062 cbz x10,.Lmul4x4_post_condition
1063
1064 ldp x6,x7,[x1,#8*0] // a[4..7]
1065 ldp x8,x9,[x1,#8*2]
1066 add x1,x1,#8*4
1067 ldr x25,[sp] // a[0]*n0
1068 ldp x14,x15,[x3,#8*0] // n[4..7]
1069 ldp x16,x17,[x3,#8*2]
1070 add x3,x3,#8*4
1071
1072.Loop_mul4x_1st_tail:
1073 mul x10,x6,x24 // lo(a[4..7]*b[i])
1074 adc x0,x0,xzr // modulo-scheduled
1075 mul x11,x7,x24
1076 add x28,x28,#8
1077 mul x12,x8,x24
1078 and x28,x28,#31
1079 mul x13,x9,x24
1080 adds x19,x19,x10
1081 umulh x10,x6,x24 // hi(a[4..7]*b[i])
1082 adcs x20,x20,x11
1083 umulh x11,x7,x24
1084 adcs x21,x21,x12
1085 umulh x12,x8,x24
1086 adcs x22,x22,x13
1087 umulh x13,x9,x24
1088 adc x23,xzr,xzr
1089 ldr x24,[x2,x28] // next b[i] (or b[0])
1090 adds x20,x20,x10
1091 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
1092 adcs x21,x21,x11
1093 mul x11,x15,x25
1094 adcs x22,x22,x12
1095 mul x12,x16,x25
1096 adc x23,x23,x13 // can't overflow
1097 mul x13,x17,x25
1098 adds x19,x19,x10
1099 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
1100 adcs x20,x20,x11
1101 umulh x11,x15,x25
1102 adcs x21,x21,x12
1103 umulh x12,x16,x25
1104 adcs x22,x22,x13
1105 adcs x23,x23,x0
1106 umulh x13,x17,x25
1107 adc x0,xzr,xzr
1108 ldr x25,[sp,x28] // next t[0]*n0
1109 str x19,[x26],#8 // result!!!
1110 adds x19,x20,x10
1111 sub x10,x27,x1 // done yet?
1112 adcs x20,x21,x11
1113 adcs x21,x22,x12
1114 adcs x22,x23,x13
1115 //adc x0,x0,xzr
1116 cbnz x28,.Loop_mul4x_1st_tail
1117
1118 sub x11,x27,x5 // rewinded x1
1119 cbz x10,.Lmul4x_proceed
1120
1121 ldp x6,x7,[x1,#8*0]
1122 ldp x8,x9,[x1,#8*2]
1123 add x1,x1,#8*4
1124 ldp x14,x15,[x3,#8*0]
1125 ldp x16,x17,[x3,#8*2]
1126 add x3,x3,#8*4
1127 b .Loop_mul4x_1st_tail
1128
1129.align 5
1130.Lmul4x_proceed:
1131 ldr x24,[x2,#8*4]! // *++b
1132 adc x30,x0,xzr
1133 ldp x6,x7,[x11,#8*0] // a[0..3]
1134 sub x3,x3,x5 // rewind np
1135 ldp x8,x9,[x11,#8*2]
1136 add x1,x11,#8*4
1137
1138 stp x19,x20,[x26,#8*0] // result!!!
1139 ldp x19,x20,[sp,#8*4] // t[0..3]
1140 stp x21,x22,[x26,#8*2] // result!!!
1141 ldp x21,x22,[sp,#8*6]
1142
1143 ldp x14,x15,[x3,#8*0] // n[0..3]
1144 mov x26,sp
1145 ldp x16,x17,[x3,#8*2]
1146 adds x3,x3,#8*4 // clear carry bit
1147 mov x0,xzr
1148
1149.align 4
1150.Loop_mul4x_reduction:
1151 mul x10,x6,x24 // lo(a[0..3]*b[4])
1152 adc x0,x0,xzr // modulo-scheduled
1153 mul x11,x7,x24
1154 add x28,x28,#8
1155 mul x12,x8,x24
1156 and x28,x28,#31
1157 mul x13,x9,x24
1158 adds x19,x19,x10
1159 umulh x10,x6,x24 // hi(a[0..3]*b[4])
1160 adcs x20,x20,x11
1161 mul x25,x19,x4 // t[0]*n0
1162 adcs x21,x21,x12
1163 umulh x11,x7,x24
1164 adcs x22,x22,x13
1165 umulh x12,x8,x24
1166 adc x23,xzr,xzr
1167 umulh x13,x9,x24
1168 ldr x24,[x2,x28] // next b[i]
1169 adds x20,x20,x10
1170 // (*) mul x10,x14,x25
1171 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1172 adcs x21,x21,x11
1173 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
1174 adcs x22,x22,x12
1175 mul x12,x16,x25
1176 adc x23,x23,x13 // can't overflow
1177 mul x13,x17,x25
1178 // (*) adds xzr,x19,x10
1179 subs xzr,x19,#1 // (*)
1180 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
1181 adcs x19,x20,x11
1182 umulh x11,x15,x25
1183 adcs x20,x21,x12
1184 umulh x12,x16,x25
1185 adcs x21,x22,x13
1186 umulh x13,x17,x25
1187 adcs x22,x23,x0
1188 adc x0,xzr,xzr
1189 adds x19,x19,x10
1190 adcs x20,x20,x11
1191 adcs x21,x21,x12
1192 adcs x22,x22,x13
1193 //adc x0,x0,xzr
1194 cbnz x28,.Loop_mul4x_reduction
1195
1196 adc x0,x0,xzr
1197 ldp x10,x11,[x26,#8*4] // t[4..7]
1198 ldp x12,x13,[x26,#8*6]
1199 ldp x6,x7,[x1,#8*0] // a[4..7]
1200 ldp x8,x9,[x1,#8*2]
1201 add x1,x1,#8*4
1202 adds x19,x19,x10
1203 adcs x20,x20,x11
1204 adcs x21,x21,x12
1205 adcs x22,x22,x13
1206 //adc x0,x0,xzr
1207
1208 ldr x25,[sp] // t[0]*n0
1209 ldp x14,x15,[x3,#8*0] // n[4..7]
1210 ldp x16,x17,[x3,#8*2]
1211 add x3,x3,#8*4
1212
1213.align 4
1214.Loop_mul4x_tail:
1215 mul x10,x6,x24 // lo(a[4..7]*b[4])
1216 adc x0,x0,xzr // modulo-scheduled
1217 mul x11,x7,x24
1218 add x28,x28,#8
1219 mul x12,x8,x24
1220 and x28,x28,#31
1221 mul x13,x9,x24
1222 adds x19,x19,x10
1223 umulh x10,x6,x24 // hi(a[4..7]*b[4])
1224 adcs x20,x20,x11
1225 umulh x11,x7,x24
1226 adcs x21,x21,x12
1227 umulh x12,x8,x24
1228 adcs x22,x22,x13
1229 umulh x13,x9,x24
1230 adc x23,xzr,xzr
1231 ldr x24,[x2,x28] // next b[i]
1232 adds x20,x20,x10
1233 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
1234 adcs x21,x21,x11
1235 mul x11,x15,x25
1236 adcs x22,x22,x12
1237 mul x12,x16,x25
1238 adc x23,x23,x13 // can't overflow
1239 mul x13,x17,x25
1240 adds x19,x19,x10
1241 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
1242 adcs x20,x20,x11
1243 umulh x11,x15,x25
1244 adcs x21,x21,x12
1245 umulh x12,x16,x25
1246 adcs x22,x22,x13
1247 umulh x13,x17,x25
1248 adcs x23,x23,x0
1249 ldr x25,[sp,x28] // next a[0]*n0
1250 adc x0,xzr,xzr
1251 str x19,[x26],#8 // result!!!
1252 adds x19,x20,x10
1253 sub x10,x27,x1 // done yet?
1254 adcs x20,x21,x11
1255 adcs x21,x22,x12
1256 adcs x22,x23,x13
1257 //adc x0,x0,xzr
1258 cbnz x28,.Loop_mul4x_tail
1259
1260 sub x11,x3,x5 // rewinded np?
1261 adc x0,x0,xzr
1262 cbz x10,.Loop_mul4x_break
1263
1264 ldp x10,x11,[x26,#8*4]
1265 ldp x12,x13,[x26,#8*6]
1266 ldp x6,x7,[x1,#8*0]
1267 ldp x8,x9,[x1,#8*2]
1268 add x1,x1,#8*4
1269 adds x19,x19,x10
1270 adcs x20,x20,x11
1271 adcs x21,x21,x12
1272 adcs x22,x22,x13
1273 //adc x0,x0,xzr
1274 ldp x14,x15,[x3,#8*0]
1275 ldp x16,x17,[x3,#8*2]
1276 add x3,x3,#8*4
1277 b .Loop_mul4x_tail
1278
1279.align 4
1280.Loop_mul4x_break:
1281 ldp x12,x13,[x29,#96] // pull rp and &b[num]
1282 adds x19,x19,x30
1283 add x2,x2,#8*4 // bp++
1284 adcs x20,x20,xzr
1285 sub x1,x1,x5 // rewind ap
1286 adcs x21,x21,xzr
1287 stp x19,x20,[x26,#8*0] // result!!!
1288 adcs x22,x22,xzr
1289 ldp x19,x20,[sp,#8*4] // t[0..3]
1290 adc x30,x0,xzr
1291 stp x21,x22,[x26,#8*2] // result!!!
1292 cmp x2,x13 // done yet?
1293 ldp x21,x22,[sp,#8*6]
1294 ldp x14,x15,[x11,#8*0] // n[0..3]
1295 ldp x16,x17,[x11,#8*2]
1296 add x3,x11,#8*4
1297 b.eq .Lmul4x_post
1298
1299 ldr x24,[x2]
1300 ldp x6,x7,[x1,#8*0] // a[0..3]
1301 ldp x8,x9,[x1,#8*2]
1302 adds x1,x1,#8*4 // clear carry bit
1303 mov x0,xzr
1304 mov x26,sp
1305 b .Loop_mul4x_reduction
1306
1307.align 4
1308.Lmul4x_post:
1309 // Final step. We see if result is larger than modulus, and
1310 // if it is, subtract the modulus. But comparison implies
1311 // subtraction. So we subtract modulus, see if it borrowed,
1312 // and conditionally copy original value.
1313 mov x0,x12
1314 mov x27,x12 // x0 copy
1315 subs x10,x19,x14
1316 add x26,sp,#8*8
1317 sbcs x11,x20,x15
1318 sub x28,x5,#8*4
1319
1320.Lmul4x_sub:
1321 sbcs x12,x21,x16
1322 ldp x14,x15,[x3,#8*0]
1323 sub x28,x28,#8*4
1324 ldp x19,x20,[x26,#8*0]
1325 sbcs x13,x22,x17
1326 ldp x16,x17,[x3,#8*2]
1327 add x3,x3,#8*4
1328 ldp x21,x22,[x26,#8*2]
1329 add x26,x26,#8*4
1330 stp x10,x11,[x0,#8*0]
1331 sbcs x10,x19,x14
1332 stp x12,x13,[x0,#8*2]
1333 add x0,x0,#8*4
1334 sbcs x11,x20,x15
1335 cbnz x28,.Lmul4x_sub
1336
1337 sbcs x12,x21,x16
1338 mov x26,sp
1339 add x1,sp,#8*4
1340 ldp x6,x7,[x27,#8*0]
1341 sbcs x13,x22,x17
1342 stp x10,x11,[x0,#8*0]
1343 ldp x8,x9,[x27,#8*2]
1344 stp x12,x13,[x0,#8*2]
1345 ldp x19,x20,[x1,#8*0]
1346 ldp x21,x22,[x1,#8*2]
1347 sbcs xzr,x30,xzr // did it borrow?
1348 ldr x30,[x29,#8] // pull return address
1349
1350 sub x28,x5,#8*4
1351.Lmul4x_cond_copy:
1352 sub x28,x28,#8*4
1353 csel x10,x19,x6,lo
1354 stp xzr,xzr,[x26,#8*0]
1355 csel x11,x20,x7,lo
1356 ldp x6,x7,[x27,#8*4]
1357 ldp x19,x20,[x1,#8*4]
1358 csel x12,x21,x8,lo
1359 stp xzr,xzr,[x26,#8*2]
1360 add x26,x26,#8*4
1361 csel x13,x22,x9,lo
1362 ldp x8,x9,[x27,#8*6]
1363 ldp x21,x22,[x1,#8*6]
1364 add x1,x1,#8*4
1365 stp x10,x11,[x27,#8*0]
1366 stp x12,x13,[x27,#8*2]
1367 add x27,x27,#8*4
1368 cbnz x28,.Lmul4x_cond_copy
1369
1370 csel x10,x19,x6,lo
1371 stp xzr,xzr,[x26,#8*0]
1372 csel x11,x20,x7,lo
1373 stp xzr,xzr,[x26,#8*2]
1374 csel x12,x21,x8,lo
1375 stp xzr,xzr,[x26,#8*3]
1376 csel x13,x22,x9,lo
1377 stp xzr,xzr,[x26,#8*4]
1378 stp x10,x11,[x27,#8*0]
1379 stp x12,x13,[x27,#8*2]
1380
1381 b .Lmul4x_done
1382
1383.align 4
1384.Lmul4x4_post_condition:
1385 adc x0,x0,xzr
1386 ldr x1,[x29,#96] // pull rp
1387 // x19-3,x0 hold result, x14-7 hold modulus
1388 subs x6,x19,x14
1389 ldr x30,[x29,#8] // pull return address
1390 sbcs x7,x20,x15
1391 stp xzr,xzr,[sp,#8*0]
1392 sbcs x8,x21,x16
1393 stp xzr,xzr,[sp,#8*2]
1394 sbcs x9,x22,x17
1395 stp xzr,xzr,[sp,#8*4]
1396 sbcs xzr,x0,xzr // did it borrow?
1397 stp xzr,xzr,[sp,#8*6]
1398
1399 // x6-3 hold result-modulus
1400 csel x6,x19,x6,lo
1401 csel x7,x20,x7,lo
1402 csel x8,x21,x8,lo
1403 csel x9,x22,x9,lo
1404 stp x6,x7,[x1,#8*0]
1405 stp x8,x9,[x1,#8*2]
1406
1407.Lmul4x_done:
1408 ldp x19,x20,[x29,#16]
1409 mov sp,x29
1410 ldp x21,x22,[x29,#32]
1411 mov x0,#1
1412 ldp x23,x24,[x29,#48]
1413 ldp x25,x26,[x29,#64]
1414 ldp x27,x28,[x29,#80]
1415 ldr x29,[sp],#128
1416 ret
1417.size __bn_mul4x_mont,.-__bn_mul4x_mont
1418.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1419.align 2
1420.align 4
David Benjamin4969cc92016-04-22 15:02:23 -04001421#endif
Robert Sloan726e9d12018-09-11 11:45:04 -07001422#endif // !OPENSSL_NO_ASM