blob: 562657494df7882ca6e72883f0b8114988c25e06 [file] [log] [blame]
Adam Langleyfad63272015-11-12 12:15:39 -08001.text
2
Robert Sloan8ff03552017-06-14 12:40:58 -07003.globl _bn_mul_mont
4.private_extern _bn_mul_mont
5
Adam Langleyfad63272015-11-12 12:15:39 -08006.align 5
Robert Sloan8ff03552017-06-14 12:40:58 -07007_bn_mul_mont:
Adam Langleyfad63272015-11-12 12:15:39 -08008 tst x5,#7
9 b.eq __bn_sqr8x_mont
10 tst x5,#3
11 b.eq __bn_mul4x_mont
Robert Sloan8ff03552017-06-14 12:40:58 -070012Lmul_mont:
Adam Langleyfad63272015-11-12 12:15:39 -080013 stp x29,x30,[sp,#-64]!
14 add x29,sp,#0
15 stp x19,x20,[sp,#16]
16 stp x21,x22,[sp,#32]
17 stp x23,x24,[sp,#48]
18
19 ldr x9,[x2],#8 // bp[0]
20 sub x22,sp,x5,lsl#3
21 ldp x7,x8,[x1],#16 // ap[0..1]
22 lsl x5,x5,#3
23 ldr x4,[x4] // *n0
24 and x22,x22,#-16 // ABI says so
25 ldp x13,x14,[x3],#16 // np[0..1]
26
27 mul x6,x7,x9 // ap[0]*bp[0]
28 sub x21,x5,#16 // j=num-2
29 umulh x7,x7,x9
30 mul x10,x8,x9 // ap[1]*bp[0]
31 umulh x11,x8,x9
32
33 mul x15,x6,x4 // "tp[0]"*n0
34 mov sp,x22 // alloca
35
36 // (*) mul x12,x13,x15 // np[0]*m1
37 umulh x13,x13,x15
38 mul x16,x14,x15 // np[1]*m1
39 // (*) adds x12,x12,x6 // discarded
40 // (*) As for removal of first multiplication and addition
41 // instructions. The outcome of first addition is
42 // guaranteed to be zero, which leaves two computationally
43 // significant outcomes: it either carries or not. Then
44 // question is when does it carry? Is there alternative
45 // way to deduce it? If you follow operations, you can
46 // observe that condition for carry is quite simple:
47 // x6 being non-zero. So that carry can be calculated
48 // by adding -1 to x6. That's what next instruction does.
49 subs xzr,x6,#1 // (*)
50 umulh x17,x14,x15
51 adc x13,x13,xzr
Robert Sloan8ff03552017-06-14 12:40:58 -070052 cbz x21,L1st_skip
Adam Langleyfad63272015-11-12 12:15:39 -080053
Robert Sloan8ff03552017-06-14 12:40:58 -070054L1st:
Adam Langleyfad63272015-11-12 12:15:39 -080055 ldr x8,[x1],#8
56 adds x6,x10,x7
57 sub x21,x21,#8 // j--
58 adc x7,x11,xzr
59
60 ldr x14,[x3],#8
61 adds x12,x16,x13
62 mul x10,x8,x9 // ap[j]*bp[0]
63 adc x13,x17,xzr
64 umulh x11,x8,x9
65
66 adds x12,x12,x6
67 mul x16,x14,x15 // np[j]*m1
68 adc x13,x13,xzr
69 umulh x17,x14,x15
70 str x12,[x22],#8 // tp[j-1]
Robert Sloan8ff03552017-06-14 12:40:58 -070071 cbnz x21,L1st
Adam Langleyfad63272015-11-12 12:15:39 -080072
Robert Sloan8ff03552017-06-14 12:40:58 -070073L1st_skip:
Adam Langleyfad63272015-11-12 12:15:39 -080074 adds x6,x10,x7
75 sub x1,x1,x5 // rewind x1
76 adc x7,x11,xzr
77
78 adds x12,x16,x13
79 sub x3,x3,x5 // rewind x3
80 adc x13,x17,xzr
81
82 adds x12,x12,x6
83 sub x20,x5,#8 // i=num-1
84 adcs x13,x13,x7
85
86 adc x19,xzr,xzr // upmost overflow bit
87 stp x12,x13,[x22]
88
Robert Sloan8ff03552017-06-14 12:40:58 -070089Louter:
Adam Langleyfad63272015-11-12 12:15:39 -080090 ldr x9,[x2],#8 // bp[i]
91 ldp x7,x8,[x1],#16
92 ldr x23,[sp] // tp[0]
93 add x22,sp,#8
94
95 mul x6,x7,x9 // ap[0]*bp[i]
96 sub x21,x5,#16 // j=num-2
97 umulh x7,x7,x9
98 ldp x13,x14,[x3],#16
99 mul x10,x8,x9 // ap[1]*bp[i]
100 adds x6,x6,x23
101 umulh x11,x8,x9
102 adc x7,x7,xzr
103
104 mul x15,x6,x4
105 sub x20,x20,#8 // i--
106
107 // (*) mul x12,x13,x15 // np[0]*m1
108 umulh x13,x13,x15
109 mul x16,x14,x15 // np[1]*m1
110 // (*) adds x12,x12,x6
111 subs xzr,x6,#1 // (*)
112 umulh x17,x14,x15
Robert Sloan8ff03552017-06-14 12:40:58 -0700113 cbz x21,Linner_skip
Adam Langleyfad63272015-11-12 12:15:39 -0800114
Robert Sloan8ff03552017-06-14 12:40:58 -0700115Linner:
Adam Langleyfad63272015-11-12 12:15:39 -0800116 ldr x8,[x1],#8
117 adc x13,x13,xzr
118 ldr x23,[x22],#8 // tp[j]
119 adds x6,x10,x7
120 sub x21,x21,#8 // j--
121 adc x7,x11,xzr
122
123 adds x12,x16,x13
124 ldr x14,[x3],#8
125 adc x13,x17,xzr
126
127 mul x10,x8,x9 // ap[j]*bp[i]
128 adds x6,x6,x23
129 umulh x11,x8,x9
130 adc x7,x7,xzr
131
132 mul x16,x14,x15 // np[j]*m1
133 adds x12,x12,x6
134 umulh x17,x14,x15
135 str x12,[x22,#-16] // tp[j-1]
Robert Sloan8ff03552017-06-14 12:40:58 -0700136 cbnz x21,Linner
Adam Langleyfad63272015-11-12 12:15:39 -0800137
Robert Sloan8ff03552017-06-14 12:40:58 -0700138Linner_skip:
Adam Langleyfad63272015-11-12 12:15:39 -0800139 ldr x23,[x22],#8 // tp[j]
140 adc x13,x13,xzr
141 adds x6,x10,x7
142 sub x1,x1,x5 // rewind x1
143 adc x7,x11,xzr
144
145 adds x12,x16,x13
146 sub x3,x3,x5 // rewind x3
147 adcs x13,x17,x19
148 adc x19,xzr,xzr
149
150 adds x6,x6,x23
151 adc x7,x7,xzr
152
153 adds x12,x12,x6
154 adcs x13,x13,x7
155 adc x19,x19,xzr // upmost overflow bit
156 stp x12,x13,[x22,#-16]
157
Robert Sloan8ff03552017-06-14 12:40:58 -0700158 cbnz x20,Louter
Adam Langleyfad63272015-11-12 12:15:39 -0800159
160 // Final step. We see if result is larger than modulus, and
161 // if it is, subtract the modulus. But comparison implies
162 // subtraction. So we subtract modulus, see if it borrowed,
163 // and conditionally copy original value.
164 ldr x23,[sp] // tp[0]
165 add x22,sp,#8
166 ldr x14,[x3],#8 // np[0]
167 subs x21,x5,#8 // j=num-1 and clear borrow
168 mov x1,x0
Robert Sloan8ff03552017-06-14 12:40:58 -0700169Lsub:
Adam Langleyfad63272015-11-12 12:15:39 -0800170 sbcs x8,x23,x14 // tp[j]-np[j]
171 ldr x23,[x22],#8
172 sub x21,x21,#8 // j--
173 ldr x14,[x3],#8
174 str x8,[x1],#8 // rp[j]=tp[j]-np[j]
Robert Sloan8ff03552017-06-14 12:40:58 -0700175 cbnz x21,Lsub
Adam Langleyfad63272015-11-12 12:15:39 -0800176
177 sbcs x8,x23,x14
178 sbcs x19,x19,xzr // did it borrow?
179 str x8,[x1],#8 // rp[num-1]
180
181 ldr x23,[sp] // tp[0]
182 add x22,sp,#8
183 ldr x8,[x0],#8 // rp[0]
184 sub x5,x5,#8 // num--
185 nop
Robert Sloan8ff03552017-06-14 12:40:58 -0700186Lcond_copy:
Adam Langleyfad63272015-11-12 12:15:39 -0800187 sub x5,x5,#8 // num--
188 csel x14,x23,x8,lo // did it borrow?
189 ldr x23,[x22],#8
190 ldr x8,[x0],#8
191 str xzr,[x22,#-16] // wipe tp
192 str x14,[x0,#-16]
Robert Sloan8ff03552017-06-14 12:40:58 -0700193 cbnz x5,Lcond_copy
Adam Langleyfad63272015-11-12 12:15:39 -0800194
195 csel x14,x23,x8,lo
196 str xzr,[x22,#-8] // wipe tp
197 str x14,[x0,#-8]
198
199 ldp x19,x20,[x29,#16]
200 mov sp,x29
201 ldp x21,x22,[x29,#32]
202 mov x0,#1
203 ldp x23,x24,[x29,#48]
204 ldr x29,[sp],#64
205 ret
Robert Sloan8ff03552017-06-14 12:40:58 -0700206
207
Adam Langleyfad63272015-11-12 12:15:39 -0800208.align 5
209__bn_sqr8x_mont:
210 cmp x1,x2
211 b.ne __bn_mul4x_mont
Robert Sloan8ff03552017-06-14 12:40:58 -0700212Lsqr8x_mont:
Adam Langleyfad63272015-11-12 12:15:39 -0800213 stp x29,x30,[sp,#-128]!
214 add x29,sp,#0
215 stp x19,x20,[sp,#16]
216 stp x21,x22,[sp,#32]
217 stp x23,x24,[sp,#48]
218 stp x25,x26,[sp,#64]
219 stp x27,x28,[sp,#80]
220 stp x0,x3,[sp,#96] // offload rp and np
221
222 ldp x6,x7,[x1,#8*0]
223 ldp x8,x9,[x1,#8*2]
224 ldp x10,x11,[x1,#8*4]
225 ldp x12,x13,[x1,#8*6]
226
227 sub x2,sp,x5,lsl#4
228 lsl x5,x5,#3
229 ldr x4,[x4] // *n0
230 mov sp,x2 // alloca
231 sub x27,x5,#8*8
Robert Sloan8ff03552017-06-14 12:40:58 -0700232 b Lsqr8x_zero_start
Adam Langleyfad63272015-11-12 12:15:39 -0800233
Robert Sloan8ff03552017-06-14 12:40:58 -0700234Lsqr8x_zero:
Adam Langleyfad63272015-11-12 12:15:39 -0800235 sub x27,x27,#8*8
236 stp xzr,xzr,[x2,#8*0]
237 stp xzr,xzr,[x2,#8*2]
238 stp xzr,xzr,[x2,#8*4]
239 stp xzr,xzr,[x2,#8*6]
Robert Sloan8ff03552017-06-14 12:40:58 -0700240Lsqr8x_zero_start:
Adam Langleyfad63272015-11-12 12:15:39 -0800241 stp xzr,xzr,[x2,#8*8]
242 stp xzr,xzr,[x2,#8*10]
243 stp xzr,xzr,[x2,#8*12]
244 stp xzr,xzr,[x2,#8*14]
245 add x2,x2,#8*16
Robert Sloan8ff03552017-06-14 12:40:58 -0700246 cbnz x27,Lsqr8x_zero
Adam Langleyfad63272015-11-12 12:15:39 -0800247
248 add x3,x1,x5
249 add x1,x1,#8*8
250 mov x19,xzr
251 mov x20,xzr
252 mov x21,xzr
253 mov x22,xzr
254 mov x23,xzr
255 mov x24,xzr
256 mov x25,xzr
257 mov x26,xzr
258 mov x2,sp
259 str x4,[x29,#112] // offload n0
260
261 // Multiply everything but a[i]*a[i]
262.align 4
Robert Sloan8ff03552017-06-14 12:40:58 -0700263Lsqr8x_outer_loop:
Adam Langleyfad63272015-11-12 12:15:39 -0800264 // a[1]a[0] (i)
265 // a[2]a[0]
266 // a[3]a[0]
267 // a[4]a[0]
268 // a[5]a[0]
269 // a[6]a[0]
270 // a[7]a[0]
271 // a[2]a[1] (ii)
272 // a[3]a[1]
273 // a[4]a[1]
274 // a[5]a[1]
275 // a[6]a[1]
276 // a[7]a[1]
277 // a[3]a[2] (iii)
278 // a[4]a[2]
279 // a[5]a[2]
280 // a[6]a[2]
281 // a[7]a[2]
282 // a[4]a[3] (iv)
283 // a[5]a[3]
284 // a[6]a[3]
285 // a[7]a[3]
286 // a[5]a[4] (v)
287 // a[6]a[4]
288 // a[7]a[4]
289 // a[6]a[5] (vi)
290 // a[7]a[5]
291 // a[7]a[6] (vii)
292
293 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
294 mul x15,x8,x6
295 mul x16,x9,x6
296 mul x17,x10,x6
297 adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
298 mul x14,x11,x6
299 adcs x21,x21,x15
300 mul x15,x12,x6
301 adcs x22,x22,x16
302 mul x16,x13,x6
303 adcs x23,x23,x17
304 umulh x17,x7,x6 // hi(a[1..7]*a[0])
305 adcs x24,x24,x14
306 umulh x14,x8,x6
307 adcs x25,x25,x15
308 umulh x15,x9,x6
309 adcs x26,x26,x16
310 umulh x16,x10,x6
311 stp x19,x20,[x2],#8*2 // t[0..1]
312 adc x19,xzr,xzr // t[8]
313 adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
314 umulh x17,x11,x6
315 adcs x22,x22,x14
316 umulh x14,x12,x6
317 adcs x23,x23,x15
318 umulh x15,x13,x6
319 adcs x24,x24,x16
320 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
321 adcs x25,x25,x17
322 mul x17,x9,x7
323 adcs x26,x26,x14
324 mul x14,x10,x7
325 adc x19,x19,x15
326
327 mul x15,x11,x7
328 adds x22,x22,x16
329 mul x16,x12,x7
330 adcs x23,x23,x17
331 mul x17,x13,x7
332 adcs x24,x24,x14
333 umulh x14,x8,x7 // hi(a[2..7]*a[1])
334 adcs x25,x25,x15
335 umulh x15,x9,x7
336 adcs x26,x26,x16
337 umulh x16,x10,x7
338 adcs x19,x19,x17
339 umulh x17,x11,x7
340 stp x21,x22,[x2],#8*2 // t[2..3]
341 adc x20,xzr,xzr // t[9]
342 adds x23,x23,x14
343 umulh x14,x12,x7
344 adcs x24,x24,x15
345 umulh x15,x13,x7
346 adcs x25,x25,x16
347 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
348 adcs x26,x26,x17
349 mul x17,x10,x8
350 adcs x19,x19,x14
351 mul x14,x11,x8
352 adc x20,x20,x15
353
354 mul x15,x12,x8
355 adds x24,x24,x16
356 mul x16,x13,x8
357 adcs x25,x25,x17
358 umulh x17,x9,x8 // hi(a[3..7]*a[2])
359 adcs x26,x26,x14
360 umulh x14,x10,x8
361 adcs x19,x19,x15
362 umulh x15,x11,x8
363 adcs x20,x20,x16
364 umulh x16,x12,x8
365 stp x23,x24,[x2],#8*2 // t[4..5]
366 adc x21,xzr,xzr // t[10]
367 adds x25,x25,x17
368 umulh x17,x13,x8
369 adcs x26,x26,x14
370 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
371 adcs x19,x19,x15
372 mul x15,x11,x9
373 adcs x20,x20,x16
374 mul x16,x12,x9
375 adc x21,x21,x17
376
377 mul x17,x13,x9
378 adds x26,x26,x14
379 umulh x14,x10,x9 // hi(a[4..7]*a[3])
380 adcs x19,x19,x15
381 umulh x15,x11,x9
382 adcs x20,x20,x16
383 umulh x16,x12,x9
384 adcs x21,x21,x17
385 umulh x17,x13,x9
386 stp x25,x26,[x2],#8*2 // t[6..7]
387 adc x22,xzr,xzr // t[11]
388 adds x19,x19,x14
389 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
390 adcs x20,x20,x15
391 mul x15,x12,x10
392 adcs x21,x21,x16
393 mul x16,x13,x10
394 adc x22,x22,x17
395
396 umulh x17,x11,x10 // hi(a[5..7]*a[4])
397 adds x20,x20,x14
398 umulh x14,x12,x10
399 adcs x21,x21,x15
400 umulh x15,x13,x10
401 adcs x22,x22,x16
402 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
403 adc x23,xzr,xzr // t[12]
404 adds x21,x21,x17
405 mul x17,x13,x11
406 adcs x22,x22,x14
407 umulh x14,x12,x11 // hi(a[6..7]*a[5])
408 adc x23,x23,x15
409
410 umulh x15,x13,x11
411 adds x22,x22,x16
412 mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
413 adcs x23,x23,x17
414 umulh x17,x13,x12 // hi(a[7]*a[6])
415 adc x24,xzr,xzr // t[13]
416 adds x23,x23,x14
417 sub x27,x3,x1 // done yet?
418 adc x24,x24,x15
419
420 adds x24,x24,x16
421 sub x14,x3,x5 // rewinded ap
422 adc x25,xzr,xzr // t[14]
423 add x25,x25,x17
424
Robert Sloan8ff03552017-06-14 12:40:58 -0700425 cbz x27,Lsqr8x_outer_break
Adam Langleyfad63272015-11-12 12:15:39 -0800426
427 mov x4,x6
428 ldp x6,x7,[x2,#8*0]
429 ldp x8,x9,[x2,#8*2]
430 ldp x10,x11,[x2,#8*4]
431 ldp x12,x13,[x2,#8*6]
432 adds x19,x19,x6
433 adcs x20,x20,x7
434 ldp x6,x7,[x1,#8*0]
435 adcs x21,x21,x8
436 adcs x22,x22,x9
437 ldp x8,x9,[x1,#8*2]
438 adcs x23,x23,x10
439 adcs x24,x24,x11
440 ldp x10,x11,[x1,#8*4]
441 adcs x25,x25,x12
442 mov x0,x1
443 adcs x26,xzr,x13
444 ldp x12,x13,[x1,#8*6]
445 add x1,x1,#8*8
446 //adc x28,xzr,xzr // moved below
447 mov x27,#-8*8
448
449 // a[8]a[0]
450 // a[9]a[0]
451 // a[a]a[0]
452 // a[b]a[0]
453 // a[c]a[0]
454 // a[d]a[0]
455 // a[e]a[0]
456 // a[f]a[0]
457 // a[8]a[1]
458 // a[f]a[1]........................
459 // a[8]a[2]
460 // a[f]a[2]........................
461 // a[8]a[3]
462 // a[f]a[3]........................
463 // a[8]a[4]
464 // a[f]a[4]........................
465 // a[8]a[5]
466 // a[f]a[5]........................
467 // a[8]a[6]
468 // a[f]a[6]........................
469 // a[8]a[7]
470 // a[f]a[7]........................
Robert Sloan8ff03552017-06-14 12:40:58 -0700471Lsqr8x_mul:
Adam Langleyfad63272015-11-12 12:15:39 -0800472 mul x14,x6,x4
473 adc x28,xzr,xzr // carry bit, modulo-scheduled
474 mul x15,x7,x4
475 add x27,x27,#8
476 mul x16,x8,x4
477 mul x17,x9,x4
478 adds x19,x19,x14
479 mul x14,x10,x4
480 adcs x20,x20,x15
481 mul x15,x11,x4
482 adcs x21,x21,x16
483 mul x16,x12,x4
484 adcs x22,x22,x17
485 mul x17,x13,x4
486 adcs x23,x23,x14
487 umulh x14,x6,x4
488 adcs x24,x24,x15
489 umulh x15,x7,x4
490 adcs x25,x25,x16
491 umulh x16,x8,x4
492 adcs x26,x26,x17
493 umulh x17,x9,x4
494 adc x28,x28,xzr
495 str x19,[x2],#8
496 adds x19,x20,x14
497 umulh x14,x10,x4
498 adcs x20,x21,x15
499 umulh x15,x11,x4
500 adcs x21,x22,x16
501 umulh x16,x12,x4
502 adcs x22,x23,x17
503 umulh x17,x13,x4
504 ldr x4,[x0,x27]
505 adcs x23,x24,x14
506 adcs x24,x25,x15
507 adcs x25,x26,x16
508 adcs x26,x28,x17
509 //adc x28,xzr,xzr // moved above
Robert Sloan8ff03552017-06-14 12:40:58 -0700510 cbnz x27,Lsqr8x_mul
Adam Langleyfad63272015-11-12 12:15:39 -0800511 // note that carry flag is guaranteed
512 // to be zero at this point
513 cmp x1,x3 // done yet?
Robert Sloan8ff03552017-06-14 12:40:58 -0700514 b.eq Lsqr8x_break
Adam Langleyfad63272015-11-12 12:15:39 -0800515
516 ldp x6,x7,[x2,#8*0]
517 ldp x8,x9,[x2,#8*2]
518 ldp x10,x11,[x2,#8*4]
519 ldp x12,x13,[x2,#8*6]
520 adds x19,x19,x6
521 ldr x4,[x0,#-8*8]
522 adcs x20,x20,x7
523 ldp x6,x7,[x1,#8*0]
524 adcs x21,x21,x8
525 adcs x22,x22,x9
526 ldp x8,x9,[x1,#8*2]
527 adcs x23,x23,x10
528 adcs x24,x24,x11
529 ldp x10,x11,[x1,#8*4]
530 adcs x25,x25,x12
531 mov x27,#-8*8
532 adcs x26,x26,x13
533 ldp x12,x13,[x1,#8*6]
534 add x1,x1,#8*8
535 //adc x28,xzr,xzr // moved above
Robert Sloan8ff03552017-06-14 12:40:58 -0700536 b Lsqr8x_mul
Adam Langleyfad63272015-11-12 12:15:39 -0800537
538.align 4
Robert Sloan8ff03552017-06-14 12:40:58 -0700539Lsqr8x_break:
Adam Langleyfad63272015-11-12 12:15:39 -0800540 ldp x6,x7,[x0,#8*0]
541 add x1,x0,#8*8
542 ldp x8,x9,[x0,#8*2]
543 sub x14,x3,x1 // is it last iteration?
544 ldp x10,x11,[x0,#8*4]
545 sub x15,x2,x14
546 ldp x12,x13,[x0,#8*6]
Robert Sloan8ff03552017-06-14 12:40:58 -0700547 cbz x14,Lsqr8x_outer_loop
Adam Langleyfad63272015-11-12 12:15:39 -0800548
549 stp x19,x20,[x2,#8*0]
550 ldp x19,x20,[x15,#8*0]
551 stp x21,x22,[x2,#8*2]
552 ldp x21,x22,[x15,#8*2]
553 stp x23,x24,[x2,#8*4]
554 ldp x23,x24,[x15,#8*4]
555 stp x25,x26,[x2,#8*6]
556 mov x2,x15
557 ldp x25,x26,[x15,#8*6]
Robert Sloan8ff03552017-06-14 12:40:58 -0700558 b Lsqr8x_outer_loop
Adam Langleyfad63272015-11-12 12:15:39 -0800559
560.align 4
Robert Sloan8ff03552017-06-14 12:40:58 -0700561Lsqr8x_outer_break:
Adam Langleyfad63272015-11-12 12:15:39 -0800562 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
563 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
564 ldp x15,x16,[sp,#8*1]
565 ldp x11,x13,[x14,#8*2]
566 add x1,x14,#8*4
567 ldp x17,x14,[sp,#8*3]
568
569 stp x19,x20,[x2,#8*0]
570 mul x19,x7,x7
571 stp x21,x22,[x2,#8*2]
572 umulh x7,x7,x7
573 stp x23,x24,[x2,#8*4]
574 mul x8,x9,x9
575 stp x25,x26,[x2,#8*6]
576 mov x2,sp
577 umulh x9,x9,x9
578 adds x20,x7,x15,lsl#1
579 extr x15,x16,x15,#63
580 sub x27,x5,#8*4
581
Robert Sloan8ff03552017-06-14 12:40:58 -0700582Lsqr4x_shift_n_add:
Adam Langleyfad63272015-11-12 12:15:39 -0800583 adcs x21,x8,x15
584 extr x16,x17,x16,#63
585 sub x27,x27,#8*4
586 adcs x22,x9,x16
587 ldp x15,x16,[x2,#8*5]
588 mul x10,x11,x11
589 ldp x7,x9,[x1],#8*2
590 umulh x11,x11,x11
591 mul x12,x13,x13
592 umulh x13,x13,x13
593 extr x17,x14,x17,#63
594 stp x19,x20,[x2,#8*0]
595 adcs x23,x10,x17
596 extr x14,x15,x14,#63
597 stp x21,x22,[x2,#8*2]
598 adcs x24,x11,x14
599 ldp x17,x14,[x2,#8*7]
600 extr x15,x16,x15,#63
601 adcs x25,x12,x15
602 extr x16,x17,x16,#63
603 adcs x26,x13,x16
604 ldp x15,x16,[x2,#8*9]
605 mul x6,x7,x7
606 ldp x11,x13,[x1],#8*2
607 umulh x7,x7,x7
608 mul x8,x9,x9
609 umulh x9,x9,x9
610 stp x23,x24,[x2,#8*4]
611 extr x17,x14,x17,#63
612 stp x25,x26,[x2,#8*6]
613 add x2,x2,#8*8
614 adcs x19,x6,x17
615 extr x14,x15,x14,#63
616 adcs x20,x7,x14
617 ldp x17,x14,[x2,#8*3]
618 extr x15,x16,x15,#63
Robert Sloan8ff03552017-06-14 12:40:58 -0700619 cbnz x27,Lsqr4x_shift_n_add
Adam Langleyfad63272015-11-12 12:15:39 -0800620 ldp x1,x4,[x29,#104] // pull np and n0
621
622 adcs x21,x8,x15
623 extr x16,x17,x16,#63
624 adcs x22,x9,x16
625 ldp x15,x16,[x2,#8*5]
626 mul x10,x11,x11
627 umulh x11,x11,x11
628 stp x19,x20,[x2,#8*0]
629 mul x12,x13,x13
630 umulh x13,x13,x13
631 stp x21,x22,[x2,#8*2]
632 extr x17,x14,x17,#63
633 adcs x23,x10,x17
634 extr x14,x15,x14,#63
635 ldp x19,x20,[sp,#8*0]
636 adcs x24,x11,x14
637 extr x15,x16,x15,#63
638 ldp x6,x7,[x1,#8*0]
639 adcs x25,x12,x15
640 extr x16,xzr,x16,#63
641 ldp x8,x9,[x1,#8*2]
642 adc x26,x13,x16
643 ldp x10,x11,[x1,#8*4]
644
645 // Reduce by 512 bits per iteration
646 mul x28,x4,x19 // t[0]*n0
647 ldp x12,x13,[x1,#8*6]
648 add x3,x1,x5
649 ldp x21,x22,[sp,#8*2]
650 stp x23,x24,[x2,#8*4]
651 ldp x23,x24,[sp,#8*4]
652 stp x25,x26,[x2,#8*6]
653 ldp x25,x26,[sp,#8*6]
654 add x1,x1,#8*8
655 mov x30,xzr // initial top-most carry
656 mov x2,sp
657 mov x27,#8
658
Robert Sloan8ff03552017-06-14 12:40:58 -0700659Lsqr8x_reduction:
Adam Langleyfad63272015-11-12 12:15:39 -0800660 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
661 mul x15,x7,x28
662 sub x27,x27,#1
663 mul x16,x8,x28
664 str x28,[x2],#8 // put aside t[0]*n0 for tail processing
665 mul x17,x9,x28
666 // (*) adds xzr,x19,x14
667 subs xzr,x19,#1 // (*)
668 mul x14,x10,x28
669 adcs x19,x20,x15
670 mul x15,x11,x28
671 adcs x20,x21,x16
672 mul x16,x12,x28
673 adcs x21,x22,x17
674 mul x17,x13,x28
675 adcs x22,x23,x14
676 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
677 adcs x23,x24,x15
678 umulh x15,x7,x28
679 adcs x24,x25,x16
680 umulh x16,x8,x28
681 adcs x25,x26,x17
682 umulh x17,x9,x28
683 adc x26,xzr,xzr
684 adds x19,x19,x14
685 umulh x14,x10,x28
686 adcs x20,x20,x15
687 umulh x15,x11,x28
688 adcs x21,x21,x16
689 umulh x16,x12,x28
690 adcs x22,x22,x17
691 umulh x17,x13,x28
692 mul x28,x4,x19 // next t[0]*n0
693 adcs x23,x23,x14
694 adcs x24,x24,x15
695 adcs x25,x25,x16
696 adc x26,x26,x17
Robert Sloan8ff03552017-06-14 12:40:58 -0700697 cbnz x27,Lsqr8x_reduction
Adam Langleyfad63272015-11-12 12:15:39 -0800698
699 ldp x14,x15,[x2,#8*0]
700 ldp x16,x17,[x2,#8*2]
701 mov x0,x2
702 sub x27,x3,x1 // done yet?
703 adds x19,x19,x14
704 adcs x20,x20,x15
705 ldp x14,x15,[x2,#8*4]
706 adcs x21,x21,x16
707 adcs x22,x22,x17
708 ldp x16,x17,[x2,#8*6]
709 adcs x23,x23,x14
710 adcs x24,x24,x15
711 adcs x25,x25,x16
712 adcs x26,x26,x17
713 //adc x28,xzr,xzr // moved below
Robert Sloan8ff03552017-06-14 12:40:58 -0700714 cbz x27,Lsqr8x8_post_condition
Adam Langleyfad63272015-11-12 12:15:39 -0800715
716 ldr x4,[x2,#-8*8]
717 ldp x6,x7,[x1,#8*0]
718 ldp x8,x9,[x1,#8*2]
719 ldp x10,x11,[x1,#8*4]
720 mov x27,#-8*8
721 ldp x12,x13,[x1,#8*6]
722 add x1,x1,#8*8
723
Robert Sloan8ff03552017-06-14 12:40:58 -0700724Lsqr8x_tail:
Adam Langleyfad63272015-11-12 12:15:39 -0800725 mul x14,x6,x4
726 adc x28,xzr,xzr // carry bit, modulo-scheduled
727 mul x15,x7,x4
728 add x27,x27,#8
729 mul x16,x8,x4
730 mul x17,x9,x4
731 adds x19,x19,x14
732 mul x14,x10,x4
733 adcs x20,x20,x15
734 mul x15,x11,x4
735 adcs x21,x21,x16
736 mul x16,x12,x4
737 adcs x22,x22,x17
738 mul x17,x13,x4
739 adcs x23,x23,x14
740 umulh x14,x6,x4
741 adcs x24,x24,x15
742 umulh x15,x7,x4
743 adcs x25,x25,x16
744 umulh x16,x8,x4
745 adcs x26,x26,x17
746 umulh x17,x9,x4
747 adc x28,x28,xzr
748 str x19,[x2],#8
749 adds x19,x20,x14
750 umulh x14,x10,x4
751 adcs x20,x21,x15
752 umulh x15,x11,x4
753 adcs x21,x22,x16
754 umulh x16,x12,x4
755 adcs x22,x23,x17
756 umulh x17,x13,x4
757 ldr x4,[x0,x27]
758 adcs x23,x24,x14
759 adcs x24,x25,x15
760 adcs x25,x26,x16
761 adcs x26,x28,x17
762 //adc x28,xzr,xzr // moved above
Robert Sloan8ff03552017-06-14 12:40:58 -0700763 cbnz x27,Lsqr8x_tail
Adam Langleyfad63272015-11-12 12:15:39 -0800764 // note that carry flag is guaranteed
765 // to be zero at this point
766 ldp x6,x7,[x2,#8*0]
767 sub x27,x3,x1 // done yet?
768 sub x16,x3,x5 // rewinded np
769 ldp x8,x9,[x2,#8*2]
770 ldp x10,x11,[x2,#8*4]
771 ldp x12,x13,[x2,#8*6]
Robert Sloan8ff03552017-06-14 12:40:58 -0700772 cbz x27,Lsqr8x_tail_break
Adam Langleyfad63272015-11-12 12:15:39 -0800773
774 ldr x4,[x0,#-8*8]
775 adds x19,x19,x6
776 adcs x20,x20,x7
777 ldp x6,x7,[x1,#8*0]
778 adcs x21,x21,x8
779 adcs x22,x22,x9
780 ldp x8,x9,[x1,#8*2]
781 adcs x23,x23,x10
782 adcs x24,x24,x11
783 ldp x10,x11,[x1,#8*4]
784 adcs x25,x25,x12
785 mov x27,#-8*8
786 adcs x26,x26,x13
787 ldp x12,x13,[x1,#8*6]
788 add x1,x1,#8*8
789 //adc x28,xzr,xzr // moved above
Robert Sloan8ff03552017-06-14 12:40:58 -0700790 b Lsqr8x_tail
Adam Langleyfad63272015-11-12 12:15:39 -0800791
792.align 4
Robert Sloan8ff03552017-06-14 12:40:58 -0700793Lsqr8x_tail_break:
Adam Langleyfad63272015-11-12 12:15:39 -0800794 ldr x4,[x29,#112] // pull n0
795 add x27,x2,#8*8 // end of current t[num] window
796
797 subs xzr,x30,#1 // "move" top-most carry to carry bit
798 adcs x14,x19,x6
799 adcs x15,x20,x7
800 ldp x19,x20,[x0,#8*0]
801 adcs x21,x21,x8
802 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
803 adcs x22,x22,x9
804 ldp x8,x9,[x16,#8*2]
805 adcs x23,x23,x10
806 adcs x24,x24,x11
807 ldp x10,x11,[x16,#8*4]
808 adcs x25,x25,x12
809 adcs x26,x26,x13
810 ldp x12,x13,[x16,#8*6]
811 add x1,x16,#8*8
812 adc x30,xzr,xzr // top-most carry
813 mul x28,x4,x19
814 stp x14,x15,[x2,#8*0]
815 stp x21,x22,[x2,#8*2]
816 ldp x21,x22,[x0,#8*2]
817 stp x23,x24,[x2,#8*4]
818 ldp x23,x24,[x0,#8*4]
819 cmp x27,x29 // did we hit the bottom?
820 stp x25,x26,[x2,#8*6]
821 mov x2,x0 // slide the window
822 ldp x25,x26,[x0,#8*6]
823 mov x27,#8
Robert Sloan8ff03552017-06-14 12:40:58 -0700824 b.ne Lsqr8x_reduction
Adam Langleyfad63272015-11-12 12:15:39 -0800825
826 // Final step. We see if result is larger than modulus, and
827 // if it is, subtract the modulus. But comparison implies
828 // subtraction. So we subtract modulus, see if it borrowed,
829 // and conditionally copy original value.
830 ldr x0,[x29,#96] // pull rp
831 add x2,x2,#8*8
832 subs x14,x19,x6
833 sbcs x15,x20,x7
834 sub x27,x5,#8*8
835 mov x3,x0 // x0 copy
836
Robert Sloan8ff03552017-06-14 12:40:58 -0700837Lsqr8x_sub:
Adam Langleyfad63272015-11-12 12:15:39 -0800838 sbcs x16,x21,x8
839 ldp x6,x7,[x1,#8*0]
840 sbcs x17,x22,x9
841 stp x14,x15,[x0,#8*0]
842 sbcs x14,x23,x10
843 ldp x8,x9,[x1,#8*2]
844 sbcs x15,x24,x11
845 stp x16,x17,[x0,#8*2]
846 sbcs x16,x25,x12
847 ldp x10,x11,[x1,#8*4]
848 sbcs x17,x26,x13
849 ldp x12,x13,[x1,#8*6]
850 add x1,x1,#8*8
851 ldp x19,x20,[x2,#8*0]
852 sub x27,x27,#8*8
853 ldp x21,x22,[x2,#8*2]
854 ldp x23,x24,[x2,#8*4]
855 ldp x25,x26,[x2,#8*6]
856 add x2,x2,#8*8
857 stp x14,x15,[x0,#8*4]
858 sbcs x14,x19,x6
859 stp x16,x17,[x0,#8*6]
860 add x0,x0,#8*8
861 sbcs x15,x20,x7
Robert Sloan8ff03552017-06-14 12:40:58 -0700862 cbnz x27,Lsqr8x_sub
Adam Langleyfad63272015-11-12 12:15:39 -0800863
864 sbcs x16,x21,x8
865 mov x2,sp
866 add x1,sp,x5
867 ldp x6,x7,[x3,#8*0]
868 sbcs x17,x22,x9
869 stp x14,x15,[x0,#8*0]
870 sbcs x14,x23,x10
871 ldp x8,x9,[x3,#8*2]
872 sbcs x15,x24,x11
873 stp x16,x17,[x0,#8*2]
874 sbcs x16,x25,x12
875 ldp x19,x20,[x1,#8*0]
876 sbcs x17,x26,x13
877 ldp x21,x22,[x1,#8*2]
878 sbcs xzr,x30,xzr // did it borrow?
879 ldr x30,[x29,#8] // pull return address
880 stp x14,x15,[x0,#8*4]
881 stp x16,x17,[x0,#8*6]
882
883 sub x27,x5,#8*4
Robert Sloan8ff03552017-06-14 12:40:58 -0700884Lsqr4x_cond_copy:
Adam Langleyfad63272015-11-12 12:15:39 -0800885 sub x27,x27,#8*4
886 csel x14,x19,x6,lo
887 stp xzr,xzr,[x2,#8*0]
888 csel x15,x20,x7,lo
889 ldp x6,x7,[x3,#8*4]
890 ldp x19,x20,[x1,#8*4]
891 csel x16,x21,x8,lo
892 stp xzr,xzr,[x2,#8*2]
893 add x2,x2,#8*4
894 csel x17,x22,x9,lo
895 ldp x8,x9,[x3,#8*6]
896 ldp x21,x22,[x1,#8*6]
897 add x1,x1,#8*4
898 stp x14,x15,[x3,#8*0]
899 stp x16,x17,[x3,#8*2]
900 add x3,x3,#8*4
901 stp xzr,xzr,[x1,#8*0]
902 stp xzr,xzr,[x1,#8*2]
Robert Sloan8ff03552017-06-14 12:40:58 -0700903 cbnz x27,Lsqr4x_cond_copy
Adam Langleyfad63272015-11-12 12:15:39 -0800904
905 csel x14,x19,x6,lo
906 stp xzr,xzr,[x2,#8*0]
907 csel x15,x20,x7,lo
908 stp xzr,xzr,[x2,#8*2]
909 csel x16,x21,x8,lo
910 csel x17,x22,x9,lo
911 stp x14,x15,[x3,#8*0]
912 stp x16,x17,[x3,#8*2]
913
Robert Sloan8ff03552017-06-14 12:40:58 -0700914 b Lsqr8x_done
Adam Langleyfad63272015-11-12 12:15:39 -0800915
916.align 4
Robert Sloan8ff03552017-06-14 12:40:58 -0700917Lsqr8x8_post_condition:
Adam Langleyfad63272015-11-12 12:15:39 -0800918 adc x28,xzr,xzr
919 ldr x30,[x29,#8] // pull return address
920 // x19-7,x28 hold result, x6-7 hold modulus
921 subs x6,x19,x6
922 ldr x1,[x29,#96] // pull rp
923 sbcs x7,x20,x7
924 stp xzr,xzr,[sp,#8*0]
925 sbcs x8,x21,x8
926 stp xzr,xzr,[sp,#8*2]
927 sbcs x9,x22,x9
928 stp xzr,xzr,[sp,#8*4]
929 sbcs x10,x23,x10
930 stp xzr,xzr,[sp,#8*6]
931 sbcs x11,x24,x11
932 stp xzr,xzr,[sp,#8*8]
933 sbcs x12,x25,x12
934 stp xzr,xzr,[sp,#8*10]
935 sbcs x13,x26,x13
936 stp xzr,xzr,[sp,#8*12]
937 sbcs x28,x28,xzr // did it borrow?
938 stp xzr,xzr,[sp,#8*14]
939
940 // x6-7 hold result-modulus
941 csel x6,x19,x6,lo
942 csel x7,x20,x7,lo
943 csel x8,x21,x8,lo
944 csel x9,x22,x9,lo
945 stp x6,x7,[x1,#8*0]
946 csel x10,x23,x10,lo
947 csel x11,x24,x11,lo
948 stp x8,x9,[x1,#8*2]
949 csel x12,x25,x12,lo
950 csel x13,x26,x13,lo
951 stp x10,x11,[x1,#8*4]
952 stp x12,x13,[x1,#8*6]
953
Robert Sloan8ff03552017-06-14 12:40:58 -0700954Lsqr8x_done:
Adam Langleyfad63272015-11-12 12:15:39 -0800955 ldp x19,x20,[x29,#16]
956 mov sp,x29
957 ldp x21,x22,[x29,#32]
958 mov x0,#1
959 ldp x23,x24,[x29,#48]
960 ldp x25,x26,[x29,#64]
961 ldp x27,x28,[x29,#80]
962 ldr x29,[sp],#128
963 ret
Robert Sloan8ff03552017-06-14 12:40:58 -0700964
965
Adam Langleyfad63272015-11-12 12:15:39 -0800966.align 5
967__bn_mul4x_mont:
968 stp x29,x30,[sp,#-128]!
969 add x29,sp,#0
970 stp x19,x20,[sp,#16]
971 stp x21,x22,[sp,#32]
972 stp x23,x24,[sp,#48]
973 stp x25,x26,[sp,#64]
974 stp x27,x28,[sp,#80]
975
976 sub x26,sp,x5,lsl#3
977 lsl x5,x5,#3
978 ldr x4,[x4] // *n0
979 sub sp,x26,#8*4 // alloca
980
981 add x10,x2,x5
982 add x27,x1,x5
983 stp x0,x10,[x29,#96] // offload rp and &b[num]
984
985 ldr x24,[x2,#8*0] // b[0]
986 ldp x6,x7,[x1,#8*0] // a[0..3]
987 ldp x8,x9,[x1,#8*2]
988 add x1,x1,#8*4
989 mov x19,xzr
990 mov x20,xzr
991 mov x21,xzr
992 mov x22,xzr
993 ldp x14,x15,[x3,#8*0] // n[0..3]
994 ldp x16,x17,[x3,#8*2]
995 adds x3,x3,#8*4 // clear carry bit
996 mov x0,xzr
997 mov x28,#0
998 mov x26,sp
999
Robert Sloan8ff03552017-06-14 12:40:58 -07001000Loop_mul4x_1st_reduction:
Adam Langleyfad63272015-11-12 12:15:39 -08001001 mul x10,x6,x24 // lo(a[0..3]*b[0])
1002 adc x0,x0,xzr // modulo-scheduled
1003 mul x11,x7,x24
1004 add x28,x28,#8
1005 mul x12,x8,x24
1006 and x28,x28,#31
1007 mul x13,x9,x24
1008 adds x19,x19,x10
1009 umulh x10,x6,x24 // hi(a[0..3]*b[0])
1010 adcs x20,x20,x11
1011 mul x25,x19,x4 // t[0]*n0
1012 adcs x21,x21,x12
1013 umulh x11,x7,x24
1014 adcs x22,x22,x13
1015 umulh x12,x8,x24
1016 adc x23,xzr,xzr
1017 umulh x13,x9,x24
1018 ldr x24,[x2,x28] // next b[i] (or b[0])
1019 adds x20,x20,x10
1020 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
1021 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1022 adcs x21,x21,x11
1023 mul x11,x15,x25
1024 adcs x22,x22,x12
1025 mul x12,x16,x25
1026 adc x23,x23,x13 // can't overflow
1027 mul x13,x17,x25
1028 // (*) adds xzr,x19,x10
1029 subs xzr,x19,#1 // (*)
1030 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
1031 adcs x19,x20,x11
1032 umulh x11,x15,x25
1033 adcs x20,x21,x12
1034 umulh x12,x16,x25
1035 adcs x21,x22,x13
1036 umulh x13,x17,x25
1037 adcs x22,x23,x0
1038 adc x0,xzr,xzr
1039 adds x19,x19,x10
1040 sub x10,x27,x1
1041 adcs x20,x20,x11
1042 adcs x21,x21,x12
1043 adcs x22,x22,x13
1044 //adc x0,x0,xzr
Robert Sloan8ff03552017-06-14 12:40:58 -07001045 cbnz x28,Loop_mul4x_1st_reduction
Adam Langleyfad63272015-11-12 12:15:39 -08001046
Robert Sloan8ff03552017-06-14 12:40:58 -07001047 cbz x10,Lmul4x4_post_condition
Adam Langleyfad63272015-11-12 12:15:39 -08001048
1049 ldp x6,x7,[x1,#8*0] // a[4..7]
1050 ldp x8,x9,[x1,#8*2]
1051 add x1,x1,#8*4
1052 ldr x25,[sp] // a[0]*n0
1053 ldp x14,x15,[x3,#8*0] // n[4..7]
1054 ldp x16,x17,[x3,#8*2]
1055 add x3,x3,#8*4
1056
Robert Sloan8ff03552017-06-14 12:40:58 -07001057Loop_mul4x_1st_tail:
Adam Langleyfad63272015-11-12 12:15:39 -08001058 mul x10,x6,x24 // lo(a[4..7]*b[i])
1059 adc x0,x0,xzr // modulo-scheduled
1060 mul x11,x7,x24
1061 add x28,x28,#8
1062 mul x12,x8,x24
1063 and x28,x28,#31
1064 mul x13,x9,x24
1065 adds x19,x19,x10
1066 umulh x10,x6,x24 // hi(a[4..7]*b[i])
1067 adcs x20,x20,x11
1068 umulh x11,x7,x24
1069 adcs x21,x21,x12
1070 umulh x12,x8,x24
1071 adcs x22,x22,x13
1072 umulh x13,x9,x24
1073 adc x23,xzr,xzr
1074 ldr x24,[x2,x28] // next b[i] (or b[0])
1075 adds x20,x20,x10
1076 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
1077 adcs x21,x21,x11
1078 mul x11,x15,x25
1079 adcs x22,x22,x12
1080 mul x12,x16,x25
1081 adc x23,x23,x13 // can't overflow
1082 mul x13,x17,x25
1083 adds x19,x19,x10
1084 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
1085 adcs x20,x20,x11
1086 umulh x11,x15,x25
1087 adcs x21,x21,x12
1088 umulh x12,x16,x25
1089 adcs x22,x22,x13
1090 adcs x23,x23,x0
1091 umulh x13,x17,x25
1092 adc x0,xzr,xzr
1093 ldr x25,[sp,x28] // next t[0]*n0
1094 str x19,[x26],#8 // result!!!
1095 adds x19,x20,x10
1096 sub x10,x27,x1 // done yet?
1097 adcs x20,x21,x11
1098 adcs x21,x22,x12
1099 adcs x22,x23,x13
1100 //adc x0,x0,xzr
Robert Sloan8ff03552017-06-14 12:40:58 -07001101 cbnz x28,Loop_mul4x_1st_tail
Adam Langleyfad63272015-11-12 12:15:39 -08001102
1103 sub x11,x27,x5 // rewinded x1
Robert Sloan8ff03552017-06-14 12:40:58 -07001104 cbz x10,Lmul4x_proceed
Adam Langleyfad63272015-11-12 12:15:39 -08001105
1106 ldp x6,x7,[x1,#8*0]
1107 ldp x8,x9,[x1,#8*2]
1108 add x1,x1,#8*4
1109 ldp x14,x15,[x3,#8*0]
1110 ldp x16,x17,[x3,#8*2]
1111 add x3,x3,#8*4
Robert Sloan8ff03552017-06-14 12:40:58 -07001112 b Loop_mul4x_1st_tail
Adam Langleyfad63272015-11-12 12:15:39 -08001113
1114.align 5
Robert Sloan8ff03552017-06-14 12:40:58 -07001115Lmul4x_proceed:
Adam Langleyfad63272015-11-12 12:15:39 -08001116 ldr x24,[x2,#8*4]! // *++b
1117 adc x30,x0,xzr
1118 ldp x6,x7,[x11,#8*0] // a[0..3]
1119 sub x3,x3,x5 // rewind np
1120 ldp x8,x9,[x11,#8*2]
1121 add x1,x11,#8*4
1122
1123 stp x19,x20,[x26,#8*0] // result!!!
1124 ldp x19,x20,[sp,#8*4] // t[0..3]
1125 stp x21,x22,[x26,#8*2] // result!!!
1126 ldp x21,x22,[sp,#8*6]
1127
1128 ldp x14,x15,[x3,#8*0] // n[0..3]
1129 mov x26,sp
1130 ldp x16,x17,[x3,#8*2]
1131 adds x3,x3,#8*4 // clear carry bit
1132 mov x0,xzr
1133
1134.align 4
Robert Sloan8ff03552017-06-14 12:40:58 -07001135Loop_mul4x_reduction:
Adam Langleyfad63272015-11-12 12:15:39 -08001136 mul x10,x6,x24 // lo(a[0..3]*b[4])
1137 adc x0,x0,xzr // modulo-scheduled
1138 mul x11,x7,x24
1139 add x28,x28,#8
1140 mul x12,x8,x24
1141 and x28,x28,#31
1142 mul x13,x9,x24
1143 adds x19,x19,x10
1144 umulh x10,x6,x24 // hi(a[0..3]*b[4])
1145 adcs x20,x20,x11
1146 mul x25,x19,x4 // t[0]*n0
1147 adcs x21,x21,x12
1148 umulh x11,x7,x24
1149 adcs x22,x22,x13
1150 umulh x12,x8,x24
1151 adc x23,xzr,xzr
1152 umulh x13,x9,x24
1153 ldr x24,[x2,x28] // next b[i]
1154 adds x20,x20,x10
1155 // (*) mul x10,x14,x25
1156 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1157 adcs x21,x21,x11
1158 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
1159 adcs x22,x22,x12
1160 mul x12,x16,x25
1161 adc x23,x23,x13 // can't overflow
1162 mul x13,x17,x25
1163 // (*) adds xzr,x19,x10
1164 subs xzr,x19,#1 // (*)
1165 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
1166 adcs x19,x20,x11
1167 umulh x11,x15,x25
1168 adcs x20,x21,x12
1169 umulh x12,x16,x25
1170 adcs x21,x22,x13
1171 umulh x13,x17,x25
1172 adcs x22,x23,x0
1173 adc x0,xzr,xzr
1174 adds x19,x19,x10
1175 adcs x20,x20,x11
1176 adcs x21,x21,x12
1177 adcs x22,x22,x13
1178 //adc x0,x0,xzr
Robert Sloan8ff03552017-06-14 12:40:58 -07001179 cbnz x28,Loop_mul4x_reduction
Adam Langleyfad63272015-11-12 12:15:39 -08001180
1181 adc x0,x0,xzr
1182 ldp x10,x11,[x26,#8*4] // t[4..7]
1183 ldp x12,x13,[x26,#8*6]
1184 ldp x6,x7,[x1,#8*0] // a[4..7]
1185 ldp x8,x9,[x1,#8*2]
1186 add x1,x1,#8*4
1187 adds x19,x19,x10
1188 adcs x20,x20,x11
1189 adcs x21,x21,x12
1190 adcs x22,x22,x13
1191 //adc x0,x0,xzr
1192
1193 ldr x25,[sp] // t[0]*n0
1194 ldp x14,x15,[x3,#8*0] // n[4..7]
1195 ldp x16,x17,[x3,#8*2]
1196 add x3,x3,#8*4
1197
1198.align 4
Robert Sloan8ff03552017-06-14 12:40:58 -07001199Loop_mul4x_tail:
Adam Langleyfad63272015-11-12 12:15:39 -08001200 mul x10,x6,x24 // lo(a[4..7]*b[4])
1201 adc x0,x0,xzr // modulo-scheduled
1202 mul x11,x7,x24
1203 add x28,x28,#8
1204 mul x12,x8,x24
1205 and x28,x28,#31
1206 mul x13,x9,x24
1207 adds x19,x19,x10
1208 umulh x10,x6,x24 // hi(a[4..7]*b[4])
1209 adcs x20,x20,x11
1210 umulh x11,x7,x24
1211 adcs x21,x21,x12
1212 umulh x12,x8,x24
1213 adcs x22,x22,x13
1214 umulh x13,x9,x24
1215 adc x23,xzr,xzr
1216 ldr x24,[x2,x28] // next b[i]
1217 adds x20,x20,x10
1218 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
1219 adcs x21,x21,x11
1220 mul x11,x15,x25
1221 adcs x22,x22,x12
1222 mul x12,x16,x25
1223 adc x23,x23,x13 // can't overflow
1224 mul x13,x17,x25
1225 adds x19,x19,x10
1226 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
1227 adcs x20,x20,x11
1228 umulh x11,x15,x25
1229 adcs x21,x21,x12
1230 umulh x12,x16,x25
1231 adcs x22,x22,x13
1232 umulh x13,x17,x25
1233 adcs x23,x23,x0
1234 ldr x25,[sp,x28] // next a[0]*n0
1235 adc x0,xzr,xzr
1236 str x19,[x26],#8 // result!!!
1237 adds x19,x20,x10
1238 sub x10,x27,x1 // done yet?
1239 adcs x20,x21,x11
1240 adcs x21,x22,x12
1241 adcs x22,x23,x13
1242 //adc x0,x0,xzr
Robert Sloan8ff03552017-06-14 12:40:58 -07001243 cbnz x28,Loop_mul4x_tail
Adam Langleyfad63272015-11-12 12:15:39 -08001244
1245 sub x11,x3,x5 // rewinded np?
1246 adc x0,x0,xzr
Robert Sloan8ff03552017-06-14 12:40:58 -07001247 cbz x10,Loop_mul4x_break
Adam Langleyfad63272015-11-12 12:15:39 -08001248
1249 ldp x10,x11,[x26,#8*4]
1250 ldp x12,x13,[x26,#8*6]
1251 ldp x6,x7,[x1,#8*0]
1252 ldp x8,x9,[x1,#8*2]
1253 add x1,x1,#8*4
1254 adds x19,x19,x10
1255 adcs x20,x20,x11
1256 adcs x21,x21,x12
1257 adcs x22,x22,x13
1258 //adc x0,x0,xzr
1259 ldp x14,x15,[x3,#8*0]
1260 ldp x16,x17,[x3,#8*2]
1261 add x3,x3,#8*4
Robert Sloan8ff03552017-06-14 12:40:58 -07001262 b Loop_mul4x_tail
Adam Langleyfad63272015-11-12 12:15:39 -08001263
1264.align 4
Robert Sloan8ff03552017-06-14 12:40:58 -07001265Loop_mul4x_break:
Adam Langleyfad63272015-11-12 12:15:39 -08001266 ldp x12,x13,[x29,#96] // pull rp and &b[num]
1267 adds x19,x19,x30
1268 add x2,x2,#8*4 // bp++
1269 adcs x20,x20,xzr
1270 sub x1,x1,x5 // rewind ap
1271 adcs x21,x21,xzr
1272 stp x19,x20,[x26,#8*0] // result!!!
1273 adcs x22,x22,xzr
1274 ldp x19,x20,[sp,#8*4] // t[0..3]
1275 adc x30,x0,xzr
1276 stp x21,x22,[x26,#8*2] // result!!!
1277 cmp x2,x13 // done yet?
1278 ldp x21,x22,[sp,#8*6]
1279 ldp x14,x15,[x11,#8*0] // n[0..3]
1280 ldp x16,x17,[x11,#8*2]
1281 add x3,x11,#8*4
Robert Sloan8ff03552017-06-14 12:40:58 -07001282 b.eq Lmul4x_post
Adam Langleyfad63272015-11-12 12:15:39 -08001283
1284 ldr x24,[x2]
1285 ldp x6,x7,[x1,#8*0] // a[0..3]
1286 ldp x8,x9,[x1,#8*2]
1287 adds x1,x1,#8*4 // clear carry bit
1288 mov x0,xzr
1289 mov x26,sp
Robert Sloan8ff03552017-06-14 12:40:58 -07001290 b Loop_mul4x_reduction
Adam Langleyfad63272015-11-12 12:15:39 -08001291
1292.align 4
Robert Sloan8ff03552017-06-14 12:40:58 -07001293Lmul4x_post:
Adam Langleyfad63272015-11-12 12:15:39 -08001294 // Final step. We see if result is larger than modulus, and
1295 // if it is, subtract the modulus. But comparison implies
1296 // subtraction. So we subtract modulus, see if it borrowed,
1297 // and conditionally copy original value.
1298 mov x0,x12
1299 mov x27,x12 // x0 copy
1300 subs x10,x19,x14
1301 add x26,sp,#8*8
1302 sbcs x11,x20,x15
1303 sub x28,x5,#8*4
1304
Robert Sloan8ff03552017-06-14 12:40:58 -07001305Lmul4x_sub:
Adam Langleyfad63272015-11-12 12:15:39 -08001306 sbcs x12,x21,x16
1307 ldp x14,x15,[x3,#8*0]
1308 sub x28,x28,#8*4
1309 ldp x19,x20,[x26,#8*0]
1310 sbcs x13,x22,x17
1311 ldp x16,x17,[x3,#8*2]
1312 add x3,x3,#8*4
1313 ldp x21,x22,[x26,#8*2]
1314 add x26,x26,#8*4
1315 stp x10,x11,[x0,#8*0]
1316 sbcs x10,x19,x14
1317 stp x12,x13,[x0,#8*2]
1318 add x0,x0,#8*4
1319 sbcs x11,x20,x15
Robert Sloan8ff03552017-06-14 12:40:58 -07001320 cbnz x28,Lmul4x_sub
Adam Langleyfad63272015-11-12 12:15:39 -08001321
1322 sbcs x12,x21,x16
1323 mov x26,sp
1324 add x1,sp,#8*4
1325 ldp x6,x7,[x27,#8*0]
1326 sbcs x13,x22,x17
1327 stp x10,x11,[x0,#8*0]
1328 ldp x8,x9,[x27,#8*2]
1329 stp x12,x13,[x0,#8*2]
1330 ldp x19,x20,[x1,#8*0]
1331 ldp x21,x22,[x1,#8*2]
1332 sbcs xzr,x30,xzr // did it borrow?
1333 ldr x30,[x29,#8] // pull return address
1334
1335 sub x28,x5,#8*4
Robert Sloan8ff03552017-06-14 12:40:58 -07001336Lmul4x_cond_copy:
Adam Langleyfad63272015-11-12 12:15:39 -08001337 sub x28,x28,#8*4
1338 csel x10,x19,x6,lo
1339 stp xzr,xzr,[x26,#8*0]
1340 csel x11,x20,x7,lo
1341 ldp x6,x7,[x27,#8*4]
1342 ldp x19,x20,[x1,#8*4]
1343 csel x12,x21,x8,lo
1344 stp xzr,xzr,[x26,#8*2]
1345 add x26,x26,#8*4
1346 csel x13,x22,x9,lo
1347 ldp x8,x9,[x27,#8*6]
1348 ldp x21,x22,[x1,#8*6]
1349 add x1,x1,#8*4
1350 stp x10,x11,[x27,#8*0]
1351 stp x12,x13,[x27,#8*2]
1352 add x27,x27,#8*4
Robert Sloan8ff03552017-06-14 12:40:58 -07001353 cbnz x28,Lmul4x_cond_copy
Adam Langleyfad63272015-11-12 12:15:39 -08001354
1355 csel x10,x19,x6,lo
1356 stp xzr,xzr,[x26,#8*0]
1357 csel x11,x20,x7,lo
1358 stp xzr,xzr,[x26,#8*2]
1359 csel x12,x21,x8,lo
1360 stp xzr,xzr,[x26,#8*3]
1361 csel x13,x22,x9,lo
1362 stp xzr,xzr,[x26,#8*4]
1363 stp x10,x11,[x27,#8*0]
1364 stp x12,x13,[x27,#8*2]
1365
Robert Sloan8ff03552017-06-14 12:40:58 -07001366 b Lmul4x_done
Adam Langleyfad63272015-11-12 12:15:39 -08001367
1368.align 4
Robert Sloan8ff03552017-06-14 12:40:58 -07001369Lmul4x4_post_condition:
Adam Langleyfad63272015-11-12 12:15:39 -08001370 adc x0,x0,xzr
1371 ldr x1,[x29,#96] // pull rp
1372 // x19-3,x0 hold result, x14-7 hold modulus
1373 subs x6,x19,x14
1374 ldr x30,[x29,#8] // pull return address
1375 sbcs x7,x20,x15
1376 stp xzr,xzr,[sp,#8*0]
1377 sbcs x8,x21,x16
1378 stp xzr,xzr,[sp,#8*2]
1379 sbcs x9,x22,x17
1380 stp xzr,xzr,[sp,#8*4]
1381 sbcs xzr,x0,xzr // did it borrow?
1382 stp xzr,xzr,[sp,#8*6]
1383
1384 // x6-3 hold result-modulus
1385 csel x6,x19,x6,lo
1386 csel x7,x20,x7,lo
1387 csel x8,x21,x8,lo
1388 csel x9,x22,x9,lo
1389 stp x6,x7,[x1,#8*0]
1390 stp x8,x9,[x1,#8*2]
1391
Robert Sloan8ff03552017-06-14 12:40:58 -07001392Lmul4x_done:
Adam Langleyfad63272015-11-12 12:15:39 -08001393 ldp x19,x20,[x29,#16]
1394 mov sp,x29
1395 ldp x21,x22,[x29,#32]
1396 mov x0,#1
1397 ldp x23,x24,[x29,#48]
1398 ldp x25,x26,[x29,#64]
1399 ldp x27,x28,[x29,#80]
1400 ldr x29,[sp],#128
1401 ret
Robert Sloan8ff03552017-06-14 12:40:58 -07001402
Adam Langleyfad63272015-11-12 12:15:39 -08001403.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1404.align 2
1405.align 4