blob: ce19d8090e5536df00bcd754333530cbb8dc8b27 [file] [log] [blame]
Pete Bentley0c61efe2019-08-13 09:32:23 +01001#! /usr/bin/env perl
2#
3# April 2019
4#
5# Abstract: field arithmetic in aarch64 assembly for SIDH/p434
6
7$flavour = shift;
8$output = shift;
9if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
10
11$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
12( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
13( $xlate="${dir}../../../crypto/perlasm/arm-xlate.pl" and -f $xlate) or
14die "can't locate arm-xlate.pl";
15
16open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
17*STDOUT=*OUT;
18
19$PREFIX="sike";
20
21$code.=<<___;
22.section .rodata
23
24# p434 x 2
25.Lp434x2:
26 .quad 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF
27 .quad 0xFB82ECF5C5FFFFFF, 0xF78CB8F062B15D47
28 .quad 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688
29
30# p434 + 1
31.Lp434p1:
32 .quad 0xFDC1767AE3000000, 0x7BC65C783158AEA3
33 .quad 0x6CFC5FD681C52056, 0x0002341F27177344
34
35.text
36___
37
38# Computes C0-C2 = A0 * (B0-B1)
39# Inputs remain intact
40sub mul64x128 {
41 my ($A0,$B0,$B1,$C0,$C1,$C2,$T0,$T1)=@_;
42 my $body=<<___;
43 mul $T1, $A0, $B0
44 umulh $B0, $A0, $B0
45 adds $C0, $C0, $C2
46 adc $C1, $C1, xzr
47
48 mul $T0, $A0, $B1
49 umulh $B1, $A0, $B1
50 adds $C0, $C0, $T1
51 adcs $C1, $C1, $B0
52 adc $C2, xzr, xzr
53
54 adds $C1, $C1, $T0
55 adc $C2, $C2, $B1
56___
57 return $body;
58}
59
60# Computes C0-C4 = A0 * (B0-B3)
61# Inputs remain intact
62sub mul64x256 {
63 my ($A0,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$T0,$T1,$T2)=@_;
64 my $body=<<___;
65 mul $C0, $A0, $B0 // C0
66 umulh $T0, $A0, $B0
67
68 mul $C1, $A0, $B1
69 umulh $T1, $A0, $B1
70 adds $C1, $C1, $T0 // C1
71 adc $T0, xzr, xzr
72
73 mul $C2, $A0, $B2
74 umulh $T2, $A0, $B2
75 adds $T1, $T0, $T1
76 adcs $C2, $C2, $T1 // C2
77 adc $T0, xzr, xzr
78
79 mul $C3, $A0, $B3
80 umulh $C4, $A0, $B3
81 adds $T2, $T0, $T2
82 adcs $C3, $C3, $T2 // C3
83 adc $C4, $C4, xzr // C4
84___
85 return $body;
86}
87
88# Computes C0-C4 = (A0-A1) * (B0-B3)
89# Inputs remain intact
90sub mul128x256 {
91 my ($A0,$A1,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_;
92 my $body=<<___;
93 mul $C0, $A0, $B0 // C0
94 umulh $C3, $A0, $B0
95
96 mul $C1, $A0, $B1
97 umulh $C2, $A0, $B1
98
99 mul $T0, $A1, $B0
100 umulh $T1, $A1, $B0
101 adds $C1, $C1, $C3
102 adc $C2, $C2, xzr
103
104 mul $T2, $A0, $B2
105 umulh $T3, $A0, $B2
106 adds $C1, $C1, $T0 // C1
107 adcs $C2, $C2, $T1
108 adc $C3, xzr, xzr
109
110 mul $T0, $A1, $B1
111 umulh $T1, $A1, $B1
112 adds $C2, $C2, $T2
113 adcs $C3, $C3, $T3
114 adc $C4, xzr, xzr
115
116 mul $T2, $A0, $B3
117 umulh $T3, $A0, $B3
118 adds $C2, $C2, $T0 // C2
119 adcs $C3, $C3, $T1
120 adc $C4, $C4, xzr
121
122 mul $T0, $A1, $B2
123 umulh $T1, $A1, $B2
124 adds $C3, $C3, $T2
125 adcs $C4, $C4, $T3
126 adc $C5, xzr, xzr
127
128 mul $T2, $A1, $B3
129 umulh $T3, $A1, $B3
130 adds $C3, $C3, $T0 // C3
131 adcs $C4, $C4, $T1
132 adc $C5, $C5, xzr
133 adds $C4, $C4, $T2 // C4
134 adc $C5, $C5, $T3 // C5
135
136___
137 return $body;
138}
139
140# Computes C0-C5 = (A0-A2) * (B0-B2)
141# Inputs remain intact
142sub mul192 {
143 my ($A0,$A1,$A2,$B0,$B1,$B2,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_;
144 my $body=<<___;
145
146 // A0 * B0
147 mul $C0, $A0, $B0 // C0
148 umulh $C3, $A0, $B0
149
150 // A0 * B1
151 mul $C1, $A0, $B1
152 umulh $C2, $A0, $B1
153
154 // A1 * B0
155 mul $T0, $A1, $B0
156 umulh $T1, $A1, $B0
157 adds $C1, $C1, $C3
158 adc $C2, $C2, xzr
159
160 // A0 * B2
161 mul $T2, $A0, $B2
162 umulh $T3, $A0, $B2
163 adds $C1, $C1, $T0 // C1
164 adcs $C2, $C2, $T1
165 adc $C3, xzr, xzr
166
167 // A2 * B0
168 mul $T0, $A2, $B0
169 umulh $C4, $A2, $B0
170 adds $C2, $C2, $T2
171 adcs $C3, $C3, $C4
172 adc $C4, xzr, xzr
173
174 // A1 * B1
175 mul $T2, $A1, $B1
176 umulh $T1, $A1, $B1
177 adds $C2, $C2, $T0
178 adcs $C3, $C3, $T3
179 adc $C4, $C4, xzr
180
181 // A1 * B2
182 mul $T0, $A1, $B2
183 umulh $T3, $A1, $B2
184 adds $C2, $C2, $T2 // C2
185 adcs $C3, $C3, $T1
186 adc $C4, $C4, xzr
187
188 // A2 * B1
189 mul $T2, $A2, $B1
190 umulh $T1, $A2, $B1
191 adds $C3, $C3, $T0
192 adcs $C4, $C4, $T3
193 adc $C5, xzr, xzr
194
195 // A2 * B2
196 mul $T0, $A2, $B2
197 umulh $T3, $A2, $B2
198 adds $C3, $C3, $T2 // C3
199 adcs $C4, $C4, $T1
200 adc $C5, $C5, xzr
201
202 adds $C4, $C4, $T0 // C4
203 adc $C5, $C5, $T3 // C5
204___
205 return $body;
206}
207sub mul256_karatsuba {
208 my ($M,$A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7,$T0,$T1)=@_;
209 # (AH+AL) x (BH+BL), low part
210 my $mul_low=&mul64x128($A1, $C6, $T1, $C3, $C4, $C5, $C7, $A0);
211 # AL x BL
212 my $mul_albl=&mul64x128($A1, $B0, $B1, $C1, $T1, $C7, $C6, $A0);
213 # AH x BH
214 my $mul_ahbh=&mul64x128($A3, $B2, $B3, $A1, $C6, $B0, $B1, $A2);
215 my $body=<<___;
216 // A0-A1 <- AH + AL, T0 <- mask
217 adds $A0, $A0, $A2
218 adcs $A1, $A1, $A3
219 adc $T0, xzr, xzr
220
221 // C6, T1 <- BH + BL, C7 <- mask
222 adds $C6, $B0, $B2
223 adcs $T1, $B1, $B3
224 adc $C7, xzr, xzr
225
226 // C0-C1 <- masked (BH + BL)
227 sub $C2, xzr, $T0
228 sub $C3, xzr, $C7
229 and $C0, $C6, $C2
230 and $C1, $T1, $C2
231
232 // C4-C5 <- masked (AH + AL), T0 <- combined carry
233 and $C4, $A0, $C3
234 and $C5, $A1, $C3
235 mul $C2, $A0, $C6
236 mul $C3, $A0, $T1
237 and $T0, $T0, $C7
238
239 // C0-C1, T0 <- (AH+AL) x (BH+BL), part 1
240 adds $C0, $C4, $C0
241 umulh $C4, $A0, $T1
242 adcs $C1, $C5, $C1
243 umulh $C5, $A0, $C6
244 adc $T0, $T0, xzr
245
246 // C2-C5 <- (AH+AL) x (BH+BL), low part
247 $mul_low
248 ldp $A0, $A1, [$M,#0]
249
250 // C2-C5, T0 <- (AH+AL) x (BH+BL), final part
251 adds $C4, $C0, $C4
252 umulh $C7, $A0, $B0
253 umulh $T1, $A0, $B1
254 adcs $C5, $C1, $C5
255 mul $C0, $A0, $B0
256 mul $C1, $A0, $B1
257 adc $T0, $T0, xzr
258
259 // C0-C1, T1, C7 <- AL x BL
260 $mul_albl
261
262 // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL
263 mul $A0, $A2, $B2
264 umulh $B0, $A2, $B2
265 subs $C2, $C2, $C0
266 sbcs $C3, $C3, $C1
267 sbcs $C4, $C4, $T1
268 mul $A1, $A2, $B3
269 umulh $C6, $A2, $B3
270 sbcs $C5, $C5, $C7
271 sbc $T0, $T0, xzr
272
273 // A0, A1, C6, B0 <- AH x BH
274 $mul_ahbh
275
276 // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
277 subs $C2, $C2, $A0
278 sbcs $C3, $C3, $A1
279 sbcs $C4, $C4, $C6
280 sbcs $C5, $C5, $B0
281 sbc $T0, $T0, xzr
282
283 adds $C2, $C2, $T1
284 adcs $C3, $C3, $C7
285 adcs $C4, $C4, $A0
286 adcs $C5, $C5, $A1
287 adcs $C6, $T0, $C6
288 adc $C7, $B0, xzr
289___
290 return $body;
291}
292
293# 512-bit integer multiplication using Karatsuba (two levels),
294# Comba (lower level).
295# Operation: c [x2] = a [x0] * b [x1]
296sub mul {
297 # (AH+AL) x (BH+BL), low part
298 my $mul_kc_low=&mul256_karatsuba(
299 "x2", # M0
300 "x3","x4","x5","x6", # A0-A3
301 "x10","x11","x12","x13", # B0-B3
302 "x8","x9","x19","x20","x21","x22","x23","x24", # C0-C7
303 "x25","x26"); # TMP
304 # AL x BL
305 my $mul_albl=&mul256_karatsuba(
306 "x0", # M0f
307 "x3","x4","x5","x6", # A0-A3
308 "x10","x11","x12","x13", # B0-B3
309 "x21","x22","x23","x24","x25","x26","x27","x28",# C0-C7
310 "x8","x9"); # TMP
311 # AH x BH
312 my $mul_ahbh=&mul192(
313 "x3","x4","x5", # A0-A2
314 "x10","x11","x12", # B0-B2
315 "x21","x22","x23","x24","x25","x26", # C0-C5
316 "x8","x9","x27","x28"); # TMP
317
318 my $body=<<___;
319 .global ${PREFIX}_mpmul
320 .align 4
321 ${PREFIX}_mpmul:
322 stp x29, x30, [sp,#-96]!
323 add x29, sp, #0
324 stp x19, x20, [sp,#16]
325 stp x21, x22, [sp,#32]
326 stp x23, x24, [sp,#48]
327 stp x25, x26, [sp,#64]
328 stp x27, x28, [sp,#80]
329
330 ldp x3, x4, [x0]
331 ldp x5, x6, [x0,#16]
332 ldp x7, x8, [x0,#32]
333 ldr x9, [x0,#48]
334 ldp x10, x11, [x1,#0]
335 ldp x12, x13, [x1,#16]
336 ldp x14, x15, [x1,#32]
337 ldr x16, [x1,#48]
338
339 // x3-x7 <- AH + AL, x7 <- carry
340 adds x3, x3, x7
341 adcs x4, x4, x8
342 adcs x5, x5, x9
343 adcs x6, x6, xzr
344 adc x7, xzr, xzr
345
346 // x10-x13 <- BH + BL, x8 <- carry
347 adds x10, x10, x14
348 adcs x11, x11, x15
349 adcs x12, x12, x16
350 adcs x13, x13, xzr
351 adc x8, xzr, xzr
352
353 // x9 <- combined carry
354 and x9, x7, x8
355 // x7-x8 <- mask
356 sub x7, xzr, x7
357 sub x8, xzr, x8
358
359 // x15-x19 <- masked (BH + BL)
360 and x14, x10, x7
361 and x15, x11, x7
362 and x16, x12, x7
363 and x17, x13, x7
364
365 // x20-x23 <- masked (AH + AL)
366 and x20, x3, x8
367 and x21, x4, x8
368 and x22, x5, x8
369 and x23, x6, x8
370
371 // x15-x19, x7 <- masked (AH+AL) + masked (BH+BL), step 1
372 adds x14, x14, x20
373 adcs x15, x15, x21
374 adcs x16, x16, x22
375 adcs x17, x17, x23
376 adc x7, x9, xzr
377
378 // x8-x9,x19,x20-x24 <- (AH+AL) x (BH+BL), low part
379 stp x3, x4, [x2,#0]
380 $mul_kc_low
381
382 // x15-x19, x7 <- (AH+AL) x (BH+BL), final step
383 adds x14, x14, x21
384 adcs x15, x15, x22
385 adcs x16, x16, x23
386 adcs x17, x17, x24
387 adc x7, x7, xzr
388
389 // Load AL
390 ldp x3, x4, [x0]
391 ldp x5, x6, [x0,#16]
392 // Load BL
393 ldp x10, x11, [x1,#0]
394 ldp x12, x13, [x1,#16]
395
396 // Temporarily store x8 in x2
397 stp x8, x9, [x2,#0]
398 // x21-x28 <- AL x BL
399 $mul_albl
400 // Restore x8
401 ldp x8, x9, [x2,#0]
402
403 // x8-x10,x20,x15-x17,x19 <- maskd (AH+AL) x (BH+BL) - ALxBL
404 subs x8, x8, x21
405 sbcs x9, x9, x22
406 sbcs x19, x19, x23
407 sbcs x20, x20, x24
408 sbcs x14, x14, x25
409 sbcs x15, x15, x26
410 sbcs x16, x16, x27
411 sbcs x17, x17, x28
412 sbc x7, x7, xzr
413
414 // Store ALxBL, low
415 stp x21, x22, [x2]
416 stp x23, x24, [x2,#16]
417
418 // Load AH
419 ldp x3, x4, [x0,#32]
420 ldr x5, [x0,#48]
421 // Load BH
422 ldp x10, x11, [x1,#32]
423 ldr x12, [x1,#48]
424
425 adds x8, x8, x25
426 adcs x9, x9, x26
427 adcs x19, x19, x27
428 adcs x20, x20, x28
429 adc x1, xzr, xzr
430
431 add x0, x0, #32
432 // Temporarily store x8,x9 in x2
433 stp x8,x9, [x2,#32]
434 // x21-x28 <- AH x BH
435 $mul_ahbh
436 // Restore x8,x9
437 ldp x8,x9, [x2,#32]
438
439 neg x1, x1
440
441 // x8-x9,x19,x20,x14-x17 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
442 subs x8, x8, x21
443 sbcs x9, x9, x22
444 sbcs x19, x19, x23
445 sbcs x20, x20, x24
446 sbcs x14, x14, x25
447 sbcs x15, x15, x26
448 sbcs x16, x16, xzr
449 sbcs x17, x17, xzr
450 sbc x7, x7, xzr
451
452 // Store (AH+AL) x (BH+BL) - ALxBL - AHxBH, low
453 stp x8, x9, [x2,#32]
454 stp x19, x20, [x2,#48]
455
456 adds x1, x1, #1
457 adcs x14, x14, x21
458 adcs x15, x15, x22
459 adcs x16, x16, x23
460 adcs x17, x17, x24
461 adcs x25, x7, x25
462 adc x26, x26, xzr
463
464 stp x14, x15, [x2,#64]
465 stp x16, x17, [x2,#80]
466 stp x25, x26, [x2,#96]
467
468 ldp x19, x20, [x29,#16]
469 ldp x21, x22, [x29,#32]
470 ldp x23, x24, [x29,#48]
471 ldp x25, x26, [x29,#64]
472 ldp x27, x28, [x29,#80]
473 ldp x29, x30, [sp],#96
474 ret
475___
476 return $body;
477}
478$code.=&mul();
479
480# Montgomery reduction
481# Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
482# Operation: mc [x1] = ma [x0]
483# NOTE: ma=mc is not allowed
484sub rdc {
485 my $mul01=&mul128x256(
486 "x2","x3", # A0-A1
487 "x23","x24","x25","x26", # B0-B3
488 "x4","x5","x6","x7","x8","x9", # C0-C5
489 "x10","x11","x27","x28"); # TMP
490 my $mul23=&mul128x256(
491 "x2","x10", # A0-A1
492 "x23","x24","x25","x26", # B0-B3
493 "x4","x5","x6","x7","x8","x9", # C0-C5
494 "x0","x3","x27","x28"); # TMP
495 my $mul45=&mul128x256(
496 "x11","x12", # A0-A1
497 "x23","x24","x25","x26", # B0-B3
498 "x4","x5","x6","x7","x8","x9", # C0-C5
499 "x10","x3","x27","x28"); # TMP
500 my $mul67=&mul64x256(
501 "x13", # A0
502 "x23","x24","x25","x26", # B0-B3
503 "x4","x5","x6","x7","x8", # C0-C4
504 "x10","x27","x28"); # TMP
505 my $body=<<___;
506 .global ${PREFIX}_fprdc
507 .align 4
508 ${PREFIX}_fprdc:
509 stp x29, x30, [sp, #-96]!
510 add x29, sp, xzr
511 stp x19, x20, [sp,#16]
512 stp x21, x22, [sp,#32]
513 stp x23, x24, [sp,#48]
514 stp x25, x26, [sp,#64]
515 stp x27, x28, [sp,#80]
516
517 ldp x2, x3, [x0,#0] // a[0-1]
518
519 // Load the prime constant
520 adrp x26, :pg_hi21:.Lp434p1
521 add x26, x26, :lo12:.Lp434p1
522 ldp x23, x24, [x26, #0x0]
523 ldp x25, x26, [x26,#0x10]
524
525 // a[0-1] * p434+1
526 $mul01
527
528 ldp x10, x11, [x0, #0x18]
529 ldp x12, x13, [x0, #0x28]
530 ldp x14, x15, [x0, #0x38]
531 ldp x16, x17, [x0, #0x48]
532 ldp x19, x20, [x0, #0x58]
533 ldr x21, [x0, #0x68]
534
535 adds x10, x10, x4
536 adcs x11, x11, x5
537 adcs x12, x12, x6
538 adcs x13, x13, x7
539 adcs x14, x14, x8
540 adcs x15, x15, x9
541 adcs x22, x16, xzr
542 adcs x17, x17, xzr
543 adcs x19, x19, xzr
544 adcs x20, x20, xzr
545 adc x21, x21, xzr
546
547 ldr x2, [x0,#0x10] // a[2]
548 // a[2-3] * p434+1
549 $mul23
550
551 adds x12, x12, x4
552 adcs x13, x13, x5
553 adcs x14, x14, x6
554 adcs x15, x15, x7
555 adcs x16, x22, x8
556 adcs x17, x17, x9
557 adcs x22, x19, xzr
558 adcs x20, x20, xzr
559 adc x21, x21, xzr
560
561 $mul45
562 adds x14, x14, x4
563 adcs x15, x15, x5
564 adcs x16, x16, x6
565 adcs x17, x17, x7
566 adcs x19, x22, x8
567 adcs x20, x20, x9
568 adc x22, x21, xzr
569
570 stp x14, x15, [x1, #0x0] // C0, C1
571
572 $mul67
573 adds x16, x16, x4
574 adcs x17, x17, x5
575 adcs x19, x19, x6
576 adcs x20, x20, x7
577 adc x21, x22, x8
578
579 str x16, [x1, #0x10]
580 stp x17, x19, [x1, #0x18]
581 stp x20, x21, [x1, #0x28]
582
583 ldp x19, x20, [x29,#16]
584 ldp x21, x22, [x29,#32]
585 ldp x23, x24, [x29,#48]
586 ldp x25, x26, [x29,#64]
587 ldp x27, x28, [x29,#80]
588 ldp x29, x30, [sp],#96
589 ret
590___
591}
592$code.=&rdc();
593
594# Field addition
595# Operation: c [x2] = a [x0] + b [x1]
596$code.=<<___;
597 .global ${PREFIX}_fpadd
598 .align 4
599 ${PREFIX}_fpadd:
600 stp x29,x30, [sp,#-16]!
601 add x29, sp, #0
602
603 ldp x3, x4, [x0,#0]
604 ldp x5, x6, [x0,#16]
605 ldp x7, x8, [x0,#32]
606 ldr x9, [x0,#48]
607 ldp x11, x12, [x1,#0]
608 ldp x13, x14, [x1,#16]
609 ldp x15, x16, [x1,#32]
610 ldr x17, [x1,#48]
611
612 // Add a + b
613 adds x3, x3, x11
614 adcs x4, x4, x12
615 adcs x5, x5, x13
616 adcs x6, x6, x14
617 adcs x7, x7, x15
618 adcs x8, x8, x16
619 adc x9, x9, x17
620
621 // Subtract 2xp434
622 adrp x17, :pg_hi21:.Lp434x2
623 add x17, x17, :lo12:.Lp434x2
624 ldp x11, x12, [x17, #0]
625 ldp x13, x14, [x17, #16]
626 ldp x15, x16, [x17, #32]
627 subs x3, x3, x11
628 sbcs x4, x4, x12
629 sbcs x5, x5, x12
630 sbcs x6, x6, x13
631 sbcs x7, x7, x14
632 sbcs x8, x8, x15
633 sbcs x9, x9, x16
634 sbc x0, xzr, xzr // x0 can be reused now
635
636 // Add 2xp434 anded with the mask in x0
637 and x11, x11, x0
638 and x12, x12, x0
639 and x13, x13, x0
640 and x14, x14, x0
641 and x15, x15, x0
642 and x16, x16, x0
643
644 adds x3, x3, x11
645 adcs x4, x4, x12
646 adcs x5, x5, x12
647 adcs x6, x6, x13
648 adcs x7, x7, x14
649 adcs x8, x8, x15
650 adc x9, x9, x16
651
652 stp x3, x4, [x2,#0]
653 stp x5, x6, [x2,#16]
654 stp x7, x8, [x2,#32]
655 str x9, [x2,#48]
656
657 ldp x29, x30, [sp],#16
658 ret
659___
660
661# Field subtraction
662# Operation: c [x2] = a [x0] - b [x1]
663$code.=<<___;
664 .global ${PREFIX}_fpsub
665 .align 4
666 ${PREFIX}_fpsub:
667 stp x29, x30, [sp,#-16]!
668 add x29, sp, #0
669
670 ldp x3, x4, [x0,#0]
671 ldp x5, x6, [x0,#16]
672 ldp x7, x8, [x0,#32]
673 ldr x9, [x0,#48]
674 ldp x11, x12, [x1,#0]
675 ldp x13, x14, [x1,#16]
676 ldp x15, x16, [x1,#32]
677 ldr x17, [x1,#48]
678
679 // Subtract a - b
680 subs x3, x3, x11
681 sbcs x4, x4, x12
682 sbcs x5, x5, x13
683 sbcs x6, x6, x14
684 sbcs x7, x7, x15
685 sbcs x8, x8, x16
686 sbcs x9, x9, x17
687 sbc x0, xzr, xzr
688
689 // Add 2xp434 anded with the mask in x0
690 adrp x17, :pg_hi21:.Lp434x2
691 add x17, x17, :lo12:.Lp434x2
692
693 // First half
694 ldp x11, x12, [x17, #0]
695 ldp x13, x14, [x17, #16]
696 ldp x15, x16, [x17, #32]
697
698 // Add 2xp434 anded with the mask in x0
699 and x11, x11, x0
700 and x12, x12, x0
701 and x13, x13, x0
702 and x14, x14, x0
703 and x15, x15, x0
704 and x16, x16, x0
705
706 adds x3, x3, x11
707 adcs x4, x4, x12
708 adcs x5, x5, x12
709 adcs x6, x6, x13
710 adcs x7, x7, x14
711 adcs x8, x8, x15
712 adc x9, x9, x16
713
714 stp x3, x4, [x2,#0]
715 stp x5, x6, [x2,#16]
716 stp x7, x8, [x2,#32]
717 str x9, [x2,#48]
718
719 ldp x29, x30, [sp],#16
720 ret
721___
722
723# 434-bit multiprecision addition
724# Operation: c [x2] = a [x0] + b [x1]
725$code.=<<___;
726 .global ${PREFIX}_mpadd_asm
727 .align 4
728 ${PREFIX}_mpadd_asm:
729 stp x29, x30, [sp,#-16]!
730 add x29, sp, #0
731
732 ldp x3, x4, [x0,#0]
733 ldp x5, x6, [x0,#16]
734 ldp x7, x8, [x0,#32]
735 ldr x9, [x0,#48]
736 ldp x11, x12, [x1,#0]
737 ldp x13, x14, [x1,#16]
738 ldp x15, x16, [x1,#32]
739 ldr x17, [x1,#48]
740
741 adds x3, x3, x11
742 adcs x4, x4, x12
743 adcs x5, x5, x13
744 adcs x6, x6, x14
745 adcs x7, x7, x15
746 adcs x8, x8, x16
747 adc x9, x9, x17
748
749 stp x3, x4, [x2,#0]
750 stp x5, x6, [x2,#16]
751 stp x7, x8, [x2,#32]
752 str x9, [x2,#48]
753
754 ldp x29, x30, [sp],#16
755 ret
756___
757
758# 2x434-bit multiprecision subtraction
759# Operation: c [x2] = a [x0] - b [x1].
760# Returns borrow mask
761$code.=<<___;
762 .global ${PREFIX}_mpsubx2_asm
763 .align 4
764 ${PREFIX}_mpsubx2_asm:
765 stp x29, x30, [sp,#-16]!
766 add x29, sp, #0
767
768 ldp x3, x4, [x0,#0]
769 ldp x5, x6, [x0,#16]
770 ldp x11, x12, [x1,#0]
771 ldp x13, x14, [x1,#16]
772 subs x3, x3, x11
773 sbcs x4, x4, x12
774 sbcs x5, x5, x13
775 sbcs x6, x6, x14
776 ldp x7, x8, [x0,#32]
777 ldp x9, x10, [x0,#48]
778 ldp x11, x12, [x1,#32]
779 ldp x13, x14, [x1,#48]
780 sbcs x7, x7, x11
781 sbcs x8, x8, x12
782 sbcs x9, x9, x13
783 sbcs x10, x10, x14
784
785 stp x3, x4, [x2,#0]
786 stp x5, x6, [x2,#16]
787 stp x7, x8, [x2,#32]
788 stp x9, x10, [x2,#48]
789
790 ldp x3, x4, [x0,#64]
791 ldp x5, x6, [x0,#80]
792 ldp x11, x12, [x1,#64]
793 ldp x13, x14, [x1,#80]
794 sbcs x3, x3, x11
795 sbcs x4, x4, x12
796 sbcs x5, x5, x13
797 sbcs x6, x6, x14
798 ldp x7, x8, [x0,#96]
799 ldp x11, x12, [x1,#96]
800 sbcs x7, x7, x11
801 sbcs x8, x8, x12
802 sbc x0, xzr, xzr
803
804 stp x3, x4, [x2,#64]
805 stp x5, x6, [x2,#80]
806 stp x7, x8, [x2,#96]
807
808 ldp x29, x30, [sp],#16
809 ret
810___
811
812
813# Double 2x434-bit multiprecision subtraction
814# Operation: c [x2] = c [x2] - a [x0] - b [x1]
815$code.=<<___;
816 .global ${PREFIX}_mpdblsubx2_asm
817 .align 4
818 ${PREFIX}_mpdblsubx2_asm:
819 stp x29, x30, [sp, #-16]!
820 add x29, sp, #0
821
822 ldp x3, x4, [x2, #0]
823 ldp x5, x6, [x2,#16]
824 ldp x7, x8, [x2,#32]
825
826 ldp x11, x12, [x0, #0]
827 ldp x13, x14, [x0,#16]
828 ldp x15, x16, [x0,#32]
829
830 subs x3, x3, x11
831 sbcs x4, x4, x12
832 sbcs x5, x5, x13
833 sbcs x6, x6, x14
834 sbcs x7, x7, x15
835 sbcs x8, x8, x16
836
837 // x9 stores carry
838 adc x9, xzr, xzr
839
840 ldp x11, x12, [x1, #0]
841 ldp x13, x14, [x1,#16]
842 ldp x15, x16, [x1,#32]
843 subs x3, x3, x11
844 sbcs x4, x4, x12
845 sbcs x5, x5, x13
846 sbcs x6, x6, x14
847 sbcs x7, x7, x15
848 sbcs x8, x8, x16
849 adc x9, x9, xzr
850
851 stp x3, x4, [x2, #0]
852 stp x5, x6, [x2,#16]
853 stp x7, x8, [x2,#32]
854
855 ldp x3, x4, [x2,#48]
856 ldp x5, x6, [x2,#64]
857 ldp x7, x8, [x2,#80]
858
859 ldp x11, x12, [x0,#48]
860 ldp x13, x14, [x0,#64]
861 ldp x15, x16, [x0,#80]
862
863 // x9 = 2 - x9
864 neg x9, x9
865 add x9, x9, #2
866
867 subs x3, x3, x9
868 sbcs x3, x3, x11
869 sbcs x4, x4, x12
870 sbcs x5, x5, x13
871 sbcs x6, x6, x14
872 sbcs x7, x7, x15
873 sbcs x8, x8, x16
874 adc x9, xzr, xzr
875
876 ldp x11, x12, [x1,#48]
877 ldp x13, x14, [x1,#64]
878 ldp x15, x16, [x1,#80]
879 subs x3, x3, x11
880 sbcs x4, x4, x12
881 sbcs x5, x5, x13
882 sbcs x6, x6, x14
883 sbcs x7, x7, x15
884 sbcs x8, x8, x16
885 adc x9, x9, xzr
886
887 stp x3, x4, [x2,#48]
888 stp x5, x6, [x2,#64]
889 stp x7, x8, [x2,#80]
890
891 ldp x3, x4, [x2,#96]
892 ldp x11, x12, [x0,#96]
893 ldp x13, x14, [x1,#96]
894
895 // x9 = 2 - x9
896 neg x9, x9
897 add x9, x9, #2
898
899 subs x3, x3, x9
900 sbcs x3, x3, x11
901 sbcs x4, x4, x12
902 subs x3, x3, x13
903 sbc x4, x4, x14
904 stp x3, x4, [x2,#96]
905
906 ldp x29, x30, [sp],#16
907 ret
908___
909
910foreach (split("\n",$code)) {
911 s/\`([^\`]*)\`/eval($1)/ge;
912 print $_,"\n";
913}
914
915close STDOUT;