blob: cffde1a821ec183cf2de551aeb173a00ebd348a7 [file] [log] [blame]
Pete Bentleya5c947b2019-08-09 14:24:27 +00001#! /usr/bin/env perl
2#
3# April 2019
4#
5# Abstract: field arithmetic in x64 assembly for SIDH/p434
6
7$flavour = shift;
8$output = shift;
9if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
10
11$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
12( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
13( $xlate="${dir}../../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
14die "can't locate x86_64-xlate.pl";
15
16open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
17*STDOUT=*OUT;
18
19$PREFIX="sike";
20$bmi2_adx = 1;
21
22$code.=<<___;
23.text
24
25# p434 x 2
26.Lp434x2:
27.quad 0xFFFFFFFFFFFFFFFE
28.quad 0xFFFFFFFFFFFFFFFF
29.quad 0xFB82ECF5C5FFFFFF
30.quad 0xF78CB8F062B15D47
31.quad 0xD9F8BFAD038A40AC
32.quad 0x0004683E4E2EE688
33
34# p434 + 1
35.Lp434p1:
36.quad 0xFDC1767AE3000000
37.quad 0x7BC65C783158AEA3
38.quad 0x6CFC5FD681C52056
39.quad 0x0002341F27177344
40
41.extern OPENSSL_ia32cap_P
42.hidden OPENSSL_ia32cap_P
43___
44
45# Jump to alternative implemenatation provided as an
46# argument in case CPU supports ADOX/ADCX and MULX instructions.
47sub alt_impl {
48 $jmp_func = shift;
49
50 $body=<<___;
51 lea OPENSSL_ia32cap_P(%rip), %rcx
52 mov 8(%rcx), %rcx
53 and \$0x80100, %ecx
54 cmp \$0x80100, %ecx
55 je $jmp_func
56
57___
58 return $body
59}
60
61# Performs schoolbook multiplication of 2 192-bit numbers. Uses
62# MULX instruction. Result is stored in 192 bits pointed by $DST.
63sub mul192 {
64 my ($idxM0,$M0,$idxM1,$M1,$idxDST,$DST,$T0,$T1,$T2,$T3,$T4,$T5,$T6)=@_;
65 my ($ML0,$ML8,$ML16)=map("$idxM0+$_($M0)",(0,8,16));
66 my ($MR0,$MR8,$MR16)=map("$idxM1+$_($M1)",(0,8,16));
67 my ($D0,$D1,$D2,$D3,$D4,$D5)=map("$idxDST+$_($DST)",(0,8,16,24,32,40));
68
69 $body=<<___;
70 mov $ML0, %rdx
71 mulx $MR0, $T1, $T0 # T0:T1 = A0*B0
72 mov $T1, $D0 # DST0
73 mulx $MR8, $T2, $T1 # T1:T2 = A0*B1
74 xor %rax, %rax
75 adox $T2, $T0
76 mulx $MR16,$T3, $T2 # T2:T3 = A0*B2
77 adox $T3, $T1
78
79 mov $ML8, %rdx
80 mulx $MR0, $T4, $T3 # T3:T4 = A1*B0
81 adox %rax, $T2
82 xor %rax, %rax
83
84 mulx $MR8, $T6, $T5 # T6:T7 = A1*B1
85 adox $T0, $T4
86 mov $T4, $D1 # DST1
87 adcx $T6, $T3
88
89 mulx $MR16,$T0, $T6 # T6:T0 = A1*B2
90 adox $T1, $T3
91 adcx $T0, $T5
92 adcx %rax, $T6
93 adox $T2, $T5
94
95 mov $ML16,%rdx
96 mulx $MR0, $T0, $T1 # T1:T0 = A2*B0
97 adox %rax, $T6
98 xor %rax, %rax
99
100 mulx $MR8, $T2, $T4 # T4:T2 = A2*B1
101 adox $T3, $T0
102 mov $T0, $D2 # DST2
103 adcx $T5, $T1
104
105 mulx $MR16,$T3, $T0 # T0:T3 = A2*B2
106 adcx $T6, $T4
107 adcx %rax, $T0
108 adox $T2, $T1
109 adox $T4, $T3
110 adox %rax, $T0
111 mov $T1, $D3 # DST3
112 mov $T3, $D4 # DST4
113 mov $T0, $D5 # DST5
114
115___
116 return $body;
117}
118
119# Performs schoolbook multiplication of 2 256-bit numbers. Uses
120# MULX instruction. Result is stored in 256 bits pointed by $DST.
121sub mul256 {
122 my ($idxM0,$M0,$idxM1,$M1,$idxDST,$DST,$T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7,$T8,$T9)=@_;
123 my ($ML0,$ML8,$ML16,$ML24)=map("$idxM0+$_($M0)",(0,8,16,24));
124 my ($MR0,$MR8,$MR16,$MR24)=map("$idxM1+$_($M1)",(0,8,16,24));
125 my ($D0,$D1,$D2,$D3,$D4,$D5,$D6,$D7)=map("$idxDST+$_($DST)",(0,8,16,24,32,40,48,56));
126
127 $body=<<___;
128 mov $ML0, %rdx
129 mulx $MR0, $T1, $T0 # T0:T1 = A0*B0
130 mov $T1, $D0 # DST0_final
131 mulx $MR8, $T2, $T1 # T1:T2 = A0*B1
132 xor %rax, %rax
133 adox $T2, $T0
134 mulx $MR16,$T3, $T2 # T2:T3 = A0*B2
135 adox $T3, $T1
136 mulx $MR24,$T4, $T3 # T3:T4 = A0*B3
137 adox $T4, $T2
138
139 mov $ML8, %rdx
140 mulx $MR0, $T4, $T5 # T5:T4 = A1*B0
141 adox %rax, $T3
142 xor %rax, %rax
143 mulx $MR8, $T7, $T6 # T6:T7 = A1*B1
144 adox $T0, $T4
145 mov $T4, $D1 # DST1_final
146 adcx $T7, $T5
147 mulx $MR16,$T8, $T7 # T7:T8 = A1*B2
148 adcx $T8, $T6
149 adox $T1, $T5
150 mulx $MR24,$T9, $T8 # T8:T9 = A1*B3
151 adcx $T9, $T7
152 adcx %rax, $T8
153 adox $T2, $T6
154
155 mov $ML16,%rdx
156 mulx $MR0, $T0, $T1 # T1:T0 = A2*B0
157 adox $T3, $T7
158 adox %rax, $T8
159 xor %rax, %rax
160 mulx $MR8, $T3, $T2 # T2:T3 = A2*B1
161 adox $T5, $T0
162 mov $T0, $D2 # DST2_final
163 adcx $T3, $T1
164 mulx $MR16,$T4, $T3 # T3:T4 = A2*B2
165 adcx $T4, $T2
166 adox $T6, $T1
167 mulx $MR24,$T9, $T4 # T3:T4 = A2*B3
168 adcx $T9, $T3
169 adcx %rax, $T4
170
171 adox $T7, $T2
172 adox $T8, $T3
173 adox %rax, $T4
174
175 mov $ML24,%rdx
176 mulx $MR0, $T0, $T5 # T5:T0 = A3*B0
177 xor %rax, %rax
178 mulx $MR8, $T7, $T6 # T6:T7 = A3*B1
179 adcx $T7, $T5
180 adox $T0, $T1
181 mulx $MR16, $T8, $T7 # T7:T8 = A3*B2
182 adcx $T8, $T6
183 adox $T5, $T2
184 mulx $MR24, $T9, $T8 # T8:T9 = A3*B3
185 adcx $T9, $T7
186 adcx %rax, $T8
187 adox $T6, $T3
188 adox $T7, $T4
189 adox %rax, $T8
190 mov $T1, $D3 # DST3_final
191 mov $T2, $D4 # DST4_final
192 mov $T3, $D5 # DST5_final
193 mov $T4, $D6 # DST6_final
194 mov $T8, $D7 # DST7_final
195
196___
197 return $body;
198}
199
200# Performs schoolbook multiplication of 64-bit with 256-bit
201# number.
202sub mul64x256 {
203 my ($idxM0,$M0,$M1,$T0,$T1,$T2,$T3,$T4,$T5)=@_;
204 my $body.=<<___;
205 mov $idxM0($M0), $T5
206
207 xor $T2, $T2
208 mov 0+$M1, %rax
209 mul $T5
210 mov %rax, $T0 # C0
211 mov %rdx, $T1
212
213 xor $T3, $T3
214 mov 8+$M1, %rax
215 mul $T5
216 add %rax, $T1 # C1
217 adc %rdx, $T2
218
219 xor $T4, $T4
220 mov 16+$M1, %rax
221 mul $T5
222 add %rax, $T2 # C2
223 adc %rdx, $T3
224
225 mov 24+$M1, %rax
226 mul $T5
227 add %rax, $T3 # C3
228 adc %rdx, $T4 # C4
229___
230 return $body;
231}
232
233# Performs schoolbook multiplication of 64-bit with 256-bit
234# number. Uses MULX and ADOX instructions.
235sub mulx64x256 {
236 my ($idxM0,$M0,$M1,$T0,$T1,$T2,$T3,$T4,$T5)=@_;
237 my $body.=<<___;
238 xor %rax, %rax
239 mov $idxM0($M0), %rdx
240 mulx 0+$M1, $T0, $T1 # T0 <- C0
241 mulx 8+$M1, $T4, $T2
242 mulx 16+$M1, $T5, $T3
243
244 adox $T4, $T1 # T1 <- C1
245 adox $T5, $T2 # T2 <- C2
246
247 mulx 24+$M1, $T5, $T4
248 adox $T5, $T3 # T3 <- C3
249 adox %rax, $T4 # T4 <- C4
250___
251 return $body;
252}
253
254# Performs schoolbook multiplication of 128-bit with 256-bit
255# number. Destroys RAX and RDX
256sub mul128x256 {
257 my ($idxMA,$MA,$MB,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1)=@_;
258 my ($MA0,$MA8)=map("$idxMA+$_($MA)", (0,8));
259 my $body.=<<___;
260 # A0 x B0
261 mov $MA0, $T0
262 mov 0+$MB, %rax
263 mul $T0
264 xor $C2, $C2
265 mov %rax, $C0 # c0
266 mov %rdx, $C1
267
268 # A0 x B1
269 mov 8+$MB, %rax
270 mul $T0
271 xor $C3, $C3
272 add %rax, $C1
273 adc %rdx, $C2
274
275 # A1 x B0
276 mov $MA8, $T1
277 mov 0+$MB, %rax
278 mul $T1
279 add %rax, $C1
280 adc %rdx, $C2
281 adc \$0x0, $C3
282
283 # A0 x B2
284 xor $C4, $C4
285 mov 16+$MB, %rax
286 mul $T0
287 add %rax, $C2
288 adc %rdx, $C3
289 adc \$0x0, $C4
290
291 # A1 x B1
292 mov 8+$MB, %rax
293 mul $T1
294 add %rax, $C2 # c2
295 adc %rdx, $C3
296 adc \$0x0, $C4
297
298 # A0 x B3
299 mov 24+$MB, %rax
300 mul $T0
301 xor $C5, $C5
302 add %rax, $C3
303 adc %rdx, $C4
304 adc \$0x0, $C5
305
306 # A1 x B2
307 mov 16+$MB, %rax
308 mul $T1
309 add %rax, $C3 # c3
310 adc %rdx, $C4
311 adc \$0x0, $C5
312
313 # A1 x B3
314 mov 24+$MB, %rax
315 mul $T1
316 add %rax, $C4
317 adc %rdx, $C5
318
319___
320 return $body;
321}
322
323# Performs schoolbook multiplication of 128-bit with 256-bit
324# number. Uses MULX, ADOX, ADCX instruction.
325sub mulx128x256 {
326 my ($idxM0,$M0,$M1,$T0,$T1,$T2,$T3,$T4,$T5,$T6)=@_;
327 my ($MUL0,$MUL8)=map("$idxM0+$_($M0)", (0,8));
328 my $body.=<<___;
329 xor %rax, %rax
330 mov $MUL0, %rdx
331 mulx 0+$M1, $T0, $T1 # T0 <- C0
332 mulx 8+$M1, $T4, $T2
333 mulx 16+$M1, $T5, $T3
334
335 adox $T4, $T1 # T1: interm1
336 adox $T5, $T2 # T2: interm2
337
338 mulx 24+$M1, $T5, $T4
339 adox $T5, $T3 # T3: interm3
340 adox %rax, $T4 # T4: interm4
341
342 xor %rax, %rax
343 mov $MUL8, %rdx
344 mulx 0+$M1, $T5, $T6
345 adcx $T5, $T1 # T1 <- C1
346 adcx $T6, $T2
347
348 mulx 8+$M1, $T6, $T5
349 adcx $T5, $T3
350 adox $T6, $T2 # T2 <- C2
351
352 mulx 16+$M1, $T6, $T5
353 adcx $T5, $T4
354 adox $T6, $T3 # T3 <- C3
355
356 mulx 24+$M1, $T6, $T5
357 adcx %rax, $T5
358 adox $T6, $T4 # T4 <- C4
359 adox %rax, $T5 # T5 <- C5
360___
361 return $body;
362}
363
364# Compute z = x + y (mod p).
365# Operation: c [rdx] = a [rdi] + b [rsi]
366$code.=<<___;
367.globl ${PREFIX}_fpadd
368.type ${PREFIX}_fpadd,\@function,3
369${PREFIX}_fpadd:
370.cfi_startproc
371 push %r12
372.cfi_adjust_cfa_offset 8
373.cfi_offset r12, -16
374 push %r13
375.cfi_adjust_cfa_offset 8
376.cfi_offset r13, -24
377 push %r14
378.cfi_adjust_cfa_offset 8
379.cfi_offset r14, -32
380
381 xor %rax, %rax
382
383 mov 0x0(%rdi), %r8
384 add 0x0(%rsi), %r8
385 mov 0x8(%rdi), %r9
386 adc 0x8(%rsi), %r9
387 mov 0x10(%rdi), %r10
388 adc 0x10(%rsi), %r10
389 mov 0x18(%rdi), %r11
390 adc 0x18(%rsi), %r11
391 mov 0x20(%rdi), %r12
392 adc 0x20(%rsi), %r12
393 mov 0x28(%rdi), %r13
394 adc 0x28(%rsi), %r13
395 mov 0x30(%rdi), %r14
396 adc 0x30(%rsi), %r14
397
398 mov .Lp434x2(%rip), %rcx
399 sub %rcx, %r8
400 mov 0x8+.Lp434x2(%rip), %rcx
401 sbb %rcx, %r9
402 sbb %rcx, %r10
403 mov 0x10+.Lp434x2(%rip), %rcx
404 sbb %rcx, %r11
405 mov 0x18+.Lp434x2(%rip), %rcx
406 sbb %rcx, %r12
407 mov 0x20+.Lp434x2(%rip), %rcx
408 sbb %rcx, %r13
409 mov 0x28+.Lp434x2(%rip), %rcx
410 sbb %rcx, %r14
411
412 sbb \$0, %rax
413
414 mov .Lp434x2(%rip), %rdi
415 and %rax, %rdi
416 mov 0x8+.Lp434x2(%rip), %rsi
417 and %rax, %rsi
418 mov 0x10+.Lp434x2(%rip), %rcx
419 and %rax, %rcx
420
421 add %rdi, %r8
422 mov %r8, 0x0(%rdx)
423 adc %rsi, %r9
424 mov %r9, 0x8(%rdx)
425 adc %rsi, %r10
426 mov %r10, 0x10(%rdx)
427 adc %rcx, %r11
428 mov %r11, 0x18(%rdx)
429
430 setc %cl
431 mov 0x18+.Lp434x2(%rip), %r8
432 and %rax, %r8
433 mov 0x20+.Lp434x2(%rip), %r9
434 and %rax, %r9
435 mov 0x28+.Lp434x2(%rip), %r10
436 and %rax, %r10
437 bt \$0, %rcx
438
439 adc %r8, %r12
440 mov %r12, 0x20(%rdx)
441 adc %r9, %r13
442 mov %r13, 0x28(%rdx)
443 adc %r10, %r14
444 mov %r14, 0x30(%rdx)
445
446 pop %r14
447.cfi_adjust_cfa_offset -8
448 pop %r13
449.cfi_adjust_cfa_offset -8
450 pop %r12
451.cfi_adjust_cfa_offset -8
452 ret
453.cfi_endproc
454___
455
456# Loads data to XMM0 and XMM1 and
457# conditionaly swaps depending on XMM3
458sub cswap_block16() {
459 my $idx = shift;
460 $idx *= 16;
461 ("
462 movdqu $idx(%rdi), %xmm0
463 movdqu $idx(%rsi), %xmm1
464 movdqa %xmm1, %xmm2
465 pxor %xmm0, %xmm2
466 pand %xmm3, %xmm2
467 pxor %xmm2, %xmm0
468 pxor %xmm2, %xmm1
469 movdqu %xmm0, $idx(%rdi)
470 movdqu %xmm1, $idx(%rsi)
471 ");
472}
473
474# Conditionally swaps bits in x and y in constant time.
475# mask indicates bits to be swapped (set bits are swapped)
476# Operation: [rdi] <-> [rsi] if rdx==1
477sub sike_cswap {
478 # P[0] with Q[0]
479 foreach ( 0.. 6){$BLOCKS.=eval "&cswap_block16($_)";}
480 # P[1] with Q[1]
481 foreach ( 7..13){$BLOCKS.=eval "&cswap_block16($_)";}
482
483 my $body =<<___;
484.globl ${PREFIX}_cswap_asm
485.type ${PREFIX}_cswap_asm,\@function,3
486${PREFIX}_cswap_asm:
487 # Fill XMM3. After this step first half of XMM3 is
488 # just zeros and second half is whatever in RDX
489 mov %rdx, %xmm3
490
491 # Copy lower double word everywhere else. So that
492 # XMM3=RDX|RDX. As RDX has either all bits set
493 # or non result will be that XMM3 has also either
494 # all bits set or non of them. 68 = 01000100b
495 pshufd \$68, %xmm3, %xmm3
496 $BLOCKS
497 ret
498___
499 ($body)
500}
501$code.=&sike_cswap();
502
503
504# Field subtraction
505# Operation: c [rdx] = a [rdi] - b [rsi]
506$code.=<<___;
507.globl ${PREFIX}_fpsub
508.type ${PREFIX}_fpsub,\@function,3
509${PREFIX}_fpsub:
510.cfi_startproc
511 push %r12
512.cfi_adjust_cfa_offset 8
513.cfi_offset r12, -16
514 push %r13
515.cfi_adjust_cfa_offset 8
516.cfi_offset r13, -24
517 push %r14
518.cfi_adjust_cfa_offset 8
519.cfi_offset r14, -32
520
521 xor %rax, %rax
522
523 mov 0x0(%rdi), %r8
524 sub 0x0(%rsi), %r8
525 mov 0x8(%rdi), %r9
526 sbb 0x8(%rsi), %r9
527 mov 0x10(%rdi), %r10
528 sbb 0x10(%rsi), %r10
529 mov 0x18(%rdi), %r11
530 sbb 0x18(%rsi), %r11
531 mov 0x20(%rdi), %r12
532 sbb 0x20(%rsi), %r12
533 mov 0x28(%rdi), %r13
534 sbb 0x28(%rsi), %r13
535 mov 0x30(%rdi), %r14
536 sbb 0x30(%rsi), %r14
537
538 sbb \$0x0, %rax
539
540 mov .Lp434x2(%rip), %rdi
541 and %rax, %rdi
542 mov 0x08+.Lp434x2(%rip), %rsi
543 and %rax, %rsi
544 mov 0x10+.Lp434x2(%rip), %rcx
545 and %rax, %rcx
546
547 add %rdi, %r8
548 mov %r8, 0x0(%rdx)
549 adc %rsi, %r9
550 mov %r9, 0x8(%rdx)
551 adc %rsi, %r10
552 mov %r10, 0x10(%rdx)
553 adc %rcx, %r11
554 mov %r11, 0x18(%rdx)
555
556 setc %cl
557 mov 0x18+.Lp434x2(%rip), %r8
558 and %rax, %r8
559 mov 0x20+.Lp434x2(%rip), %r9
560 and %rax, %r9
561 mov 0x28+.Lp434x2(%rip), %r10
562 and %rax, %r10
563 bt \$0x0, %rcx
564
565 adc %r8, %r12
566 adc %r9, %r13
567 adc %r10, %r14
568 mov %r12, 0x20(%rdx)
569 mov %r13, 0x28(%rdx)
570 mov %r14, 0x30(%rdx)
571
572 pop %r14
573.cfi_adjust_cfa_offset -8
574 pop %r13
575.cfi_adjust_cfa_offset -8
576 pop %r12
577.cfi_adjust_cfa_offset -8
578 ret
579.cfi_endproc
580___
581
582# 434-bit multiprecision addition
583# Operation: c [rdx] = a [rdi] + b [rsi]
584$code.=<<___;
585.globl ${PREFIX}_mpadd_asm
586.type ${PREFIX}_mpadd_asm,\@function,3
587${PREFIX}_mpadd_asm:
588.cfi_startproc
589 mov 0x0(%rdi), %r8;
590 mov 0x8(%rdi), %r9
591 mov 0x10(%rdi), %r10
592 mov 0x18(%rdi), %r11
593 mov 0x20(%rdi), %rcx
594 add 0x0(%rsi), %r8
595 adc 0x8(%rsi), %r9
596 adc 0x10(%rsi), %r10
597 adc 0x18(%rsi), %r11
598 adc 0x20(%rsi), %rcx
599 mov %r8, 0x0(%rdx)
600 mov %r9, 0x8(%rdx)
601 mov %r10, 0x10(%rdx)
602 mov %r11, 0x18(%rdx)
603 mov %rcx, 0x20(%rdx)
604
605 mov 0x28(%rdi), %r8
606 mov 0x30(%rdi), %r9
607 adc 0x28(%rsi), %r8
608 adc 0x30(%rsi), %r9
609 mov %r8, 0x28(%rdx)
610 mov %r9, 0x30(%rdx)
611 ret
612.cfi_endproc
613___
614
615# 2x434-bit multiprecision subtraction
616# Operation: c [rdx] = a [rdi] - b [rsi].
617# Returns borrow mask
618$code.=<<___;
619.globl ${PREFIX}_mpsubx2_asm
620.type ${PREFIX}_mpsubx2_asm,\@function,3
621${PREFIX}_mpsubx2_asm:
622.cfi_startproc
623 xor %rax, %rax
624
625 mov 0x0(%rdi), %r8
626 mov 0x8(%rdi), %r9
627 mov 0x10(%rdi), %r10
628 mov 0x18(%rdi), %r11
629 mov 0x20(%rdi), %rcx
630 sub 0x0(%rsi), %r8
631 sbb 0x8(%rsi), %r9
632 sbb 0x10(%rsi), %r10
633 sbb 0x18(%rsi), %r11
634 sbb 0x20(%rsi), %rcx
635 mov %r8, 0x0(%rdx)
636 mov %r9, 0x8(%rdx)
637 mov %r10, 0x10(%rdx)
638 mov %r11, 0x18(%rdx)
639 mov %rcx, 0x20(%rdx)
640
641 mov 0x28(%rdi), %r8
642 mov 0x30(%rdi), %r9
643 mov 0x38(%rdi), %r10
644 mov 0x40(%rdi), %r11
645 mov 0x48(%rdi), %rcx
646 sbb 0x28(%rsi), %r8
647 sbb 0x30(%rsi), %r9
648 sbb 0x38(%rsi), %r10
649 sbb 0x40(%rsi), %r11
650 sbb 0x48(%rsi), %rcx
651 mov %r8, 0x28(%rdx)
652 mov %r9, 0x30(%rdx)
653 mov %r10, 0x38(%rdx)
654 mov %r11, 0x40(%rdx)
655 mov %rcx, 0x48(%rdx)
656
657 mov 0x50(%rdi), %r8
658 mov 0x58(%rdi), %r9
659 mov 0x60(%rdi), %r10
660 mov 0x68(%rdi), %r11
661 sbb 0x50(%rsi), %r8
662 sbb 0x58(%rsi), %r9
663 sbb 0x60(%rsi), %r10
664 sbb 0x68(%rsi), %r11
665 sbb \$0x0, %rax
666 mov %r8, 0x50(%rdx)
667 mov %r9, 0x58(%rdx)
668 mov %r10, 0x60(%rdx)
669 mov %r11, 0x68(%rdx)
670 ret
671.cfi_endproc
672___
673
674
675# Double 2x434-bit multiprecision subtraction
676# Operation: c [rdx] = c [rdx] - a [rdi] - b [rsi]
677$code.=<<___;
678.globl ${PREFIX}_mpdblsubx2_asm
679.type ${PREFIX}_mpdblsubx2_asm,\@function,3
680${PREFIX}_mpdblsubx2_asm:
681.cfi_startproc
682 push %r12
683.cfi_adjust_cfa_offset 8
684.cfi_offset r12, -16
685 push %r13
686.cfi_adjust_cfa_offset 8
687.cfi_offset r13, -24
688
689 xor %rax, %rax
690
691 # ci:low = c:low - a:low
692 mov 0x0(%rdx), %r8
693 mov 0x8(%rdx), %r9
694 mov 0x10(%rdx), %r10
695 mov 0x18(%rdx), %r11
696 mov 0x20(%rdx), %r12
697 mov 0x28(%rdx), %r13
698 mov 0x30(%rdx), %rcx
699 sub 0x0(%rdi), %r8
700 sbb 0x8(%rdi), %r9
701 sbb 0x10(%rdi), %r10
702 sbb 0x18(%rdi), %r11
703 sbb 0x20(%rdi), %r12
704 sbb 0x28(%rdi), %r13
705 sbb 0x30(%rdi), %rcx
706 adc \$0x0, %rax
707
708 # c:low = ci:low - b:low
709 sub 0x0(%rsi), %r8
710 sbb 0x8(%rsi), %r9
711 sbb 0x10(%rsi), %r10
712 sbb 0x18(%rsi), %r11
713 sbb 0x20(%rsi), %r12
714 sbb 0x28(%rsi), %r13
715 sbb 0x30(%rsi), %rcx
716 adc \$0x0, %rax
717
718 # store c:low
719 mov %r8, 0x0(%rdx)
720 mov %r9, 0x8(%rdx)
721 mov %r10, 0x10(%rdx)
722 mov %r11, 0x18(%rdx)
723 mov %r12, 0x20(%rdx)
724 mov %r13, 0x28(%rdx)
725 mov %rcx, 0x30(%rdx)
726
727 # ci:high = c:high - a:high
728 mov 0x38(%rdx), %r8
729 mov 0x40(%rdx), %r9
730 mov 0x48(%rdx), %r10
731 mov 0x50(%rdx), %r11
732 mov 0x58(%rdx), %r12
733 mov 0x60(%rdx), %r13
734 mov 0x68(%rdx), %rcx
735
736 sub %rax, %r8
737 sbb 0x38(%rdi), %r8
738 sbb 0x40(%rdi), %r9
739 sbb 0x48(%rdi), %r10
740 sbb 0x50(%rdi), %r11
741 sbb 0x58(%rdi), %r12
742 sbb 0x60(%rdi), %r13
743 sbb 0x68(%rdi), %rcx
744
745 # c:high = ci:high - b:high
746 sub 0x38(%rsi), %r8
747 sbb 0x40(%rsi), %r9
748 sbb 0x48(%rsi), %r10
749 sbb 0x50(%rsi), %r11
750 sbb 0x58(%rsi), %r12
751 sbb 0x60(%rsi), %r13
752 sbb 0x68(%rsi), %rcx
753
754 # store c:high
755 mov %r8, 0x38(%rdx)
756 mov %r9, 0x40(%rdx)
757 mov %r10, 0x48(%rdx)
758 mov %r11, 0x50(%rdx)
759 mov %r12, 0x58(%rdx)
760 mov %r13, 0x60(%rdx)
761 mov %rcx, 0x68(%rdx)
762
763 pop %r13
764.cfi_adjust_cfa_offset -8
765 pop %r12
766.cfi_adjust_cfa_offset -8
767 ret
768.cfi_endproc
769
770___
771
772sub redc_common {
773 my ($mul01, $mul23, $mul45, $mul67)=@_;
774 my $body=<<___;
775 $mul01
776 xor %rcx, %rcx
777 add 0x18(%rdi), %r8
778 adc 0x20(%rdi), %r9
779 adc 0x28(%rdi), %r10
780 adc 0x30(%rdi), %r11
781 adc 0x38(%rdi), %r12
782 adc 0x40(%rdi), %r13
783 adc 0x48(%rdi), %rcx
784 mov %r8, 0x18(%rdi)
785 mov %r9, 0x20(%rdi)
786 mov %r10, 0x28(%rdi)
787 mov %r11, 0x30(%rdi)
788 mov %r12, 0x38(%rdi)
789 mov %r13, 0x40(%rdi)
790 mov %rcx, 0x48(%rdi)
791 mov 0x50(%rdi), %r8
792 mov 0x58(%rdi), %r9
793 mov 0x60(%rdi), %r10
794 mov 0x68(%rdi), %r11
795 adc \$0x0, %r8
796 adc \$0x0, %r9
797 adc \$0x0, %r10
798 adc \$0x0, %r11
799 mov %r8, 0x50(%rdi)
800 mov %r9, 0x58(%rdi)
801 mov %r10, 0x60(%rdi)
802 mov %r11, 0x68(%rdi)
803
804 $mul23
805 xor %rcx, %rcx
806 add 0x28(%rdi), %r8
807 adc 0x30(%rdi), %r9
808 adc 0x38(%rdi), %r10
809 adc 0x40(%rdi), %r11
810 adc 0x48(%rdi), %r12
811 adc 0x50(%rdi), %r13
812 adc 0x58(%rdi), %rcx
813 mov %r8, 0x28(%rdi)
814 mov %r9, 0x30(%rdi)
815 mov %r10, 0x38(%rdi)
816 mov %r11, 0x40(%rdi)
817 mov %r12, 0x48(%rdi)
818 mov %r13, 0x50(%rdi)
819 mov %rcx, 0x58(%rdi)
820 mov 0x60(%rdi), %r8
821 mov 0x68(%rdi), %r9
822 adc \$0x0, %r8
823 adc \$0x0, %r9
824 mov %r8, 0x60(%rdi)
825 mov %r9, 0x68(%rdi)
826
827 $mul45
828 xor %rcx, %rcx
829 add 0x38(%rdi), %r8
830 adc 0x40(%rdi), %r9
831 adc 0x48(%rdi), %r10
832 adc 0x50(%rdi), %r11
833 adc 0x58(%rdi), %r12
834 adc 0x60(%rdi), %r13
835 adc 0x68(%rdi), %rcx
836 mov %r8, 0x0(%rsi) # C0
837 mov %r9, 0x8(%rsi) # C1
838 mov %r10, 0x48(%rdi)
839 mov %r11, 0x50(%rdi)
840 mov %r12, 0x58(%rdi)
841 mov %r13, 0x60(%rdi)
842 mov %rcx, 0x68(%rdi)
843
844 $mul67
845 add 0x48(%rdi), %r8
846 adc 0x50(%rdi), %r9
847 adc 0x58(%rdi), %r10
848 adc 0x60(%rdi), %r11
849 adc 0x68(%rdi), %r12
850 mov %r8, 0x10(%rsi) # C2
851 mov %r9, 0x18(%rsi) # C3
852 mov %r10, 0x20(%rsi) # C4
853 mov %r11, 0x28(%rsi) # C5
854 mov %r12, 0x30(%rsi) # C6
855___
856 return $body;
857}
858
859# Optimized Montgomery reduction for CPUs, based on method described
860# in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015.
861# Operation: c [rsi] = a [rdi]
862# NOTE: a=c is not allowed
863sub sike_rdc {
864 my $jump_redc_bdw=&alt_impl(".Lrdc_bdw") if ($bmi2_adx);
865 # a[0-1] x .Lp434p1 --> result: r8:r13
866 my $mulx1=&mulx128x256( 0,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)),"%rcx");
867 # a[2-3] x .Lp434p1 --> result: r8:r13
868 my $mulx2=&mulx128x256(16,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)),"%rcx");
869 # a[4-5] x .Lp434p1 --> result: r8:r13
870 my $mulx3=&mulx128x256(32,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)),"%rcx");
871 # a[6-7] x .Lp434p1 --> result: r8:r13
872 my $mulx4=&mulx64x256( 48,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)));
873
874 # a[0-1] x .Lp434p1 --> result: r8:r13
875 my $mul1=&mul128x256( 0,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..14)),"%rcx");
876 # a[2-3] x .Lp434p1 --> result: r8:r13
877 my $mul2=&mul128x256(16,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..14)),"%rcx");
878 # a[4-5] x .Lp434p1 --> result: r8:r13
879 my $mul3=&mul128x256(32,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..14)),"%rcx");
880 # a[6-7] x .Lp434p1 --> result: r8:r13
881 my $mul4=&mul64x256( 48,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)));
882
883 my $redc_mul=&redc_common($mul1, $mul2, $mul3, $mul4);
884 my $redc_bdw=&redc_common($mulx1, $mulx2, $mulx3, $mulx4) if ($bmi2_adx);
885
886 # REDC for Broadwell CPUs
887 my $code=<<___;
888 .Lrdc_bdw:
889 .cfi_startproc
890 # sike_fprdc has already pushed r12--15 by this point.
891 .cfi_adjust_cfa_offset 32
892 .cfi_offset r12, -16
893 .cfi_offset r13, -24
894 .cfi_offset r14, -32
895 .cfi_offset r15, -40
896
897 $redc_bdw
898
899 pop %r15
900 .cfi_adjust_cfa_offset -8
901 .cfi_same_value r15
902 pop %r14
903 .cfi_adjust_cfa_offset -8
904 .cfi_same_value r14
905 pop %r13
906 .cfi_adjust_cfa_offset -8
907 .cfi_same_value r13
908 pop %r12
909 .cfi_adjust_cfa_offset -8
910 .cfi_same_value r12
911 ret
912 .cfi_endproc
913___
914
915 # REDC for CPUs older than Broadwell
916 $code.=<<___;
917 .globl ${PREFIX}_fprdc
918 .type ${PREFIX}_fprdc,\@function,3
919 ${PREFIX}_fprdc:
920 .cfi_startproc
921 push %r12
922 .cfi_adjust_cfa_offset 8
923 .cfi_offset r12, -16
924 push %r13
925 .cfi_adjust_cfa_offset 8
926 .cfi_offset r13, -24
927 push %r14
928 .cfi_adjust_cfa_offset 8
929 .cfi_offset r14, -32
930 push %r15
931 .cfi_adjust_cfa_offset 8
932 .cfi_offset r15, -40
933
934 # Jump to optimized implementation if
935 # CPU supports ADCX/ADOX/MULX
936 $jump_redc_bdw
937 # Otherwise use generic implementation
938 $redc_mul
939
940 pop %r15
941 .cfi_adjust_cfa_offset -8
942 pop %r14
943 .cfi_adjust_cfa_offset -8
944 pop %r13
945 .cfi_adjust_cfa_offset -8
946 pop %r12
947 .cfi_adjust_cfa_offset -8
948 ret
949 .cfi_endproc
950___
951 return $code;
952}
953$code.=&sike_rdc();
954
955# 434-bit multiplication using Karatsuba (one level),
956# schoolbook (one level). Uses MULX/ADOX/ADCX instructions
957# available on Broadwell micro-architectures and newer.
958sub mul_bdw {
959 # [rsp] <- (AH+AL) x (BH+BL)
960 my $mul256_low=&mul256(0,"%rsp",32,"%rsp",0,"%rsp",map("%r$_",(8..15)),"%rbx","%rbp");
961 # [rcx] <- AL x BL
962 my $mul256_albl=&mul256(0,"%rdi",0,"%rsi",0,"%rcx",map("%r$_",(8..15)),"%rbx","%rbp");
963 # [rcx+64] <- AH x BH
964 my $mul192_ahbh=&mul192(32,"%rdi",32,"%rsi",64,"%rcx",map("%r$_",(8..14)));
965
966 $body=<<___;
967
968 mov %rdx, %rcx
969 xor %rax, %rax
970
971 # r8-r11 <- AH + AL, rax <- mask
972 mov 0x0(%rdi), %r8
973 mov 0x8(%rdi), %r9
974 mov 0x10(%rdi), %r10
975 mov 0x18(%rdi), %r11
976
977 push %rbx
978 .cfi_adjust_cfa_offset 8
979 .cfi_offset rbx, -48
980 push %rbp
981 .cfi_offset rbp, -56
982 .cfi_adjust_cfa_offset 8
983 sub \$96, %rsp
984 .cfi_adjust_cfa_offset 96
985
986 add 0x20(%rdi), %r8
987 adc 0x28(%rdi), %r9
988 adc 0x30(%rdi), %r10
989 adc \$0x0, %r11
990 sbb \$0x0, %rax
991 mov %r8, 0x0(%rsp)
992 mov %r9, 0x8(%rsp)
993 mov %r10, 0x10(%rsp)
994 mov %r11, 0x18(%rsp)
995
996 # r12-r15 <- BH + BL, rbx <- mask
997 xor %rbx, %rbx
998 mov 0x0(%rsi), %r12
999 mov 0x8(%rsi), %r13
1000 mov 0x10(%rsi), %r14
1001 mov 0x18(%rsi), %r15
1002 add 0x20(%rsi), %r12
1003 adc 0x28(%rsi), %r13
1004 adc 0x30(%rsi), %r14
1005 adc \$0x0, %r15
1006 sbb \$0x0, %rbx
1007 mov %r12, 0x20(%rsp)
1008 mov %r13, 0x28(%rsp)
1009 mov %r14, 0x30(%rsp)
1010 mov %r15, 0x38(%rsp)
1011
1012 # r12-r15 <- masked (BH + BL)
1013 and %rax, %r12
1014 and %rax, %r13
1015 and %rax, %r14
1016 and %rax, %r15
1017
1018 # r8-r11 <- masked (AH + AL)
1019 and %rbx, %r8
1020 and %rbx, %r9
1021 and %rbx, %r10
1022 and %rbx, %r11
1023
1024 # r8-r11 <- masked (AH + AL) + masked (BH + BL)
1025 add %r12, %r8
1026 adc %r13, %r9
1027 adc %r14, %r10
1028 adc %r15, %r11
1029 mov %r8, 0x40(%rsp)
1030 mov %r9, 0x48(%rsp)
1031 mov %r10, 0x50(%rsp)
1032 mov %r11, 0x58(%rsp)
1033
1034 # [rsp] <- CM = (AH+AL) x (BH+BL)
1035 $mul256_low
1036 # [rcx] <- CL = AL x BL (Result c0-c3)
1037 $mul256_albl
1038 # [rcx+64] <- CH = AH x BH
1039 $mul192_ahbh
1040
1041 # r8-r11 <- (AH+AL) x (BH+BL), final step
1042 mov 0x40(%rsp), %r8
1043 mov 0x48(%rsp), %r9
1044 mov 0x50(%rsp), %r10
1045 mov 0x58(%rsp), %r11
1046
1047 mov 0x20(%rsp), %rax
1048 add %rax, %r8
1049 mov 0x28(%rsp), %rax
1050 adc %rax, %r9
1051 mov 0x30(%rsp), %rax
1052 adc %rax, %r10
1053 mov 0x38(%rsp), %rax
1054 adc %rax, %r11
1055
1056 # [rsp], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
1057 mov 0x0(%rsp), %r12
1058 mov 0x8(%rsp), %r13
1059 mov 0x10(%rsp), %r14
1060 mov 0x18(%rsp), %r15
1061 sub 0x0(%rcx), %r12
1062 sbb 0x8(%rcx), %r13
1063 sbb 0x10(%rcx), %r14
1064 sbb 0x18(%rcx), %r15
1065 sbb 0x20(%rcx), %r8
1066 sbb 0x28(%rcx), %r9
1067 sbb 0x30(%rcx), %r10
1068 sbb 0x38(%rcx), %r11
1069
1070 # r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
1071 sub 0x40(%rcx), %r12
1072 sbb 0x48(%rcx), %r13
1073 sbb 0x50(%rcx), %r14
1074 sbb 0x58(%rcx), %r15
1075 sbb 0x60(%rcx), %r8
1076 sbb 0x68(%rcx), %r9
1077 sbb \$0x0, %r10
1078 sbb \$0x0, %r11
1079
1080 add 0x20(%rcx), %r12
1081 mov %r12, 0x20(%rcx) # Result C4-C7
1082 adc 0x28(%rcx), %r13
1083 mov %r13, 0x28(%rcx)
1084 adc 0x30(%rcx), %r14
1085 mov %r14, 0x30(%rcx)
1086 adc 0x38(%rcx), %r15
1087 mov %r15, 0x38(%rcx)
1088 adc 0x40(%rcx), %r8
1089 mov %r8, 0x40(%rcx) # Result C8-C15
1090 adc 0x48(%rcx), %r9
1091 mov %r9, 0x48(%rcx)
1092 adc 0x50(%rcx), %r10
1093 mov %r10, 0x50(%rcx)
1094 adc 0x58(%rcx), %r11
1095 mov %r11, 0x58(%rcx)
1096 mov 0x60(%rcx), %r12
1097 adc \$0x0, %r12
1098 mov %r12, 0x60(%rcx)
1099 mov 0x68(%rcx), %r13
1100 adc \$0x0, %r13
1101 mov %r13, 0x68(%rcx)
1102
1103 add \$96, %rsp
1104 .cfi_adjust_cfa_offset -96
1105 pop %rbp
1106 .cfi_adjust_cfa_offset -8
1107 .cfi_same_value rbp
1108 pop %rbx
1109 .cfi_adjust_cfa_offset -8
1110 .cfi_same_value rbx
1111___
1112 return $body;
1113}
1114
1115# 434-bit multiplication using Karatsuba (one level),
1116# schoolbook (one level).
1117sub mul {
1118 my $code=<<___;
1119 mov %rdx, %rcx
1120
1121 sub \$112, %rsp # Allocating space in stack
1122 .cfi_adjust_cfa_offset 112
1123
1124 # rcx[0-3] <- AH+AL
1125 xor %rax, %rax
1126 mov 0x20(%rdi), %r8
1127 mov 0x28(%rdi), %r9
1128 mov 0x30(%rdi), %r10
1129 xor %r11, %r11
1130 add 0x0(%rdi), %r8
1131 adc 0x8(%rdi), %r9
1132 adc 0x10(%rdi), %r10
1133 adc 0x18(%rdi), %r11
1134 # store AH+AL mask
1135 sbb \$0, %rax
1136 mov %rax, 0x40(%rsp)
1137 # store AH+AL in 0-0x18(rcx)
1138 mov %r8, 0x0(%rcx)
1139 mov %r9, 0x8(%rcx)
1140 mov %r10, 0x10(%rcx)
1141 mov %r11, 0x18(%rcx)
1142
1143 # r12-r15 <- BH+BL
1144 xor %rdx, %rdx
1145 mov 0x20(%rsi), %r12
1146 mov 0x28(%rsi), %r13
1147 mov 0x30(%rsi), %r14
1148 xor %r15, %r15
1149 add 0x0(%rsi), %r12
1150 adc 0x8(%rsi), %r13
1151 adc 0x10(%rsi), %r14
1152 adc 0x18(%rsi), %r15
1153 sbb \$0x0, %rdx
1154 # store BH+BL mask
1155 mov %rdx, 0x48(%rsp)
1156
1157 # (rsp[0-0x38]) <- (AH+AL)*(BH+BL)
1158 mov (%rcx), %rax
1159 mul %r12
1160 mov %rax, (%rsp) # c0
1161 mov %rdx, %r8
1162
1163 xor %r9, %r9
1164 mov (%rcx), %rax
1165 mul %r13
1166 add %rax, %r8
1167 adc %rdx, %r9
1168
1169 xor %r10, %r10
1170 mov 0x8(%rcx), %rax
1171 mul %r12
1172 add %rax, %r8
1173 mov %r8, 0x8(%rsp) # c1
1174 adc %rdx, %r9
1175 adc \$0x0,%r10
1176
1177 xor %r8, %r8
1178 mov (%rcx), %rax
1179 mul %r14
1180 add %rax, %r9
1181 adc %rdx, %r10
1182 adc \$0x0,%r8
1183
1184 mov 0x10(%rcx), %rax
1185 mul %r12
1186 add %rax, %r9
1187 adc %rdx, %r10
1188 adc \$0x0,%r8
1189
1190 mov 0x8(%rcx), %rax
1191 mul %r13
1192 add %rax, %r9
1193 mov %r9, 0x10(%rsp) # c2
1194 adc %rdx, %r10
1195 adc \$0x0, %r8
1196
1197 xor %r9, %r9
1198 mov (%rcx),%rax
1199 mul %r15
1200 add %rax, %r10
1201 adc %rdx, %r8
1202 adc \$0x0,%r9
1203
1204 mov 0x18(%rcx), %rax
1205 mul %r12
1206 add %rax, %r10
1207 adc %rdx, %r8
1208 adc \$0x0,%r9
1209
1210 mov 0x8(%rcx), %rax
1211 mul %r14
1212 add %rax, %r10
1213 adc %rdx, %r8
1214 adc \$0x0,%r9
1215
1216 mov 0x10(%rcx), %rax
1217 mul %r13
1218 add %rax, %r10
1219 mov %r10, 0x18(%rsp) # c3
1220 adc %rdx, %r8
1221 adc \$0x0, %r9
1222
1223 xor %r10, %r10
1224 mov 0x8(%rcx), %rax
1225 mul %r15
1226 add %rax, %r8
1227 adc %rdx, %r9
1228 adc \$0x0,%r10
1229
1230 mov 0x18(%rcx), %rax
1231 mul %r13
1232 add %rax, %r8
1233 adc %rdx, %r9
1234 adc \$0x0,%r10
1235
1236 mov 0x10(%rcx), %rax
1237 mul %r14
1238 add %rax, %r8 # c4
1239 mov %r8, 0x20(%rsp)
1240 adc %rdx, %r9
1241 adc \$0x0,%r10
1242
1243 xor %r11, %r11
1244 mov 0x10(%rcx), %rax
1245 mul %r15
1246 add %rax, %r9
1247 adc %rdx, %r10
1248 adc \$0x0,%r11
1249
1250 mov 0x18(%rcx), %rax
1251 mul %r14
1252 add %rax, %r9 # c5
1253 mov %r9, 0x28(%rsp)
1254 adc %rdx, %r10
1255 adc \$0x0,%r11
1256
1257 mov 0x18(%rcx), %rax
1258 mul %r15
1259 add %rax, %r10 # c6
1260 mov %r10, 0x30(%rsp)
1261 adc %rdx, %r11 # c7
1262 mov %r11, 0x38(%rsp)
1263
1264 # r12-r15 <- masked (BH + BL)
1265 mov 0x40(%rsp), %rax
1266 and %rax, %r12
1267 and %rax, %r13
1268 and %rax, %r14
1269 and %rax, %r15
1270
1271 # r8-r11 <- masked (AH + AL)
1272 mov 0x48(%rsp),%rax
1273 mov 0x00(rcx), %r8
1274 and %rax, %r8
1275 mov 0x08(rcx), %r9
1276 and %rax, %r9
1277 mov 0x10(rcx), %r10
1278 and %rax, %r10
1279 mov 0x18(rcx), %r11
1280 and %rax, %r11
1281
1282 # r12-r15 <- masked (AH + AL) + masked (BH + BL)
1283 add %r8, %r12
1284 adc %r9, %r13
1285 adc %r10, %r14
1286 adc %r11, %r15
1287
1288 # rsp[0x20-0x38] <- (AH+AL) x (BH+BL) high
1289 mov 0x20(%rsp), %rax
1290 add %rax, %r12
1291 mov 0x28(%rsp), %rax
1292 adc %rax, %r13
1293 mov 0x30(%rsp), %rax
1294 adc %rax, %r14
1295 mov 0x38(%rsp), %rax
1296 adc %rax, %r15
1297 mov %r12, 0x50(%rsp)
1298 mov %r13, 0x58(%rsp)
1299 mov %r14, 0x60(%rsp)
1300 mov %r15, 0x68(%rsp)
1301
1302 # [rcx] <- CL = AL x BL
1303 mov (%rdi), %r11
1304 mov (%rsi), %rax
1305 mul %r11
1306 xor %r9, %r9
1307 mov %rax, (%rcx) # c0
1308 mov %rdx, %r8
1309
1310 mov 0x10(%rdi), %r14
1311 mov 0x8(%rsi), %rax
1312 mul %r11
1313 xor %r10, %r10
1314 add %rax, %r8
1315 adc %rdx, %r9
1316
1317 mov 0x8(%rdi), %r12
1318 mov (%rsi), %rax
1319 mul %r12
1320 add %rax, %r8
1321 mov %r8, 0x8(%rcx) # c1
1322 adc %rdx, %r9
1323 adc \$0x0,%r10
1324
1325 xor %r8, %r8
1326 mov 0x10(%rsi), %rax
1327 mul %r11
1328 add %rax, %r9
1329 adc %rdx, %r10
1330 adc \$0x0,%r8
1331
1332 mov (%rsi),%r13
1333 mov %r14, %rax
1334 mul %r13
1335 add %rax, %r9
1336 adc %rdx, %r10
1337 adc \$0x0,%r8
1338
1339 mov 0x8(%rsi), %rax
1340 mul %r12
1341 add %rax, %r9
1342 mov %r9, 0x10(%rcx) # c2
1343 adc %rdx, %r10
1344 adc \$0x0,%r8
1345
1346 xor %r9, %r9
1347 mov 0x18(%rsi), %rax
1348 mul %r11
1349 mov 0x18(%rdi), %r15
1350 add %rax, %r10
1351 adc %rdx, %r8
1352 adc \$0x0,%r9
1353
1354 mov %r15, %rax
1355 mul %r13
1356 add %rax, %r10
1357 adc %rdx, %r8
1358 adc \$0x0,%r9
1359
1360 mov 0x10(%rsi), %rax
1361 mul %r12
1362 add %rax, %r10
1363 adc %rdx, %r8
1364 adc \$0x0,%r9
1365
1366 mov 0x8(%rsi), %rax
1367 mul %r14
1368 add %rax, %r10
1369 mov %r10, 0x18(%rcx) # c3
1370 adc %rdx, %r8
1371 adc \$0x0,%r9
1372
1373 xor %r10, %r10
1374 mov 0x18(%rsi), %rax
1375 mul %r12
1376 add %rax, %r8
1377 adc %rdx, %r9
1378 adc \$0x0,%r10
1379
1380 mov 0x8(%rsi), %rax
1381 mul %r15
1382 add %rax, %r8
1383 adc %rdx, %r9
1384 adc \$0x0,%r10
1385
1386 mov 0x10(%rsi), %rax
1387 mul %r14
1388 add %rax, %r8
1389 mov %r8, 0x20(%rcx) # c4
1390 adc %rdx, %r9
1391 adc \$0x0,%r10
1392
1393 xor %r8, %r8
1394 mov 0x18(%rsi), %rax
1395 mul %r14
1396 add %rax, %r9
1397 adc %rdx, %r10
1398 adc \$0x0,%r8
1399
1400 mov 0x10(%rsi), %rax
1401 mul %r15
1402 add %rax, %r9
1403 mov %r9, 0x28(%rcx) # c5
1404 adc %rdx, %r10
1405 adc \$0x0,%r8
1406
1407 mov 0x18(%rsi), %rax
1408 mul %r15
1409 add %rax, %r10
1410 mov %r10, 0x30(%rcx) # c6
1411 adc %rdx, %r8
1412 mov %r8, 0x38(%rcx) # c7
1413
1414 # rcx[0x40-0x68] <- AH*BH
1415 # multiplies 2 192-bit numbers A,B
1416 mov 0x20(%rdi), %r11
1417 mov 0x20(%rsi), %rax
1418 mul %r11
1419 xor %r9, %r9
1420 mov %rax, 0x40(%rcx) # c0
1421 mov %rdx, %r8
1422
1423 mov 0x30(%rdi), %r14
1424 mov 0x28(%rsi), %rax
1425 mul %r11
1426 xor %r10, %r10
1427 add %rax, %r8
1428 adc %rdx, %r9
1429
1430 mov 0x28(%rdi), %r12
1431 mov 0x20(%rsi), %rax
1432 mul %r12
1433 add %rax, %r8
1434 mov %r8, 0x48(%rcx) # c1
1435 adc %rdx, %r9
1436 adc \$0x0,%r10
1437
1438 xor %r8, %r8
1439 mov 0x30(%rsi), %rax
1440 mul %r11
1441 add %rax, %r9
1442 adc %rdx, %r10
1443 adc \$0x0,%r8
1444
1445 mov 0x20(%rsi), %r13
1446 mov %r14, %rax
1447 mul %r13
1448 add %rax, %r9
1449 adc %rdx, %r10
1450 adc \$0x0,%r8
1451
1452 mov 0x28(%rsi), %rax
1453 mul %r12
1454 add %rax, %r9
1455 mov %r9, 0x50(%rcx) # c2
1456 adc %rdx, %r10
1457 adc \$0x0,%r8
1458
1459 mov 0x30(%rsi), %rax
1460 mul %r12
1461 xor %r12, %r12
1462 add %rax, %r10
1463 adc %rdx, %r8
1464 adc \$0x0,%r12
1465
1466 mov 0x28(%rsi), %rax
1467 mul %r14
1468 add %rax, %r10
1469 adc %rdx, %r8
1470 adc \$0x0,%r12
1471 mov %r10, 0x58(%rcx) # c3
1472
1473 mov 0x30(%rsi), %rax
1474 mul %r14
1475 add %rax, %r8
1476 adc \$0x0,%r12
1477 mov %r8, 0x60(%rcx) # c4
1478
1479 add %r12, %rdx # c5
1480
1481 # [r8-r15] <- (AH+AL)x(BH+BL) - ALxBL
1482 mov 0x0(%rsp), %r8
1483 sub 0x0(%rcx), %r8
1484 mov 0x8(%rsp), %r9
1485 sbb 0x8(%rcx), %r9
1486 mov 0x10(%rsp), %r10
1487 sbb 0x10(%rcx), %r10
1488 mov 0x18(%rsp), %r11
1489 sbb 0x18(%rcx), %r11
1490 mov 0x50(%rsp), %r12
1491 sbb 0x20(%rcx), %r12
1492 mov 0x58(%rsp), %r13
1493 sbb 0x28(%rcx), %r13
1494 mov 0x60(%rsp), %r14
1495 sbb 0x30(%rcx), %r14
1496 mov 0x68(%rsp), %r15
1497 sbb 0x38(%rcx), %r15
1498
1499 # [r8-r15] <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
1500 mov 0x40(%rcx), %rax
1501 sub %rax, %r8
1502 mov 0x48(%rcx), %rax
1503 sbb %rax, %r9
1504 mov 0x50(%rcx), %rax
1505 sbb %rax, %r10
1506 mov 0x58(%rcx), %rax
1507 sbb %rax, %r11
1508 mov 0x60(%rcx), %rax
1509 sbb %rax, %r12
1510 sbb %rdx, %r13
1511 sbb \$0x0,%r14
1512 sbb \$0x0,%r15
1513
1514 # Final result
1515 add 0x20(%rcx), %r8
1516 mov %r8, 0x20(%rcx) # Result C4-C7
1517 adc 0x28(%rcx), %r9
1518 mov %r9, 0x28(%rcx)
1519 adc 0x30(%rcx), %r10
1520 mov %r10, 0x30(%rcx)
1521 adc 0x38(%rcx), %r11
1522 mov %r11, 0x38(%rcx)
1523 adc 0x40(%rcx), %r12
1524 mov %r12, 0x40(%rcx) # Result C8-C13
1525 adc 0x48(%rcx), %r13
1526 mov %r13, 0x48(%rcx)
1527 adc 0x50(%rcx), %r14
1528 mov %r14, 0x50(%rcx)
1529 adc 0x58(%rcx), %r15
1530 mov %r15, 0x58(%rcx)
1531 mov 0x60(%rcx), %r12
1532 adc \$0x0, %r12
1533 mov %r12, 0x60(%rcx)
1534 adc \$0x0, %rdx
1535 mov %rdx, 0x68(%rcx)
1536
1537 add \$112, %rsp # Restoring space in stack
1538 .cfi_adjust_cfa_offset -112
1539___
1540
1541 return $code;
1542}
1543
1544# Integer multiplication based on Karatsuba method
1545# Operation: c [rdx] = a [rdi] * b [rsi]
1546# NOTE: a=c or b=c are not allowed
1547sub sike_mul {
1548 my $jump_mul_bdw=&alt_impl(".Lmul_bdw") if ($bmi2_adx);
1549 # MUL for Broadwell CPUs
1550 my $mul_bdw=&mul_bdw() if ($bmi2_adx);
1551 # MUL for CPUs older than Broadwell
1552 my $mul=&mul();
1553
1554 my $body=<<___;
1555 .Lmul_bdw:
1556 .cfi_startproc
1557 # sike_mpmul has already pushed r12--15 by this point.
1558 .cfi_adjust_cfa_offset 32
1559 .cfi_offset r12, -16
1560 .cfi_offset r13, -24
1561 .cfi_offset r14, -32
1562 .cfi_offset r15, -40
1563
1564 $mul_bdw
1565
1566 pop %r15
1567 .cfi_adjust_cfa_offset -8
1568 .cfi_same_value r15
1569 pop %r14
1570 .cfi_adjust_cfa_offset -8
1571 .cfi_same_value r14
1572 pop %r13
1573 .cfi_adjust_cfa_offset -8
1574 .cfi_same_value r13
1575 pop %r12
1576 .cfi_adjust_cfa_offset -8
1577 .cfi_same_value r12
1578 ret
1579 .cfi_endproc
1580
1581 .globl ${PREFIX}_mpmul
1582 .type ${PREFIX}_mpmul,\@function,3
1583 ${PREFIX}_mpmul:
1584 .cfi_startproc
1585 push %r12
1586 .cfi_adjust_cfa_offset 8
1587 .cfi_offset r12, -16
1588 push %r13
1589 .cfi_adjust_cfa_offset 8
1590 .cfi_offset r13, -24
1591 push %r14
1592 .cfi_adjust_cfa_offset 8
1593 .cfi_offset r14, -32
1594 push %r15
1595 .cfi_adjust_cfa_offset 8
1596 .cfi_offset r15, -40
1597
1598 # Jump to optimized implementation if
1599 # CPU supports ADCX/ADOX/MULX
1600 $jump_mul_bdw
1601 # Otherwise use generic implementation
1602 $mul
1603
1604 pop %r15
1605 .cfi_adjust_cfa_offset -8
1606 pop %r14
1607 .cfi_adjust_cfa_offset -8
1608 pop %r13
1609 .cfi_adjust_cfa_offset -8
1610 pop %r12
1611 .cfi_adjust_cfa_offset -8
1612 ret
1613 .cfi_endproc
1614
1615___
1616 return $body;
1617}
1618
1619$code.=&sike_mul();
1620
1621foreach (split("\n",$code)) {
1622 s/\`([^\`]*)\`/eval($1)/ge;
1623 print $_,"\n";
1624}
1625
1626close STDOUT;