blob: 1ac3d21175e1397e8b2ce35638e4c3c9ae84aaef [file] [log] [blame]
Adam Langleyfad63272015-11-12 12:15:39 -08001#!/usr/bin/env perl
2
3# Copyright (c) 2014, Intel Corporation.
4#
5# Permission to use, copy, modify, and/or distribute this software for any
6# purpose with or without fee is hereby granted, provided that the above
7# copyright notice and this permission notice appear in all copies.
8#
9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
12# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
14# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
15# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17# Developers and authors:
18# Shay Gueron (1, 2), and Vlad Krasnov (1)
19# (1) Intel Corporation, Israel Development Center
20# (2) University of Haifa
21
22# Reference:
23# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
24# 256 Bit Primes"
25
26# Further optimization by <appro@openssl.org>:
27#
28# this/original
29# Opteron +12-49%
30# Bulldozer +14-45%
31# P4 +18-46%
32# Westmere +12-34%
33# Sandy Bridge +9-35%
34# Ivy Bridge +9-35%
35# Haswell +8-37%
36# Broadwell +18-58%
37# Atom +15-50%
38# VIA Nano +43-160%
39#
40# Ranges denote minimum and maximum improvement coefficients depending
41# on benchmark.
42
43$flavour = shift;
44$output = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
Robert Sloan8ff03552017-06-14 12:40:58 -070051( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
Adam Langleyfad63272015-11-12 12:15:39 -080052die "can't locate x86_64-xlate.pl";
53
David Benjaminc895d6b2016-08-11 13:26:41 -040054open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
Adam Langleyfad63272015-11-12 12:15:39 -080055*STDOUT=*OUT;
56
57# TODO: enable these after testing. $avx goes to two and $addx to one.
58$avx=0;
59$addx=0;
60
61$code.=<<___;
62.text
63.extern OPENSSL_ia32cap_P
64
65# The polynomial
66.align 64
67.Lpoly:
68.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
69
Adam Langleyfad63272015-11-12 12:15:39 -080070.LOne:
71.long 1,1,1,1,1,1,1,1
72.LTwo:
73.long 2,2,2,2,2,2,2,2
74.LThree:
75.long 3,3,3,3,3,3,3,3
76.LONE_mont:
77.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
78___
79
80{
Adam Langleyfad63272015-11-12 12:15:39 -080081my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
82my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
83my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
84
85$code.=<<___;
86
Adam Langleyfad63272015-11-12 12:15:39 -080087################################################################################
Adam Langleyfad63272015-11-12 12:15:39 -080088# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
89.globl ecp_nistz256_neg
90.type ecp_nistz256_neg,\@function,2
91.align 32
92ecp_nistz256_neg:
93 push %r12
94 push %r13
95
96 xor $a0, $a0
97 xor $a1, $a1
98 xor $a2, $a2
99 xor $a3, $a3
100 xor $t4, $t4
101
102 sub 8*0($a_ptr), $a0
103 sbb 8*1($a_ptr), $a1
104 sbb 8*2($a_ptr), $a2
105 mov $a0, $t0
106 sbb 8*3($a_ptr), $a3
107 lea .Lpoly(%rip), $a_ptr
108 mov $a1, $t1
109 sbb \$0, $t4
110
111 add 8*0($a_ptr), $a0
112 mov $a2, $t2
113 adc 8*1($a_ptr), $a1
114 adc 8*2($a_ptr), $a2
115 mov $a3, $t3
116 adc 8*3($a_ptr), $a3
117 test $t4, $t4
118
119 cmovz $t0, $a0
120 cmovz $t1, $a1
121 mov $a0, 8*0($r_ptr)
122 cmovz $t2, $a2
123 mov $a1, 8*1($r_ptr)
124 cmovz $t3, $a3
125 mov $a2, 8*2($r_ptr)
126 mov $a3, 8*3($r_ptr)
127
128 pop %r13
129 pop %r12
130 ret
131.size ecp_nistz256_neg,.-ecp_nistz256_neg
132___
133}
134{
135my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
136my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
137my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
138my ($poly1,$poly3)=($acc6,$acc7);
139
140$code.=<<___;
141################################################################################
Adam Langleyfad63272015-11-12 12:15:39 -0800142# void ecp_nistz256_mul_mont(
143# uint64_t res[4],
144# uint64_t a[4],
145# uint64_t b[4]);
146
147.globl ecp_nistz256_mul_mont
148.type ecp_nistz256_mul_mont,\@function,3
149.align 32
150ecp_nistz256_mul_mont:
151___
152$code.=<<___ if ($addx);
153 mov \$0x80100, %ecx
154 and OPENSSL_ia32cap_P+8(%rip), %ecx
155___
156$code.=<<___;
157.Lmul_mont:
158 push %rbp
159 push %rbx
160 push %r12
161 push %r13
162 push %r14
163 push %r15
164___
165$code.=<<___ if ($addx);
166 cmp \$0x80100, %ecx
167 je .Lmul_montx
168___
169$code.=<<___;
170 mov $b_org, $b_ptr
171 mov 8*0($b_org), %rax
172 mov 8*0($a_ptr), $acc1
173 mov 8*1($a_ptr), $acc2
174 mov 8*2($a_ptr), $acc3
175 mov 8*3($a_ptr), $acc4
176
177 call __ecp_nistz256_mul_montq
178___
179$code.=<<___ if ($addx);
180 jmp .Lmul_mont_done
181
182.align 32
183.Lmul_montx:
184 mov $b_org, $b_ptr
185 mov 8*0($b_org), %rdx
186 mov 8*0($a_ptr), $acc1
187 mov 8*1($a_ptr), $acc2
188 mov 8*2($a_ptr), $acc3
189 mov 8*3($a_ptr), $acc4
190 lea -128($a_ptr), $a_ptr # control u-op density
191
192 call __ecp_nistz256_mul_montx
193___
194$code.=<<___;
195.Lmul_mont_done:
196 pop %r15
197 pop %r14
198 pop %r13
199 pop %r12
200 pop %rbx
201 pop %rbp
202 ret
203.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
204
205.type __ecp_nistz256_mul_montq,\@abi-omnipotent
206.align 32
207__ecp_nistz256_mul_montq:
208 ########################################################################
209 # Multiply a by b[0]
210 mov %rax, $t1
211 mulq $acc1
212 mov .Lpoly+8*1(%rip),$poly1
213 mov %rax, $acc0
214 mov $t1, %rax
215 mov %rdx, $acc1
216
217 mulq $acc2
218 mov .Lpoly+8*3(%rip),$poly3
219 add %rax, $acc1
220 mov $t1, %rax
221 adc \$0, %rdx
222 mov %rdx, $acc2
223
224 mulq $acc3
225 add %rax, $acc2
226 mov $t1, %rax
227 adc \$0, %rdx
228 mov %rdx, $acc3
229
230 mulq $acc4
231 add %rax, $acc3
232 mov $acc0, %rax
233 adc \$0, %rdx
234 xor $acc5, $acc5
235 mov %rdx, $acc4
236
237 ########################################################################
238 # First reduction step
239 # Basically now we want to multiply acc[0] by p256,
240 # and add the result to the acc.
241 # Due to the special form of p256 we do some optimizations
242 #
243 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
244 # then we add acc[0] and get acc[0] x 2^96
245
246 mov $acc0, $t1
247 shl \$32, $acc0
248 mulq $poly3
249 shr \$32, $t1
250 add $acc0, $acc1 # +=acc[0]<<96
251 adc $t1, $acc2
252 adc %rax, $acc3
253 mov 8*1($b_ptr), %rax
254 adc %rdx, $acc4
255 adc \$0, $acc5
256 xor $acc0, $acc0
257
258 ########################################################################
259 # Multiply by b[1]
260 mov %rax, $t1
261 mulq 8*0($a_ptr)
262 add %rax, $acc1
263 mov $t1, %rax
264 adc \$0, %rdx
265 mov %rdx, $t0
266
267 mulq 8*1($a_ptr)
268 add $t0, $acc2
269 adc \$0, %rdx
270 add %rax, $acc2
271 mov $t1, %rax
272 adc \$0, %rdx
273 mov %rdx, $t0
274
275 mulq 8*2($a_ptr)
276 add $t0, $acc3
277 adc \$0, %rdx
278 add %rax, $acc3
279 mov $t1, %rax
280 adc \$0, %rdx
281 mov %rdx, $t0
282
283 mulq 8*3($a_ptr)
284 add $t0, $acc4
285 adc \$0, %rdx
286 add %rax, $acc4
287 mov $acc1, %rax
288 adc %rdx, $acc5
289 adc \$0, $acc0
290
291 ########################################################################
Robert Sloana94fe052017-02-21 08:49:28 -0800292 # Second reduction step
Adam Langleyfad63272015-11-12 12:15:39 -0800293 mov $acc1, $t1
294 shl \$32, $acc1
295 mulq $poly3
296 shr \$32, $t1
297 add $acc1, $acc2
298 adc $t1, $acc3
299 adc %rax, $acc4
300 mov 8*2($b_ptr), %rax
301 adc %rdx, $acc5
302 adc \$0, $acc0
303 xor $acc1, $acc1
304
305 ########################################################################
306 # Multiply by b[2]
307 mov %rax, $t1
308 mulq 8*0($a_ptr)
309 add %rax, $acc2
310 mov $t1, %rax
311 adc \$0, %rdx
312 mov %rdx, $t0
313
314 mulq 8*1($a_ptr)
315 add $t0, $acc3
316 adc \$0, %rdx
317 add %rax, $acc3
318 mov $t1, %rax
319 adc \$0, %rdx
320 mov %rdx, $t0
321
322 mulq 8*2($a_ptr)
323 add $t0, $acc4
324 adc \$0, %rdx
325 add %rax, $acc4
326 mov $t1, %rax
327 adc \$0, %rdx
328 mov %rdx, $t0
329
330 mulq 8*3($a_ptr)
331 add $t0, $acc5
332 adc \$0, %rdx
333 add %rax, $acc5
334 mov $acc2, %rax
335 adc %rdx, $acc0
336 adc \$0, $acc1
337
338 ########################################################################
Robert Sloana94fe052017-02-21 08:49:28 -0800339 # Third reduction step
Adam Langleyfad63272015-11-12 12:15:39 -0800340 mov $acc2, $t1
341 shl \$32, $acc2
342 mulq $poly3
343 shr \$32, $t1
344 add $acc2, $acc3
345 adc $t1, $acc4
346 adc %rax, $acc5
347 mov 8*3($b_ptr), %rax
348 adc %rdx, $acc0
349 adc \$0, $acc1
350 xor $acc2, $acc2
351
352 ########################################################################
353 # Multiply by b[3]
354 mov %rax, $t1
355 mulq 8*0($a_ptr)
356 add %rax, $acc3
357 mov $t1, %rax
358 adc \$0, %rdx
359 mov %rdx, $t0
360
361 mulq 8*1($a_ptr)
362 add $t0, $acc4
363 adc \$0, %rdx
364 add %rax, $acc4
365 mov $t1, %rax
366 adc \$0, %rdx
367 mov %rdx, $t0
368
369 mulq 8*2($a_ptr)
370 add $t0, $acc5
371 adc \$0, %rdx
372 add %rax, $acc5
373 mov $t1, %rax
374 adc \$0, %rdx
375 mov %rdx, $t0
376
377 mulq 8*3($a_ptr)
378 add $t0, $acc0
379 adc \$0, %rdx
380 add %rax, $acc0
381 mov $acc3, %rax
382 adc %rdx, $acc1
383 adc \$0, $acc2
384
385 ########################################################################
Robert Sloana94fe052017-02-21 08:49:28 -0800386 # Final reduction step
Adam Langleyfad63272015-11-12 12:15:39 -0800387 mov $acc3, $t1
388 shl \$32, $acc3
389 mulq $poly3
390 shr \$32, $t1
391 add $acc3, $acc4
392 adc $t1, $acc5
393 mov $acc4, $t0
394 adc %rax, $acc0
395 adc %rdx, $acc1
396 mov $acc5, $t1
397 adc \$0, $acc2
398
Robert Sloana94fe052017-02-21 08:49:28 -0800399 ########################################################################
Adam Langleyfad63272015-11-12 12:15:39 -0800400 # Branch-less conditional subtraction of P
401 sub \$-1, $acc4 # .Lpoly[0]
402 mov $acc0, $t2
403 sbb $poly1, $acc5 # .Lpoly[1]
404 sbb \$0, $acc0 # .Lpoly[2]
405 mov $acc1, $t3
406 sbb $poly3, $acc1 # .Lpoly[3]
407 sbb \$0, $acc2
408
409 cmovc $t0, $acc4
410 cmovc $t1, $acc5
411 mov $acc4, 8*0($r_ptr)
412 cmovc $t2, $acc0
413 mov $acc5, 8*1($r_ptr)
414 cmovc $t3, $acc1
415 mov $acc0, 8*2($r_ptr)
416 mov $acc1, 8*3($r_ptr)
417
418 ret
419.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
420
421################################################################################
422# void ecp_nistz256_sqr_mont(
423# uint64_t res[4],
424# uint64_t a[4]);
425
426# we optimize the square according to S.Gueron and V.Krasnov,
427# "Speeding up Big-Number Squaring"
428.globl ecp_nistz256_sqr_mont
429.type ecp_nistz256_sqr_mont,\@function,2
430.align 32
431ecp_nistz256_sqr_mont:
432___
433$code.=<<___ if ($addx);
434 mov \$0x80100, %ecx
435 and OPENSSL_ia32cap_P+8(%rip), %ecx
436___
437$code.=<<___;
438 push %rbp
439 push %rbx
440 push %r12
441 push %r13
442 push %r14
443 push %r15
444___
445$code.=<<___ if ($addx);
446 cmp \$0x80100, %ecx
447 je .Lsqr_montx
448___
449$code.=<<___;
450 mov 8*0($a_ptr), %rax
451 mov 8*1($a_ptr), $acc6
452 mov 8*2($a_ptr), $acc7
453 mov 8*3($a_ptr), $acc0
454
455 call __ecp_nistz256_sqr_montq
456___
457$code.=<<___ if ($addx);
458 jmp .Lsqr_mont_done
459
460.align 32
461.Lsqr_montx:
462 mov 8*0($a_ptr), %rdx
463 mov 8*1($a_ptr), $acc6
464 mov 8*2($a_ptr), $acc7
465 mov 8*3($a_ptr), $acc0
466 lea -128($a_ptr), $a_ptr # control u-op density
467
468 call __ecp_nistz256_sqr_montx
469___
470$code.=<<___;
471.Lsqr_mont_done:
472 pop %r15
473 pop %r14
474 pop %r13
475 pop %r12
476 pop %rbx
477 pop %rbp
478 ret
479.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
480
481.type __ecp_nistz256_sqr_montq,\@abi-omnipotent
482.align 32
483__ecp_nistz256_sqr_montq:
484 mov %rax, $acc5
485 mulq $acc6 # a[1]*a[0]
486 mov %rax, $acc1
487 mov $acc7, %rax
488 mov %rdx, $acc2
489
490 mulq $acc5 # a[0]*a[2]
491 add %rax, $acc2
492 mov $acc0, %rax
493 adc \$0, %rdx
494 mov %rdx, $acc3
495
496 mulq $acc5 # a[0]*a[3]
497 add %rax, $acc3
498 mov $acc7, %rax
499 adc \$0, %rdx
500 mov %rdx, $acc4
501
502 #################################
503 mulq $acc6 # a[1]*a[2]
504 add %rax, $acc3
505 mov $acc0, %rax
506 adc \$0, %rdx
507 mov %rdx, $t1
508
509 mulq $acc6 # a[1]*a[3]
510 add %rax, $acc4
511 mov $acc0, %rax
512 adc \$0, %rdx
513 add $t1, $acc4
514 mov %rdx, $acc5
515 adc \$0, $acc5
516
517 #################################
518 mulq $acc7 # a[2]*a[3]
519 xor $acc7, $acc7
520 add %rax, $acc5
521 mov 8*0($a_ptr), %rax
522 mov %rdx, $acc6
523 adc \$0, $acc6
524
525 add $acc1, $acc1 # acc1:6<<1
526 adc $acc2, $acc2
527 adc $acc3, $acc3
528 adc $acc4, $acc4
529 adc $acc5, $acc5
530 adc $acc6, $acc6
531 adc \$0, $acc7
532
533 mulq %rax
534 mov %rax, $acc0
535 mov 8*1($a_ptr), %rax
536 mov %rdx, $t0
537
538 mulq %rax
539 add $t0, $acc1
540 adc %rax, $acc2
541 mov 8*2($a_ptr), %rax
542 adc \$0, %rdx
543 mov %rdx, $t0
544
545 mulq %rax
546 add $t0, $acc3
547 adc %rax, $acc4
548 mov 8*3($a_ptr), %rax
549 adc \$0, %rdx
550 mov %rdx, $t0
551
552 mulq %rax
553 add $t0, $acc5
554 adc %rax, $acc6
555 mov $acc0, %rax
556 adc %rdx, $acc7
557
558 mov .Lpoly+8*1(%rip), $a_ptr
559 mov .Lpoly+8*3(%rip), $t1
560
561 ##########################################
562 # Now the reduction
563 # First iteration
564 mov $acc0, $t0
565 shl \$32, $acc0
566 mulq $t1
567 shr \$32, $t0
568 add $acc0, $acc1 # +=acc[0]<<96
569 adc $t0, $acc2
570 adc %rax, $acc3
571 mov $acc1, %rax
572 adc \$0, %rdx
573
574 ##########################################
575 # Second iteration
576 mov $acc1, $t0
577 shl \$32, $acc1
578 mov %rdx, $acc0
579 mulq $t1
580 shr \$32, $t0
581 add $acc1, $acc2
582 adc $t0, $acc3
583 adc %rax, $acc0
584 mov $acc2, %rax
585 adc \$0, %rdx
586
587 ##########################################
588 # Third iteration
589 mov $acc2, $t0
590 shl \$32, $acc2
591 mov %rdx, $acc1
592 mulq $t1
593 shr \$32, $t0
594 add $acc2, $acc3
595 adc $t0, $acc0
596 adc %rax, $acc1
597 mov $acc3, %rax
598 adc \$0, %rdx
599
600 ###########################################
601 # Last iteration
602 mov $acc3, $t0
603 shl \$32, $acc3
604 mov %rdx, $acc2
605 mulq $t1
606 shr \$32, $t0
607 add $acc3, $acc0
608 adc $t0, $acc1
609 adc %rax, $acc2
610 adc \$0, %rdx
611 xor $acc3, $acc3
612
613 ############################################
614 # Add the rest of the acc
615 add $acc0, $acc4
616 adc $acc1, $acc5
617 mov $acc4, $acc0
618 adc $acc2, $acc6
619 adc %rdx, $acc7
620 mov $acc5, $acc1
621 adc \$0, $acc3
622
623 sub \$-1, $acc4 # .Lpoly[0]
624 mov $acc6, $acc2
625 sbb $a_ptr, $acc5 # .Lpoly[1]
626 sbb \$0, $acc6 # .Lpoly[2]
627 mov $acc7, $t0
628 sbb $t1, $acc7 # .Lpoly[3]
629 sbb \$0, $acc3
630
631 cmovc $acc0, $acc4
632 cmovc $acc1, $acc5
633 mov $acc4, 8*0($r_ptr)
634 cmovc $acc2, $acc6
635 mov $acc5, 8*1($r_ptr)
636 cmovc $t0, $acc7
637 mov $acc6, 8*2($r_ptr)
638 mov $acc7, 8*3($r_ptr)
639
640 ret
641.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
642___
643
644if ($addx) {
645$code.=<<___;
646.type __ecp_nistz256_mul_montx,\@abi-omnipotent
647.align 32
648__ecp_nistz256_mul_montx:
649 ########################################################################
650 # Multiply by b[0]
651 mulx $acc1, $acc0, $acc1
652 mulx $acc2, $t0, $acc2
653 mov \$32, $poly1
654 xor $acc5, $acc5 # cf=0
655 mulx $acc3, $t1, $acc3
656 mov .Lpoly+8*3(%rip), $poly3
657 adc $t0, $acc1
658 mulx $acc4, $t0, $acc4
659 mov $acc0, %rdx
660 adc $t1, $acc2
661 shlx $poly1,$acc0,$t1
662 adc $t0, $acc3
663 shrx $poly1,$acc0,$t0
664 adc \$0, $acc4
665
666 ########################################################################
667 # First reduction step
668 add $t1, $acc1
669 adc $t0, $acc2
670
671 mulx $poly3, $t0, $t1
672 mov 8*1($b_ptr), %rdx
673 adc $t0, $acc3
674 adc $t1, $acc4
675 adc \$0, $acc5
676 xor $acc0, $acc0 # $acc0=0,cf=0,of=0
677
678 ########################################################################
679 # Multiply by b[1]
680 mulx 8*0+128($a_ptr), $t0, $t1
681 adcx $t0, $acc1
682 adox $t1, $acc2
683
684 mulx 8*1+128($a_ptr), $t0, $t1
685 adcx $t0, $acc2
686 adox $t1, $acc3
687
688 mulx 8*2+128($a_ptr), $t0, $t1
689 adcx $t0, $acc3
690 adox $t1, $acc4
691
692 mulx 8*3+128($a_ptr), $t0, $t1
693 mov $acc1, %rdx
694 adcx $t0, $acc4
695 shlx $poly1, $acc1, $t0
696 adox $t1, $acc5
697 shrx $poly1, $acc1, $t1
698
699 adcx $acc0, $acc5
700 adox $acc0, $acc0
701 adc \$0, $acc0
702
703 ########################################################################
704 # Second reduction step
705 add $t0, $acc2
706 adc $t1, $acc3
707
708 mulx $poly3, $t0, $t1
709 mov 8*2($b_ptr), %rdx
710 adc $t0, $acc4
711 adc $t1, $acc5
712 adc \$0, $acc0
713 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0
714
715 ########################################################################
716 # Multiply by b[2]
717 mulx 8*0+128($a_ptr), $t0, $t1
718 adcx $t0, $acc2
719 adox $t1, $acc3
720
721 mulx 8*1+128($a_ptr), $t0, $t1
722 adcx $t0, $acc3
723 adox $t1, $acc4
724
725 mulx 8*2+128($a_ptr), $t0, $t1
726 adcx $t0, $acc4
727 adox $t1, $acc5
728
729 mulx 8*3+128($a_ptr), $t0, $t1
730 mov $acc2, %rdx
731 adcx $t0, $acc5
732 shlx $poly1, $acc2, $t0
733 adox $t1, $acc0
734 shrx $poly1, $acc2, $t1
735
736 adcx $acc1, $acc0
737 adox $acc1, $acc1
738 adc \$0, $acc1
739
740 ########################################################################
741 # Third reduction step
742 add $t0, $acc3
743 adc $t1, $acc4
744
745 mulx $poly3, $t0, $t1
746 mov 8*3($b_ptr), %rdx
747 adc $t0, $acc5
748 adc $t1, $acc0
749 adc \$0, $acc1
750 xor $acc2, $acc2 # $acc2=0,cf=0,of=0
751
752 ########################################################################
753 # Multiply by b[3]
754 mulx 8*0+128($a_ptr), $t0, $t1
755 adcx $t0, $acc3
756 adox $t1, $acc4
757
758 mulx 8*1+128($a_ptr), $t0, $t1
759 adcx $t0, $acc4
760 adox $t1, $acc5
761
762 mulx 8*2+128($a_ptr), $t0, $t1
763 adcx $t0, $acc5
764 adox $t1, $acc0
765
766 mulx 8*3+128($a_ptr), $t0, $t1
767 mov $acc3, %rdx
768 adcx $t0, $acc0
769 shlx $poly1, $acc3, $t0
770 adox $t1, $acc1
771 shrx $poly1, $acc3, $t1
772
773 adcx $acc2, $acc1
774 adox $acc2, $acc2
775 adc \$0, $acc2
776
777 ########################################################################
778 # Fourth reduction step
779 add $t0, $acc4
780 adc $t1, $acc5
781
782 mulx $poly3, $t0, $t1
783 mov $acc4, $t2
784 mov .Lpoly+8*1(%rip), $poly1
785 adc $t0, $acc0
786 mov $acc5, $t3
787 adc $t1, $acc1
788 adc \$0, $acc2
789
790 ########################################################################
791 # Branch-less conditional subtraction of P
792 xor %eax, %eax
793 mov $acc0, $t0
794 sbb \$-1, $acc4 # .Lpoly[0]
795 sbb $poly1, $acc5 # .Lpoly[1]
796 sbb \$0, $acc0 # .Lpoly[2]
797 mov $acc1, $t1
798 sbb $poly3, $acc1 # .Lpoly[3]
799 sbb \$0, $acc2
800
801 cmovc $t2, $acc4
802 cmovc $t3, $acc5
803 mov $acc4, 8*0($r_ptr)
804 cmovc $t0, $acc0
805 mov $acc5, 8*1($r_ptr)
806 cmovc $t1, $acc1
807 mov $acc0, 8*2($r_ptr)
808 mov $acc1, 8*3($r_ptr)
809
810 ret
811.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
812
813.type __ecp_nistz256_sqr_montx,\@abi-omnipotent
814.align 32
815__ecp_nistz256_sqr_montx:
816 mulx $acc6, $acc1, $acc2 # a[0]*a[1]
817 mulx $acc7, $t0, $acc3 # a[0]*a[2]
818 xor %eax, %eax
819 adc $t0, $acc2
820 mulx $acc0, $t1, $acc4 # a[0]*a[3]
821 mov $acc6, %rdx
822 adc $t1, $acc3
823 adc \$0, $acc4
824 xor $acc5, $acc5 # $acc5=0,cf=0,of=0
825
826 #################################
827 mulx $acc7, $t0, $t1 # a[1]*a[2]
828 adcx $t0, $acc3
829 adox $t1, $acc4
830
831 mulx $acc0, $t0, $t1 # a[1]*a[3]
832 mov $acc7, %rdx
833 adcx $t0, $acc4
834 adox $t1, $acc5
835 adc \$0, $acc5
836
837 #################################
838 mulx $acc0, $t0, $acc6 # a[2]*a[3]
839 mov 8*0+128($a_ptr), %rdx
840 xor $acc7, $acc7 # $acc7=0,cf=0,of=0
841 adcx $acc1, $acc1 # acc1:6<<1
842 adox $t0, $acc5
843 adcx $acc2, $acc2
844 adox $acc7, $acc6 # of=0
845
846 mulx %rdx, $acc0, $t1
847 mov 8*1+128($a_ptr), %rdx
848 adcx $acc3, $acc3
849 adox $t1, $acc1
850 adcx $acc4, $acc4
851 mulx %rdx, $t0, $t4
852 mov 8*2+128($a_ptr), %rdx
853 adcx $acc5, $acc5
854 adox $t0, $acc2
855 adcx $acc6, $acc6
856 .byte 0x67
857 mulx %rdx, $t0, $t1
858 mov 8*3+128($a_ptr), %rdx
859 adox $t4, $acc3
860 adcx $acc7, $acc7
861 adox $t0, $acc4
862 mov \$32, $a_ptr
863 adox $t1, $acc5
864 .byte 0x67,0x67
865 mulx %rdx, $t0, $t4
Robert Sloan8ff03552017-06-14 12:40:58 -0700866 mov .Lpoly+8*3(%rip), %rdx
Adam Langleyfad63272015-11-12 12:15:39 -0800867 adox $t0, $acc6
868 shlx $a_ptr, $acc0, $t0
869 adox $t4, $acc7
870 shrx $a_ptr, $acc0, $t4
Robert Sloan8ff03552017-06-14 12:40:58 -0700871 mov %rdx,$t1
Adam Langleyfad63272015-11-12 12:15:39 -0800872
873 # reduction step 1
874 add $t0, $acc1
875 adc $t4, $acc2
876
Robert Sloan8ff03552017-06-14 12:40:58 -0700877 mulx $acc0, $t0, $acc0
Adam Langleyfad63272015-11-12 12:15:39 -0800878 adc $t0, $acc3
879 shlx $a_ptr, $acc1, $t0
880 adc \$0, $acc0
881 shrx $a_ptr, $acc1, $t4
882
883 # reduction step 2
884 add $t0, $acc2
885 adc $t4, $acc3
886
Robert Sloan8ff03552017-06-14 12:40:58 -0700887 mulx $acc1, $t0, $acc1
Adam Langleyfad63272015-11-12 12:15:39 -0800888 adc $t0, $acc0
889 shlx $a_ptr, $acc2, $t0
890 adc \$0, $acc1
891 shrx $a_ptr, $acc2, $t4
892
893 # reduction step 3
894 add $t0, $acc3
895 adc $t4, $acc0
896
Robert Sloan8ff03552017-06-14 12:40:58 -0700897 mulx $acc2, $t0, $acc2
Adam Langleyfad63272015-11-12 12:15:39 -0800898 adc $t0, $acc1
899 shlx $a_ptr, $acc3, $t0
900 adc \$0, $acc2
901 shrx $a_ptr, $acc3, $t4
902
903 # reduction step 4
904 add $t0, $acc0
905 adc $t4, $acc1
906
Robert Sloan8ff03552017-06-14 12:40:58 -0700907 mulx $acc3, $t0, $acc3
Adam Langleyfad63272015-11-12 12:15:39 -0800908 adc $t0, $acc2
909 adc \$0, $acc3
910
Robert Sloan8ff03552017-06-14 12:40:58 -0700911 xor $t3, $t3
912 add $acc0, $acc4 # accumulate upper half
Adam Langleyfad63272015-11-12 12:15:39 -0800913 mov .Lpoly+8*1(%rip), $a_ptr
914 adc $acc1, $acc5
915 mov $acc4, $acc0
916 adc $acc2, $acc6
917 adc $acc3, $acc7
918 mov $acc5, $acc1
919 adc \$0, $t3
920
Robert Sloan8ff03552017-06-14 12:40:58 -0700921 sub \$-1, $acc4 # .Lpoly[0]
Adam Langleyfad63272015-11-12 12:15:39 -0800922 mov $acc6, $acc2
923 sbb $a_ptr, $acc5 # .Lpoly[1]
924 sbb \$0, $acc6 # .Lpoly[2]
925 mov $acc7, $acc3
926 sbb $t1, $acc7 # .Lpoly[3]
927 sbb \$0, $t3
928
929 cmovc $acc0, $acc4
930 cmovc $acc1, $acc5
931 mov $acc4, 8*0($r_ptr)
932 cmovc $acc2, $acc6
933 mov $acc5, 8*1($r_ptr)
934 cmovc $acc3, $acc7
935 mov $acc6, 8*2($r_ptr)
936 mov $acc7, 8*3($r_ptr)
937
938 ret
939.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
940___
941}
942}
943{
Adam Langleyfad63272015-11-12 12:15:39 -0800944my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
945my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
946my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
947my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
948
949$code.=<<___;
950################################################################################
951# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
952.globl ecp_nistz256_select_w5
953.type ecp_nistz256_select_w5,\@abi-omnipotent
954.align 32
955ecp_nistz256_select_w5:
956___
957$code.=<<___ if ($avx>1);
958 mov OPENSSL_ia32cap_P+8(%rip), %eax
959 test \$`1<<5`, %eax
960 jnz .Lavx2_select_w5
961___
962$code.=<<___ if ($win64);
963 lea -0x88(%rsp), %rax
964.LSEH_begin_ecp_nistz256_select_w5:
965 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
966 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
967 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
968 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
969 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
970 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
971 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
972 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
973 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
974 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
975 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
976___
977$code.=<<___;
978 movdqa .LOne(%rip), $ONE
979 movd $index, $INDEX
980
981 pxor $Ra, $Ra
982 pxor $Rb, $Rb
983 pxor $Rc, $Rc
984 pxor $Rd, $Rd
985 pxor $Re, $Re
986 pxor $Rf, $Rf
987
988 movdqa $ONE, $M0
989 pshufd \$0, $INDEX, $INDEX
990
991 mov \$16, %rax
992.Lselect_loop_sse_w5:
993
994 movdqa $M0, $TMP0
995 paddd $ONE, $M0
996 pcmpeqd $INDEX, $TMP0
997
998 movdqa 16*0($in_t), $T0a
999 movdqa 16*1($in_t), $T0b
1000 movdqa 16*2($in_t), $T0c
1001 movdqa 16*3($in_t), $T0d
1002 movdqa 16*4($in_t), $T0e
1003 movdqa 16*5($in_t), $T0f
1004 lea 16*6($in_t), $in_t
1005
1006 pand $TMP0, $T0a
1007 pand $TMP0, $T0b
1008 por $T0a, $Ra
1009 pand $TMP0, $T0c
1010 por $T0b, $Rb
1011 pand $TMP0, $T0d
1012 por $T0c, $Rc
1013 pand $TMP0, $T0e
1014 por $T0d, $Rd
1015 pand $TMP0, $T0f
1016 por $T0e, $Re
1017 por $T0f, $Rf
1018
1019 dec %rax
1020 jnz .Lselect_loop_sse_w5
1021
1022 movdqu $Ra, 16*0($val)
1023 movdqu $Rb, 16*1($val)
1024 movdqu $Rc, 16*2($val)
1025 movdqu $Rd, 16*3($val)
1026 movdqu $Re, 16*4($val)
1027 movdqu $Rf, 16*5($val)
1028___
1029$code.=<<___ if ($win64);
1030 movaps (%rsp), %xmm6
1031 movaps 0x10(%rsp), %xmm7
1032 movaps 0x20(%rsp), %xmm8
1033 movaps 0x30(%rsp), %xmm9
1034 movaps 0x40(%rsp), %xmm10
1035 movaps 0x50(%rsp), %xmm11
1036 movaps 0x60(%rsp), %xmm12
1037 movaps 0x70(%rsp), %xmm13
1038 movaps 0x80(%rsp), %xmm14
1039 movaps 0x90(%rsp), %xmm15
1040 lea 0xa8(%rsp), %rsp
1041.LSEH_end_ecp_nistz256_select_w5:
1042___
1043$code.=<<___;
1044 ret
1045.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
1046
1047################################################################################
1048# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
1049.globl ecp_nistz256_select_w7
1050.type ecp_nistz256_select_w7,\@abi-omnipotent
1051.align 32
1052ecp_nistz256_select_w7:
1053___
1054$code.=<<___ if ($avx>1);
1055 mov OPENSSL_ia32cap_P+8(%rip), %eax
1056 test \$`1<<5`, %eax
1057 jnz .Lavx2_select_w7
1058___
1059$code.=<<___ if ($win64);
1060 lea -0x88(%rsp), %rax
1061.LSEH_begin_ecp_nistz256_select_w7:
1062 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
1063 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
1064 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
1065 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
1066 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
1067 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
1068 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
1069 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
1070 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
1071 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
1072 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
1073___
1074$code.=<<___;
1075 movdqa .LOne(%rip), $M0
1076 movd $index, $INDEX
1077
1078 pxor $Ra, $Ra
1079 pxor $Rb, $Rb
1080 pxor $Rc, $Rc
1081 pxor $Rd, $Rd
1082
1083 movdqa $M0, $ONE
1084 pshufd \$0, $INDEX, $INDEX
1085 mov \$64, %rax
1086
1087.Lselect_loop_sse_w7:
1088 movdqa $M0, $TMP0
1089 paddd $ONE, $M0
1090 movdqa 16*0($in_t), $T0a
1091 movdqa 16*1($in_t), $T0b
1092 pcmpeqd $INDEX, $TMP0
1093 movdqa 16*2($in_t), $T0c
1094 movdqa 16*3($in_t), $T0d
1095 lea 16*4($in_t), $in_t
1096
1097 pand $TMP0, $T0a
1098 pand $TMP0, $T0b
1099 por $T0a, $Ra
1100 pand $TMP0, $T0c
1101 por $T0b, $Rb
1102 pand $TMP0, $T0d
1103 por $T0c, $Rc
1104 prefetcht0 255($in_t)
1105 por $T0d, $Rd
1106
1107 dec %rax
1108 jnz .Lselect_loop_sse_w7
1109
1110 movdqu $Ra, 16*0($val)
1111 movdqu $Rb, 16*1($val)
1112 movdqu $Rc, 16*2($val)
1113 movdqu $Rd, 16*3($val)
1114___
1115$code.=<<___ if ($win64);
1116 movaps (%rsp), %xmm6
1117 movaps 0x10(%rsp), %xmm7
1118 movaps 0x20(%rsp), %xmm8
1119 movaps 0x30(%rsp), %xmm9
1120 movaps 0x40(%rsp), %xmm10
1121 movaps 0x50(%rsp), %xmm11
1122 movaps 0x60(%rsp), %xmm12
1123 movaps 0x70(%rsp), %xmm13
1124 movaps 0x80(%rsp), %xmm14
1125 movaps 0x90(%rsp), %xmm15
1126 lea 0xa8(%rsp), %rsp
1127.LSEH_end_ecp_nistz256_select_w7:
1128___
1129$code.=<<___;
1130 ret
1131.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
1132___
1133}
1134if ($avx>1) {
1135my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1136my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
1137my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
1138my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
1139
1140$code.=<<___;
1141################################################################################
1142# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
1143.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent
1144.align 32
1145ecp_nistz256_avx2_select_w5:
1146.Lavx2_select_w5:
1147 vzeroupper
1148___
1149$code.=<<___ if ($win64);
1150 lea -0x88(%rsp), %rax
1151.LSEH_begin_ecp_nistz256_avx2_select_w5:
1152 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
1153 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
1154 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
1155 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
1156 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
1157 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
1158 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
1159 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
1160 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
1161 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
1162 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
1163___
1164$code.=<<___;
1165 vmovdqa .LTwo(%rip), $TWO
1166
1167 vpxor $Ra, $Ra, $Ra
1168 vpxor $Rb, $Rb, $Rb
1169 vpxor $Rc, $Rc, $Rc
1170
1171 vmovdqa .LOne(%rip), $M0
1172 vmovdqa .LTwo(%rip), $M1
1173
1174 vmovd $index, %xmm1
1175 vpermd $INDEX, $Ra, $INDEX
1176
1177 mov \$8, %rax
1178.Lselect_loop_avx2_w5:
1179
1180 vmovdqa 32*0($in_t), $T0a
1181 vmovdqa 32*1($in_t), $T0b
1182 vmovdqa 32*2($in_t), $T0c
1183
1184 vmovdqa 32*3($in_t), $T1a
1185 vmovdqa 32*4($in_t), $T1b
1186 vmovdqa 32*5($in_t), $T1c
1187
1188 vpcmpeqd $INDEX, $M0, $TMP0
1189 vpcmpeqd $INDEX, $M1, $TMP1
1190
1191 vpaddd $TWO, $M0, $M0
1192 vpaddd $TWO, $M1, $M1
1193 lea 32*6($in_t), $in_t
1194
1195 vpand $TMP0, $T0a, $T0a
1196 vpand $TMP0, $T0b, $T0b
1197 vpand $TMP0, $T0c, $T0c
1198 vpand $TMP1, $T1a, $T1a
1199 vpand $TMP1, $T1b, $T1b
1200 vpand $TMP1, $T1c, $T1c
1201
1202 vpxor $T0a, $Ra, $Ra
1203 vpxor $T0b, $Rb, $Rb
1204 vpxor $T0c, $Rc, $Rc
1205 vpxor $T1a, $Ra, $Ra
1206 vpxor $T1b, $Rb, $Rb
1207 vpxor $T1c, $Rc, $Rc
1208
1209 dec %rax
1210 jnz .Lselect_loop_avx2_w5
1211
1212 vmovdqu $Ra, 32*0($val)
1213 vmovdqu $Rb, 32*1($val)
1214 vmovdqu $Rc, 32*2($val)
1215 vzeroupper
1216___
1217$code.=<<___ if ($win64);
1218 movaps (%rsp), %xmm6
1219 movaps 0x10(%rsp), %xmm7
1220 movaps 0x20(%rsp), %xmm8
1221 movaps 0x30(%rsp), %xmm9
1222 movaps 0x40(%rsp), %xmm10
1223 movaps 0x50(%rsp), %xmm11
1224 movaps 0x60(%rsp), %xmm12
1225 movaps 0x70(%rsp), %xmm13
1226 movaps 0x80(%rsp), %xmm14
1227 movaps 0x90(%rsp), %xmm15
1228 lea 0xa8(%rsp), %rsp
1229.LSEH_end_ecp_nistz256_avx2_select_w5:
1230___
1231$code.=<<___;
1232 ret
1233.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
1234___
1235}
1236if ($avx>1) {
1237my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1238my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
1239my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
1240my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
1241my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
1242
1243$code.=<<___;
1244
1245################################################################################
1246# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
1247.globl ecp_nistz256_avx2_select_w7
1248.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent
1249.align 32
1250ecp_nistz256_avx2_select_w7:
1251.Lavx2_select_w7:
1252 vzeroupper
1253___
1254$code.=<<___ if ($win64);
1255 lea -0x88(%rsp), %rax
1256.LSEH_begin_ecp_nistz256_avx2_select_w7:
1257 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
1258 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
1259 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
1260 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
1261 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
1262 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
1263 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
1264 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
1265 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
1266 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
1267 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
1268___
1269$code.=<<___;
1270 vmovdqa .LThree(%rip), $THREE
1271
1272 vpxor $Ra, $Ra, $Ra
1273 vpxor $Rb, $Rb, $Rb
1274
1275 vmovdqa .LOne(%rip), $M0
1276 vmovdqa .LTwo(%rip), $M1
1277 vmovdqa .LThree(%rip), $M2
1278
1279 vmovd $index, %xmm1
1280 vpermd $INDEX, $Ra, $INDEX
1281 # Skip index = 0, because it is implicitly the point at infinity
1282
1283 mov \$21, %rax
1284.Lselect_loop_avx2_w7:
1285
1286 vmovdqa 32*0($in_t), $T0a
1287 vmovdqa 32*1($in_t), $T0b
1288
1289 vmovdqa 32*2($in_t), $T1a
1290 vmovdqa 32*3($in_t), $T1b
1291
1292 vmovdqa 32*4($in_t), $T2a
1293 vmovdqa 32*5($in_t), $T2b
1294
1295 vpcmpeqd $INDEX, $M0, $TMP0
1296 vpcmpeqd $INDEX, $M1, $TMP1
1297 vpcmpeqd $INDEX, $M2, $TMP2
1298
1299 vpaddd $THREE, $M0, $M0
1300 vpaddd $THREE, $M1, $M1
1301 vpaddd $THREE, $M2, $M2
1302 lea 32*6($in_t), $in_t
1303
1304 vpand $TMP0, $T0a, $T0a
1305 vpand $TMP0, $T0b, $T0b
1306 vpand $TMP1, $T1a, $T1a
1307 vpand $TMP1, $T1b, $T1b
1308 vpand $TMP2, $T2a, $T2a
1309 vpand $TMP2, $T2b, $T2b
1310
1311 vpxor $T0a, $Ra, $Ra
1312 vpxor $T0b, $Rb, $Rb
1313 vpxor $T1a, $Ra, $Ra
1314 vpxor $T1b, $Rb, $Rb
1315 vpxor $T2a, $Ra, $Ra
1316 vpxor $T2b, $Rb, $Rb
1317
1318 dec %rax
1319 jnz .Lselect_loop_avx2_w7
1320
1321
1322 vmovdqa 32*0($in_t), $T0a
1323 vmovdqa 32*1($in_t), $T0b
1324
1325 vpcmpeqd $INDEX, $M0, $TMP0
1326
1327 vpand $TMP0, $T0a, $T0a
1328 vpand $TMP0, $T0b, $T0b
1329
1330 vpxor $T0a, $Ra, $Ra
1331 vpxor $T0b, $Rb, $Rb
1332
1333 vmovdqu $Ra, 32*0($val)
1334 vmovdqu $Rb, 32*1($val)
1335 vzeroupper
1336___
1337$code.=<<___ if ($win64);
1338 movaps (%rsp), %xmm6
1339 movaps 0x10(%rsp), %xmm7
1340 movaps 0x20(%rsp), %xmm8
1341 movaps 0x30(%rsp), %xmm9
1342 movaps 0x40(%rsp), %xmm10
1343 movaps 0x50(%rsp), %xmm11
1344 movaps 0x60(%rsp), %xmm12
1345 movaps 0x70(%rsp), %xmm13
1346 movaps 0x80(%rsp), %xmm14
1347 movaps 0x90(%rsp), %xmm15
1348 lea 0xa8(%rsp), %rsp
1349.LSEH_end_ecp_nistz256_avx2_select_w7:
1350___
1351$code.=<<___;
1352 ret
1353.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
1354___
1355} else {
1356$code.=<<___;
1357.globl ecp_nistz256_avx2_select_w7
1358.type ecp_nistz256_avx2_select_w7,\@function,3
1359.align 32
1360ecp_nistz256_avx2_select_w7:
1361 .byte 0x0f,0x0b # ud2
1362 ret
1363.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
1364___
1365}
1366{{{
1367########################################################################
1368# This block implements higher level point_double, point_add and
1369# point_add_affine. The key to performance in this case is to allow
1370# out-of-order execution logic to overlap computations from next step
1371# with tail processing from current step. By using tailored calling
1372# sequence we minimize inter-step overhead to give processor better
1373# shot at overlapping operations...
1374#
1375# You will notice that input data is copied to stack. Trouble is that
1376# there are no registers to spare for holding original pointers and
1377# reloading them, pointers, would create undesired dependencies on
1378# effective addresses calculation paths. In other words it's too done
1379# to favour out-of-order execution logic.
1380# <appro@openssl.org>
1381
1382my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
1383my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
1384my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
1385my ($poly1,$poly3)=($acc6,$acc7);
1386
1387sub load_for_mul () {
1388my ($a,$b,$src0) = @_;
1389my $bias = $src0 eq "%rax" ? 0 : -128;
1390
1391" mov $b, $src0
1392 lea $b, $b_ptr
1393 mov 8*0+$a, $acc1
1394 mov 8*1+$a, $acc2
1395 lea $bias+$a, $a_ptr
1396 mov 8*2+$a, $acc3
1397 mov 8*3+$a, $acc4"
1398}
1399
1400sub load_for_sqr () {
1401my ($a,$src0) = @_;
1402my $bias = $src0 eq "%rax" ? 0 : -128;
1403
1404" mov 8*0+$a, $src0
1405 mov 8*1+$a, $acc6
1406 lea $bias+$a, $a_ptr
1407 mov 8*2+$a, $acc7
1408 mov 8*3+$a, $acc0"
1409}
1410
1411 {
1412########################################################################
1413# operate in 4-5-0-1 "name space" that matches multiplication output
1414#
1415my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1416
1417$code.=<<___;
1418.type __ecp_nistz256_add_toq,\@abi-omnipotent
1419.align 32
1420__ecp_nistz256_add_toq:
Steven Valdez909b19f2016-11-21 15:35:44 -05001421 xor $t4,$t4
Adam Langleyfad63272015-11-12 12:15:39 -08001422 add 8*0($b_ptr), $a0
1423 adc 8*1($b_ptr), $a1
1424 mov $a0, $t0
1425 adc 8*2($b_ptr), $a2
1426 adc 8*3($b_ptr), $a3
1427 mov $a1, $t1
Steven Valdez909b19f2016-11-21 15:35:44 -05001428 adc \$0, $t4
Adam Langleyfad63272015-11-12 12:15:39 -08001429
1430 sub \$-1, $a0
1431 mov $a2, $t2
1432 sbb $poly1, $a1
1433 sbb \$0, $a2
1434 mov $a3, $t3
1435 sbb $poly3, $a3
Steven Valdez909b19f2016-11-21 15:35:44 -05001436 sbb \$0, $t4
Adam Langleyfad63272015-11-12 12:15:39 -08001437
Steven Valdez909b19f2016-11-21 15:35:44 -05001438 cmovc $t0, $a0
1439 cmovc $t1, $a1
Adam Langleyfad63272015-11-12 12:15:39 -08001440 mov $a0, 8*0($r_ptr)
Steven Valdez909b19f2016-11-21 15:35:44 -05001441 cmovc $t2, $a2
Adam Langleyfad63272015-11-12 12:15:39 -08001442 mov $a1, 8*1($r_ptr)
Steven Valdez909b19f2016-11-21 15:35:44 -05001443 cmovc $t3, $a3
Adam Langleyfad63272015-11-12 12:15:39 -08001444 mov $a2, 8*2($r_ptr)
1445 mov $a3, 8*3($r_ptr)
1446
1447 ret
1448.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
1449
1450.type __ecp_nistz256_sub_fromq,\@abi-omnipotent
1451.align 32
1452__ecp_nistz256_sub_fromq:
1453 sub 8*0($b_ptr), $a0
1454 sbb 8*1($b_ptr), $a1
1455 mov $a0, $t0
1456 sbb 8*2($b_ptr), $a2
1457 sbb 8*3($b_ptr), $a3
1458 mov $a1, $t1
1459 sbb $t4, $t4
1460
1461 add \$-1, $a0
1462 mov $a2, $t2
1463 adc $poly1, $a1
1464 adc \$0, $a2
1465 mov $a3, $t3
1466 adc $poly3, $a3
1467 test $t4, $t4
1468
1469 cmovz $t0, $a0
1470 cmovz $t1, $a1
1471 mov $a0, 8*0($r_ptr)
1472 cmovz $t2, $a2
1473 mov $a1, 8*1($r_ptr)
1474 cmovz $t3, $a3
1475 mov $a2, 8*2($r_ptr)
1476 mov $a3, 8*3($r_ptr)
1477
1478 ret
1479.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
1480
1481.type __ecp_nistz256_subq,\@abi-omnipotent
1482.align 32
1483__ecp_nistz256_subq:
1484 sub $a0, $t0
1485 sbb $a1, $t1
1486 mov $t0, $a0
1487 sbb $a2, $t2
1488 sbb $a3, $t3
1489 mov $t1, $a1
1490 sbb $t4, $t4
1491
1492 add \$-1, $t0
1493 mov $t2, $a2
1494 adc $poly1, $t1
1495 adc \$0, $t2
1496 mov $t3, $a3
1497 adc $poly3, $t3
1498 test $t4, $t4
1499
1500 cmovnz $t0, $a0
1501 cmovnz $t1, $a1
1502 cmovnz $t2, $a2
1503 cmovnz $t3, $a3
1504
1505 ret
1506.size __ecp_nistz256_subq,.-__ecp_nistz256_subq
1507
1508.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
1509.align 32
1510__ecp_nistz256_mul_by_2q:
Steven Valdez909b19f2016-11-21 15:35:44 -05001511 xor $t4, $t4
Adam Langleyfad63272015-11-12 12:15:39 -08001512 add $a0, $a0 # a0:a3+a0:a3
1513 adc $a1, $a1
1514 mov $a0, $t0
1515 adc $a2, $a2
1516 adc $a3, $a3
1517 mov $a1, $t1
Steven Valdez909b19f2016-11-21 15:35:44 -05001518 adc \$0, $t4
Adam Langleyfad63272015-11-12 12:15:39 -08001519
1520 sub \$-1, $a0
1521 mov $a2, $t2
1522 sbb $poly1, $a1
1523 sbb \$0, $a2
1524 mov $a3, $t3
1525 sbb $poly3, $a3
Steven Valdez909b19f2016-11-21 15:35:44 -05001526 sbb \$0, $t4
Adam Langleyfad63272015-11-12 12:15:39 -08001527
Steven Valdez909b19f2016-11-21 15:35:44 -05001528 cmovc $t0, $a0
1529 cmovc $t1, $a1
Adam Langleyfad63272015-11-12 12:15:39 -08001530 mov $a0, 8*0($r_ptr)
Steven Valdez909b19f2016-11-21 15:35:44 -05001531 cmovc $t2, $a2
Adam Langleyfad63272015-11-12 12:15:39 -08001532 mov $a1, 8*1($r_ptr)
Steven Valdez909b19f2016-11-21 15:35:44 -05001533 cmovc $t3, $a3
Adam Langleyfad63272015-11-12 12:15:39 -08001534 mov $a2, 8*2($r_ptr)
1535 mov $a3, 8*3($r_ptr)
1536
1537 ret
1538.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
1539___
1540 }
1541sub gen_double () {
1542 my $x = shift;
1543 my ($src0,$sfx,$bias);
1544 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1545
1546 if ($x ne "x") {
1547 $src0 = "%rax";
1548 $sfx = "";
1549 $bias = 0;
1550
1551$code.=<<___;
1552.globl ecp_nistz256_point_double
1553.type ecp_nistz256_point_double,\@function,2
1554.align 32
1555ecp_nistz256_point_double:
1556___
1557$code.=<<___ if ($addx);
1558 mov \$0x80100, %ecx
1559 and OPENSSL_ia32cap_P+8(%rip), %ecx
1560 cmp \$0x80100, %ecx
1561 je .Lpoint_doublex
1562___
1563 } else {
1564 $src0 = "%rdx";
1565 $sfx = "x";
1566 $bias = 128;
1567
1568$code.=<<___;
1569.type ecp_nistz256_point_doublex,\@function,2
1570.align 32
1571ecp_nistz256_point_doublex:
1572.Lpoint_doublex:
1573___
1574 }
1575$code.=<<___;
1576 push %rbp
1577 push %rbx
1578 push %r12
1579 push %r13
1580 push %r14
1581 push %r15
1582 sub \$32*5+8, %rsp
1583
David Benjamin4969cc92016-04-22 15:02:23 -04001584.Lpoint_double_shortcut$x:
Adam Langleyfad63272015-11-12 12:15:39 -08001585 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
1586 mov $a_ptr, $b_ptr # backup copy
1587 movdqu 0x10($a_ptr), %xmm1
1588 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order
1589 mov 0x20+8*1($a_ptr), $acc5
1590 mov 0x20+8*2($a_ptr), $acc0
1591 mov 0x20+8*3($a_ptr), $acc1
1592 mov .Lpoly+8*1(%rip), $poly1
1593 mov .Lpoly+8*3(%rip), $poly3
1594 movdqa %xmm0, $in_x(%rsp)
1595 movdqa %xmm1, $in_x+0x10(%rsp)
1596 lea 0x20($r_ptr), $acc2
1597 lea 0x40($r_ptr), $acc3
1598 movq $r_ptr, %xmm0
1599 movq $acc2, %xmm1
1600 movq $acc3, %xmm2
1601
1602 lea $S(%rsp), $r_ptr
1603 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y);
1604
1605 mov 0x40+8*0($a_ptr), $src0
1606 mov 0x40+8*1($a_ptr), $acc6
1607 mov 0x40+8*2($a_ptr), $acc7
1608 mov 0x40+8*3($a_ptr), $acc0
1609 lea 0x40-$bias($a_ptr), $a_ptr
1610 lea $Zsqr(%rsp), $r_ptr
1611 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z);
1612
1613 `&load_for_sqr("$S(%rsp)", "$src0")`
1614 lea $S(%rsp), $r_ptr
1615 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S);
1616
1617 mov 0x20($b_ptr), $src0 # $b_ptr is still valid
1618 mov 0x40+8*0($b_ptr), $acc1
1619 mov 0x40+8*1($b_ptr), $acc2
1620 mov 0x40+8*2($b_ptr), $acc3
1621 mov 0x40+8*3($b_ptr), $acc4
1622 lea 0x40-$bias($b_ptr), $a_ptr
1623 lea 0x20($b_ptr), $b_ptr
1624 movq %xmm2, $r_ptr
1625 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y);
1626 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z);
1627
1628 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
1629 mov $in_x+8*1(%rsp), $acc5
1630 lea $Zsqr(%rsp), $b_ptr
1631 mov $in_x+8*2(%rsp), $acc0
1632 mov $in_x+8*3(%rsp), $acc1
1633 lea $M(%rsp), $r_ptr
1634 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr);
1635
1636 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
1637 mov $in_x+8*1(%rsp), $acc5
1638 lea $Zsqr(%rsp), $b_ptr
1639 mov $in_x+8*2(%rsp), $acc0
1640 mov $in_x+8*3(%rsp), $acc1
1641 lea $Zsqr(%rsp), $r_ptr
1642 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr);
1643
1644 `&load_for_sqr("$S(%rsp)", "$src0")`
1645 movq %xmm1, $r_ptr
1646 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
1647___
Robert Sloana94fe052017-02-21 08:49:28 -08001648{
Adam Langleyfad63272015-11-12 12:15:39 -08001649######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
1650# operate in 4-5-6-7 "name space" that matches squaring output
1651#
1652my ($poly1,$poly3)=($a_ptr,$t1);
1653my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
1654
1655$code.=<<___;
1656 xor $t4, $t4
1657 mov $a0, $t0
1658 add \$-1, $a0
1659 mov $a1, $t1
1660 adc $poly1, $a1
1661 mov $a2, $t2
1662 adc \$0, $a2
1663 mov $a3, $t3
1664 adc $poly3, $a3
1665 adc \$0, $t4
1666 xor $a_ptr, $a_ptr # borrow $a_ptr
1667 test \$1, $t0
1668
1669 cmovz $t0, $a0
1670 cmovz $t1, $a1
1671 cmovz $t2, $a2
1672 cmovz $t3, $a3
1673 cmovz $a_ptr, $t4
1674
1675 mov $a1, $t0 # a0:a3>>1
1676 shr \$1, $a0
1677 shl \$63, $t0
1678 mov $a2, $t1
1679 shr \$1, $a1
1680 or $t0, $a0
1681 shl \$63, $t1
1682 mov $a3, $t2
1683 shr \$1, $a2
1684 or $t1, $a1
1685 shl \$63, $t2
1686 mov $a0, 8*0($r_ptr)
1687 shr \$1, $a3
1688 mov $a1, 8*1($r_ptr)
1689 shl \$63, $t4
1690 or $t2, $a2
1691 or $t4, $a3
1692 mov $a2, 8*2($r_ptr)
1693 mov $a3, 8*3($r_ptr)
1694___
1695}
1696$code.=<<___;
1697 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
1698 lea $M(%rsp), $r_ptr
1699 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr);
1700
1701 lea $tmp0(%rsp), $r_ptr
1702 call __ecp_nistz256_mul_by_2$x
1703
1704 lea $M(%rsp), $b_ptr
1705 lea $M(%rsp), $r_ptr
1706 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M);
1707
1708 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
1709 lea $S(%rsp), $r_ptr
1710 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x);
1711
1712 lea $tmp0(%rsp), $r_ptr
1713 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S);
1714
1715 `&load_for_sqr("$M(%rsp)", "$src0")`
1716 movq %xmm0, $r_ptr
1717 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M);
1718
1719 lea $tmp0(%rsp), $b_ptr
1720 mov $acc6, $acc0 # harmonize sqr output and sub input
1721 mov $acc7, $acc1
1722 mov $a_ptr, $poly1
1723 mov $t1, $poly3
1724 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0);
1725
1726 mov $S+8*0(%rsp), $t0
1727 mov $S+8*1(%rsp), $t1
1728 mov $S+8*2(%rsp), $t2
1729 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order
1730 lea $S(%rsp), $r_ptr
1731 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x);
1732
1733 mov $M(%rsp), $src0
1734 lea $M(%rsp), $b_ptr
1735 mov $acc4, $acc6 # harmonize sub output and mul input
1736 xor %ecx, %ecx
Robert Sloana94fe052017-02-21 08:49:28 -08001737 mov $acc4, $S+8*0(%rsp) # have to save:-(
Adam Langleyfad63272015-11-12 12:15:39 -08001738 mov $acc5, $acc2
1739 mov $acc5, $S+8*1(%rsp)
1740 cmovz $acc0, $acc3
1741 mov $acc0, $S+8*2(%rsp)
1742 lea $S-$bias(%rsp), $a_ptr
1743 cmovz $acc1, $acc4
1744 mov $acc1, $S+8*3(%rsp)
1745 mov $acc6, $acc1
1746 lea $S(%rsp), $r_ptr
1747 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M);
1748
1749 movq %xmm1, $b_ptr
1750 movq %xmm1, $r_ptr
1751 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y);
1752
1753 add \$32*5+8, %rsp
1754 pop %r15
1755 pop %r14
1756 pop %r13
1757 pop %r12
1758 pop %rbx
1759 pop %rbp
1760 ret
1761.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
1762___
1763}
1764&gen_double("q");
1765
1766sub gen_add () {
1767 my $x = shift;
1768 my ($src0,$sfx,$bias);
1769 my ($H,$Hsqr,$R,$Rsqr,$Hcub,
1770 $U1,$U2,$S1,$S2,
1771 $res_x,$res_y,$res_z,
1772 $in1_x,$in1_y,$in1_z,
1773 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
1774 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1775
1776 if ($x ne "x") {
1777 $src0 = "%rax";
1778 $sfx = "";
1779 $bias = 0;
1780
1781$code.=<<___;
1782.globl ecp_nistz256_point_add
1783.type ecp_nistz256_point_add,\@function,3
1784.align 32
1785ecp_nistz256_point_add:
1786___
1787$code.=<<___ if ($addx);
1788 mov \$0x80100, %ecx
1789 and OPENSSL_ia32cap_P+8(%rip), %ecx
1790 cmp \$0x80100, %ecx
1791 je .Lpoint_addx
1792___
1793 } else {
1794 $src0 = "%rdx";
1795 $sfx = "x";
1796 $bias = 128;
1797
1798$code.=<<___;
1799.type ecp_nistz256_point_addx,\@function,3
1800.align 32
1801ecp_nistz256_point_addx:
1802.Lpoint_addx:
1803___
1804 }
1805$code.=<<___;
1806 push %rbp
1807 push %rbx
1808 push %r12
1809 push %r13
1810 push %r14
1811 push %r15
1812 sub \$32*18+8, %rsp
1813
1814 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
1815 movdqu 0x10($a_ptr), %xmm1
1816 movdqu 0x20($a_ptr), %xmm2
1817 movdqu 0x30($a_ptr), %xmm3
1818 movdqu 0x40($a_ptr), %xmm4
1819 movdqu 0x50($a_ptr), %xmm5
1820 mov $a_ptr, $b_ptr # reassign
1821 mov $b_org, $a_ptr # reassign
1822 movdqa %xmm0, $in1_x(%rsp)
1823 movdqa %xmm1, $in1_x+0x10(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08001824 movdqa %xmm2, $in1_y(%rsp)
1825 movdqa %xmm3, $in1_y+0x10(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08001826 movdqa %xmm4, $in1_z(%rsp)
1827 movdqa %xmm5, $in1_z+0x10(%rsp)
Steven Valdez909b19f2016-11-21 15:35:44 -05001828 por %xmm4, %xmm5
Adam Langleyfad63272015-11-12 12:15:39 -08001829
1830 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr
Steven Valdez909b19f2016-11-21 15:35:44 -05001831 pshufd \$0xb1, %xmm5, %xmm3
Adam Langleyfad63272015-11-12 12:15:39 -08001832 movdqu 0x10($a_ptr), %xmm1
1833 movdqu 0x20($a_ptr), %xmm2
1834 por %xmm3, %xmm5
1835 movdqu 0x30($a_ptr), %xmm3
1836 mov 0x40+8*0($a_ptr), $src0 # load original in2_z
1837 mov 0x40+8*1($a_ptr), $acc6
1838 mov 0x40+8*2($a_ptr), $acc7
1839 mov 0x40+8*3($a_ptr), $acc0
1840 movdqa %xmm0, $in2_x(%rsp)
1841 pshufd \$0x1e, %xmm5, %xmm4
1842 movdqa %xmm1, $in2_x+0x10(%rsp)
Steven Valdez909b19f2016-11-21 15:35:44 -05001843 movdqu 0x40($a_ptr),%xmm0 # in2_z again
1844 movdqu 0x50($a_ptr),%xmm1
Adam Langleyfad63272015-11-12 12:15:39 -08001845 movdqa %xmm2, $in2_y(%rsp)
1846 movdqa %xmm3, $in2_y+0x10(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08001847 por %xmm4, %xmm5
1848 pxor %xmm4, %xmm4
Steven Valdez909b19f2016-11-21 15:35:44 -05001849 por %xmm0, %xmm1
1850 movq $r_ptr, %xmm0 # save $r_ptr
Adam Langleyfad63272015-11-12 12:15:39 -08001851
1852 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
1853 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy
1854 mov $acc6, $in2_z+8*1(%rsp)
1855 mov $acc7, $in2_z+8*2(%rsp)
1856 mov $acc0, $in2_z+8*3(%rsp)
1857 lea $Z2sqr(%rsp), $r_ptr # Z2^2
1858 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z);
1859
1860 pcmpeqd %xmm4, %xmm5
Steven Valdez909b19f2016-11-21 15:35:44 -05001861 pshufd \$0xb1, %xmm1, %xmm4
1862 por %xmm1, %xmm4
Adam Langleyfad63272015-11-12 12:15:39 -08001863 pshufd \$0, %xmm5, %xmm5 # in1infty
1864 pshufd \$0x1e, %xmm4, %xmm3
1865 por %xmm3, %xmm4
1866 pxor %xmm3, %xmm3
1867 pcmpeqd %xmm3, %xmm4
1868 pshufd \$0, %xmm4, %xmm4 # in2infty
1869 mov 0x40+8*0($b_ptr), $src0 # load original in1_z
1870 mov 0x40+8*1($b_ptr), $acc6
1871 mov 0x40+8*2($b_ptr), $acc7
1872 mov 0x40+8*3($b_ptr), $acc0
David Benjamin4969cc92016-04-22 15:02:23 -04001873 movq $b_ptr, %xmm1
Adam Langleyfad63272015-11-12 12:15:39 -08001874
1875 lea 0x40-$bias($b_ptr), $a_ptr
1876 lea $Z1sqr(%rsp), $r_ptr # Z1^2
1877 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
1878
1879 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
1880 lea $S1(%rsp), $r_ptr # S1 = Z2^3
1881 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z);
1882
1883 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
1884 lea $S2(%rsp), $r_ptr # S2 = Z1^3
1885 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
1886
1887 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
1888 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3
1889 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y);
1890
1891 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
1892 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
1893 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
1894
1895 lea $S1(%rsp), $b_ptr
1896 lea $R(%rsp), $r_ptr # R = S2 - S1
1897 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1);
1898
1899 or $acc5, $acc4 # see if result is zero
1900 movdqa %xmm4, %xmm2
1901 or $acc0, $acc4
1902 or $acc1, $acc4
1903 por %xmm5, %xmm2 # in1infty || in2infty
1904 movq $acc4, %xmm3
1905
1906 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
1907 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2
1908 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr);
1909
1910 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
1911 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
1912 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr);
1913
1914 lea $U1(%rsp), $b_ptr
1915 lea $H(%rsp), $r_ptr # H = U2 - U1
1916 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1);
1917
1918 or $acc5, $acc4 # see if result is zero
1919 or $acc0, $acc4
1920 or $acc1, $acc4
1921
1922 .byte 0x3e # predict taken
1923 jnz .Ladd_proceed$x # is_equal(U1,U2)?
1924 movq %xmm2, $acc0
1925 movq %xmm3, $acc1
1926 test $acc0, $acc0
1927 jnz .Ladd_proceed$x # (in1infty || in2infty)?
1928 test $acc1, $acc1
David Benjamin4969cc92016-04-22 15:02:23 -04001929 jz .Ladd_double$x # is_equal(S1,S2)?
Adam Langleyfad63272015-11-12 12:15:39 -08001930
1931 movq %xmm0, $r_ptr # restore $r_ptr
1932 pxor %xmm0, %xmm0
1933 movdqu %xmm0, 0x00($r_ptr)
1934 movdqu %xmm0, 0x10($r_ptr)
1935 movdqu %xmm0, 0x20($r_ptr)
1936 movdqu %xmm0, 0x30($r_ptr)
1937 movdqu %xmm0, 0x40($r_ptr)
1938 movdqu %xmm0, 0x50($r_ptr)
1939 jmp .Ladd_done$x
1940
1941.align 32
David Benjamin4969cc92016-04-22 15:02:23 -04001942.Ladd_double$x:
1943 movq %xmm1, $a_ptr # restore $a_ptr
1944 movq %xmm0, $r_ptr # restore $r_ptr
1945 add \$`32*(18-5)`, %rsp # difference in frame sizes
1946 jmp .Lpoint_double_shortcut$x
1947
1948.align 32
Adam Langleyfad63272015-11-12 12:15:39 -08001949.Ladd_proceed$x:
1950 `&load_for_sqr("$R(%rsp)", "$src0")`
1951 lea $Rsqr(%rsp), $r_ptr # R^2
1952 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
1953
1954 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
1955 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
1956 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
1957
1958 `&load_for_sqr("$H(%rsp)", "$src0")`
1959 lea $Hsqr(%rsp), $r_ptr # H^2
1960 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
1961
1962 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
1963 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
1964 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z);
1965
1966 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
1967 lea $Hcub(%rsp), $r_ptr # H^3
1968 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
1969
1970 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
1971 lea $U2(%rsp), $r_ptr # U1*H^2
1972 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr);
1973___
1974{
1975#######################################################################
1976# operate in 4-5-0-1 "name space" that matches multiplication output
1977#
1978my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1979my ($poly1, $poly3)=($acc6,$acc7);
1980
1981$code.=<<___;
1982 #lea $U2(%rsp), $a_ptr
1983 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
1984 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
1985
Steven Valdez909b19f2016-11-21 15:35:44 -05001986 xor $t4, $t4
Adam Langleyfad63272015-11-12 12:15:39 -08001987 add $acc0, $acc0 # a0:a3+a0:a3
1988 lea $Rsqr(%rsp), $a_ptr
1989 adc $acc1, $acc1
1990 mov $acc0, $t0
1991 adc $acc2, $acc2
1992 adc $acc3, $acc3
1993 mov $acc1, $t1
Steven Valdez909b19f2016-11-21 15:35:44 -05001994 adc \$0, $t4
Adam Langleyfad63272015-11-12 12:15:39 -08001995
1996 sub \$-1, $acc0
1997 mov $acc2, $t2
1998 sbb $poly1, $acc1
1999 sbb \$0, $acc2
2000 mov $acc3, $t3
2001 sbb $poly3, $acc3
Steven Valdez909b19f2016-11-21 15:35:44 -05002002 sbb \$0, $t4
Adam Langleyfad63272015-11-12 12:15:39 -08002003
Steven Valdez909b19f2016-11-21 15:35:44 -05002004 cmovc $t0, $acc0
Adam Langleyfad63272015-11-12 12:15:39 -08002005 mov 8*0($a_ptr), $t0
Steven Valdez909b19f2016-11-21 15:35:44 -05002006 cmovc $t1, $acc1
Adam Langleyfad63272015-11-12 12:15:39 -08002007 mov 8*1($a_ptr), $t1
Steven Valdez909b19f2016-11-21 15:35:44 -05002008 cmovc $t2, $acc2
Adam Langleyfad63272015-11-12 12:15:39 -08002009 mov 8*2($a_ptr), $t2
Steven Valdez909b19f2016-11-21 15:35:44 -05002010 cmovc $t3, $acc3
Adam Langleyfad63272015-11-12 12:15:39 -08002011 mov 8*3($a_ptr), $t3
2012
2013 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
2014
2015 lea $Hcub(%rsp), $b_ptr
2016 lea $res_x(%rsp), $r_ptr
2017 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
2018
2019 mov $U2+8*0(%rsp), $t0
2020 mov $U2+8*1(%rsp), $t1
2021 mov $U2+8*2(%rsp), $t2
2022 mov $U2+8*3(%rsp), $t3
2023 lea $res_y(%rsp), $r_ptr
2024
2025 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x);
2026
2027 mov $acc0, 8*0($r_ptr) # save the result, as
2028 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
2029 mov $acc2, 8*2($r_ptr)
2030 mov $acc3, 8*3($r_ptr)
2031___
2032}
2033$code.=<<___;
2034 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
2035 lea $S2(%rsp), $r_ptr
2036 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub);
2037
2038 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
2039 lea $res_y(%rsp), $r_ptr
2040 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y);
2041
2042 lea $S2(%rsp), $b_ptr
2043 lea $res_y(%rsp), $r_ptr
2044 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2);
2045
2046 movq %xmm0, $r_ptr # restore $r_ptr
2047
2048 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty);
2049 movdqa %xmm5, %xmm1
2050 pandn $res_z(%rsp), %xmm0
2051 movdqa %xmm5, %xmm2
2052 pandn $res_z+0x10(%rsp), %xmm1
2053 movdqa %xmm5, %xmm3
2054 pand $in2_z(%rsp), %xmm2
2055 pand $in2_z+0x10(%rsp), %xmm3
2056 por %xmm0, %xmm2
2057 por %xmm1, %xmm3
2058
2059 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
2060 movdqa %xmm4, %xmm1
2061 pandn %xmm2, %xmm0
2062 movdqa %xmm4, %xmm2
2063 pandn %xmm3, %xmm1
2064 movdqa %xmm4, %xmm3
2065 pand $in1_z(%rsp), %xmm2
2066 pand $in1_z+0x10(%rsp), %xmm3
2067 por %xmm0, %xmm2
2068 por %xmm1, %xmm3
2069 movdqu %xmm2, 0x40($r_ptr)
2070 movdqu %xmm3, 0x50($r_ptr)
2071
2072 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
2073 movdqa %xmm5, %xmm1
2074 pandn $res_x(%rsp), %xmm0
2075 movdqa %xmm5, %xmm2
2076 pandn $res_x+0x10(%rsp), %xmm1
2077 movdqa %xmm5, %xmm3
2078 pand $in2_x(%rsp), %xmm2
2079 pand $in2_x+0x10(%rsp), %xmm3
2080 por %xmm0, %xmm2
2081 por %xmm1, %xmm3
2082
2083 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
2084 movdqa %xmm4, %xmm1
2085 pandn %xmm2, %xmm0
2086 movdqa %xmm4, %xmm2
2087 pandn %xmm3, %xmm1
2088 movdqa %xmm4, %xmm3
2089 pand $in1_x(%rsp), %xmm2
2090 pand $in1_x+0x10(%rsp), %xmm3
2091 por %xmm0, %xmm2
2092 por %xmm1, %xmm3
2093 movdqu %xmm2, 0x00($r_ptr)
2094 movdqu %xmm3, 0x10($r_ptr)
2095
2096 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
2097 movdqa %xmm5, %xmm1
2098 pandn $res_y(%rsp), %xmm0
2099 movdqa %xmm5, %xmm2
2100 pandn $res_y+0x10(%rsp), %xmm1
2101 movdqa %xmm5, %xmm3
2102 pand $in2_y(%rsp), %xmm2
2103 pand $in2_y+0x10(%rsp), %xmm3
2104 por %xmm0, %xmm2
2105 por %xmm1, %xmm3
2106
2107 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
2108 movdqa %xmm4, %xmm1
2109 pandn %xmm2, %xmm0
2110 movdqa %xmm4, %xmm2
2111 pandn %xmm3, %xmm1
2112 movdqa %xmm4, %xmm3
2113 pand $in1_y(%rsp), %xmm2
2114 pand $in1_y+0x10(%rsp), %xmm3
2115 por %xmm0, %xmm2
2116 por %xmm1, %xmm3
2117 movdqu %xmm2, 0x20($r_ptr)
2118 movdqu %xmm3, 0x30($r_ptr)
2119
2120.Ladd_done$x:
2121 add \$32*18+8, %rsp
2122 pop %r15
2123 pop %r14
2124 pop %r13
2125 pop %r12
2126 pop %rbx
2127 pop %rbp
2128 ret
2129.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
2130___
2131}
2132&gen_add("q");
2133
2134sub gen_add_affine () {
2135 my $x = shift;
2136 my ($src0,$sfx,$bias);
2137 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
2138 $res_x,$res_y,$res_z,
2139 $in1_x,$in1_y,$in1_z,
2140 $in2_x,$in2_y)=map(32*$_,(0..14));
2141 my $Z1sqr = $S2;
2142
2143 if ($x ne "x") {
2144 $src0 = "%rax";
2145 $sfx = "";
2146 $bias = 0;
2147
2148$code.=<<___;
2149.globl ecp_nistz256_point_add_affine
2150.type ecp_nistz256_point_add_affine,\@function,3
2151.align 32
2152ecp_nistz256_point_add_affine:
2153___
2154$code.=<<___ if ($addx);
2155 mov \$0x80100, %ecx
2156 and OPENSSL_ia32cap_P+8(%rip), %ecx
2157 cmp \$0x80100, %ecx
2158 je .Lpoint_add_affinex
2159___
2160 } else {
2161 $src0 = "%rdx";
2162 $sfx = "x";
2163 $bias = 128;
2164
2165$code.=<<___;
2166.type ecp_nistz256_point_add_affinex,\@function,3
2167.align 32
2168ecp_nistz256_point_add_affinex:
2169.Lpoint_add_affinex:
2170___
2171 }
2172$code.=<<___;
2173 push %rbp
2174 push %rbx
2175 push %r12
2176 push %r13
2177 push %r14
2178 push %r15
2179 sub \$32*15+8, %rsp
2180
2181 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
2182 mov $b_org, $b_ptr # reassign
2183 movdqu 0x10($a_ptr), %xmm1
2184 movdqu 0x20($a_ptr), %xmm2
2185 movdqu 0x30($a_ptr), %xmm3
2186 movdqu 0x40($a_ptr), %xmm4
2187 movdqu 0x50($a_ptr), %xmm5
2188 mov 0x40+8*0($a_ptr), $src0 # load original in1_z
2189 mov 0x40+8*1($a_ptr), $acc6
2190 mov 0x40+8*2($a_ptr), $acc7
2191 mov 0x40+8*3($a_ptr), $acc0
2192 movdqa %xmm0, $in1_x(%rsp)
2193 movdqa %xmm1, $in1_x+0x10(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08002194 movdqa %xmm2, $in1_y(%rsp)
2195 movdqa %xmm3, $in1_y+0x10(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08002196 movdqa %xmm4, $in1_z(%rsp)
2197 movdqa %xmm5, $in1_z+0x10(%rsp)
Steven Valdez909b19f2016-11-21 15:35:44 -05002198 por %xmm4, %xmm5
Adam Langleyfad63272015-11-12 12:15:39 -08002199
2200 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr
Steven Valdez909b19f2016-11-21 15:35:44 -05002201 pshufd \$0xb1, %xmm5, %xmm3
Adam Langleyfad63272015-11-12 12:15:39 -08002202 movdqu 0x10($b_ptr), %xmm1
2203 movdqu 0x20($b_ptr), %xmm2
2204 por %xmm3, %xmm5
2205 movdqu 0x30($b_ptr), %xmm3
2206 movdqa %xmm0, $in2_x(%rsp)
2207 pshufd \$0x1e, %xmm5, %xmm4
2208 movdqa %xmm1, $in2_x+0x10(%rsp)
2209 por %xmm0, %xmm1
2210 movq $r_ptr, %xmm0 # save $r_ptr
2211 movdqa %xmm2, $in2_y(%rsp)
2212 movdqa %xmm3, $in2_y+0x10(%rsp)
2213 por %xmm2, %xmm3
2214 por %xmm4, %xmm5
2215 pxor %xmm4, %xmm4
2216 por %xmm1, %xmm3
2217
2218 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
2219 lea $Z1sqr(%rsp), $r_ptr # Z1^2
2220 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
2221
2222 pcmpeqd %xmm4, %xmm5
2223 pshufd \$0xb1, %xmm3, %xmm4
2224 mov 0x00($b_ptr), $src0 # $b_ptr is still valid
2225 #lea 0x00($b_ptr), $b_ptr
2226 mov $acc4, $acc1 # harmonize sqr output and mul input
2227 por %xmm3, %xmm4
2228 pshufd \$0, %xmm5, %xmm5 # in1infty
2229 pshufd \$0x1e, %xmm4, %xmm3
2230 mov $acc5, $acc2
2231 por %xmm3, %xmm4
2232 pxor %xmm3, %xmm3
2233 mov $acc6, $acc3
2234 pcmpeqd %xmm3, %xmm4
2235 pshufd \$0, %xmm4, %xmm4 # in2infty
2236
2237 lea $Z1sqr-$bias(%rsp), $a_ptr
2238 mov $acc7, $acc4
2239 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
2240 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x);
2241
2242 lea $in1_x(%rsp), $b_ptr
2243 lea $H(%rsp), $r_ptr # H = U2 - U1
2244 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x);
2245
2246 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
2247 lea $S2(%rsp), $r_ptr # S2 = Z1^3
2248 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
2249
2250 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
2251 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
2252 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
2253
2254 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
2255 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
2256 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
2257
2258 lea $in1_y(%rsp), $b_ptr
2259 lea $R(%rsp), $r_ptr # R = S2 - S1
2260 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y);
2261
2262 `&load_for_sqr("$H(%rsp)", "$src0")`
2263 lea $Hsqr(%rsp), $r_ptr # H^2
2264 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
2265
2266 `&load_for_sqr("$R(%rsp)", "$src0")`
2267 lea $Rsqr(%rsp), $r_ptr # R^2
2268 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
2269
2270 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
2271 lea $Hcub(%rsp), $r_ptr # H^3
2272 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
2273
2274 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
2275 lea $U2(%rsp), $r_ptr # U1*H^2
2276 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr);
2277___
2278{
2279#######################################################################
2280# operate in 4-5-0-1 "name space" that matches multiplication output
2281#
2282my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2283my ($poly1, $poly3)=($acc6,$acc7);
2284
2285$code.=<<___;
2286 #lea $U2(%rsp), $a_ptr
2287 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
2288 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
2289
Steven Valdez909b19f2016-11-21 15:35:44 -05002290 xor $t4, $t4
Adam Langleyfad63272015-11-12 12:15:39 -08002291 add $acc0, $acc0 # a0:a3+a0:a3
2292 lea $Rsqr(%rsp), $a_ptr
2293 adc $acc1, $acc1
2294 mov $acc0, $t0
2295 adc $acc2, $acc2
2296 adc $acc3, $acc3
2297 mov $acc1, $t1
Steven Valdez909b19f2016-11-21 15:35:44 -05002298 adc \$0, $t4
Adam Langleyfad63272015-11-12 12:15:39 -08002299
2300 sub \$-1, $acc0
2301 mov $acc2, $t2
2302 sbb $poly1, $acc1
2303 sbb \$0, $acc2
2304 mov $acc3, $t3
2305 sbb $poly3, $acc3
Steven Valdez909b19f2016-11-21 15:35:44 -05002306 sbb \$0, $t4
Adam Langleyfad63272015-11-12 12:15:39 -08002307
Steven Valdez909b19f2016-11-21 15:35:44 -05002308 cmovc $t0, $acc0
Adam Langleyfad63272015-11-12 12:15:39 -08002309 mov 8*0($a_ptr), $t0
Steven Valdez909b19f2016-11-21 15:35:44 -05002310 cmovc $t1, $acc1
Adam Langleyfad63272015-11-12 12:15:39 -08002311 mov 8*1($a_ptr), $t1
Steven Valdez909b19f2016-11-21 15:35:44 -05002312 cmovc $t2, $acc2
Adam Langleyfad63272015-11-12 12:15:39 -08002313 mov 8*2($a_ptr), $t2
Steven Valdez909b19f2016-11-21 15:35:44 -05002314 cmovc $t3, $acc3
Adam Langleyfad63272015-11-12 12:15:39 -08002315 mov 8*3($a_ptr), $t3
2316
2317 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
2318
2319 lea $Hcub(%rsp), $b_ptr
2320 lea $res_x(%rsp), $r_ptr
2321 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
2322
2323 mov $U2+8*0(%rsp), $t0
2324 mov $U2+8*1(%rsp), $t1
2325 mov $U2+8*2(%rsp), $t2
2326 mov $U2+8*3(%rsp), $t3
2327 lea $H(%rsp), $r_ptr
2328
2329 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x);
2330
2331 mov $acc0, 8*0($r_ptr) # save the result, as
2332 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
2333 mov $acc2, 8*2($r_ptr)
2334 mov $acc3, 8*3($r_ptr)
2335___
2336}
2337$code.=<<___;
2338 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
2339 lea $S2(%rsp), $r_ptr
2340 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y);
2341
2342 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
2343 lea $H(%rsp), $r_ptr
2344 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R);
2345
2346 lea $S2(%rsp), $b_ptr
2347 lea $res_y(%rsp), $r_ptr
2348 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2);
2349
2350 movq %xmm0, $r_ptr # restore $r_ptr
2351
2352 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty);
2353 movdqa %xmm5, %xmm1
2354 pandn $res_z(%rsp), %xmm0
2355 movdqa %xmm5, %xmm2
2356 pandn $res_z+0x10(%rsp), %xmm1
2357 movdqa %xmm5, %xmm3
2358 pand .LONE_mont(%rip), %xmm2
2359 pand .LONE_mont+0x10(%rip), %xmm3
2360 por %xmm0, %xmm2
2361 por %xmm1, %xmm3
2362
2363 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
2364 movdqa %xmm4, %xmm1
2365 pandn %xmm2, %xmm0
2366 movdqa %xmm4, %xmm2
2367 pandn %xmm3, %xmm1
2368 movdqa %xmm4, %xmm3
2369 pand $in1_z(%rsp), %xmm2
2370 pand $in1_z+0x10(%rsp), %xmm3
2371 por %xmm0, %xmm2
2372 por %xmm1, %xmm3
2373 movdqu %xmm2, 0x40($r_ptr)
2374 movdqu %xmm3, 0x50($r_ptr)
2375
2376 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
2377 movdqa %xmm5, %xmm1
2378 pandn $res_x(%rsp), %xmm0
2379 movdqa %xmm5, %xmm2
2380 pandn $res_x+0x10(%rsp), %xmm1
2381 movdqa %xmm5, %xmm3
2382 pand $in2_x(%rsp), %xmm2
2383 pand $in2_x+0x10(%rsp), %xmm3
2384 por %xmm0, %xmm2
2385 por %xmm1, %xmm3
2386
2387 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
2388 movdqa %xmm4, %xmm1
2389 pandn %xmm2, %xmm0
2390 movdqa %xmm4, %xmm2
2391 pandn %xmm3, %xmm1
2392 movdqa %xmm4, %xmm3
2393 pand $in1_x(%rsp), %xmm2
2394 pand $in1_x+0x10(%rsp), %xmm3
2395 por %xmm0, %xmm2
2396 por %xmm1, %xmm3
2397 movdqu %xmm2, 0x00($r_ptr)
2398 movdqu %xmm3, 0x10($r_ptr)
2399
2400 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
2401 movdqa %xmm5, %xmm1
2402 pandn $res_y(%rsp), %xmm0
2403 movdqa %xmm5, %xmm2
2404 pandn $res_y+0x10(%rsp), %xmm1
2405 movdqa %xmm5, %xmm3
2406 pand $in2_y(%rsp), %xmm2
2407 pand $in2_y+0x10(%rsp), %xmm3
2408 por %xmm0, %xmm2
2409 por %xmm1, %xmm3
2410
2411 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
2412 movdqa %xmm4, %xmm1
2413 pandn %xmm2, %xmm0
2414 movdqa %xmm4, %xmm2
2415 pandn %xmm3, %xmm1
2416 movdqa %xmm4, %xmm3
2417 pand $in1_y(%rsp), %xmm2
2418 pand $in1_y+0x10(%rsp), %xmm3
2419 por %xmm0, %xmm2
2420 por %xmm1, %xmm3
2421 movdqu %xmm2, 0x20($r_ptr)
2422 movdqu %xmm3, 0x30($r_ptr)
2423
2424 add \$32*15+8, %rsp
2425 pop %r15
2426 pop %r14
2427 pop %r13
2428 pop %r12
2429 pop %rbx
2430 pop %rbp
2431 ret
2432.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
2433___
2434}
2435&gen_add_affine("q");
2436
2437########################################################################
2438# AD*X magic
2439#
2440if ($addx) { {
2441########################################################################
2442# operate in 4-5-0-1 "name space" that matches multiplication output
2443#
2444my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2445
2446$code.=<<___;
2447.type __ecp_nistz256_add_tox,\@abi-omnipotent
2448.align 32
2449__ecp_nistz256_add_tox:
2450 xor $t4, $t4
2451 adc 8*0($b_ptr), $a0
2452 adc 8*1($b_ptr), $a1
2453 mov $a0, $t0
2454 adc 8*2($b_ptr), $a2
2455 adc 8*3($b_ptr), $a3
2456 mov $a1, $t1
2457 adc \$0, $t4
2458
2459 xor $t3, $t3
2460 sbb \$-1, $a0
2461 mov $a2, $t2
2462 sbb $poly1, $a1
2463 sbb \$0, $a2
2464 mov $a3, $t3
2465 sbb $poly3, $a3
Steven Valdez909b19f2016-11-21 15:35:44 -05002466 sbb \$0, $t4
Adam Langleyfad63272015-11-12 12:15:39 -08002467
Steven Valdez909b19f2016-11-21 15:35:44 -05002468 cmovc $t0, $a0
2469 cmovc $t1, $a1
Adam Langleyfad63272015-11-12 12:15:39 -08002470 mov $a0, 8*0($r_ptr)
Steven Valdez909b19f2016-11-21 15:35:44 -05002471 cmovc $t2, $a2
Adam Langleyfad63272015-11-12 12:15:39 -08002472 mov $a1, 8*1($r_ptr)
Steven Valdez909b19f2016-11-21 15:35:44 -05002473 cmovc $t3, $a3
Adam Langleyfad63272015-11-12 12:15:39 -08002474 mov $a2, 8*2($r_ptr)
2475 mov $a3, 8*3($r_ptr)
2476
2477 ret
2478.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
2479
2480.type __ecp_nistz256_sub_fromx,\@abi-omnipotent
2481.align 32
2482__ecp_nistz256_sub_fromx:
2483 xor $t4, $t4
2484 sbb 8*0($b_ptr), $a0
2485 sbb 8*1($b_ptr), $a1
2486 mov $a0, $t0
2487 sbb 8*2($b_ptr), $a2
2488 sbb 8*3($b_ptr), $a3
2489 mov $a1, $t1
2490 sbb \$0, $t4
2491
2492 xor $t3, $t3
2493 adc \$-1, $a0
2494 mov $a2, $t2
2495 adc $poly1, $a1
2496 adc \$0, $a2
2497 mov $a3, $t3
2498 adc $poly3, $a3
2499
2500 bt \$0, $t4
2501 cmovnc $t0, $a0
2502 cmovnc $t1, $a1
2503 mov $a0, 8*0($r_ptr)
2504 cmovnc $t2, $a2
2505 mov $a1, 8*1($r_ptr)
2506 cmovnc $t3, $a3
2507 mov $a2, 8*2($r_ptr)
2508 mov $a3, 8*3($r_ptr)
2509
2510 ret
2511.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
2512
2513.type __ecp_nistz256_subx,\@abi-omnipotent
2514.align 32
2515__ecp_nistz256_subx:
2516 xor $t4, $t4
2517 sbb $a0, $t0
2518 sbb $a1, $t1
2519 mov $t0, $a0
2520 sbb $a2, $t2
2521 sbb $a3, $t3
2522 mov $t1, $a1
2523 sbb \$0, $t4
2524
2525 xor $a3 ,$a3
2526 adc \$-1, $t0
2527 mov $t2, $a2
2528 adc $poly1, $t1
2529 adc \$0, $t2
2530 mov $t3, $a3
2531 adc $poly3, $t3
2532
2533 bt \$0, $t4
2534 cmovc $t0, $a0
2535 cmovc $t1, $a1
2536 cmovc $t2, $a2
2537 cmovc $t3, $a3
2538
2539 ret
2540.size __ecp_nistz256_subx,.-__ecp_nistz256_subx
2541
2542.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent
2543.align 32
2544__ecp_nistz256_mul_by_2x:
2545 xor $t4, $t4
2546 adc $a0, $a0 # a0:a3+a0:a3
2547 adc $a1, $a1
2548 mov $a0, $t0
2549 adc $a2, $a2
2550 adc $a3, $a3
2551 mov $a1, $t1
2552 adc \$0, $t4
2553
2554 xor $t3, $t3
2555 sbb \$-1, $a0
2556 mov $a2, $t2
2557 sbb $poly1, $a1
2558 sbb \$0, $a2
2559 mov $a3, $t3
2560 sbb $poly3, $a3
Steven Valdez909b19f2016-11-21 15:35:44 -05002561 sbb \$0, $t4
Adam Langleyfad63272015-11-12 12:15:39 -08002562
Steven Valdez909b19f2016-11-21 15:35:44 -05002563 cmovc $t0, $a0
2564 cmovc $t1, $a1
Adam Langleyfad63272015-11-12 12:15:39 -08002565 mov $a0, 8*0($r_ptr)
Steven Valdez909b19f2016-11-21 15:35:44 -05002566 cmovc $t2, $a2
Adam Langleyfad63272015-11-12 12:15:39 -08002567 mov $a1, 8*1($r_ptr)
Steven Valdez909b19f2016-11-21 15:35:44 -05002568 cmovc $t3, $a3
Adam Langleyfad63272015-11-12 12:15:39 -08002569 mov $a2, 8*2($r_ptr)
2570 mov $a3, 8*3($r_ptr)
2571
2572 ret
2573.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
2574___
2575 }
2576&gen_double("x");
2577&gen_add("x");
2578&gen_add_affine("x");
2579}
2580}}}
2581
2582$code =~ s/\`([^\`]*)\`/eval $1/gem;
2583print $code;
2584close STDOUT;