blob: f7875772ad243e22ab2647b076f1bc83d7eb8801 [file] [log] [blame]
Robert Sloan6f79a502017-04-03 09:16:40 -07001#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
Adam Langleyfad63272015-11-12 12:15:39 -08002.text
3
4
5
6.p2align 6
7L$poly:
8.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
9
Adam Langleyfad63272015-11-12 12:15:39 -080010L$One:
11.long 1,1,1,1,1,1,1,1
12L$Two:
13.long 2,2,2,2,2,2,2,2
14L$Three:
15.long 3,3,3,3,3,3,3,3
16L$ONE_mont:
17.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
18
Adam Langleyfad63272015-11-12 12:15:39 -080019
Adam Langleyfad63272015-11-12 12:15:39 -080020
Adam Langleyfad63272015-11-12 12:15:39 -080021.globl _ecp_nistz256_neg
22.private_extern _ecp_nistz256_neg
23
24.p2align 5
25_ecp_nistz256_neg:
26 pushq %r12
27 pushq %r13
28
29 xorq %r8,%r8
30 xorq %r9,%r9
31 xorq %r10,%r10
32 xorq %r11,%r11
33 xorq %r13,%r13
34
35 subq 0(%rsi),%r8
36 sbbq 8(%rsi),%r9
37 sbbq 16(%rsi),%r10
38 movq %r8,%rax
39 sbbq 24(%rsi),%r11
40 leaq L$poly(%rip),%rsi
41 movq %r9,%rdx
42 sbbq $0,%r13
43
44 addq 0(%rsi),%r8
45 movq %r10,%rcx
46 adcq 8(%rsi),%r9
47 adcq 16(%rsi),%r10
48 movq %r11,%r12
49 adcq 24(%rsi),%r11
50 testq %r13,%r13
51
52 cmovzq %rax,%r8
53 cmovzq %rdx,%r9
54 movq %r8,0(%rdi)
55 cmovzq %rcx,%r10
56 movq %r9,8(%rdi)
57 cmovzq %r12,%r11
58 movq %r10,16(%rdi)
59 movq %r11,24(%rdi)
60
61 popq %r13
62 popq %r12
63 .byte 0xf3,0xc3
64
65
66
67
68
Adam Langleyfad63272015-11-12 12:15:39 -080069
70
71.globl _ecp_nistz256_mul_mont
72.private_extern _ecp_nistz256_mul_mont
73
74.p2align 5
75_ecp_nistz256_mul_mont:
76L$mul_mont:
77 pushq %rbp
78 pushq %rbx
79 pushq %r12
80 pushq %r13
81 pushq %r14
82 pushq %r15
83 movq %rdx,%rbx
84 movq 0(%rdx),%rax
85 movq 0(%rsi),%r9
86 movq 8(%rsi),%r10
87 movq 16(%rsi),%r11
88 movq 24(%rsi),%r12
89
90 call __ecp_nistz256_mul_montq
91L$mul_mont_done:
92 popq %r15
93 popq %r14
94 popq %r13
95 popq %r12
96 popq %rbx
97 popq %rbp
98 .byte 0xf3,0xc3
99
100
101
102.p2align 5
103__ecp_nistz256_mul_montq:
104
105
106 movq %rax,%rbp
107 mulq %r9
108 movq L$poly+8(%rip),%r14
109 movq %rax,%r8
110 movq %rbp,%rax
111 movq %rdx,%r9
112
113 mulq %r10
114 movq L$poly+24(%rip),%r15
115 addq %rax,%r9
116 movq %rbp,%rax
117 adcq $0,%rdx
118 movq %rdx,%r10
119
120 mulq %r11
121 addq %rax,%r10
122 movq %rbp,%rax
123 adcq $0,%rdx
124 movq %rdx,%r11
125
126 mulq %r12
127 addq %rax,%r11
128 movq %r8,%rax
129 adcq $0,%rdx
130 xorq %r13,%r13
131 movq %rdx,%r12
132
133
134
135
136
137
138
139
140
141
142 movq %r8,%rbp
143 shlq $32,%r8
144 mulq %r15
145 shrq $32,%rbp
146 addq %r8,%r9
147 adcq %rbp,%r10
148 adcq %rax,%r11
149 movq 8(%rbx),%rax
150 adcq %rdx,%r12
151 adcq $0,%r13
152 xorq %r8,%r8
153
154
155
156 movq %rax,%rbp
157 mulq 0(%rsi)
158 addq %rax,%r9
159 movq %rbp,%rax
160 adcq $0,%rdx
161 movq %rdx,%rcx
162
163 mulq 8(%rsi)
164 addq %rcx,%r10
165 adcq $0,%rdx
166 addq %rax,%r10
167 movq %rbp,%rax
168 adcq $0,%rdx
169 movq %rdx,%rcx
170
171 mulq 16(%rsi)
172 addq %rcx,%r11
173 adcq $0,%rdx
174 addq %rax,%r11
175 movq %rbp,%rax
176 adcq $0,%rdx
177 movq %rdx,%rcx
178
179 mulq 24(%rsi)
180 addq %rcx,%r12
181 adcq $0,%rdx
182 addq %rax,%r12
183 movq %r9,%rax
184 adcq %rdx,%r13
185 adcq $0,%r8
186
187
188
189 movq %r9,%rbp
190 shlq $32,%r9
191 mulq %r15
192 shrq $32,%rbp
193 addq %r9,%r10
194 adcq %rbp,%r11
195 adcq %rax,%r12
196 movq 16(%rbx),%rax
197 adcq %rdx,%r13
198 adcq $0,%r8
199 xorq %r9,%r9
200
201
202
203 movq %rax,%rbp
204 mulq 0(%rsi)
205 addq %rax,%r10
206 movq %rbp,%rax
207 adcq $0,%rdx
208 movq %rdx,%rcx
209
210 mulq 8(%rsi)
211 addq %rcx,%r11
212 adcq $0,%rdx
213 addq %rax,%r11
214 movq %rbp,%rax
215 adcq $0,%rdx
216 movq %rdx,%rcx
217
218 mulq 16(%rsi)
219 addq %rcx,%r12
220 adcq $0,%rdx
221 addq %rax,%r12
222 movq %rbp,%rax
223 adcq $0,%rdx
224 movq %rdx,%rcx
225
226 mulq 24(%rsi)
227 addq %rcx,%r13
228 adcq $0,%rdx
229 addq %rax,%r13
230 movq %r10,%rax
231 adcq %rdx,%r8
232 adcq $0,%r9
233
234
235
236 movq %r10,%rbp
237 shlq $32,%r10
238 mulq %r15
239 shrq $32,%rbp
240 addq %r10,%r11
241 adcq %rbp,%r12
242 adcq %rax,%r13
243 movq 24(%rbx),%rax
244 adcq %rdx,%r8
245 adcq $0,%r9
246 xorq %r10,%r10
247
248
249
250 movq %rax,%rbp
251 mulq 0(%rsi)
252 addq %rax,%r11
253 movq %rbp,%rax
254 adcq $0,%rdx
255 movq %rdx,%rcx
256
257 mulq 8(%rsi)
258 addq %rcx,%r12
259 adcq $0,%rdx
260 addq %rax,%r12
261 movq %rbp,%rax
262 adcq $0,%rdx
263 movq %rdx,%rcx
264
265 mulq 16(%rsi)
266 addq %rcx,%r13
267 adcq $0,%rdx
268 addq %rax,%r13
269 movq %rbp,%rax
270 adcq $0,%rdx
271 movq %rdx,%rcx
272
273 mulq 24(%rsi)
274 addq %rcx,%r8
275 adcq $0,%rdx
276 addq %rax,%r8
277 movq %r11,%rax
278 adcq %rdx,%r9
279 adcq $0,%r10
280
281
282
283 movq %r11,%rbp
284 shlq $32,%r11
285 mulq %r15
286 shrq $32,%rbp
287 addq %r11,%r12
288 adcq %rbp,%r13
289 movq %r12,%rcx
290 adcq %rax,%r8
291 adcq %rdx,%r9
292 movq %r13,%rbp
293 adcq $0,%r10
294
295
296
297 subq $-1,%r12
298 movq %r8,%rbx
299 sbbq %r14,%r13
300 sbbq $0,%r8
301 movq %r9,%rdx
302 sbbq %r15,%r9
303 sbbq $0,%r10
304
305 cmovcq %rcx,%r12
306 cmovcq %rbp,%r13
307 movq %r12,0(%rdi)
308 cmovcq %rbx,%r8
309 movq %r13,8(%rdi)
310 cmovcq %rdx,%r9
311 movq %r8,16(%rdi)
312 movq %r9,24(%rdi)
313
314 .byte 0xf3,0xc3
315
316
317
318
319
320
321
322
323
324.globl _ecp_nistz256_sqr_mont
325.private_extern _ecp_nistz256_sqr_mont
326
327.p2align 5
328_ecp_nistz256_sqr_mont:
329 pushq %rbp
330 pushq %rbx
331 pushq %r12
332 pushq %r13
333 pushq %r14
334 pushq %r15
335 movq 0(%rsi),%rax
336 movq 8(%rsi),%r14
337 movq 16(%rsi),%r15
338 movq 24(%rsi),%r8
339
340 call __ecp_nistz256_sqr_montq
341L$sqr_mont_done:
342 popq %r15
343 popq %r14
344 popq %r13
345 popq %r12
346 popq %rbx
347 popq %rbp
348 .byte 0xf3,0xc3
349
350
351
352.p2align 5
353__ecp_nistz256_sqr_montq:
354 movq %rax,%r13
355 mulq %r14
356 movq %rax,%r9
357 movq %r15,%rax
358 movq %rdx,%r10
359
360 mulq %r13
361 addq %rax,%r10
362 movq %r8,%rax
363 adcq $0,%rdx
364 movq %rdx,%r11
365
366 mulq %r13
367 addq %rax,%r11
368 movq %r15,%rax
369 adcq $0,%rdx
370 movq %rdx,%r12
371
372
373 mulq %r14
374 addq %rax,%r11
375 movq %r8,%rax
376 adcq $0,%rdx
377 movq %rdx,%rbp
378
379 mulq %r14
380 addq %rax,%r12
381 movq %r8,%rax
382 adcq $0,%rdx
383 addq %rbp,%r12
384 movq %rdx,%r13
385 adcq $0,%r13
386
387
388 mulq %r15
389 xorq %r15,%r15
390 addq %rax,%r13
391 movq 0(%rsi),%rax
392 movq %rdx,%r14
393 adcq $0,%r14
394
395 addq %r9,%r9
396 adcq %r10,%r10
397 adcq %r11,%r11
398 adcq %r12,%r12
399 adcq %r13,%r13
400 adcq %r14,%r14
401 adcq $0,%r15
402
403 mulq %rax
404 movq %rax,%r8
405 movq 8(%rsi),%rax
406 movq %rdx,%rcx
407
408 mulq %rax
409 addq %rcx,%r9
410 adcq %rax,%r10
411 movq 16(%rsi),%rax
412 adcq $0,%rdx
413 movq %rdx,%rcx
414
415 mulq %rax
416 addq %rcx,%r11
417 adcq %rax,%r12
418 movq 24(%rsi),%rax
419 adcq $0,%rdx
420 movq %rdx,%rcx
421
422 mulq %rax
423 addq %rcx,%r13
424 adcq %rax,%r14
425 movq %r8,%rax
426 adcq %rdx,%r15
427
428 movq L$poly+8(%rip),%rsi
429 movq L$poly+24(%rip),%rbp
430
431
432
433
434 movq %r8,%rcx
435 shlq $32,%r8
436 mulq %rbp
437 shrq $32,%rcx
438 addq %r8,%r9
439 adcq %rcx,%r10
440 adcq %rax,%r11
441 movq %r9,%rax
442 adcq $0,%rdx
443
444
445
446 movq %r9,%rcx
447 shlq $32,%r9
448 movq %rdx,%r8
449 mulq %rbp
450 shrq $32,%rcx
451 addq %r9,%r10
452 adcq %rcx,%r11
453 adcq %rax,%r8
454 movq %r10,%rax
455 adcq $0,%rdx
456
457
458
459 movq %r10,%rcx
460 shlq $32,%r10
461 movq %rdx,%r9
462 mulq %rbp
463 shrq $32,%rcx
464 addq %r10,%r11
465 adcq %rcx,%r8
466 adcq %rax,%r9
467 movq %r11,%rax
468 adcq $0,%rdx
469
470
471
472 movq %r11,%rcx
473 shlq $32,%r11
474 movq %rdx,%r10
475 mulq %rbp
476 shrq $32,%rcx
477 addq %r11,%r8
478 adcq %rcx,%r9
479 adcq %rax,%r10
480 adcq $0,%rdx
481 xorq %r11,%r11
482
483
484
485 addq %r8,%r12
486 adcq %r9,%r13
487 movq %r12,%r8
488 adcq %r10,%r14
489 adcq %rdx,%r15
490 movq %r13,%r9
491 adcq $0,%r11
492
493 subq $-1,%r12
494 movq %r14,%r10
495 sbbq %rsi,%r13
496 sbbq $0,%r14
497 movq %r15,%rcx
498 sbbq %rbp,%r15
499 sbbq $0,%r11
500
501 cmovcq %r8,%r12
502 cmovcq %r9,%r13
503 movq %r12,0(%rdi)
504 cmovcq %r10,%r14
505 movq %r13,8(%rdi)
506 cmovcq %rcx,%r15
507 movq %r14,16(%rdi)
508 movq %r15,24(%rdi)
509
510 .byte 0xf3,0xc3
511
512
513
Adam Langleyfad63272015-11-12 12:15:39 -0800514.globl _ecp_nistz256_select_w5
515.private_extern _ecp_nistz256_select_w5
516
517.p2align 5
518_ecp_nistz256_select_w5:
Robert Sloan8f860b12017-08-28 07:37:06 -0700519 leaq _OPENSSL_ia32cap_P(%rip),%rax
520 movq 8(%rax),%rax
521 testl $32,%eax
522 jnz L$avx2_select_w5
Adam Langleyfad63272015-11-12 12:15:39 -0800523 movdqa L$One(%rip),%xmm0
524 movd %edx,%xmm1
525
526 pxor %xmm2,%xmm2
527 pxor %xmm3,%xmm3
528 pxor %xmm4,%xmm4
529 pxor %xmm5,%xmm5
530 pxor %xmm6,%xmm6
531 pxor %xmm7,%xmm7
532
533 movdqa %xmm0,%xmm8
534 pshufd $0,%xmm1,%xmm1
535
536 movq $16,%rax
537L$select_loop_sse_w5:
538
539 movdqa %xmm8,%xmm15
540 paddd %xmm0,%xmm8
541 pcmpeqd %xmm1,%xmm15
542
543 movdqa 0(%rsi),%xmm9
544 movdqa 16(%rsi),%xmm10
545 movdqa 32(%rsi),%xmm11
546 movdqa 48(%rsi),%xmm12
547 movdqa 64(%rsi),%xmm13
548 movdqa 80(%rsi),%xmm14
549 leaq 96(%rsi),%rsi
550
551 pand %xmm15,%xmm9
552 pand %xmm15,%xmm10
553 por %xmm9,%xmm2
554 pand %xmm15,%xmm11
555 por %xmm10,%xmm3
556 pand %xmm15,%xmm12
557 por %xmm11,%xmm4
558 pand %xmm15,%xmm13
559 por %xmm12,%xmm5
560 pand %xmm15,%xmm14
561 por %xmm13,%xmm6
562 por %xmm14,%xmm7
563
564 decq %rax
565 jnz L$select_loop_sse_w5
566
567 movdqu %xmm2,0(%rdi)
568 movdqu %xmm3,16(%rdi)
569 movdqu %xmm4,32(%rdi)
570 movdqu %xmm5,48(%rdi)
571 movdqu %xmm6,64(%rdi)
572 movdqu %xmm7,80(%rdi)
573 .byte 0xf3,0xc3
574
575
576
577
578.globl _ecp_nistz256_select_w7
579.private_extern _ecp_nistz256_select_w7
580
581.p2align 5
582_ecp_nistz256_select_w7:
Robert Sloan8f860b12017-08-28 07:37:06 -0700583 leaq _OPENSSL_ia32cap_P(%rip),%rax
584 movq 8(%rax),%rax
585 testl $32,%eax
586 jnz L$avx2_select_w7
Adam Langleyfad63272015-11-12 12:15:39 -0800587 movdqa L$One(%rip),%xmm8
588 movd %edx,%xmm1
589
590 pxor %xmm2,%xmm2
591 pxor %xmm3,%xmm3
592 pxor %xmm4,%xmm4
593 pxor %xmm5,%xmm5
594
595 movdqa %xmm8,%xmm0
596 pshufd $0,%xmm1,%xmm1
597 movq $64,%rax
598
599L$select_loop_sse_w7:
600 movdqa %xmm8,%xmm15
601 paddd %xmm0,%xmm8
602 movdqa 0(%rsi),%xmm9
603 movdqa 16(%rsi),%xmm10
604 pcmpeqd %xmm1,%xmm15
605 movdqa 32(%rsi),%xmm11
606 movdqa 48(%rsi),%xmm12
607 leaq 64(%rsi),%rsi
608
609 pand %xmm15,%xmm9
610 pand %xmm15,%xmm10
611 por %xmm9,%xmm2
612 pand %xmm15,%xmm11
613 por %xmm10,%xmm3
614 pand %xmm15,%xmm12
615 por %xmm11,%xmm4
616 prefetcht0 255(%rsi)
617 por %xmm12,%xmm5
618
619 decq %rax
620 jnz L$select_loop_sse_w7
621
622 movdqu %xmm2,0(%rdi)
623 movdqu %xmm3,16(%rdi)
624 movdqu %xmm4,32(%rdi)
625 movdqu %xmm5,48(%rdi)
626 .byte 0xf3,0xc3
627
Robert Sloan8f860b12017-08-28 07:37:06 -0700628
629
630
631.p2align 5
632ecp_nistz256_avx2_select_w5:
633L$avx2_select_w5:
634 vzeroupper
635 vmovdqa L$Two(%rip),%ymm0
636
637 vpxor %ymm2,%ymm2,%ymm2
638 vpxor %ymm3,%ymm3,%ymm3
639 vpxor %ymm4,%ymm4,%ymm4
640
641 vmovdqa L$One(%rip),%ymm5
642 vmovdqa L$Two(%rip),%ymm10
643
644 vmovd %edx,%xmm1
645 vpermd %ymm1,%ymm2,%ymm1
646
647 movq $8,%rax
648L$select_loop_avx2_w5:
649
650 vmovdqa 0(%rsi),%ymm6
651 vmovdqa 32(%rsi),%ymm7
652 vmovdqa 64(%rsi),%ymm8
653
654 vmovdqa 96(%rsi),%ymm11
655 vmovdqa 128(%rsi),%ymm12
656 vmovdqa 160(%rsi),%ymm13
657
658 vpcmpeqd %ymm1,%ymm5,%ymm9
659 vpcmpeqd %ymm1,%ymm10,%ymm14
660
661 vpaddd %ymm0,%ymm5,%ymm5
662 vpaddd %ymm0,%ymm10,%ymm10
663 leaq 192(%rsi),%rsi
664
665 vpand %ymm9,%ymm6,%ymm6
666 vpand %ymm9,%ymm7,%ymm7
667 vpand %ymm9,%ymm8,%ymm8
668 vpand %ymm14,%ymm11,%ymm11
669 vpand %ymm14,%ymm12,%ymm12
670 vpand %ymm14,%ymm13,%ymm13
671
672 vpxor %ymm6,%ymm2,%ymm2
673 vpxor %ymm7,%ymm3,%ymm3
674 vpxor %ymm8,%ymm4,%ymm4
675 vpxor %ymm11,%ymm2,%ymm2
676 vpxor %ymm12,%ymm3,%ymm3
677 vpxor %ymm13,%ymm4,%ymm4
678
679 decq %rax
680 jnz L$select_loop_avx2_w5
681
682 vmovdqu %ymm2,0(%rdi)
683 vmovdqu %ymm3,32(%rdi)
684 vmovdqu %ymm4,64(%rdi)
685 vzeroupper
686 .byte 0xf3,0xc3
687
688
689
690
Adam Langleyfad63272015-11-12 12:15:39 -0800691.globl _ecp_nistz256_avx2_select_w7
692.private_extern _ecp_nistz256_avx2_select_w7
693
694.p2align 5
695_ecp_nistz256_avx2_select_w7:
Robert Sloan8f860b12017-08-28 07:37:06 -0700696L$avx2_select_w7:
697 vzeroupper
698 vmovdqa L$Three(%rip),%ymm0
699
700 vpxor %ymm2,%ymm2,%ymm2
701 vpxor %ymm3,%ymm3,%ymm3
702
703 vmovdqa L$One(%rip),%ymm4
704 vmovdqa L$Two(%rip),%ymm8
705 vmovdqa L$Three(%rip),%ymm12
706
707 vmovd %edx,%xmm1
708 vpermd %ymm1,%ymm2,%ymm1
709
710
711 movq $21,%rax
712L$select_loop_avx2_w7:
713
714 vmovdqa 0(%rsi),%ymm5
715 vmovdqa 32(%rsi),%ymm6
716
717 vmovdqa 64(%rsi),%ymm9
718 vmovdqa 96(%rsi),%ymm10
719
720 vmovdqa 128(%rsi),%ymm13
721 vmovdqa 160(%rsi),%ymm14
722
723 vpcmpeqd %ymm1,%ymm4,%ymm7
724 vpcmpeqd %ymm1,%ymm8,%ymm11
725 vpcmpeqd %ymm1,%ymm12,%ymm15
726
727 vpaddd %ymm0,%ymm4,%ymm4
728 vpaddd %ymm0,%ymm8,%ymm8
729 vpaddd %ymm0,%ymm12,%ymm12
730 leaq 192(%rsi),%rsi
731
732 vpand %ymm7,%ymm5,%ymm5
733 vpand %ymm7,%ymm6,%ymm6
734 vpand %ymm11,%ymm9,%ymm9
735 vpand %ymm11,%ymm10,%ymm10
736 vpand %ymm15,%ymm13,%ymm13
737 vpand %ymm15,%ymm14,%ymm14
738
739 vpxor %ymm5,%ymm2,%ymm2
740 vpxor %ymm6,%ymm3,%ymm3
741 vpxor %ymm9,%ymm2,%ymm2
742 vpxor %ymm10,%ymm3,%ymm3
743 vpxor %ymm13,%ymm2,%ymm2
744 vpxor %ymm14,%ymm3,%ymm3
745
746 decq %rax
747 jnz L$select_loop_avx2_w7
748
749
750 vmovdqa 0(%rsi),%ymm5
751 vmovdqa 32(%rsi),%ymm6
752
753 vpcmpeqd %ymm1,%ymm4,%ymm7
754
755 vpand %ymm7,%ymm5,%ymm5
756 vpand %ymm7,%ymm6,%ymm6
757
758 vpxor %ymm5,%ymm2,%ymm2
759 vpxor %ymm6,%ymm3,%ymm3
760
761 vmovdqu %ymm2,0(%rdi)
762 vmovdqu %ymm3,32(%rdi)
763 vzeroupper
Adam Langleyfad63272015-11-12 12:15:39 -0800764 .byte 0xf3,0xc3
765
766
767.p2align 5
768__ecp_nistz256_add_toq:
Steven Valdez909b19f2016-11-21 15:35:44 -0500769 xorq %r11,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800770 addq 0(%rbx),%r12
771 adcq 8(%rbx),%r13
772 movq %r12,%rax
773 adcq 16(%rbx),%r8
774 adcq 24(%rbx),%r9
775 movq %r13,%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -0500776 adcq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800777
778 subq $-1,%r12
779 movq %r8,%rcx
780 sbbq %r14,%r13
781 sbbq $0,%r8
782 movq %r9,%r10
783 sbbq %r15,%r9
Steven Valdez909b19f2016-11-21 15:35:44 -0500784 sbbq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800785
Steven Valdez909b19f2016-11-21 15:35:44 -0500786 cmovcq %rax,%r12
787 cmovcq %rbp,%r13
Adam Langleyfad63272015-11-12 12:15:39 -0800788 movq %r12,0(%rdi)
Steven Valdez909b19f2016-11-21 15:35:44 -0500789 cmovcq %rcx,%r8
Adam Langleyfad63272015-11-12 12:15:39 -0800790 movq %r13,8(%rdi)
Steven Valdez909b19f2016-11-21 15:35:44 -0500791 cmovcq %r10,%r9
Adam Langleyfad63272015-11-12 12:15:39 -0800792 movq %r8,16(%rdi)
793 movq %r9,24(%rdi)
794
795 .byte 0xf3,0xc3
796
797
798
799.p2align 5
800__ecp_nistz256_sub_fromq:
801 subq 0(%rbx),%r12
802 sbbq 8(%rbx),%r13
803 movq %r12,%rax
804 sbbq 16(%rbx),%r8
805 sbbq 24(%rbx),%r9
806 movq %r13,%rbp
807 sbbq %r11,%r11
808
809 addq $-1,%r12
810 movq %r8,%rcx
811 adcq %r14,%r13
812 adcq $0,%r8
813 movq %r9,%r10
814 adcq %r15,%r9
815 testq %r11,%r11
816
817 cmovzq %rax,%r12
818 cmovzq %rbp,%r13
819 movq %r12,0(%rdi)
820 cmovzq %rcx,%r8
821 movq %r13,8(%rdi)
822 cmovzq %r10,%r9
823 movq %r8,16(%rdi)
824 movq %r9,24(%rdi)
825
826 .byte 0xf3,0xc3
827
828
829
830.p2align 5
831__ecp_nistz256_subq:
832 subq %r12,%rax
833 sbbq %r13,%rbp
834 movq %rax,%r12
835 sbbq %r8,%rcx
836 sbbq %r9,%r10
837 movq %rbp,%r13
838 sbbq %r11,%r11
839
840 addq $-1,%rax
841 movq %rcx,%r8
842 adcq %r14,%rbp
843 adcq $0,%rcx
844 movq %r10,%r9
845 adcq %r15,%r10
846 testq %r11,%r11
847
848 cmovnzq %rax,%r12
849 cmovnzq %rbp,%r13
850 cmovnzq %rcx,%r8
851 cmovnzq %r10,%r9
852
853 .byte 0xf3,0xc3
854
855
856
857.p2align 5
858__ecp_nistz256_mul_by_2q:
Steven Valdez909b19f2016-11-21 15:35:44 -0500859 xorq %r11,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800860 addq %r12,%r12
861 adcq %r13,%r13
862 movq %r12,%rax
863 adcq %r8,%r8
864 adcq %r9,%r9
865 movq %r13,%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -0500866 adcq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800867
868 subq $-1,%r12
869 movq %r8,%rcx
870 sbbq %r14,%r13
871 sbbq $0,%r8
872 movq %r9,%r10
873 sbbq %r15,%r9
Steven Valdez909b19f2016-11-21 15:35:44 -0500874 sbbq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800875
Steven Valdez909b19f2016-11-21 15:35:44 -0500876 cmovcq %rax,%r12
877 cmovcq %rbp,%r13
Adam Langleyfad63272015-11-12 12:15:39 -0800878 movq %r12,0(%rdi)
Steven Valdez909b19f2016-11-21 15:35:44 -0500879 cmovcq %rcx,%r8
Adam Langleyfad63272015-11-12 12:15:39 -0800880 movq %r13,8(%rdi)
Steven Valdez909b19f2016-11-21 15:35:44 -0500881 cmovcq %r10,%r9
Adam Langleyfad63272015-11-12 12:15:39 -0800882 movq %r8,16(%rdi)
883 movq %r9,24(%rdi)
884
885 .byte 0xf3,0xc3
886
887.globl _ecp_nistz256_point_double
888.private_extern _ecp_nistz256_point_double
889
890.p2align 5
891_ecp_nistz256_point_double:
892 pushq %rbp
893 pushq %rbx
894 pushq %r12
895 pushq %r13
896 pushq %r14
897 pushq %r15
898 subq $160+8,%rsp
899
David Benjamin4969cc92016-04-22 15:02:23 -0400900L$point_double_shortcutq:
Adam Langleyfad63272015-11-12 12:15:39 -0800901 movdqu 0(%rsi),%xmm0
902 movq %rsi,%rbx
903 movdqu 16(%rsi),%xmm1
904 movq 32+0(%rsi),%r12
905 movq 32+8(%rsi),%r13
906 movq 32+16(%rsi),%r8
907 movq 32+24(%rsi),%r9
908 movq L$poly+8(%rip),%r14
909 movq L$poly+24(%rip),%r15
910 movdqa %xmm0,96(%rsp)
911 movdqa %xmm1,96+16(%rsp)
912 leaq 32(%rdi),%r10
913 leaq 64(%rdi),%r11
914.byte 102,72,15,110,199
915.byte 102,73,15,110,202
916.byte 102,73,15,110,211
917
918 leaq 0(%rsp),%rdi
919 call __ecp_nistz256_mul_by_2q
920
921 movq 64+0(%rsi),%rax
922 movq 64+8(%rsi),%r14
923 movq 64+16(%rsi),%r15
924 movq 64+24(%rsi),%r8
925 leaq 64-0(%rsi),%rsi
926 leaq 64(%rsp),%rdi
927 call __ecp_nistz256_sqr_montq
928
929 movq 0+0(%rsp),%rax
930 movq 8+0(%rsp),%r14
931 leaq 0+0(%rsp),%rsi
932 movq 16+0(%rsp),%r15
933 movq 24+0(%rsp),%r8
934 leaq 0(%rsp),%rdi
935 call __ecp_nistz256_sqr_montq
936
937 movq 32(%rbx),%rax
938 movq 64+0(%rbx),%r9
939 movq 64+8(%rbx),%r10
940 movq 64+16(%rbx),%r11
941 movq 64+24(%rbx),%r12
942 leaq 64-0(%rbx),%rsi
943 leaq 32(%rbx),%rbx
944.byte 102,72,15,126,215
945 call __ecp_nistz256_mul_montq
946 call __ecp_nistz256_mul_by_2q
947
948 movq 96+0(%rsp),%r12
949 movq 96+8(%rsp),%r13
950 leaq 64(%rsp),%rbx
951 movq 96+16(%rsp),%r8
952 movq 96+24(%rsp),%r9
953 leaq 32(%rsp),%rdi
954 call __ecp_nistz256_add_toq
955
956 movq 96+0(%rsp),%r12
957 movq 96+8(%rsp),%r13
958 leaq 64(%rsp),%rbx
959 movq 96+16(%rsp),%r8
960 movq 96+24(%rsp),%r9
961 leaq 64(%rsp),%rdi
962 call __ecp_nistz256_sub_fromq
963
964 movq 0+0(%rsp),%rax
965 movq 8+0(%rsp),%r14
966 leaq 0+0(%rsp),%rsi
967 movq 16+0(%rsp),%r15
968 movq 24+0(%rsp),%r8
969.byte 102,72,15,126,207
970 call __ecp_nistz256_sqr_montq
971 xorq %r9,%r9
972 movq %r12,%rax
973 addq $-1,%r12
974 movq %r13,%r10
975 adcq %rsi,%r13
976 movq %r14,%rcx
977 adcq $0,%r14
978 movq %r15,%r8
979 adcq %rbp,%r15
980 adcq $0,%r9
981 xorq %rsi,%rsi
982 testq $1,%rax
983
984 cmovzq %rax,%r12
985 cmovzq %r10,%r13
986 cmovzq %rcx,%r14
987 cmovzq %r8,%r15
988 cmovzq %rsi,%r9
989
990 movq %r13,%rax
991 shrq $1,%r12
992 shlq $63,%rax
993 movq %r14,%r10
994 shrq $1,%r13
995 orq %rax,%r12
996 shlq $63,%r10
997 movq %r15,%rcx
998 shrq $1,%r14
999 orq %r10,%r13
1000 shlq $63,%rcx
1001 movq %r12,0(%rdi)
1002 shrq $1,%r15
1003 movq %r13,8(%rdi)
1004 shlq $63,%r9
1005 orq %rcx,%r14
1006 orq %r9,%r15
1007 movq %r14,16(%rdi)
1008 movq %r15,24(%rdi)
1009 movq 64(%rsp),%rax
1010 leaq 64(%rsp),%rbx
1011 movq 0+32(%rsp),%r9
1012 movq 8+32(%rsp),%r10
1013 leaq 0+32(%rsp),%rsi
1014 movq 16+32(%rsp),%r11
1015 movq 24+32(%rsp),%r12
1016 leaq 32(%rsp),%rdi
1017 call __ecp_nistz256_mul_montq
1018
1019 leaq 128(%rsp),%rdi
1020 call __ecp_nistz256_mul_by_2q
1021
1022 leaq 32(%rsp),%rbx
1023 leaq 32(%rsp),%rdi
1024 call __ecp_nistz256_add_toq
1025
1026 movq 96(%rsp),%rax
1027 leaq 96(%rsp),%rbx
1028 movq 0+0(%rsp),%r9
1029 movq 8+0(%rsp),%r10
1030 leaq 0+0(%rsp),%rsi
1031 movq 16+0(%rsp),%r11
1032 movq 24+0(%rsp),%r12
1033 leaq 0(%rsp),%rdi
1034 call __ecp_nistz256_mul_montq
1035
1036 leaq 128(%rsp),%rdi
1037 call __ecp_nistz256_mul_by_2q
1038
1039 movq 0+32(%rsp),%rax
1040 movq 8+32(%rsp),%r14
1041 leaq 0+32(%rsp),%rsi
1042 movq 16+32(%rsp),%r15
1043 movq 24+32(%rsp),%r8
1044.byte 102,72,15,126,199
1045 call __ecp_nistz256_sqr_montq
1046
1047 leaq 128(%rsp),%rbx
1048 movq %r14,%r8
1049 movq %r15,%r9
1050 movq %rsi,%r14
1051 movq %rbp,%r15
1052 call __ecp_nistz256_sub_fromq
1053
1054 movq 0+0(%rsp),%rax
1055 movq 0+8(%rsp),%rbp
1056 movq 0+16(%rsp),%rcx
1057 movq 0+24(%rsp),%r10
1058 leaq 0(%rsp),%rdi
1059 call __ecp_nistz256_subq
1060
1061 movq 32(%rsp),%rax
1062 leaq 32(%rsp),%rbx
1063 movq %r12,%r14
1064 xorl %ecx,%ecx
1065 movq %r12,0+0(%rsp)
1066 movq %r13,%r10
1067 movq %r13,0+8(%rsp)
1068 cmovzq %r8,%r11
1069 movq %r8,0+16(%rsp)
1070 leaq 0-0(%rsp),%rsi
1071 cmovzq %r9,%r12
1072 movq %r9,0+24(%rsp)
1073 movq %r14,%r9
1074 leaq 0(%rsp),%rdi
1075 call __ecp_nistz256_mul_montq
1076
1077.byte 102,72,15,126,203
1078.byte 102,72,15,126,207
1079 call __ecp_nistz256_sub_fromq
1080
1081 addq $160+8,%rsp
1082 popq %r15
1083 popq %r14
1084 popq %r13
1085 popq %r12
1086 popq %rbx
1087 popq %rbp
1088 .byte 0xf3,0xc3
1089
1090.globl _ecp_nistz256_point_add
1091.private_extern _ecp_nistz256_point_add
1092
1093.p2align 5
1094_ecp_nistz256_point_add:
1095 pushq %rbp
1096 pushq %rbx
1097 pushq %r12
1098 pushq %r13
1099 pushq %r14
1100 pushq %r15
1101 subq $576+8,%rsp
1102
1103 movdqu 0(%rsi),%xmm0
1104 movdqu 16(%rsi),%xmm1
1105 movdqu 32(%rsi),%xmm2
1106 movdqu 48(%rsi),%xmm3
1107 movdqu 64(%rsi),%xmm4
1108 movdqu 80(%rsi),%xmm5
1109 movq %rsi,%rbx
1110 movq %rdx,%rsi
1111 movdqa %xmm0,384(%rsp)
1112 movdqa %xmm1,384+16(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08001113 movdqa %xmm2,416(%rsp)
1114 movdqa %xmm3,416+16(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08001115 movdqa %xmm4,448(%rsp)
1116 movdqa %xmm5,448+16(%rsp)
Steven Valdez909b19f2016-11-21 15:35:44 -05001117 por %xmm4,%xmm5
Adam Langleyfad63272015-11-12 12:15:39 -08001118
1119 movdqu 0(%rsi),%xmm0
Steven Valdez909b19f2016-11-21 15:35:44 -05001120 pshufd $0xb1,%xmm5,%xmm3
Adam Langleyfad63272015-11-12 12:15:39 -08001121 movdqu 16(%rsi),%xmm1
1122 movdqu 32(%rsi),%xmm2
1123 por %xmm3,%xmm5
1124 movdqu 48(%rsi),%xmm3
1125 movq 64+0(%rsi),%rax
1126 movq 64+8(%rsi),%r14
1127 movq 64+16(%rsi),%r15
1128 movq 64+24(%rsi),%r8
1129 movdqa %xmm0,480(%rsp)
David Benjamin4969cc92016-04-22 15:02:23 -04001130 pshufd $0x1e,%xmm5,%xmm4
Adam Langleyfad63272015-11-12 12:15:39 -08001131 movdqa %xmm1,480+16(%rsp)
Steven Valdez909b19f2016-11-21 15:35:44 -05001132 movdqu 64(%rsi),%xmm0
1133 movdqu 80(%rsi),%xmm1
Adam Langleyfad63272015-11-12 12:15:39 -08001134 movdqa %xmm2,512(%rsp)
1135 movdqa %xmm3,512+16(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08001136 por %xmm4,%xmm5
1137 pxor %xmm4,%xmm4
Steven Valdez909b19f2016-11-21 15:35:44 -05001138 por %xmm0,%xmm1
1139.byte 102,72,15,110,199
Adam Langleyfad63272015-11-12 12:15:39 -08001140
1141 leaq 64-0(%rsi),%rsi
1142 movq %rax,544+0(%rsp)
1143 movq %r14,544+8(%rsp)
1144 movq %r15,544+16(%rsp)
1145 movq %r8,544+24(%rsp)
1146 leaq 96(%rsp),%rdi
1147 call __ecp_nistz256_sqr_montq
1148
1149 pcmpeqd %xmm4,%xmm5
Steven Valdez909b19f2016-11-21 15:35:44 -05001150 pshufd $0xb1,%xmm1,%xmm4
1151 por %xmm1,%xmm4
Adam Langleyfad63272015-11-12 12:15:39 -08001152 pshufd $0,%xmm5,%xmm5
David Benjamin4969cc92016-04-22 15:02:23 -04001153 pshufd $0x1e,%xmm4,%xmm3
Adam Langleyfad63272015-11-12 12:15:39 -08001154 por %xmm3,%xmm4
1155 pxor %xmm3,%xmm3
1156 pcmpeqd %xmm3,%xmm4
1157 pshufd $0,%xmm4,%xmm4
1158 movq 64+0(%rbx),%rax
1159 movq 64+8(%rbx),%r14
1160 movq 64+16(%rbx),%r15
1161 movq 64+24(%rbx),%r8
David Benjamin4969cc92016-04-22 15:02:23 -04001162.byte 102,72,15,110,203
Adam Langleyfad63272015-11-12 12:15:39 -08001163
1164 leaq 64-0(%rbx),%rsi
1165 leaq 32(%rsp),%rdi
1166 call __ecp_nistz256_sqr_montq
1167
1168 movq 544(%rsp),%rax
1169 leaq 544(%rsp),%rbx
1170 movq 0+96(%rsp),%r9
1171 movq 8+96(%rsp),%r10
1172 leaq 0+96(%rsp),%rsi
1173 movq 16+96(%rsp),%r11
1174 movq 24+96(%rsp),%r12
1175 leaq 224(%rsp),%rdi
1176 call __ecp_nistz256_mul_montq
1177
1178 movq 448(%rsp),%rax
1179 leaq 448(%rsp),%rbx
1180 movq 0+32(%rsp),%r9
1181 movq 8+32(%rsp),%r10
1182 leaq 0+32(%rsp),%rsi
1183 movq 16+32(%rsp),%r11
1184 movq 24+32(%rsp),%r12
1185 leaq 256(%rsp),%rdi
1186 call __ecp_nistz256_mul_montq
1187
1188 movq 416(%rsp),%rax
1189 leaq 416(%rsp),%rbx
1190 movq 0+224(%rsp),%r9
1191 movq 8+224(%rsp),%r10
1192 leaq 0+224(%rsp),%rsi
1193 movq 16+224(%rsp),%r11
1194 movq 24+224(%rsp),%r12
1195 leaq 224(%rsp),%rdi
1196 call __ecp_nistz256_mul_montq
1197
1198 movq 512(%rsp),%rax
1199 leaq 512(%rsp),%rbx
1200 movq 0+256(%rsp),%r9
1201 movq 8+256(%rsp),%r10
1202 leaq 0+256(%rsp),%rsi
1203 movq 16+256(%rsp),%r11
1204 movq 24+256(%rsp),%r12
1205 leaq 256(%rsp),%rdi
1206 call __ecp_nistz256_mul_montq
1207
1208 leaq 224(%rsp),%rbx
1209 leaq 64(%rsp),%rdi
1210 call __ecp_nistz256_sub_fromq
1211
1212 orq %r13,%r12
1213 movdqa %xmm4,%xmm2
1214 orq %r8,%r12
1215 orq %r9,%r12
1216 por %xmm5,%xmm2
1217.byte 102,73,15,110,220
1218
1219 movq 384(%rsp),%rax
1220 leaq 384(%rsp),%rbx
1221 movq 0+96(%rsp),%r9
1222 movq 8+96(%rsp),%r10
1223 leaq 0+96(%rsp),%rsi
1224 movq 16+96(%rsp),%r11
1225 movq 24+96(%rsp),%r12
1226 leaq 160(%rsp),%rdi
1227 call __ecp_nistz256_mul_montq
1228
1229 movq 480(%rsp),%rax
1230 leaq 480(%rsp),%rbx
1231 movq 0+32(%rsp),%r9
1232 movq 8+32(%rsp),%r10
1233 leaq 0+32(%rsp),%rsi
1234 movq 16+32(%rsp),%r11
1235 movq 24+32(%rsp),%r12
1236 leaq 192(%rsp),%rdi
1237 call __ecp_nistz256_mul_montq
1238
1239 leaq 160(%rsp),%rbx
1240 leaq 0(%rsp),%rdi
1241 call __ecp_nistz256_sub_fromq
1242
1243 orq %r13,%r12
1244 orq %r8,%r12
1245 orq %r9,%r12
1246
1247.byte 0x3e
1248 jnz L$add_proceedq
1249.byte 102,73,15,126,208
1250.byte 102,73,15,126,217
1251 testq %r8,%r8
1252 jnz L$add_proceedq
1253 testq %r9,%r9
David Benjamin4969cc92016-04-22 15:02:23 -04001254 jz L$add_doubleq
Adam Langleyfad63272015-11-12 12:15:39 -08001255
1256.byte 102,72,15,126,199
1257 pxor %xmm0,%xmm0
1258 movdqu %xmm0,0(%rdi)
1259 movdqu %xmm0,16(%rdi)
1260 movdqu %xmm0,32(%rdi)
1261 movdqu %xmm0,48(%rdi)
1262 movdqu %xmm0,64(%rdi)
1263 movdqu %xmm0,80(%rdi)
1264 jmp L$add_doneq
1265
1266.p2align 5
David Benjamin4969cc92016-04-22 15:02:23 -04001267L$add_doubleq:
1268.byte 102,72,15,126,206
1269.byte 102,72,15,126,199
1270 addq $416,%rsp
1271 jmp L$point_double_shortcutq
1272
1273.p2align 5
Adam Langleyfad63272015-11-12 12:15:39 -08001274L$add_proceedq:
1275 movq 0+64(%rsp),%rax
1276 movq 8+64(%rsp),%r14
1277 leaq 0+64(%rsp),%rsi
1278 movq 16+64(%rsp),%r15
1279 movq 24+64(%rsp),%r8
1280 leaq 96(%rsp),%rdi
1281 call __ecp_nistz256_sqr_montq
1282
1283 movq 448(%rsp),%rax
1284 leaq 448(%rsp),%rbx
1285 movq 0+0(%rsp),%r9
1286 movq 8+0(%rsp),%r10
1287 leaq 0+0(%rsp),%rsi
1288 movq 16+0(%rsp),%r11
1289 movq 24+0(%rsp),%r12
1290 leaq 352(%rsp),%rdi
1291 call __ecp_nistz256_mul_montq
1292
1293 movq 0+0(%rsp),%rax
1294 movq 8+0(%rsp),%r14
1295 leaq 0+0(%rsp),%rsi
1296 movq 16+0(%rsp),%r15
1297 movq 24+0(%rsp),%r8
1298 leaq 32(%rsp),%rdi
1299 call __ecp_nistz256_sqr_montq
1300
1301 movq 544(%rsp),%rax
1302 leaq 544(%rsp),%rbx
1303 movq 0+352(%rsp),%r9
1304 movq 8+352(%rsp),%r10
1305 leaq 0+352(%rsp),%rsi
1306 movq 16+352(%rsp),%r11
1307 movq 24+352(%rsp),%r12
1308 leaq 352(%rsp),%rdi
1309 call __ecp_nistz256_mul_montq
1310
1311 movq 0(%rsp),%rax
1312 leaq 0(%rsp),%rbx
1313 movq 0+32(%rsp),%r9
1314 movq 8+32(%rsp),%r10
1315 leaq 0+32(%rsp),%rsi
1316 movq 16+32(%rsp),%r11
1317 movq 24+32(%rsp),%r12
1318 leaq 128(%rsp),%rdi
1319 call __ecp_nistz256_mul_montq
1320
1321 movq 160(%rsp),%rax
1322 leaq 160(%rsp),%rbx
1323 movq 0+32(%rsp),%r9
1324 movq 8+32(%rsp),%r10
1325 leaq 0+32(%rsp),%rsi
1326 movq 16+32(%rsp),%r11
1327 movq 24+32(%rsp),%r12
1328 leaq 192(%rsp),%rdi
1329 call __ecp_nistz256_mul_montq
1330
1331
1332
1333
Steven Valdez909b19f2016-11-21 15:35:44 -05001334 xorq %r11,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001335 addq %r12,%r12
1336 leaq 96(%rsp),%rsi
1337 adcq %r13,%r13
1338 movq %r12,%rax
1339 adcq %r8,%r8
1340 adcq %r9,%r9
1341 movq %r13,%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -05001342 adcq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001343
1344 subq $-1,%r12
1345 movq %r8,%rcx
1346 sbbq %r14,%r13
1347 sbbq $0,%r8
1348 movq %r9,%r10
1349 sbbq %r15,%r9
Steven Valdez909b19f2016-11-21 15:35:44 -05001350 sbbq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001351
Steven Valdez909b19f2016-11-21 15:35:44 -05001352 cmovcq %rax,%r12
Adam Langleyfad63272015-11-12 12:15:39 -08001353 movq 0(%rsi),%rax
Steven Valdez909b19f2016-11-21 15:35:44 -05001354 cmovcq %rbp,%r13
Adam Langleyfad63272015-11-12 12:15:39 -08001355 movq 8(%rsi),%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -05001356 cmovcq %rcx,%r8
Adam Langleyfad63272015-11-12 12:15:39 -08001357 movq 16(%rsi),%rcx
Steven Valdez909b19f2016-11-21 15:35:44 -05001358 cmovcq %r10,%r9
Adam Langleyfad63272015-11-12 12:15:39 -08001359 movq 24(%rsi),%r10
1360
1361 call __ecp_nistz256_subq
1362
1363 leaq 128(%rsp),%rbx
1364 leaq 288(%rsp),%rdi
1365 call __ecp_nistz256_sub_fromq
1366
1367 movq 192+0(%rsp),%rax
1368 movq 192+8(%rsp),%rbp
1369 movq 192+16(%rsp),%rcx
1370 movq 192+24(%rsp),%r10
1371 leaq 320(%rsp),%rdi
1372
1373 call __ecp_nistz256_subq
1374
1375 movq %r12,0(%rdi)
1376 movq %r13,8(%rdi)
1377 movq %r8,16(%rdi)
1378 movq %r9,24(%rdi)
1379 movq 128(%rsp),%rax
1380 leaq 128(%rsp),%rbx
1381 movq 0+224(%rsp),%r9
1382 movq 8+224(%rsp),%r10
1383 leaq 0+224(%rsp),%rsi
1384 movq 16+224(%rsp),%r11
1385 movq 24+224(%rsp),%r12
1386 leaq 256(%rsp),%rdi
1387 call __ecp_nistz256_mul_montq
1388
1389 movq 320(%rsp),%rax
1390 leaq 320(%rsp),%rbx
1391 movq 0+64(%rsp),%r9
1392 movq 8+64(%rsp),%r10
1393 leaq 0+64(%rsp),%rsi
1394 movq 16+64(%rsp),%r11
1395 movq 24+64(%rsp),%r12
1396 leaq 320(%rsp),%rdi
1397 call __ecp_nistz256_mul_montq
1398
1399 leaq 256(%rsp),%rbx
1400 leaq 320(%rsp),%rdi
1401 call __ecp_nistz256_sub_fromq
1402
1403.byte 102,72,15,126,199
1404
1405 movdqa %xmm5,%xmm0
1406 movdqa %xmm5,%xmm1
1407 pandn 352(%rsp),%xmm0
1408 movdqa %xmm5,%xmm2
1409 pandn 352+16(%rsp),%xmm1
1410 movdqa %xmm5,%xmm3
1411 pand 544(%rsp),%xmm2
1412 pand 544+16(%rsp),%xmm3
1413 por %xmm0,%xmm2
1414 por %xmm1,%xmm3
1415
1416 movdqa %xmm4,%xmm0
1417 movdqa %xmm4,%xmm1
1418 pandn %xmm2,%xmm0
1419 movdqa %xmm4,%xmm2
1420 pandn %xmm3,%xmm1
1421 movdqa %xmm4,%xmm3
1422 pand 448(%rsp),%xmm2
1423 pand 448+16(%rsp),%xmm3
1424 por %xmm0,%xmm2
1425 por %xmm1,%xmm3
1426 movdqu %xmm2,64(%rdi)
1427 movdqu %xmm3,80(%rdi)
1428
1429 movdqa %xmm5,%xmm0
1430 movdqa %xmm5,%xmm1
1431 pandn 288(%rsp),%xmm0
1432 movdqa %xmm5,%xmm2
1433 pandn 288+16(%rsp),%xmm1
1434 movdqa %xmm5,%xmm3
1435 pand 480(%rsp),%xmm2
1436 pand 480+16(%rsp),%xmm3
1437 por %xmm0,%xmm2
1438 por %xmm1,%xmm3
1439
1440 movdqa %xmm4,%xmm0
1441 movdqa %xmm4,%xmm1
1442 pandn %xmm2,%xmm0
1443 movdqa %xmm4,%xmm2
1444 pandn %xmm3,%xmm1
1445 movdqa %xmm4,%xmm3
1446 pand 384(%rsp),%xmm2
1447 pand 384+16(%rsp),%xmm3
1448 por %xmm0,%xmm2
1449 por %xmm1,%xmm3
1450 movdqu %xmm2,0(%rdi)
1451 movdqu %xmm3,16(%rdi)
1452
1453 movdqa %xmm5,%xmm0
1454 movdqa %xmm5,%xmm1
1455 pandn 320(%rsp),%xmm0
1456 movdqa %xmm5,%xmm2
1457 pandn 320+16(%rsp),%xmm1
1458 movdqa %xmm5,%xmm3
1459 pand 512(%rsp),%xmm2
1460 pand 512+16(%rsp),%xmm3
1461 por %xmm0,%xmm2
1462 por %xmm1,%xmm3
1463
1464 movdqa %xmm4,%xmm0
1465 movdqa %xmm4,%xmm1
1466 pandn %xmm2,%xmm0
1467 movdqa %xmm4,%xmm2
1468 pandn %xmm3,%xmm1
1469 movdqa %xmm4,%xmm3
1470 pand 416(%rsp),%xmm2
1471 pand 416+16(%rsp),%xmm3
1472 por %xmm0,%xmm2
1473 por %xmm1,%xmm3
1474 movdqu %xmm2,32(%rdi)
1475 movdqu %xmm3,48(%rdi)
1476
1477L$add_doneq:
1478 addq $576+8,%rsp
1479 popq %r15
1480 popq %r14
1481 popq %r13
1482 popq %r12
1483 popq %rbx
1484 popq %rbp
1485 .byte 0xf3,0xc3
1486
1487.globl _ecp_nistz256_point_add_affine
1488.private_extern _ecp_nistz256_point_add_affine
1489
1490.p2align 5
1491_ecp_nistz256_point_add_affine:
1492 pushq %rbp
1493 pushq %rbx
1494 pushq %r12
1495 pushq %r13
1496 pushq %r14
1497 pushq %r15
1498 subq $480+8,%rsp
1499
1500 movdqu 0(%rsi),%xmm0
1501 movq %rdx,%rbx
1502 movdqu 16(%rsi),%xmm1
1503 movdqu 32(%rsi),%xmm2
1504 movdqu 48(%rsi),%xmm3
1505 movdqu 64(%rsi),%xmm4
1506 movdqu 80(%rsi),%xmm5
1507 movq 64+0(%rsi),%rax
1508 movq 64+8(%rsi),%r14
1509 movq 64+16(%rsi),%r15
1510 movq 64+24(%rsi),%r8
1511 movdqa %xmm0,320(%rsp)
1512 movdqa %xmm1,320+16(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08001513 movdqa %xmm2,352(%rsp)
1514 movdqa %xmm3,352+16(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08001515 movdqa %xmm4,384(%rsp)
1516 movdqa %xmm5,384+16(%rsp)
Steven Valdez909b19f2016-11-21 15:35:44 -05001517 por %xmm4,%xmm5
Adam Langleyfad63272015-11-12 12:15:39 -08001518
1519 movdqu 0(%rbx),%xmm0
Steven Valdez909b19f2016-11-21 15:35:44 -05001520 pshufd $0xb1,%xmm5,%xmm3
Adam Langleyfad63272015-11-12 12:15:39 -08001521 movdqu 16(%rbx),%xmm1
1522 movdqu 32(%rbx),%xmm2
1523 por %xmm3,%xmm5
1524 movdqu 48(%rbx),%xmm3
1525 movdqa %xmm0,416(%rsp)
David Benjamin4969cc92016-04-22 15:02:23 -04001526 pshufd $0x1e,%xmm5,%xmm4
Adam Langleyfad63272015-11-12 12:15:39 -08001527 movdqa %xmm1,416+16(%rsp)
1528 por %xmm0,%xmm1
1529.byte 102,72,15,110,199
1530 movdqa %xmm2,448(%rsp)
1531 movdqa %xmm3,448+16(%rsp)
1532 por %xmm2,%xmm3
1533 por %xmm4,%xmm5
1534 pxor %xmm4,%xmm4
1535 por %xmm1,%xmm3
1536
1537 leaq 64-0(%rsi),%rsi
1538 leaq 32(%rsp),%rdi
1539 call __ecp_nistz256_sqr_montq
1540
1541 pcmpeqd %xmm4,%xmm5
David Benjamin4969cc92016-04-22 15:02:23 -04001542 pshufd $0xb1,%xmm3,%xmm4
Adam Langleyfad63272015-11-12 12:15:39 -08001543 movq 0(%rbx),%rax
1544
1545 movq %r12,%r9
1546 por %xmm3,%xmm4
1547 pshufd $0,%xmm5,%xmm5
David Benjamin4969cc92016-04-22 15:02:23 -04001548 pshufd $0x1e,%xmm4,%xmm3
Adam Langleyfad63272015-11-12 12:15:39 -08001549 movq %r13,%r10
1550 por %xmm3,%xmm4
1551 pxor %xmm3,%xmm3
1552 movq %r14,%r11
1553 pcmpeqd %xmm3,%xmm4
1554 pshufd $0,%xmm4,%xmm4
1555
1556 leaq 32-0(%rsp),%rsi
1557 movq %r15,%r12
1558 leaq 0(%rsp),%rdi
1559 call __ecp_nistz256_mul_montq
1560
1561 leaq 320(%rsp),%rbx
1562 leaq 64(%rsp),%rdi
1563 call __ecp_nistz256_sub_fromq
1564
1565 movq 384(%rsp),%rax
1566 leaq 384(%rsp),%rbx
1567 movq 0+32(%rsp),%r9
1568 movq 8+32(%rsp),%r10
1569 leaq 0+32(%rsp),%rsi
1570 movq 16+32(%rsp),%r11
1571 movq 24+32(%rsp),%r12
1572 leaq 32(%rsp),%rdi
1573 call __ecp_nistz256_mul_montq
1574
1575 movq 384(%rsp),%rax
1576 leaq 384(%rsp),%rbx
1577 movq 0+64(%rsp),%r9
1578 movq 8+64(%rsp),%r10
1579 leaq 0+64(%rsp),%rsi
1580 movq 16+64(%rsp),%r11
1581 movq 24+64(%rsp),%r12
1582 leaq 288(%rsp),%rdi
1583 call __ecp_nistz256_mul_montq
1584
1585 movq 448(%rsp),%rax
1586 leaq 448(%rsp),%rbx
1587 movq 0+32(%rsp),%r9
1588 movq 8+32(%rsp),%r10
1589 leaq 0+32(%rsp),%rsi
1590 movq 16+32(%rsp),%r11
1591 movq 24+32(%rsp),%r12
1592 leaq 32(%rsp),%rdi
1593 call __ecp_nistz256_mul_montq
1594
1595 leaq 352(%rsp),%rbx
1596 leaq 96(%rsp),%rdi
1597 call __ecp_nistz256_sub_fromq
1598
1599 movq 0+64(%rsp),%rax
1600 movq 8+64(%rsp),%r14
1601 leaq 0+64(%rsp),%rsi
1602 movq 16+64(%rsp),%r15
1603 movq 24+64(%rsp),%r8
1604 leaq 128(%rsp),%rdi
1605 call __ecp_nistz256_sqr_montq
1606
1607 movq 0+96(%rsp),%rax
1608 movq 8+96(%rsp),%r14
1609 leaq 0+96(%rsp),%rsi
1610 movq 16+96(%rsp),%r15
1611 movq 24+96(%rsp),%r8
1612 leaq 192(%rsp),%rdi
1613 call __ecp_nistz256_sqr_montq
1614
1615 movq 128(%rsp),%rax
1616 leaq 128(%rsp),%rbx
1617 movq 0+64(%rsp),%r9
1618 movq 8+64(%rsp),%r10
1619 leaq 0+64(%rsp),%rsi
1620 movq 16+64(%rsp),%r11
1621 movq 24+64(%rsp),%r12
1622 leaq 160(%rsp),%rdi
1623 call __ecp_nistz256_mul_montq
1624
1625 movq 320(%rsp),%rax
1626 leaq 320(%rsp),%rbx
1627 movq 0+128(%rsp),%r9
1628 movq 8+128(%rsp),%r10
1629 leaq 0+128(%rsp),%rsi
1630 movq 16+128(%rsp),%r11
1631 movq 24+128(%rsp),%r12
1632 leaq 0(%rsp),%rdi
1633 call __ecp_nistz256_mul_montq
1634
1635
1636
1637
Steven Valdez909b19f2016-11-21 15:35:44 -05001638 xorq %r11,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001639 addq %r12,%r12
1640 leaq 192(%rsp),%rsi
1641 adcq %r13,%r13
1642 movq %r12,%rax
1643 adcq %r8,%r8
1644 adcq %r9,%r9
1645 movq %r13,%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -05001646 adcq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001647
1648 subq $-1,%r12
1649 movq %r8,%rcx
1650 sbbq %r14,%r13
1651 sbbq $0,%r8
1652 movq %r9,%r10
1653 sbbq %r15,%r9
Steven Valdez909b19f2016-11-21 15:35:44 -05001654 sbbq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001655
Steven Valdez909b19f2016-11-21 15:35:44 -05001656 cmovcq %rax,%r12
Adam Langleyfad63272015-11-12 12:15:39 -08001657 movq 0(%rsi),%rax
Steven Valdez909b19f2016-11-21 15:35:44 -05001658 cmovcq %rbp,%r13
Adam Langleyfad63272015-11-12 12:15:39 -08001659 movq 8(%rsi),%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -05001660 cmovcq %rcx,%r8
Adam Langleyfad63272015-11-12 12:15:39 -08001661 movq 16(%rsi),%rcx
Steven Valdez909b19f2016-11-21 15:35:44 -05001662 cmovcq %r10,%r9
Adam Langleyfad63272015-11-12 12:15:39 -08001663 movq 24(%rsi),%r10
1664
1665 call __ecp_nistz256_subq
1666
1667 leaq 160(%rsp),%rbx
1668 leaq 224(%rsp),%rdi
1669 call __ecp_nistz256_sub_fromq
1670
1671 movq 0+0(%rsp),%rax
1672 movq 0+8(%rsp),%rbp
1673 movq 0+16(%rsp),%rcx
1674 movq 0+24(%rsp),%r10
1675 leaq 64(%rsp),%rdi
1676
1677 call __ecp_nistz256_subq
1678
1679 movq %r12,0(%rdi)
1680 movq %r13,8(%rdi)
1681 movq %r8,16(%rdi)
1682 movq %r9,24(%rdi)
1683 movq 352(%rsp),%rax
1684 leaq 352(%rsp),%rbx
1685 movq 0+160(%rsp),%r9
1686 movq 8+160(%rsp),%r10
1687 leaq 0+160(%rsp),%rsi
1688 movq 16+160(%rsp),%r11
1689 movq 24+160(%rsp),%r12
1690 leaq 32(%rsp),%rdi
1691 call __ecp_nistz256_mul_montq
1692
1693 movq 96(%rsp),%rax
1694 leaq 96(%rsp),%rbx
1695 movq 0+64(%rsp),%r9
1696 movq 8+64(%rsp),%r10
1697 leaq 0+64(%rsp),%rsi
1698 movq 16+64(%rsp),%r11
1699 movq 24+64(%rsp),%r12
1700 leaq 64(%rsp),%rdi
1701 call __ecp_nistz256_mul_montq
1702
1703 leaq 32(%rsp),%rbx
1704 leaq 256(%rsp),%rdi
1705 call __ecp_nistz256_sub_fromq
1706
1707.byte 102,72,15,126,199
1708
1709 movdqa %xmm5,%xmm0
1710 movdqa %xmm5,%xmm1
1711 pandn 288(%rsp),%xmm0
1712 movdqa %xmm5,%xmm2
1713 pandn 288+16(%rsp),%xmm1
1714 movdqa %xmm5,%xmm3
1715 pand L$ONE_mont(%rip),%xmm2
1716 pand L$ONE_mont+16(%rip),%xmm3
1717 por %xmm0,%xmm2
1718 por %xmm1,%xmm3
1719
1720 movdqa %xmm4,%xmm0
1721 movdqa %xmm4,%xmm1
1722 pandn %xmm2,%xmm0
1723 movdqa %xmm4,%xmm2
1724 pandn %xmm3,%xmm1
1725 movdqa %xmm4,%xmm3
1726 pand 384(%rsp),%xmm2
1727 pand 384+16(%rsp),%xmm3
1728 por %xmm0,%xmm2
1729 por %xmm1,%xmm3
1730 movdqu %xmm2,64(%rdi)
1731 movdqu %xmm3,80(%rdi)
1732
1733 movdqa %xmm5,%xmm0
1734 movdqa %xmm5,%xmm1
1735 pandn 224(%rsp),%xmm0
1736 movdqa %xmm5,%xmm2
1737 pandn 224+16(%rsp),%xmm1
1738 movdqa %xmm5,%xmm3
1739 pand 416(%rsp),%xmm2
1740 pand 416+16(%rsp),%xmm3
1741 por %xmm0,%xmm2
1742 por %xmm1,%xmm3
1743
1744 movdqa %xmm4,%xmm0
1745 movdqa %xmm4,%xmm1
1746 pandn %xmm2,%xmm0
1747 movdqa %xmm4,%xmm2
1748 pandn %xmm3,%xmm1
1749 movdqa %xmm4,%xmm3
1750 pand 320(%rsp),%xmm2
1751 pand 320+16(%rsp),%xmm3
1752 por %xmm0,%xmm2
1753 por %xmm1,%xmm3
1754 movdqu %xmm2,0(%rdi)
1755 movdqu %xmm3,16(%rdi)
1756
1757 movdqa %xmm5,%xmm0
1758 movdqa %xmm5,%xmm1
1759 pandn 256(%rsp),%xmm0
1760 movdqa %xmm5,%xmm2
1761 pandn 256+16(%rsp),%xmm1
1762 movdqa %xmm5,%xmm3
1763 pand 448(%rsp),%xmm2
1764 pand 448+16(%rsp),%xmm3
1765 por %xmm0,%xmm2
1766 por %xmm1,%xmm3
1767
1768 movdqa %xmm4,%xmm0
1769 movdqa %xmm4,%xmm1
1770 pandn %xmm2,%xmm0
1771 movdqa %xmm4,%xmm2
1772 pandn %xmm3,%xmm1
1773 movdqa %xmm4,%xmm3
1774 pand 352(%rsp),%xmm2
1775 pand 352+16(%rsp),%xmm3
1776 por %xmm0,%xmm2
1777 por %xmm1,%xmm3
1778 movdqu %xmm2,32(%rdi)
1779 movdqu %xmm3,48(%rdi)
1780
1781 addq $480+8,%rsp
1782 popq %r15
1783 popq %r14
1784 popq %r13
1785 popq %r12
1786 popq %rbx
1787 popq %rbp
1788 .byte 0xf3,0xc3
1789
1790#endif