blob: 6d21888f049edd4db98e6af6a9b45e07ed945547 [file] [log] [blame]
Robert Sloan6f79a502017-04-03 09:16:40 -07001#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
Adam Langleyfad63272015-11-12 12:15:39 -08002.text
3.extern OPENSSL_ia32cap_P
4.hidden OPENSSL_ia32cap_P
5
6
7.align 64
8.Lpoly:
9.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
10
Adam Langleyfad63272015-11-12 12:15:39 -080011.LOne:
12.long 1,1,1,1,1,1,1,1
13.LTwo:
14.long 2,2,2,2,2,2,2,2
15.LThree:
16.long 3,3,3,3,3,3,3,3
17.LONE_mont:
18.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
19
Adam Langleyfad63272015-11-12 12:15:39 -080020
21
Adam Langleyfad63272015-11-12 12:15:39 -080022.globl ecp_nistz256_neg
23.hidden ecp_nistz256_neg
24.type ecp_nistz256_neg,@function
25.align 32
26ecp_nistz256_neg:
27 pushq %r12
28 pushq %r13
29
30 xorq %r8,%r8
31 xorq %r9,%r9
32 xorq %r10,%r10
33 xorq %r11,%r11
34 xorq %r13,%r13
35
36 subq 0(%rsi),%r8
37 sbbq 8(%rsi),%r9
38 sbbq 16(%rsi),%r10
39 movq %r8,%rax
40 sbbq 24(%rsi),%r11
41 leaq .Lpoly(%rip),%rsi
42 movq %r9,%rdx
43 sbbq $0,%r13
44
45 addq 0(%rsi),%r8
46 movq %r10,%rcx
47 adcq 8(%rsi),%r9
48 adcq 16(%rsi),%r10
49 movq %r11,%r12
50 adcq 24(%rsi),%r11
51 testq %r13,%r13
52
53 cmovzq %rax,%r8
54 cmovzq %rdx,%r9
55 movq %r8,0(%rdi)
56 cmovzq %rcx,%r10
57 movq %r9,8(%rdi)
58 cmovzq %r12,%r11
59 movq %r10,16(%rdi)
60 movq %r11,24(%rdi)
61
62 popq %r13
63 popq %r12
64 .byte 0xf3,0xc3
65.size ecp_nistz256_neg,.-ecp_nistz256_neg
66
67
68
69
Adam Langleyfad63272015-11-12 12:15:39 -080070
71
72.globl ecp_nistz256_mul_mont
73.hidden ecp_nistz256_mul_mont
74.type ecp_nistz256_mul_mont,@function
75.align 32
76ecp_nistz256_mul_mont:
77.Lmul_mont:
78 pushq %rbp
79 pushq %rbx
80 pushq %r12
81 pushq %r13
82 pushq %r14
83 pushq %r15
84 movq %rdx,%rbx
85 movq 0(%rdx),%rax
86 movq 0(%rsi),%r9
87 movq 8(%rsi),%r10
88 movq 16(%rsi),%r11
89 movq 24(%rsi),%r12
90
91 call __ecp_nistz256_mul_montq
92.Lmul_mont_done:
93 popq %r15
94 popq %r14
95 popq %r13
96 popq %r12
97 popq %rbx
98 popq %rbp
99 .byte 0xf3,0xc3
100.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
101
102.type __ecp_nistz256_mul_montq,@function
103.align 32
104__ecp_nistz256_mul_montq:
105
106
107 movq %rax,%rbp
108 mulq %r9
109 movq .Lpoly+8(%rip),%r14
110 movq %rax,%r8
111 movq %rbp,%rax
112 movq %rdx,%r9
113
114 mulq %r10
115 movq .Lpoly+24(%rip),%r15
116 addq %rax,%r9
117 movq %rbp,%rax
118 adcq $0,%rdx
119 movq %rdx,%r10
120
121 mulq %r11
122 addq %rax,%r10
123 movq %rbp,%rax
124 adcq $0,%rdx
125 movq %rdx,%r11
126
127 mulq %r12
128 addq %rax,%r11
129 movq %r8,%rax
130 adcq $0,%rdx
131 xorq %r13,%r13
132 movq %rdx,%r12
133
134
135
136
137
138
139
140
141
142
143 movq %r8,%rbp
144 shlq $32,%r8
145 mulq %r15
146 shrq $32,%rbp
147 addq %r8,%r9
148 adcq %rbp,%r10
149 adcq %rax,%r11
150 movq 8(%rbx),%rax
151 adcq %rdx,%r12
152 adcq $0,%r13
153 xorq %r8,%r8
154
155
156
157 movq %rax,%rbp
158 mulq 0(%rsi)
159 addq %rax,%r9
160 movq %rbp,%rax
161 adcq $0,%rdx
162 movq %rdx,%rcx
163
164 mulq 8(%rsi)
165 addq %rcx,%r10
166 adcq $0,%rdx
167 addq %rax,%r10
168 movq %rbp,%rax
169 adcq $0,%rdx
170 movq %rdx,%rcx
171
172 mulq 16(%rsi)
173 addq %rcx,%r11
174 adcq $0,%rdx
175 addq %rax,%r11
176 movq %rbp,%rax
177 adcq $0,%rdx
178 movq %rdx,%rcx
179
180 mulq 24(%rsi)
181 addq %rcx,%r12
182 adcq $0,%rdx
183 addq %rax,%r12
184 movq %r9,%rax
185 adcq %rdx,%r13
186 adcq $0,%r8
187
188
189
190 movq %r9,%rbp
191 shlq $32,%r9
192 mulq %r15
193 shrq $32,%rbp
194 addq %r9,%r10
195 adcq %rbp,%r11
196 adcq %rax,%r12
197 movq 16(%rbx),%rax
198 adcq %rdx,%r13
199 adcq $0,%r8
200 xorq %r9,%r9
201
202
203
204 movq %rax,%rbp
205 mulq 0(%rsi)
206 addq %rax,%r10
207 movq %rbp,%rax
208 adcq $0,%rdx
209 movq %rdx,%rcx
210
211 mulq 8(%rsi)
212 addq %rcx,%r11
213 adcq $0,%rdx
214 addq %rax,%r11
215 movq %rbp,%rax
216 adcq $0,%rdx
217 movq %rdx,%rcx
218
219 mulq 16(%rsi)
220 addq %rcx,%r12
221 adcq $0,%rdx
222 addq %rax,%r12
223 movq %rbp,%rax
224 adcq $0,%rdx
225 movq %rdx,%rcx
226
227 mulq 24(%rsi)
228 addq %rcx,%r13
229 adcq $0,%rdx
230 addq %rax,%r13
231 movq %r10,%rax
232 adcq %rdx,%r8
233 adcq $0,%r9
234
235
236
237 movq %r10,%rbp
238 shlq $32,%r10
239 mulq %r15
240 shrq $32,%rbp
241 addq %r10,%r11
242 adcq %rbp,%r12
243 adcq %rax,%r13
244 movq 24(%rbx),%rax
245 adcq %rdx,%r8
246 adcq $0,%r9
247 xorq %r10,%r10
248
249
250
251 movq %rax,%rbp
252 mulq 0(%rsi)
253 addq %rax,%r11
254 movq %rbp,%rax
255 adcq $0,%rdx
256 movq %rdx,%rcx
257
258 mulq 8(%rsi)
259 addq %rcx,%r12
260 adcq $0,%rdx
261 addq %rax,%r12
262 movq %rbp,%rax
263 adcq $0,%rdx
264 movq %rdx,%rcx
265
266 mulq 16(%rsi)
267 addq %rcx,%r13
268 adcq $0,%rdx
269 addq %rax,%r13
270 movq %rbp,%rax
271 adcq $0,%rdx
272 movq %rdx,%rcx
273
274 mulq 24(%rsi)
275 addq %rcx,%r8
276 adcq $0,%rdx
277 addq %rax,%r8
278 movq %r11,%rax
279 adcq %rdx,%r9
280 adcq $0,%r10
281
282
283
284 movq %r11,%rbp
285 shlq $32,%r11
286 mulq %r15
287 shrq $32,%rbp
288 addq %r11,%r12
289 adcq %rbp,%r13
290 movq %r12,%rcx
291 adcq %rax,%r8
292 adcq %rdx,%r9
293 movq %r13,%rbp
294 adcq $0,%r10
295
296
297
298 subq $-1,%r12
299 movq %r8,%rbx
300 sbbq %r14,%r13
301 sbbq $0,%r8
302 movq %r9,%rdx
303 sbbq %r15,%r9
304 sbbq $0,%r10
305
306 cmovcq %rcx,%r12
307 cmovcq %rbp,%r13
308 movq %r12,0(%rdi)
309 cmovcq %rbx,%r8
310 movq %r13,8(%rdi)
311 cmovcq %rdx,%r9
312 movq %r8,16(%rdi)
313 movq %r9,24(%rdi)
314
315 .byte 0xf3,0xc3
316.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
317
318
319
320
321
322
323
324
325.globl ecp_nistz256_sqr_mont
326.hidden ecp_nistz256_sqr_mont
327.type ecp_nistz256_sqr_mont,@function
328.align 32
329ecp_nistz256_sqr_mont:
330 pushq %rbp
331 pushq %rbx
332 pushq %r12
333 pushq %r13
334 pushq %r14
335 pushq %r15
336 movq 0(%rsi),%rax
337 movq 8(%rsi),%r14
338 movq 16(%rsi),%r15
339 movq 24(%rsi),%r8
340
341 call __ecp_nistz256_sqr_montq
342.Lsqr_mont_done:
343 popq %r15
344 popq %r14
345 popq %r13
346 popq %r12
347 popq %rbx
348 popq %rbp
349 .byte 0xf3,0xc3
350.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
351
352.type __ecp_nistz256_sqr_montq,@function
353.align 32
354__ecp_nistz256_sqr_montq:
355 movq %rax,%r13
356 mulq %r14
357 movq %rax,%r9
358 movq %r15,%rax
359 movq %rdx,%r10
360
361 mulq %r13
362 addq %rax,%r10
363 movq %r8,%rax
364 adcq $0,%rdx
365 movq %rdx,%r11
366
367 mulq %r13
368 addq %rax,%r11
369 movq %r15,%rax
370 adcq $0,%rdx
371 movq %rdx,%r12
372
373
374 mulq %r14
375 addq %rax,%r11
376 movq %r8,%rax
377 adcq $0,%rdx
378 movq %rdx,%rbp
379
380 mulq %r14
381 addq %rax,%r12
382 movq %r8,%rax
383 adcq $0,%rdx
384 addq %rbp,%r12
385 movq %rdx,%r13
386 adcq $0,%r13
387
388
389 mulq %r15
390 xorq %r15,%r15
391 addq %rax,%r13
392 movq 0(%rsi),%rax
393 movq %rdx,%r14
394 adcq $0,%r14
395
396 addq %r9,%r9
397 adcq %r10,%r10
398 adcq %r11,%r11
399 adcq %r12,%r12
400 adcq %r13,%r13
401 adcq %r14,%r14
402 adcq $0,%r15
403
404 mulq %rax
405 movq %rax,%r8
406 movq 8(%rsi),%rax
407 movq %rdx,%rcx
408
409 mulq %rax
410 addq %rcx,%r9
411 adcq %rax,%r10
412 movq 16(%rsi),%rax
413 adcq $0,%rdx
414 movq %rdx,%rcx
415
416 mulq %rax
417 addq %rcx,%r11
418 adcq %rax,%r12
419 movq 24(%rsi),%rax
420 adcq $0,%rdx
421 movq %rdx,%rcx
422
423 mulq %rax
424 addq %rcx,%r13
425 adcq %rax,%r14
426 movq %r8,%rax
427 adcq %rdx,%r15
428
429 movq .Lpoly+8(%rip),%rsi
430 movq .Lpoly+24(%rip),%rbp
431
432
433
434
435 movq %r8,%rcx
436 shlq $32,%r8
437 mulq %rbp
438 shrq $32,%rcx
439 addq %r8,%r9
440 adcq %rcx,%r10
441 adcq %rax,%r11
442 movq %r9,%rax
443 adcq $0,%rdx
444
445
446
447 movq %r9,%rcx
448 shlq $32,%r9
449 movq %rdx,%r8
450 mulq %rbp
451 shrq $32,%rcx
452 addq %r9,%r10
453 adcq %rcx,%r11
454 adcq %rax,%r8
455 movq %r10,%rax
456 adcq $0,%rdx
457
458
459
460 movq %r10,%rcx
461 shlq $32,%r10
462 movq %rdx,%r9
463 mulq %rbp
464 shrq $32,%rcx
465 addq %r10,%r11
466 adcq %rcx,%r8
467 adcq %rax,%r9
468 movq %r11,%rax
469 adcq $0,%rdx
470
471
472
473 movq %r11,%rcx
474 shlq $32,%r11
475 movq %rdx,%r10
476 mulq %rbp
477 shrq $32,%rcx
478 addq %r11,%r8
479 adcq %rcx,%r9
480 adcq %rax,%r10
481 adcq $0,%rdx
482 xorq %r11,%r11
483
484
485
486 addq %r8,%r12
487 adcq %r9,%r13
488 movq %r12,%r8
489 adcq %r10,%r14
490 adcq %rdx,%r15
491 movq %r13,%r9
492 adcq $0,%r11
493
494 subq $-1,%r12
495 movq %r14,%r10
496 sbbq %rsi,%r13
497 sbbq $0,%r14
498 movq %r15,%rcx
499 sbbq %rbp,%r15
500 sbbq $0,%r11
501
502 cmovcq %r8,%r12
503 cmovcq %r9,%r13
504 movq %r12,0(%rdi)
505 cmovcq %r10,%r14
506 movq %r13,8(%rdi)
507 cmovcq %rcx,%r15
508 movq %r14,16(%rdi)
509 movq %r15,24(%rdi)
510
511 .byte 0xf3,0xc3
512.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
513
514
Adam Langleyfad63272015-11-12 12:15:39 -0800515.globl ecp_nistz256_select_w5
516.hidden ecp_nistz256_select_w5
517.type ecp_nistz256_select_w5,@function
518.align 32
519ecp_nistz256_select_w5:
Robert Sloan8f860b12017-08-28 07:37:06 -0700520 leaq OPENSSL_ia32cap_P(%rip),%rax
521 movq 8(%rax),%rax
522 testl $32,%eax
523 jnz .Lavx2_select_w5
Adam Langleyfad63272015-11-12 12:15:39 -0800524 movdqa .LOne(%rip),%xmm0
525 movd %edx,%xmm1
526
527 pxor %xmm2,%xmm2
528 pxor %xmm3,%xmm3
529 pxor %xmm4,%xmm4
530 pxor %xmm5,%xmm5
531 pxor %xmm6,%xmm6
532 pxor %xmm7,%xmm7
533
534 movdqa %xmm0,%xmm8
535 pshufd $0,%xmm1,%xmm1
536
537 movq $16,%rax
538.Lselect_loop_sse_w5:
539
540 movdqa %xmm8,%xmm15
541 paddd %xmm0,%xmm8
542 pcmpeqd %xmm1,%xmm15
543
544 movdqa 0(%rsi),%xmm9
545 movdqa 16(%rsi),%xmm10
546 movdqa 32(%rsi),%xmm11
547 movdqa 48(%rsi),%xmm12
548 movdqa 64(%rsi),%xmm13
549 movdqa 80(%rsi),%xmm14
550 leaq 96(%rsi),%rsi
551
552 pand %xmm15,%xmm9
553 pand %xmm15,%xmm10
554 por %xmm9,%xmm2
555 pand %xmm15,%xmm11
556 por %xmm10,%xmm3
557 pand %xmm15,%xmm12
558 por %xmm11,%xmm4
559 pand %xmm15,%xmm13
560 por %xmm12,%xmm5
561 pand %xmm15,%xmm14
562 por %xmm13,%xmm6
563 por %xmm14,%xmm7
564
565 decq %rax
566 jnz .Lselect_loop_sse_w5
567
568 movdqu %xmm2,0(%rdi)
569 movdqu %xmm3,16(%rdi)
570 movdqu %xmm4,32(%rdi)
571 movdqu %xmm5,48(%rdi)
572 movdqu %xmm6,64(%rdi)
573 movdqu %xmm7,80(%rdi)
574 .byte 0xf3,0xc3
575.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
576
577
578
579.globl ecp_nistz256_select_w7
580.hidden ecp_nistz256_select_w7
581.type ecp_nistz256_select_w7,@function
582.align 32
583ecp_nistz256_select_w7:
Robert Sloan8f860b12017-08-28 07:37:06 -0700584 leaq OPENSSL_ia32cap_P(%rip),%rax
585 movq 8(%rax),%rax
586 testl $32,%eax
587 jnz .Lavx2_select_w7
Adam Langleyfad63272015-11-12 12:15:39 -0800588 movdqa .LOne(%rip),%xmm8
589 movd %edx,%xmm1
590
591 pxor %xmm2,%xmm2
592 pxor %xmm3,%xmm3
593 pxor %xmm4,%xmm4
594 pxor %xmm5,%xmm5
595
596 movdqa %xmm8,%xmm0
597 pshufd $0,%xmm1,%xmm1
598 movq $64,%rax
599
600.Lselect_loop_sse_w7:
601 movdqa %xmm8,%xmm15
602 paddd %xmm0,%xmm8
603 movdqa 0(%rsi),%xmm9
604 movdqa 16(%rsi),%xmm10
605 pcmpeqd %xmm1,%xmm15
606 movdqa 32(%rsi),%xmm11
607 movdqa 48(%rsi),%xmm12
608 leaq 64(%rsi),%rsi
609
610 pand %xmm15,%xmm9
611 pand %xmm15,%xmm10
612 por %xmm9,%xmm2
613 pand %xmm15,%xmm11
614 por %xmm10,%xmm3
615 pand %xmm15,%xmm12
616 por %xmm11,%xmm4
617 prefetcht0 255(%rsi)
618 por %xmm12,%xmm5
619
620 decq %rax
621 jnz .Lselect_loop_sse_w7
622
623 movdqu %xmm2,0(%rdi)
624 movdqu %xmm3,16(%rdi)
625 movdqu %xmm4,32(%rdi)
626 movdqu %xmm5,48(%rdi)
627 .byte 0xf3,0xc3
628.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
Robert Sloan8f860b12017-08-28 07:37:06 -0700629
630
631.type ecp_nistz256_avx2_select_w5,@function
632.align 32
633ecp_nistz256_avx2_select_w5:
634.Lavx2_select_w5:
635 vzeroupper
636 vmovdqa .LTwo(%rip),%ymm0
637
638 vpxor %ymm2,%ymm2,%ymm2
639 vpxor %ymm3,%ymm3,%ymm3
640 vpxor %ymm4,%ymm4,%ymm4
641
642 vmovdqa .LOne(%rip),%ymm5
643 vmovdqa .LTwo(%rip),%ymm10
644
645 vmovd %edx,%xmm1
646 vpermd %ymm1,%ymm2,%ymm1
647
648 movq $8,%rax
649.Lselect_loop_avx2_w5:
650
651 vmovdqa 0(%rsi),%ymm6
652 vmovdqa 32(%rsi),%ymm7
653 vmovdqa 64(%rsi),%ymm8
654
655 vmovdqa 96(%rsi),%ymm11
656 vmovdqa 128(%rsi),%ymm12
657 vmovdqa 160(%rsi),%ymm13
658
659 vpcmpeqd %ymm1,%ymm5,%ymm9
660 vpcmpeqd %ymm1,%ymm10,%ymm14
661
662 vpaddd %ymm0,%ymm5,%ymm5
663 vpaddd %ymm0,%ymm10,%ymm10
664 leaq 192(%rsi),%rsi
665
666 vpand %ymm9,%ymm6,%ymm6
667 vpand %ymm9,%ymm7,%ymm7
668 vpand %ymm9,%ymm8,%ymm8
669 vpand %ymm14,%ymm11,%ymm11
670 vpand %ymm14,%ymm12,%ymm12
671 vpand %ymm14,%ymm13,%ymm13
672
673 vpxor %ymm6,%ymm2,%ymm2
674 vpxor %ymm7,%ymm3,%ymm3
675 vpxor %ymm8,%ymm4,%ymm4
676 vpxor %ymm11,%ymm2,%ymm2
677 vpxor %ymm12,%ymm3,%ymm3
678 vpxor %ymm13,%ymm4,%ymm4
679
680 decq %rax
681 jnz .Lselect_loop_avx2_w5
682
683 vmovdqu %ymm2,0(%rdi)
684 vmovdqu %ymm3,32(%rdi)
685 vmovdqu %ymm4,64(%rdi)
686 vzeroupper
687 .byte 0xf3,0xc3
688.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
689
690
691
Adam Langleyfad63272015-11-12 12:15:39 -0800692.globl ecp_nistz256_avx2_select_w7
693.hidden ecp_nistz256_avx2_select_w7
694.type ecp_nistz256_avx2_select_w7,@function
695.align 32
696ecp_nistz256_avx2_select_w7:
Robert Sloan8f860b12017-08-28 07:37:06 -0700697.Lavx2_select_w7:
698 vzeroupper
699 vmovdqa .LThree(%rip),%ymm0
700
701 vpxor %ymm2,%ymm2,%ymm2
702 vpxor %ymm3,%ymm3,%ymm3
703
704 vmovdqa .LOne(%rip),%ymm4
705 vmovdqa .LTwo(%rip),%ymm8
706 vmovdqa .LThree(%rip),%ymm12
707
708 vmovd %edx,%xmm1
709 vpermd %ymm1,%ymm2,%ymm1
710
711
712 movq $21,%rax
713.Lselect_loop_avx2_w7:
714
715 vmovdqa 0(%rsi),%ymm5
716 vmovdqa 32(%rsi),%ymm6
717
718 vmovdqa 64(%rsi),%ymm9
719 vmovdqa 96(%rsi),%ymm10
720
721 vmovdqa 128(%rsi),%ymm13
722 vmovdqa 160(%rsi),%ymm14
723
724 vpcmpeqd %ymm1,%ymm4,%ymm7
725 vpcmpeqd %ymm1,%ymm8,%ymm11
726 vpcmpeqd %ymm1,%ymm12,%ymm15
727
728 vpaddd %ymm0,%ymm4,%ymm4
729 vpaddd %ymm0,%ymm8,%ymm8
730 vpaddd %ymm0,%ymm12,%ymm12
731 leaq 192(%rsi),%rsi
732
733 vpand %ymm7,%ymm5,%ymm5
734 vpand %ymm7,%ymm6,%ymm6
735 vpand %ymm11,%ymm9,%ymm9
736 vpand %ymm11,%ymm10,%ymm10
737 vpand %ymm15,%ymm13,%ymm13
738 vpand %ymm15,%ymm14,%ymm14
739
740 vpxor %ymm5,%ymm2,%ymm2
741 vpxor %ymm6,%ymm3,%ymm3
742 vpxor %ymm9,%ymm2,%ymm2
743 vpxor %ymm10,%ymm3,%ymm3
744 vpxor %ymm13,%ymm2,%ymm2
745 vpxor %ymm14,%ymm3,%ymm3
746
747 decq %rax
748 jnz .Lselect_loop_avx2_w7
749
750
751 vmovdqa 0(%rsi),%ymm5
752 vmovdqa 32(%rsi),%ymm6
753
754 vpcmpeqd %ymm1,%ymm4,%ymm7
755
756 vpand %ymm7,%ymm5,%ymm5
757 vpand %ymm7,%ymm6,%ymm6
758
759 vpxor %ymm5,%ymm2,%ymm2
760 vpxor %ymm6,%ymm3,%ymm3
761
762 vmovdqu %ymm2,0(%rdi)
763 vmovdqu %ymm3,32(%rdi)
764 vzeroupper
Adam Langleyfad63272015-11-12 12:15:39 -0800765 .byte 0xf3,0xc3
766.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
767.type __ecp_nistz256_add_toq,@function
768.align 32
769__ecp_nistz256_add_toq:
Steven Valdez909b19f2016-11-21 15:35:44 -0500770 xorq %r11,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800771 addq 0(%rbx),%r12
772 adcq 8(%rbx),%r13
773 movq %r12,%rax
774 adcq 16(%rbx),%r8
775 adcq 24(%rbx),%r9
776 movq %r13,%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -0500777 adcq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800778
779 subq $-1,%r12
780 movq %r8,%rcx
781 sbbq %r14,%r13
782 sbbq $0,%r8
783 movq %r9,%r10
784 sbbq %r15,%r9
Steven Valdez909b19f2016-11-21 15:35:44 -0500785 sbbq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800786
Steven Valdez909b19f2016-11-21 15:35:44 -0500787 cmovcq %rax,%r12
788 cmovcq %rbp,%r13
Adam Langleyfad63272015-11-12 12:15:39 -0800789 movq %r12,0(%rdi)
Steven Valdez909b19f2016-11-21 15:35:44 -0500790 cmovcq %rcx,%r8
Adam Langleyfad63272015-11-12 12:15:39 -0800791 movq %r13,8(%rdi)
Steven Valdez909b19f2016-11-21 15:35:44 -0500792 cmovcq %r10,%r9
Adam Langleyfad63272015-11-12 12:15:39 -0800793 movq %r8,16(%rdi)
794 movq %r9,24(%rdi)
795
796 .byte 0xf3,0xc3
797.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
798
799.type __ecp_nistz256_sub_fromq,@function
800.align 32
801__ecp_nistz256_sub_fromq:
802 subq 0(%rbx),%r12
803 sbbq 8(%rbx),%r13
804 movq %r12,%rax
805 sbbq 16(%rbx),%r8
806 sbbq 24(%rbx),%r9
807 movq %r13,%rbp
808 sbbq %r11,%r11
809
810 addq $-1,%r12
811 movq %r8,%rcx
812 adcq %r14,%r13
813 adcq $0,%r8
814 movq %r9,%r10
815 adcq %r15,%r9
816 testq %r11,%r11
817
818 cmovzq %rax,%r12
819 cmovzq %rbp,%r13
820 movq %r12,0(%rdi)
821 cmovzq %rcx,%r8
822 movq %r13,8(%rdi)
823 cmovzq %r10,%r9
824 movq %r8,16(%rdi)
825 movq %r9,24(%rdi)
826
827 .byte 0xf3,0xc3
828.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
829
830.type __ecp_nistz256_subq,@function
831.align 32
832__ecp_nistz256_subq:
833 subq %r12,%rax
834 sbbq %r13,%rbp
835 movq %rax,%r12
836 sbbq %r8,%rcx
837 sbbq %r9,%r10
838 movq %rbp,%r13
839 sbbq %r11,%r11
840
841 addq $-1,%rax
842 movq %rcx,%r8
843 adcq %r14,%rbp
844 adcq $0,%rcx
845 movq %r10,%r9
846 adcq %r15,%r10
847 testq %r11,%r11
848
849 cmovnzq %rax,%r12
850 cmovnzq %rbp,%r13
851 cmovnzq %rcx,%r8
852 cmovnzq %r10,%r9
853
854 .byte 0xf3,0xc3
855.size __ecp_nistz256_subq,.-__ecp_nistz256_subq
856
857.type __ecp_nistz256_mul_by_2q,@function
858.align 32
859__ecp_nistz256_mul_by_2q:
Steven Valdez909b19f2016-11-21 15:35:44 -0500860 xorq %r11,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800861 addq %r12,%r12
862 adcq %r13,%r13
863 movq %r12,%rax
864 adcq %r8,%r8
865 adcq %r9,%r9
866 movq %r13,%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -0500867 adcq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800868
869 subq $-1,%r12
870 movq %r8,%rcx
871 sbbq %r14,%r13
872 sbbq $0,%r8
873 movq %r9,%r10
874 sbbq %r15,%r9
Steven Valdez909b19f2016-11-21 15:35:44 -0500875 sbbq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800876
Steven Valdez909b19f2016-11-21 15:35:44 -0500877 cmovcq %rax,%r12
878 cmovcq %rbp,%r13
Adam Langleyfad63272015-11-12 12:15:39 -0800879 movq %r12,0(%rdi)
Steven Valdez909b19f2016-11-21 15:35:44 -0500880 cmovcq %rcx,%r8
Adam Langleyfad63272015-11-12 12:15:39 -0800881 movq %r13,8(%rdi)
Steven Valdez909b19f2016-11-21 15:35:44 -0500882 cmovcq %r10,%r9
Adam Langleyfad63272015-11-12 12:15:39 -0800883 movq %r8,16(%rdi)
884 movq %r9,24(%rdi)
885
886 .byte 0xf3,0xc3
887.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
888.globl ecp_nistz256_point_double
889.hidden ecp_nistz256_point_double
890.type ecp_nistz256_point_double,@function
891.align 32
892ecp_nistz256_point_double:
893 pushq %rbp
894 pushq %rbx
895 pushq %r12
896 pushq %r13
897 pushq %r14
898 pushq %r15
899 subq $160+8,%rsp
900
David Benjamin4969cc92016-04-22 15:02:23 -0400901.Lpoint_double_shortcutq:
Adam Langleyfad63272015-11-12 12:15:39 -0800902 movdqu 0(%rsi),%xmm0
903 movq %rsi,%rbx
904 movdqu 16(%rsi),%xmm1
905 movq 32+0(%rsi),%r12
906 movq 32+8(%rsi),%r13
907 movq 32+16(%rsi),%r8
908 movq 32+24(%rsi),%r9
909 movq .Lpoly+8(%rip),%r14
910 movq .Lpoly+24(%rip),%r15
911 movdqa %xmm0,96(%rsp)
912 movdqa %xmm1,96+16(%rsp)
913 leaq 32(%rdi),%r10
914 leaq 64(%rdi),%r11
915.byte 102,72,15,110,199
916.byte 102,73,15,110,202
917.byte 102,73,15,110,211
918
919 leaq 0(%rsp),%rdi
920 call __ecp_nistz256_mul_by_2q
921
922 movq 64+0(%rsi),%rax
923 movq 64+8(%rsi),%r14
924 movq 64+16(%rsi),%r15
925 movq 64+24(%rsi),%r8
926 leaq 64-0(%rsi),%rsi
927 leaq 64(%rsp),%rdi
928 call __ecp_nistz256_sqr_montq
929
930 movq 0+0(%rsp),%rax
931 movq 8+0(%rsp),%r14
932 leaq 0+0(%rsp),%rsi
933 movq 16+0(%rsp),%r15
934 movq 24+0(%rsp),%r8
935 leaq 0(%rsp),%rdi
936 call __ecp_nistz256_sqr_montq
937
938 movq 32(%rbx),%rax
939 movq 64+0(%rbx),%r9
940 movq 64+8(%rbx),%r10
941 movq 64+16(%rbx),%r11
942 movq 64+24(%rbx),%r12
943 leaq 64-0(%rbx),%rsi
944 leaq 32(%rbx),%rbx
945.byte 102,72,15,126,215
946 call __ecp_nistz256_mul_montq
947 call __ecp_nistz256_mul_by_2q
948
949 movq 96+0(%rsp),%r12
950 movq 96+8(%rsp),%r13
951 leaq 64(%rsp),%rbx
952 movq 96+16(%rsp),%r8
953 movq 96+24(%rsp),%r9
954 leaq 32(%rsp),%rdi
955 call __ecp_nistz256_add_toq
956
957 movq 96+0(%rsp),%r12
958 movq 96+8(%rsp),%r13
959 leaq 64(%rsp),%rbx
960 movq 96+16(%rsp),%r8
961 movq 96+24(%rsp),%r9
962 leaq 64(%rsp),%rdi
963 call __ecp_nistz256_sub_fromq
964
965 movq 0+0(%rsp),%rax
966 movq 8+0(%rsp),%r14
967 leaq 0+0(%rsp),%rsi
968 movq 16+0(%rsp),%r15
969 movq 24+0(%rsp),%r8
970.byte 102,72,15,126,207
971 call __ecp_nistz256_sqr_montq
972 xorq %r9,%r9
973 movq %r12,%rax
974 addq $-1,%r12
975 movq %r13,%r10
976 adcq %rsi,%r13
977 movq %r14,%rcx
978 adcq $0,%r14
979 movq %r15,%r8
980 adcq %rbp,%r15
981 adcq $0,%r9
982 xorq %rsi,%rsi
983 testq $1,%rax
984
985 cmovzq %rax,%r12
986 cmovzq %r10,%r13
987 cmovzq %rcx,%r14
988 cmovzq %r8,%r15
989 cmovzq %rsi,%r9
990
991 movq %r13,%rax
992 shrq $1,%r12
993 shlq $63,%rax
994 movq %r14,%r10
995 shrq $1,%r13
996 orq %rax,%r12
997 shlq $63,%r10
998 movq %r15,%rcx
999 shrq $1,%r14
1000 orq %r10,%r13
1001 shlq $63,%rcx
1002 movq %r12,0(%rdi)
1003 shrq $1,%r15
1004 movq %r13,8(%rdi)
1005 shlq $63,%r9
1006 orq %rcx,%r14
1007 orq %r9,%r15
1008 movq %r14,16(%rdi)
1009 movq %r15,24(%rdi)
1010 movq 64(%rsp),%rax
1011 leaq 64(%rsp),%rbx
1012 movq 0+32(%rsp),%r9
1013 movq 8+32(%rsp),%r10
1014 leaq 0+32(%rsp),%rsi
1015 movq 16+32(%rsp),%r11
1016 movq 24+32(%rsp),%r12
1017 leaq 32(%rsp),%rdi
1018 call __ecp_nistz256_mul_montq
1019
1020 leaq 128(%rsp),%rdi
1021 call __ecp_nistz256_mul_by_2q
1022
1023 leaq 32(%rsp),%rbx
1024 leaq 32(%rsp),%rdi
1025 call __ecp_nistz256_add_toq
1026
1027 movq 96(%rsp),%rax
1028 leaq 96(%rsp),%rbx
1029 movq 0+0(%rsp),%r9
1030 movq 8+0(%rsp),%r10
1031 leaq 0+0(%rsp),%rsi
1032 movq 16+0(%rsp),%r11
1033 movq 24+0(%rsp),%r12
1034 leaq 0(%rsp),%rdi
1035 call __ecp_nistz256_mul_montq
1036
1037 leaq 128(%rsp),%rdi
1038 call __ecp_nistz256_mul_by_2q
1039
1040 movq 0+32(%rsp),%rax
1041 movq 8+32(%rsp),%r14
1042 leaq 0+32(%rsp),%rsi
1043 movq 16+32(%rsp),%r15
1044 movq 24+32(%rsp),%r8
1045.byte 102,72,15,126,199
1046 call __ecp_nistz256_sqr_montq
1047
1048 leaq 128(%rsp),%rbx
1049 movq %r14,%r8
1050 movq %r15,%r9
1051 movq %rsi,%r14
1052 movq %rbp,%r15
1053 call __ecp_nistz256_sub_fromq
1054
1055 movq 0+0(%rsp),%rax
1056 movq 0+8(%rsp),%rbp
1057 movq 0+16(%rsp),%rcx
1058 movq 0+24(%rsp),%r10
1059 leaq 0(%rsp),%rdi
1060 call __ecp_nistz256_subq
1061
1062 movq 32(%rsp),%rax
1063 leaq 32(%rsp),%rbx
1064 movq %r12,%r14
1065 xorl %ecx,%ecx
1066 movq %r12,0+0(%rsp)
1067 movq %r13,%r10
1068 movq %r13,0+8(%rsp)
1069 cmovzq %r8,%r11
1070 movq %r8,0+16(%rsp)
1071 leaq 0-0(%rsp),%rsi
1072 cmovzq %r9,%r12
1073 movq %r9,0+24(%rsp)
1074 movq %r14,%r9
1075 leaq 0(%rsp),%rdi
1076 call __ecp_nistz256_mul_montq
1077
1078.byte 102,72,15,126,203
1079.byte 102,72,15,126,207
1080 call __ecp_nistz256_sub_fromq
1081
1082 addq $160+8,%rsp
1083 popq %r15
1084 popq %r14
1085 popq %r13
1086 popq %r12
1087 popq %rbx
1088 popq %rbp
1089 .byte 0xf3,0xc3
1090.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
1091.globl ecp_nistz256_point_add
1092.hidden ecp_nistz256_point_add
1093.type ecp_nistz256_point_add,@function
1094.align 32
1095ecp_nistz256_point_add:
1096 pushq %rbp
1097 pushq %rbx
1098 pushq %r12
1099 pushq %r13
1100 pushq %r14
1101 pushq %r15
1102 subq $576+8,%rsp
1103
1104 movdqu 0(%rsi),%xmm0
1105 movdqu 16(%rsi),%xmm1
1106 movdqu 32(%rsi),%xmm2
1107 movdqu 48(%rsi),%xmm3
1108 movdqu 64(%rsi),%xmm4
1109 movdqu 80(%rsi),%xmm5
1110 movq %rsi,%rbx
1111 movq %rdx,%rsi
1112 movdqa %xmm0,384(%rsp)
1113 movdqa %xmm1,384+16(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08001114 movdqa %xmm2,416(%rsp)
1115 movdqa %xmm3,416+16(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08001116 movdqa %xmm4,448(%rsp)
1117 movdqa %xmm5,448+16(%rsp)
Steven Valdez909b19f2016-11-21 15:35:44 -05001118 por %xmm4,%xmm5
Adam Langleyfad63272015-11-12 12:15:39 -08001119
1120 movdqu 0(%rsi),%xmm0
Steven Valdez909b19f2016-11-21 15:35:44 -05001121 pshufd $0xb1,%xmm5,%xmm3
Adam Langleyfad63272015-11-12 12:15:39 -08001122 movdqu 16(%rsi),%xmm1
1123 movdqu 32(%rsi),%xmm2
1124 por %xmm3,%xmm5
1125 movdqu 48(%rsi),%xmm3
1126 movq 64+0(%rsi),%rax
1127 movq 64+8(%rsi),%r14
1128 movq 64+16(%rsi),%r15
1129 movq 64+24(%rsi),%r8
1130 movdqa %xmm0,480(%rsp)
David Benjamin4969cc92016-04-22 15:02:23 -04001131 pshufd $0x1e,%xmm5,%xmm4
Adam Langleyfad63272015-11-12 12:15:39 -08001132 movdqa %xmm1,480+16(%rsp)
Steven Valdez909b19f2016-11-21 15:35:44 -05001133 movdqu 64(%rsi),%xmm0
1134 movdqu 80(%rsi),%xmm1
Adam Langleyfad63272015-11-12 12:15:39 -08001135 movdqa %xmm2,512(%rsp)
1136 movdqa %xmm3,512+16(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08001137 por %xmm4,%xmm5
1138 pxor %xmm4,%xmm4
Steven Valdez909b19f2016-11-21 15:35:44 -05001139 por %xmm0,%xmm1
1140.byte 102,72,15,110,199
Adam Langleyfad63272015-11-12 12:15:39 -08001141
1142 leaq 64-0(%rsi),%rsi
1143 movq %rax,544+0(%rsp)
1144 movq %r14,544+8(%rsp)
1145 movq %r15,544+16(%rsp)
1146 movq %r8,544+24(%rsp)
1147 leaq 96(%rsp),%rdi
1148 call __ecp_nistz256_sqr_montq
1149
1150 pcmpeqd %xmm4,%xmm5
Steven Valdez909b19f2016-11-21 15:35:44 -05001151 pshufd $0xb1,%xmm1,%xmm4
1152 por %xmm1,%xmm4
Adam Langleyfad63272015-11-12 12:15:39 -08001153 pshufd $0,%xmm5,%xmm5
David Benjamin4969cc92016-04-22 15:02:23 -04001154 pshufd $0x1e,%xmm4,%xmm3
Adam Langleyfad63272015-11-12 12:15:39 -08001155 por %xmm3,%xmm4
1156 pxor %xmm3,%xmm3
1157 pcmpeqd %xmm3,%xmm4
1158 pshufd $0,%xmm4,%xmm4
1159 movq 64+0(%rbx),%rax
1160 movq 64+8(%rbx),%r14
1161 movq 64+16(%rbx),%r15
1162 movq 64+24(%rbx),%r8
David Benjamin4969cc92016-04-22 15:02:23 -04001163.byte 102,72,15,110,203
Adam Langleyfad63272015-11-12 12:15:39 -08001164
1165 leaq 64-0(%rbx),%rsi
1166 leaq 32(%rsp),%rdi
1167 call __ecp_nistz256_sqr_montq
1168
1169 movq 544(%rsp),%rax
1170 leaq 544(%rsp),%rbx
1171 movq 0+96(%rsp),%r9
1172 movq 8+96(%rsp),%r10
1173 leaq 0+96(%rsp),%rsi
1174 movq 16+96(%rsp),%r11
1175 movq 24+96(%rsp),%r12
1176 leaq 224(%rsp),%rdi
1177 call __ecp_nistz256_mul_montq
1178
1179 movq 448(%rsp),%rax
1180 leaq 448(%rsp),%rbx
1181 movq 0+32(%rsp),%r9
1182 movq 8+32(%rsp),%r10
1183 leaq 0+32(%rsp),%rsi
1184 movq 16+32(%rsp),%r11
1185 movq 24+32(%rsp),%r12
1186 leaq 256(%rsp),%rdi
1187 call __ecp_nistz256_mul_montq
1188
1189 movq 416(%rsp),%rax
1190 leaq 416(%rsp),%rbx
1191 movq 0+224(%rsp),%r9
1192 movq 8+224(%rsp),%r10
1193 leaq 0+224(%rsp),%rsi
1194 movq 16+224(%rsp),%r11
1195 movq 24+224(%rsp),%r12
1196 leaq 224(%rsp),%rdi
1197 call __ecp_nistz256_mul_montq
1198
1199 movq 512(%rsp),%rax
1200 leaq 512(%rsp),%rbx
1201 movq 0+256(%rsp),%r9
1202 movq 8+256(%rsp),%r10
1203 leaq 0+256(%rsp),%rsi
1204 movq 16+256(%rsp),%r11
1205 movq 24+256(%rsp),%r12
1206 leaq 256(%rsp),%rdi
1207 call __ecp_nistz256_mul_montq
1208
1209 leaq 224(%rsp),%rbx
1210 leaq 64(%rsp),%rdi
1211 call __ecp_nistz256_sub_fromq
1212
1213 orq %r13,%r12
1214 movdqa %xmm4,%xmm2
1215 orq %r8,%r12
1216 orq %r9,%r12
1217 por %xmm5,%xmm2
1218.byte 102,73,15,110,220
1219
1220 movq 384(%rsp),%rax
1221 leaq 384(%rsp),%rbx
1222 movq 0+96(%rsp),%r9
1223 movq 8+96(%rsp),%r10
1224 leaq 0+96(%rsp),%rsi
1225 movq 16+96(%rsp),%r11
1226 movq 24+96(%rsp),%r12
1227 leaq 160(%rsp),%rdi
1228 call __ecp_nistz256_mul_montq
1229
1230 movq 480(%rsp),%rax
1231 leaq 480(%rsp),%rbx
1232 movq 0+32(%rsp),%r9
1233 movq 8+32(%rsp),%r10
1234 leaq 0+32(%rsp),%rsi
1235 movq 16+32(%rsp),%r11
1236 movq 24+32(%rsp),%r12
1237 leaq 192(%rsp),%rdi
1238 call __ecp_nistz256_mul_montq
1239
1240 leaq 160(%rsp),%rbx
1241 leaq 0(%rsp),%rdi
1242 call __ecp_nistz256_sub_fromq
1243
1244 orq %r13,%r12
1245 orq %r8,%r12
1246 orq %r9,%r12
1247
1248.byte 0x3e
1249 jnz .Ladd_proceedq
1250.byte 102,73,15,126,208
1251.byte 102,73,15,126,217
1252 testq %r8,%r8
1253 jnz .Ladd_proceedq
1254 testq %r9,%r9
David Benjamin4969cc92016-04-22 15:02:23 -04001255 jz .Ladd_doubleq
Adam Langleyfad63272015-11-12 12:15:39 -08001256
1257.byte 102,72,15,126,199
1258 pxor %xmm0,%xmm0
1259 movdqu %xmm0,0(%rdi)
1260 movdqu %xmm0,16(%rdi)
1261 movdqu %xmm0,32(%rdi)
1262 movdqu %xmm0,48(%rdi)
1263 movdqu %xmm0,64(%rdi)
1264 movdqu %xmm0,80(%rdi)
1265 jmp .Ladd_doneq
1266
1267.align 32
David Benjamin4969cc92016-04-22 15:02:23 -04001268.Ladd_doubleq:
1269.byte 102,72,15,126,206
1270.byte 102,72,15,126,199
1271 addq $416,%rsp
1272 jmp .Lpoint_double_shortcutq
1273
1274.align 32
Adam Langleyfad63272015-11-12 12:15:39 -08001275.Ladd_proceedq:
1276 movq 0+64(%rsp),%rax
1277 movq 8+64(%rsp),%r14
1278 leaq 0+64(%rsp),%rsi
1279 movq 16+64(%rsp),%r15
1280 movq 24+64(%rsp),%r8
1281 leaq 96(%rsp),%rdi
1282 call __ecp_nistz256_sqr_montq
1283
1284 movq 448(%rsp),%rax
1285 leaq 448(%rsp),%rbx
1286 movq 0+0(%rsp),%r9
1287 movq 8+0(%rsp),%r10
1288 leaq 0+0(%rsp),%rsi
1289 movq 16+0(%rsp),%r11
1290 movq 24+0(%rsp),%r12
1291 leaq 352(%rsp),%rdi
1292 call __ecp_nistz256_mul_montq
1293
1294 movq 0+0(%rsp),%rax
1295 movq 8+0(%rsp),%r14
1296 leaq 0+0(%rsp),%rsi
1297 movq 16+0(%rsp),%r15
1298 movq 24+0(%rsp),%r8
1299 leaq 32(%rsp),%rdi
1300 call __ecp_nistz256_sqr_montq
1301
1302 movq 544(%rsp),%rax
1303 leaq 544(%rsp),%rbx
1304 movq 0+352(%rsp),%r9
1305 movq 8+352(%rsp),%r10
1306 leaq 0+352(%rsp),%rsi
1307 movq 16+352(%rsp),%r11
1308 movq 24+352(%rsp),%r12
1309 leaq 352(%rsp),%rdi
1310 call __ecp_nistz256_mul_montq
1311
1312 movq 0(%rsp),%rax
1313 leaq 0(%rsp),%rbx
1314 movq 0+32(%rsp),%r9
1315 movq 8+32(%rsp),%r10
1316 leaq 0+32(%rsp),%rsi
1317 movq 16+32(%rsp),%r11
1318 movq 24+32(%rsp),%r12
1319 leaq 128(%rsp),%rdi
1320 call __ecp_nistz256_mul_montq
1321
1322 movq 160(%rsp),%rax
1323 leaq 160(%rsp),%rbx
1324 movq 0+32(%rsp),%r9
1325 movq 8+32(%rsp),%r10
1326 leaq 0+32(%rsp),%rsi
1327 movq 16+32(%rsp),%r11
1328 movq 24+32(%rsp),%r12
1329 leaq 192(%rsp),%rdi
1330 call __ecp_nistz256_mul_montq
1331
1332
1333
1334
Steven Valdez909b19f2016-11-21 15:35:44 -05001335 xorq %r11,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001336 addq %r12,%r12
1337 leaq 96(%rsp),%rsi
1338 adcq %r13,%r13
1339 movq %r12,%rax
1340 adcq %r8,%r8
1341 adcq %r9,%r9
1342 movq %r13,%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -05001343 adcq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001344
1345 subq $-1,%r12
1346 movq %r8,%rcx
1347 sbbq %r14,%r13
1348 sbbq $0,%r8
1349 movq %r9,%r10
1350 sbbq %r15,%r9
Steven Valdez909b19f2016-11-21 15:35:44 -05001351 sbbq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001352
Steven Valdez909b19f2016-11-21 15:35:44 -05001353 cmovcq %rax,%r12
Adam Langleyfad63272015-11-12 12:15:39 -08001354 movq 0(%rsi),%rax
Steven Valdez909b19f2016-11-21 15:35:44 -05001355 cmovcq %rbp,%r13
Adam Langleyfad63272015-11-12 12:15:39 -08001356 movq 8(%rsi),%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -05001357 cmovcq %rcx,%r8
Adam Langleyfad63272015-11-12 12:15:39 -08001358 movq 16(%rsi),%rcx
Steven Valdez909b19f2016-11-21 15:35:44 -05001359 cmovcq %r10,%r9
Adam Langleyfad63272015-11-12 12:15:39 -08001360 movq 24(%rsi),%r10
1361
1362 call __ecp_nistz256_subq
1363
1364 leaq 128(%rsp),%rbx
1365 leaq 288(%rsp),%rdi
1366 call __ecp_nistz256_sub_fromq
1367
1368 movq 192+0(%rsp),%rax
1369 movq 192+8(%rsp),%rbp
1370 movq 192+16(%rsp),%rcx
1371 movq 192+24(%rsp),%r10
1372 leaq 320(%rsp),%rdi
1373
1374 call __ecp_nistz256_subq
1375
1376 movq %r12,0(%rdi)
1377 movq %r13,8(%rdi)
1378 movq %r8,16(%rdi)
1379 movq %r9,24(%rdi)
1380 movq 128(%rsp),%rax
1381 leaq 128(%rsp),%rbx
1382 movq 0+224(%rsp),%r9
1383 movq 8+224(%rsp),%r10
1384 leaq 0+224(%rsp),%rsi
1385 movq 16+224(%rsp),%r11
1386 movq 24+224(%rsp),%r12
1387 leaq 256(%rsp),%rdi
1388 call __ecp_nistz256_mul_montq
1389
1390 movq 320(%rsp),%rax
1391 leaq 320(%rsp),%rbx
1392 movq 0+64(%rsp),%r9
1393 movq 8+64(%rsp),%r10
1394 leaq 0+64(%rsp),%rsi
1395 movq 16+64(%rsp),%r11
1396 movq 24+64(%rsp),%r12
1397 leaq 320(%rsp),%rdi
1398 call __ecp_nistz256_mul_montq
1399
1400 leaq 256(%rsp),%rbx
1401 leaq 320(%rsp),%rdi
1402 call __ecp_nistz256_sub_fromq
1403
1404.byte 102,72,15,126,199
1405
1406 movdqa %xmm5,%xmm0
1407 movdqa %xmm5,%xmm1
1408 pandn 352(%rsp),%xmm0
1409 movdqa %xmm5,%xmm2
1410 pandn 352+16(%rsp),%xmm1
1411 movdqa %xmm5,%xmm3
1412 pand 544(%rsp),%xmm2
1413 pand 544+16(%rsp),%xmm3
1414 por %xmm0,%xmm2
1415 por %xmm1,%xmm3
1416
1417 movdqa %xmm4,%xmm0
1418 movdqa %xmm4,%xmm1
1419 pandn %xmm2,%xmm0
1420 movdqa %xmm4,%xmm2
1421 pandn %xmm3,%xmm1
1422 movdqa %xmm4,%xmm3
1423 pand 448(%rsp),%xmm2
1424 pand 448+16(%rsp),%xmm3
1425 por %xmm0,%xmm2
1426 por %xmm1,%xmm3
1427 movdqu %xmm2,64(%rdi)
1428 movdqu %xmm3,80(%rdi)
1429
1430 movdqa %xmm5,%xmm0
1431 movdqa %xmm5,%xmm1
1432 pandn 288(%rsp),%xmm0
1433 movdqa %xmm5,%xmm2
1434 pandn 288+16(%rsp),%xmm1
1435 movdqa %xmm5,%xmm3
1436 pand 480(%rsp),%xmm2
1437 pand 480+16(%rsp),%xmm3
1438 por %xmm0,%xmm2
1439 por %xmm1,%xmm3
1440
1441 movdqa %xmm4,%xmm0
1442 movdqa %xmm4,%xmm1
1443 pandn %xmm2,%xmm0
1444 movdqa %xmm4,%xmm2
1445 pandn %xmm3,%xmm1
1446 movdqa %xmm4,%xmm3
1447 pand 384(%rsp),%xmm2
1448 pand 384+16(%rsp),%xmm3
1449 por %xmm0,%xmm2
1450 por %xmm1,%xmm3
1451 movdqu %xmm2,0(%rdi)
1452 movdqu %xmm3,16(%rdi)
1453
1454 movdqa %xmm5,%xmm0
1455 movdqa %xmm5,%xmm1
1456 pandn 320(%rsp),%xmm0
1457 movdqa %xmm5,%xmm2
1458 pandn 320+16(%rsp),%xmm1
1459 movdqa %xmm5,%xmm3
1460 pand 512(%rsp),%xmm2
1461 pand 512+16(%rsp),%xmm3
1462 por %xmm0,%xmm2
1463 por %xmm1,%xmm3
1464
1465 movdqa %xmm4,%xmm0
1466 movdqa %xmm4,%xmm1
1467 pandn %xmm2,%xmm0
1468 movdqa %xmm4,%xmm2
1469 pandn %xmm3,%xmm1
1470 movdqa %xmm4,%xmm3
1471 pand 416(%rsp),%xmm2
1472 pand 416+16(%rsp),%xmm3
1473 por %xmm0,%xmm2
1474 por %xmm1,%xmm3
1475 movdqu %xmm2,32(%rdi)
1476 movdqu %xmm3,48(%rdi)
1477
1478.Ladd_doneq:
1479 addq $576+8,%rsp
1480 popq %r15
1481 popq %r14
1482 popq %r13
1483 popq %r12
1484 popq %rbx
1485 popq %rbp
1486 .byte 0xf3,0xc3
1487.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1488.globl ecp_nistz256_point_add_affine
1489.hidden ecp_nistz256_point_add_affine
1490.type ecp_nistz256_point_add_affine,@function
1491.align 32
1492ecp_nistz256_point_add_affine:
1493 pushq %rbp
1494 pushq %rbx
1495 pushq %r12
1496 pushq %r13
1497 pushq %r14
1498 pushq %r15
1499 subq $480+8,%rsp
1500
1501 movdqu 0(%rsi),%xmm0
1502 movq %rdx,%rbx
1503 movdqu 16(%rsi),%xmm1
1504 movdqu 32(%rsi),%xmm2
1505 movdqu 48(%rsi),%xmm3
1506 movdqu 64(%rsi),%xmm4
1507 movdqu 80(%rsi),%xmm5
1508 movq 64+0(%rsi),%rax
1509 movq 64+8(%rsi),%r14
1510 movq 64+16(%rsi),%r15
1511 movq 64+24(%rsi),%r8
1512 movdqa %xmm0,320(%rsp)
1513 movdqa %xmm1,320+16(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08001514 movdqa %xmm2,352(%rsp)
1515 movdqa %xmm3,352+16(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08001516 movdqa %xmm4,384(%rsp)
1517 movdqa %xmm5,384+16(%rsp)
Steven Valdez909b19f2016-11-21 15:35:44 -05001518 por %xmm4,%xmm5
Adam Langleyfad63272015-11-12 12:15:39 -08001519
1520 movdqu 0(%rbx),%xmm0
Steven Valdez909b19f2016-11-21 15:35:44 -05001521 pshufd $0xb1,%xmm5,%xmm3
Adam Langleyfad63272015-11-12 12:15:39 -08001522 movdqu 16(%rbx),%xmm1
1523 movdqu 32(%rbx),%xmm2
1524 por %xmm3,%xmm5
1525 movdqu 48(%rbx),%xmm3
1526 movdqa %xmm0,416(%rsp)
David Benjamin4969cc92016-04-22 15:02:23 -04001527 pshufd $0x1e,%xmm5,%xmm4
Adam Langleyfad63272015-11-12 12:15:39 -08001528 movdqa %xmm1,416+16(%rsp)
1529 por %xmm0,%xmm1
1530.byte 102,72,15,110,199
1531 movdqa %xmm2,448(%rsp)
1532 movdqa %xmm3,448+16(%rsp)
1533 por %xmm2,%xmm3
1534 por %xmm4,%xmm5
1535 pxor %xmm4,%xmm4
1536 por %xmm1,%xmm3
1537
1538 leaq 64-0(%rsi),%rsi
1539 leaq 32(%rsp),%rdi
1540 call __ecp_nistz256_sqr_montq
1541
1542 pcmpeqd %xmm4,%xmm5
David Benjamin4969cc92016-04-22 15:02:23 -04001543 pshufd $0xb1,%xmm3,%xmm4
Adam Langleyfad63272015-11-12 12:15:39 -08001544 movq 0(%rbx),%rax
1545
1546 movq %r12,%r9
1547 por %xmm3,%xmm4
1548 pshufd $0,%xmm5,%xmm5
David Benjamin4969cc92016-04-22 15:02:23 -04001549 pshufd $0x1e,%xmm4,%xmm3
Adam Langleyfad63272015-11-12 12:15:39 -08001550 movq %r13,%r10
1551 por %xmm3,%xmm4
1552 pxor %xmm3,%xmm3
1553 movq %r14,%r11
1554 pcmpeqd %xmm3,%xmm4
1555 pshufd $0,%xmm4,%xmm4
1556
1557 leaq 32-0(%rsp),%rsi
1558 movq %r15,%r12
1559 leaq 0(%rsp),%rdi
1560 call __ecp_nistz256_mul_montq
1561
1562 leaq 320(%rsp),%rbx
1563 leaq 64(%rsp),%rdi
1564 call __ecp_nistz256_sub_fromq
1565
1566 movq 384(%rsp),%rax
1567 leaq 384(%rsp),%rbx
1568 movq 0+32(%rsp),%r9
1569 movq 8+32(%rsp),%r10
1570 leaq 0+32(%rsp),%rsi
1571 movq 16+32(%rsp),%r11
1572 movq 24+32(%rsp),%r12
1573 leaq 32(%rsp),%rdi
1574 call __ecp_nistz256_mul_montq
1575
1576 movq 384(%rsp),%rax
1577 leaq 384(%rsp),%rbx
1578 movq 0+64(%rsp),%r9
1579 movq 8+64(%rsp),%r10
1580 leaq 0+64(%rsp),%rsi
1581 movq 16+64(%rsp),%r11
1582 movq 24+64(%rsp),%r12
1583 leaq 288(%rsp),%rdi
1584 call __ecp_nistz256_mul_montq
1585
1586 movq 448(%rsp),%rax
1587 leaq 448(%rsp),%rbx
1588 movq 0+32(%rsp),%r9
1589 movq 8+32(%rsp),%r10
1590 leaq 0+32(%rsp),%rsi
1591 movq 16+32(%rsp),%r11
1592 movq 24+32(%rsp),%r12
1593 leaq 32(%rsp),%rdi
1594 call __ecp_nistz256_mul_montq
1595
1596 leaq 352(%rsp),%rbx
1597 leaq 96(%rsp),%rdi
1598 call __ecp_nistz256_sub_fromq
1599
1600 movq 0+64(%rsp),%rax
1601 movq 8+64(%rsp),%r14
1602 leaq 0+64(%rsp),%rsi
1603 movq 16+64(%rsp),%r15
1604 movq 24+64(%rsp),%r8
1605 leaq 128(%rsp),%rdi
1606 call __ecp_nistz256_sqr_montq
1607
1608 movq 0+96(%rsp),%rax
1609 movq 8+96(%rsp),%r14
1610 leaq 0+96(%rsp),%rsi
1611 movq 16+96(%rsp),%r15
1612 movq 24+96(%rsp),%r8
1613 leaq 192(%rsp),%rdi
1614 call __ecp_nistz256_sqr_montq
1615
1616 movq 128(%rsp),%rax
1617 leaq 128(%rsp),%rbx
1618 movq 0+64(%rsp),%r9
1619 movq 8+64(%rsp),%r10
1620 leaq 0+64(%rsp),%rsi
1621 movq 16+64(%rsp),%r11
1622 movq 24+64(%rsp),%r12
1623 leaq 160(%rsp),%rdi
1624 call __ecp_nistz256_mul_montq
1625
1626 movq 320(%rsp),%rax
1627 leaq 320(%rsp),%rbx
1628 movq 0+128(%rsp),%r9
1629 movq 8+128(%rsp),%r10
1630 leaq 0+128(%rsp),%rsi
1631 movq 16+128(%rsp),%r11
1632 movq 24+128(%rsp),%r12
1633 leaq 0(%rsp),%rdi
1634 call __ecp_nistz256_mul_montq
1635
1636
1637
1638
Steven Valdez909b19f2016-11-21 15:35:44 -05001639 xorq %r11,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001640 addq %r12,%r12
1641 leaq 192(%rsp),%rsi
1642 adcq %r13,%r13
1643 movq %r12,%rax
1644 adcq %r8,%r8
1645 adcq %r9,%r9
1646 movq %r13,%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -05001647 adcq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001648
1649 subq $-1,%r12
1650 movq %r8,%rcx
1651 sbbq %r14,%r13
1652 sbbq $0,%r8
1653 movq %r9,%r10
1654 sbbq %r15,%r9
Steven Valdez909b19f2016-11-21 15:35:44 -05001655 sbbq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001656
Steven Valdez909b19f2016-11-21 15:35:44 -05001657 cmovcq %rax,%r12
Adam Langleyfad63272015-11-12 12:15:39 -08001658 movq 0(%rsi),%rax
Steven Valdez909b19f2016-11-21 15:35:44 -05001659 cmovcq %rbp,%r13
Adam Langleyfad63272015-11-12 12:15:39 -08001660 movq 8(%rsi),%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -05001661 cmovcq %rcx,%r8
Adam Langleyfad63272015-11-12 12:15:39 -08001662 movq 16(%rsi),%rcx
Steven Valdez909b19f2016-11-21 15:35:44 -05001663 cmovcq %r10,%r9
Adam Langleyfad63272015-11-12 12:15:39 -08001664 movq 24(%rsi),%r10
1665
1666 call __ecp_nistz256_subq
1667
1668 leaq 160(%rsp),%rbx
1669 leaq 224(%rsp),%rdi
1670 call __ecp_nistz256_sub_fromq
1671
1672 movq 0+0(%rsp),%rax
1673 movq 0+8(%rsp),%rbp
1674 movq 0+16(%rsp),%rcx
1675 movq 0+24(%rsp),%r10
1676 leaq 64(%rsp),%rdi
1677
1678 call __ecp_nistz256_subq
1679
1680 movq %r12,0(%rdi)
1681 movq %r13,8(%rdi)
1682 movq %r8,16(%rdi)
1683 movq %r9,24(%rdi)
1684 movq 352(%rsp),%rax
1685 leaq 352(%rsp),%rbx
1686 movq 0+160(%rsp),%r9
1687 movq 8+160(%rsp),%r10
1688 leaq 0+160(%rsp),%rsi
1689 movq 16+160(%rsp),%r11
1690 movq 24+160(%rsp),%r12
1691 leaq 32(%rsp),%rdi
1692 call __ecp_nistz256_mul_montq
1693
1694 movq 96(%rsp),%rax
1695 leaq 96(%rsp),%rbx
1696 movq 0+64(%rsp),%r9
1697 movq 8+64(%rsp),%r10
1698 leaq 0+64(%rsp),%rsi
1699 movq 16+64(%rsp),%r11
1700 movq 24+64(%rsp),%r12
1701 leaq 64(%rsp),%rdi
1702 call __ecp_nistz256_mul_montq
1703
1704 leaq 32(%rsp),%rbx
1705 leaq 256(%rsp),%rdi
1706 call __ecp_nistz256_sub_fromq
1707
1708.byte 102,72,15,126,199
1709
1710 movdqa %xmm5,%xmm0
1711 movdqa %xmm5,%xmm1
1712 pandn 288(%rsp),%xmm0
1713 movdqa %xmm5,%xmm2
1714 pandn 288+16(%rsp),%xmm1
1715 movdqa %xmm5,%xmm3
1716 pand .LONE_mont(%rip),%xmm2
1717 pand .LONE_mont+16(%rip),%xmm3
1718 por %xmm0,%xmm2
1719 por %xmm1,%xmm3
1720
1721 movdqa %xmm4,%xmm0
1722 movdqa %xmm4,%xmm1
1723 pandn %xmm2,%xmm0
1724 movdqa %xmm4,%xmm2
1725 pandn %xmm3,%xmm1
1726 movdqa %xmm4,%xmm3
1727 pand 384(%rsp),%xmm2
1728 pand 384+16(%rsp),%xmm3
1729 por %xmm0,%xmm2
1730 por %xmm1,%xmm3
1731 movdqu %xmm2,64(%rdi)
1732 movdqu %xmm3,80(%rdi)
1733
1734 movdqa %xmm5,%xmm0
1735 movdqa %xmm5,%xmm1
1736 pandn 224(%rsp),%xmm0
1737 movdqa %xmm5,%xmm2
1738 pandn 224+16(%rsp),%xmm1
1739 movdqa %xmm5,%xmm3
1740 pand 416(%rsp),%xmm2
1741 pand 416+16(%rsp),%xmm3
1742 por %xmm0,%xmm2
1743 por %xmm1,%xmm3
1744
1745 movdqa %xmm4,%xmm0
1746 movdqa %xmm4,%xmm1
1747 pandn %xmm2,%xmm0
1748 movdqa %xmm4,%xmm2
1749 pandn %xmm3,%xmm1
1750 movdqa %xmm4,%xmm3
1751 pand 320(%rsp),%xmm2
1752 pand 320+16(%rsp),%xmm3
1753 por %xmm0,%xmm2
1754 por %xmm1,%xmm3
1755 movdqu %xmm2,0(%rdi)
1756 movdqu %xmm3,16(%rdi)
1757
1758 movdqa %xmm5,%xmm0
1759 movdqa %xmm5,%xmm1
1760 pandn 256(%rsp),%xmm0
1761 movdqa %xmm5,%xmm2
1762 pandn 256+16(%rsp),%xmm1
1763 movdqa %xmm5,%xmm3
1764 pand 448(%rsp),%xmm2
1765 pand 448+16(%rsp),%xmm3
1766 por %xmm0,%xmm2
1767 por %xmm1,%xmm3
1768
1769 movdqa %xmm4,%xmm0
1770 movdqa %xmm4,%xmm1
1771 pandn %xmm2,%xmm0
1772 movdqa %xmm4,%xmm2
1773 pandn %xmm3,%xmm1
1774 movdqa %xmm4,%xmm3
1775 pand 352(%rsp),%xmm2
1776 pand 352+16(%rsp),%xmm3
1777 por %xmm0,%xmm2
1778 por %xmm1,%xmm3
1779 movdqu %xmm2,32(%rdi)
1780 movdqu %xmm3,48(%rdi)
1781
1782 addq $480+8,%rsp
1783 popq %r15
1784 popq %r14
1785 popq %r13
1786 popq %r12
1787 popq %rbx
1788 popq %rbp
1789 .byte 0xf3,0xc3
1790.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1791#endif