blob: 2c58d48c1463c675ebca7b54e9620cc05f172398 [file] [log] [blame]
Robert Sloan6f79a502017-04-03 09:16:40 -07001#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
Adam Langleyfad63272015-11-12 12:15:39 -08002.text
3
4
5
6.p2align 6
7L$poly:
8.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
9
Adam Langleyfad63272015-11-12 12:15:39 -080010L$One:
11.long 1,1,1,1,1,1,1,1
12L$Two:
13.long 2,2,2,2,2,2,2,2
14L$Three:
15.long 3,3,3,3,3,3,3,3
16L$ONE_mont:
17.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
18
Adam Langleyfad63272015-11-12 12:15:39 -080019
Adam Langleyfad63272015-11-12 12:15:39 -080020
Adam Langleyfad63272015-11-12 12:15:39 -080021.globl _ecp_nistz256_neg
22.private_extern _ecp_nistz256_neg
23
24.p2align 5
25_ecp_nistz256_neg:
26 pushq %r12
27 pushq %r13
28
29 xorq %r8,%r8
30 xorq %r9,%r9
31 xorq %r10,%r10
32 xorq %r11,%r11
33 xorq %r13,%r13
34
35 subq 0(%rsi),%r8
36 sbbq 8(%rsi),%r9
37 sbbq 16(%rsi),%r10
38 movq %r8,%rax
39 sbbq 24(%rsi),%r11
40 leaq L$poly(%rip),%rsi
41 movq %r9,%rdx
42 sbbq $0,%r13
43
44 addq 0(%rsi),%r8
45 movq %r10,%rcx
46 adcq 8(%rsi),%r9
47 adcq 16(%rsi),%r10
48 movq %r11,%r12
49 adcq 24(%rsi),%r11
50 testq %r13,%r13
51
52 cmovzq %rax,%r8
53 cmovzq %rdx,%r9
54 movq %r8,0(%rdi)
55 cmovzq %rcx,%r10
56 movq %r9,8(%rdi)
57 cmovzq %r12,%r11
58 movq %r10,16(%rdi)
59 movq %r11,24(%rdi)
60
61 popq %r13
62 popq %r12
63 .byte 0xf3,0xc3
64
65
66
67
68
Adam Langleyfad63272015-11-12 12:15:39 -080069
70
71.globl _ecp_nistz256_mul_mont
72.private_extern _ecp_nistz256_mul_mont
73
74.p2align 5
75_ecp_nistz256_mul_mont:
76L$mul_mont:
77 pushq %rbp
78 pushq %rbx
79 pushq %r12
80 pushq %r13
81 pushq %r14
82 pushq %r15
83 movq %rdx,%rbx
84 movq 0(%rdx),%rax
85 movq 0(%rsi),%r9
86 movq 8(%rsi),%r10
87 movq 16(%rsi),%r11
88 movq 24(%rsi),%r12
89
90 call __ecp_nistz256_mul_montq
91L$mul_mont_done:
92 popq %r15
93 popq %r14
94 popq %r13
95 popq %r12
96 popq %rbx
97 popq %rbp
98 .byte 0xf3,0xc3
99
100
101
102.p2align 5
103__ecp_nistz256_mul_montq:
104
105
106 movq %rax,%rbp
107 mulq %r9
108 movq L$poly+8(%rip),%r14
109 movq %rax,%r8
110 movq %rbp,%rax
111 movq %rdx,%r9
112
113 mulq %r10
114 movq L$poly+24(%rip),%r15
115 addq %rax,%r9
116 movq %rbp,%rax
117 adcq $0,%rdx
118 movq %rdx,%r10
119
120 mulq %r11
121 addq %rax,%r10
122 movq %rbp,%rax
123 adcq $0,%rdx
124 movq %rdx,%r11
125
126 mulq %r12
127 addq %rax,%r11
128 movq %r8,%rax
129 adcq $0,%rdx
130 xorq %r13,%r13
131 movq %rdx,%r12
132
133
134
135
136
137
138
139
140
141
142 movq %r8,%rbp
143 shlq $32,%r8
144 mulq %r15
145 shrq $32,%rbp
146 addq %r8,%r9
147 adcq %rbp,%r10
148 adcq %rax,%r11
149 movq 8(%rbx),%rax
150 adcq %rdx,%r12
151 adcq $0,%r13
152 xorq %r8,%r8
153
154
155
156 movq %rax,%rbp
157 mulq 0(%rsi)
158 addq %rax,%r9
159 movq %rbp,%rax
160 adcq $0,%rdx
161 movq %rdx,%rcx
162
163 mulq 8(%rsi)
164 addq %rcx,%r10
165 adcq $0,%rdx
166 addq %rax,%r10
167 movq %rbp,%rax
168 adcq $0,%rdx
169 movq %rdx,%rcx
170
171 mulq 16(%rsi)
172 addq %rcx,%r11
173 adcq $0,%rdx
174 addq %rax,%r11
175 movq %rbp,%rax
176 adcq $0,%rdx
177 movq %rdx,%rcx
178
179 mulq 24(%rsi)
180 addq %rcx,%r12
181 adcq $0,%rdx
182 addq %rax,%r12
183 movq %r9,%rax
184 adcq %rdx,%r13
185 adcq $0,%r8
186
187
188
189 movq %r9,%rbp
190 shlq $32,%r9
191 mulq %r15
192 shrq $32,%rbp
193 addq %r9,%r10
194 adcq %rbp,%r11
195 adcq %rax,%r12
196 movq 16(%rbx),%rax
197 adcq %rdx,%r13
198 adcq $0,%r8
199 xorq %r9,%r9
200
201
202
203 movq %rax,%rbp
204 mulq 0(%rsi)
205 addq %rax,%r10
206 movq %rbp,%rax
207 adcq $0,%rdx
208 movq %rdx,%rcx
209
210 mulq 8(%rsi)
211 addq %rcx,%r11
212 adcq $0,%rdx
213 addq %rax,%r11
214 movq %rbp,%rax
215 adcq $0,%rdx
216 movq %rdx,%rcx
217
218 mulq 16(%rsi)
219 addq %rcx,%r12
220 adcq $0,%rdx
221 addq %rax,%r12
222 movq %rbp,%rax
223 adcq $0,%rdx
224 movq %rdx,%rcx
225
226 mulq 24(%rsi)
227 addq %rcx,%r13
228 adcq $0,%rdx
229 addq %rax,%r13
230 movq %r10,%rax
231 adcq %rdx,%r8
232 adcq $0,%r9
233
234
235
236 movq %r10,%rbp
237 shlq $32,%r10
238 mulq %r15
239 shrq $32,%rbp
240 addq %r10,%r11
241 adcq %rbp,%r12
242 adcq %rax,%r13
243 movq 24(%rbx),%rax
244 adcq %rdx,%r8
245 adcq $0,%r9
246 xorq %r10,%r10
247
248
249
250 movq %rax,%rbp
251 mulq 0(%rsi)
252 addq %rax,%r11
253 movq %rbp,%rax
254 adcq $0,%rdx
255 movq %rdx,%rcx
256
257 mulq 8(%rsi)
258 addq %rcx,%r12
259 adcq $0,%rdx
260 addq %rax,%r12
261 movq %rbp,%rax
262 adcq $0,%rdx
263 movq %rdx,%rcx
264
265 mulq 16(%rsi)
266 addq %rcx,%r13
267 adcq $0,%rdx
268 addq %rax,%r13
269 movq %rbp,%rax
270 adcq $0,%rdx
271 movq %rdx,%rcx
272
273 mulq 24(%rsi)
274 addq %rcx,%r8
275 adcq $0,%rdx
276 addq %rax,%r8
277 movq %r11,%rax
278 adcq %rdx,%r9
279 adcq $0,%r10
280
281
282
283 movq %r11,%rbp
284 shlq $32,%r11
285 mulq %r15
286 shrq $32,%rbp
287 addq %r11,%r12
288 adcq %rbp,%r13
289 movq %r12,%rcx
290 adcq %rax,%r8
291 adcq %rdx,%r9
292 movq %r13,%rbp
293 adcq $0,%r10
294
295
296
297 subq $-1,%r12
298 movq %r8,%rbx
299 sbbq %r14,%r13
300 sbbq $0,%r8
301 movq %r9,%rdx
302 sbbq %r15,%r9
303 sbbq $0,%r10
304
305 cmovcq %rcx,%r12
306 cmovcq %rbp,%r13
307 movq %r12,0(%rdi)
308 cmovcq %rbx,%r8
309 movq %r13,8(%rdi)
310 cmovcq %rdx,%r9
311 movq %r8,16(%rdi)
312 movq %r9,24(%rdi)
313
314 .byte 0xf3,0xc3
315
316
317
318
319
320
321
322
323
324.globl _ecp_nistz256_sqr_mont
325.private_extern _ecp_nistz256_sqr_mont
326
327.p2align 5
328_ecp_nistz256_sqr_mont:
329 pushq %rbp
330 pushq %rbx
331 pushq %r12
332 pushq %r13
333 pushq %r14
334 pushq %r15
335 movq 0(%rsi),%rax
336 movq 8(%rsi),%r14
337 movq 16(%rsi),%r15
338 movq 24(%rsi),%r8
339
340 call __ecp_nistz256_sqr_montq
341L$sqr_mont_done:
342 popq %r15
343 popq %r14
344 popq %r13
345 popq %r12
346 popq %rbx
347 popq %rbp
348 .byte 0xf3,0xc3
349
350
351
352.p2align 5
353__ecp_nistz256_sqr_montq:
354 movq %rax,%r13
355 mulq %r14
356 movq %rax,%r9
357 movq %r15,%rax
358 movq %rdx,%r10
359
360 mulq %r13
361 addq %rax,%r10
362 movq %r8,%rax
363 adcq $0,%rdx
364 movq %rdx,%r11
365
366 mulq %r13
367 addq %rax,%r11
368 movq %r15,%rax
369 adcq $0,%rdx
370 movq %rdx,%r12
371
372
373 mulq %r14
374 addq %rax,%r11
375 movq %r8,%rax
376 adcq $0,%rdx
377 movq %rdx,%rbp
378
379 mulq %r14
380 addq %rax,%r12
381 movq %r8,%rax
382 adcq $0,%rdx
383 addq %rbp,%r12
384 movq %rdx,%r13
385 adcq $0,%r13
386
387
388 mulq %r15
389 xorq %r15,%r15
390 addq %rax,%r13
391 movq 0(%rsi),%rax
392 movq %rdx,%r14
393 adcq $0,%r14
394
395 addq %r9,%r9
396 adcq %r10,%r10
397 adcq %r11,%r11
398 adcq %r12,%r12
399 adcq %r13,%r13
400 adcq %r14,%r14
401 adcq $0,%r15
402
403 mulq %rax
404 movq %rax,%r8
405 movq 8(%rsi),%rax
406 movq %rdx,%rcx
407
408 mulq %rax
409 addq %rcx,%r9
410 adcq %rax,%r10
411 movq 16(%rsi),%rax
412 adcq $0,%rdx
413 movq %rdx,%rcx
414
415 mulq %rax
416 addq %rcx,%r11
417 adcq %rax,%r12
418 movq 24(%rsi),%rax
419 adcq $0,%rdx
420 movq %rdx,%rcx
421
422 mulq %rax
423 addq %rcx,%r13
424 adcq %rax,%r14
425 movq %r8,%rax
426 adcq %rdx,%r15
427
428 movq L$poly+8(%rip),%rsi
429 movq L$poly+24(%rip),%rbp
430
431
432
433
434 movq %r8,%rcx
435 shlq $32,%r8
436 mulq %rbp
437 shrq $32,%rcx
438 addq %r8,%r9
439 adcq %rcx,%r10
440 adcq %rax,%r11
441 movq %r9,%rax
442 adcq $0,%rdx
443
444
445
446 movq %r9,%rcx
447 shlq $32,%r9
448 movq %rdx,%r8
449 mulq %rbp
450 shrq $32,%rcx
451 addq %r9,%r10
452 adcq %rcx,%r11
453 adcq %rax,%r8
454 movq %r10,%rax
455 adcq $0,%rdx
456
457
458
459 movq %r10,%rcx
460 shlq $32,%r10
461 movq %rdx,%r9
462 mulq %rbp
463 shrq $32,%rcx
464 addq %r10,%r11
465 adcq %rcx,%r8
466 adcq %rax,%r9
467 movq %r11,%rax
468 adcq $0,%rdx
469
470
471
472 movq %r11,%rcx
473 shlq $32,%r11
474 movq %rdx,%r10
475 mulq %rbp
476 shrq $32,%rcx
477 addq %r11,%r8
478 adcq %rcx,%r9
479 adcq %rax,%r10
480 adcq $0,%rdx
481 xorq %r11,%r11
482
483
484
485 addq %r8,%r12
486 adcq %r9,%r13
487 movq %r12,%r8
488 adcq %r10,%r14
489 adcq %rdx,%r15
490 movq %r13,%r9
491 adcq $0,%r11
492
493 subq $-1,%r12
494 movq %r14,%r10
495 sbbq %rsi,%r13
496 sbbq $0,%r14
497 movq %r15,%rcx
498 sbbq %rbp,%r15
499 sbbq $0,%r11
500
501 cmovcq %r8,%r12
502 cmovcq %r9,%r13
503 movq %r12,0(%rdi)
504 cmovcq %r10,%r14
505 movq %r13,8(%rdi)
506 cmovcq %rcx,%r15
507 movq %r14,16(%rdi)
508 movq %r15,24(%rdi)
509
510 .byte 0xf3,0xc3
511
512
513
Adam Langleyfad63272015-11-12 12:15:39 -0800514.globl _ecp_nistz256_select_w5
515.private_extern _ecp_nistz256_select_w5
516
517.p2align 5
518_ecp_nistz256_select_w5:
519 movdqa L$One(%rip),%xmm0
520 movd %edx,%xmm1
521
522 pxor %xmm2,%xmm2
523 pxor %xmm3,%xmm3
524 pxor %xmm4,%xmm4
525 pxor %xmm5,%xmm5
526 pxor %xmm6,%xmm6
527 pxor %xmm7,%xmm7
528
529 movdqa %xmm0,%xmm8
530 pshufd $0,%xmm1,%xmm1
531
532 movq $16,%rax
533L$select_loop_sse_w5:
534
535 movdqa %xmm8,%xmm15
536 paddd %xmm0,%xmm8
537 pcmpeqd %xmm1,%xmm15
538
539 movdqa 0(%rsi),%xmm9
540 movdqa 16(%rsi),%xmm10
541 movdqa 32(%rsi),%xmm11
542 movdqa 48(%rsi),%xmm12
543 movdqa 64(%rsi),%xmm13
544 movdqa 80(%rsi),%xmm14
545 leaq 96(%rsi),%rsi
546
547 pand %xmm15,%xmm9
548 pand %xmm15,%xmm10
549 por %xmm9,%xmm2
550 pand %xmm15,%xmm11
551 por %xmm10,%xmm3
552 pand %xmm15,%xmm12
553 por %xmm11,%xmm4
554 pand %xmm15,%xmm13
555 por %xmm12,%xmm5
556 pand %xmm15,%xmm14
557 por %xmm13,%xmm6
558 por %xmm14,%xmm7
559
560 decq %rax
561 jnz L$select_loop_sse_w5
562
563 movdqu %xmm2,0(%rdi)
564 movdqu %xmm3,16(%rdi)
565 movdqu %xmm4,32(%rdi)
566 movdqu %xmm5,48(%rdi)
567 movdqu %xmm6,64(%rdi)
568 movdqu %xmm7,80(%rdi)
569 .byte 0xf3,0xc3
570
571
572
573
574.globl _ecp_nistz256_select_w7
575.private_extern _ecp_nistz256_select_w7
576
577.p2align 5
578_ecp_nistz256_select_w7:
579 movdqa L$One(%rip),%xmm8
580 movd %edx,%xmm1
581
582 pxor %xmm2,%xmm2
583 pxor %xmm3,%xmm3
584 pxor %xmm4,%xmm4
585 pxor %xmm5,%xmm5
586
587 movdqa %xmm8,%xmm0
588 pshufd $0,%xmm1,%xmm1
589 movq $64,%rax
590
591L$select_loop_sse_w7:
592 movdqa %xmm8,%xmm15
593 paddd %xmm0,%xmm8
594 movdqa 0(%rsi),%xmm9
595 movdqa 16(%rsi),%xmm10
596 pcmpeqd %xmm1,%xmm15
597 movdqa 32(%rsi),%xmm11
598 movdqa 48(%rsi),%xmm12
599 leaq 64(%rsi),%rsi
600
601 pand %xmm15,%xmm9
602 pand %xmm15,%xmm10
603 por %xmm9,%xmm2
604 pand %xmm15,%xmm11
605 por %xmm10,%xmm3
606 pand %xmm15,%xmm12
607 por %xmm11,%xmm4
608 prefetcht0 255(%rsi)
609 por %xmm12,%xmm5
610
611 decq %rax
612 jnz L$select_loop_sse_w7
613
614 movdqu %xmm2,0(%rdi)
615 movdqu %xmm3,16(%rdi)
616 movdqu %xmm4,32(%rdi)
617 movdqu %xmm5,48(%rdi)
618 .byte 0xf3,0xc3
619
620.globl _ecp_nistz256_avx2_select_w7
621.private_extern _ecp_nistz256_avx2_select_w7
622
623.p2align 5
624_ecp_nistz256_avx2_select_w7:
625.byte 0x0f,0x0b
626 .byte 0xf3,0xc3
627
628
629.p2align 5
630__ecp_nistz256_add_toq:
Steven Valdez909b19f2016-11-21 15:35:44 -0500631 xorq %r11,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800632 addq 0(%rbx),%r12
633 adcq 8(%rbx),%r13
634 movq %r12,%rax
635 adcq 16(%rbx),%r8
636 adcq 24(%rbx),%r9
637 movq %r13,%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -0500638 adcq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800639
640 subq $-1,%r12
641 movq %r8,%rcx
642 sbbq %r14,%r13
643 sbbq $0,%r8
644 movq %r9,%r10
645 sbbq %r15,%r9
Steven Valdez909b19f2016-11-21 15:35:44 -0500646 sbbq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800647
Steven Valdez909b19f2016-11-21 15:35:44 -0500648 cmovcq %rax,%r12
649 cmovcq %rbp,%r13
Adam Langleyfad63272015-11-12 12:15:39 -0800650 movq %r12,0(%rdi)
Steven Valdez909b19f2016-11-21 15:35:44 -0500651 cmovcq %rcx,%r8
Adam Langleyfad63272015-11-12 12:15:39 -0800652 movq %r13,8(%rdi)
Steven Valdez909b19f2016-11-21 15:35:44 -0500653 cmovcq %r10,%r9
Adam Langleyfad63272015-11-12 12:15:39 -0800654 movq %r8,16(%rdi)
655 movq %r9,24(%rdi)
656
657 .byte 0xf3,0xc3
658
659
660
661.p2align 5
662__ecp_nistz256_sub_fromq:
663 subq 0(%rbx),%r12
664 sbbq 8(%rbx),%r13
665 movq %r12,%rax
666 sbbq 16(%rbx),%r8
667 sbbq 24(%rbx),%r9
668 movq %r13,%rbp
669 sbbq %r11,%r11
670
671 addq $-1,%r12
672 movq %r8,%rcx
673 adcq %r14,%r13
674 adcq $0,%r8
675 movq %r9,%r10
676 adcq %r15,%r9
677 testq %r11,%r11
678
679 cmovzq %rax,%r12
680 cmovzq %rbp,%r13
681 movq %r12,0(%rdi)
682 cmovzq %rcx,%r8
683 movq %r13,8(%rdi)
684 cmovzq %r10,%r9
685 movq %r8,16(%rdi)
686 movq %r9,24(%rdi)
687
688 .byte 0xf3,0xc3
689
690
691
692.p2align 5
693__ecp_nistz256_subq:
694 subq %r12,%rax
695 sbbq %r13,%rbp
696 movq %rax,%r12
697 sbbq %r8,%rcx
698 sbbq %r9,%r10
699 movq %rbp,%r13
700 sbbq %r11,%r11
701
702 addq $-1,%rax
703 movq %rcx,%r8
704 adcq %r14,%rbp
705 adcq $0,%rcx
706 movq %r10,%r9
707 adcq %r15,%r10
708 testq %r11,%r11
709
710 cmovnzq %rax,%r12
711 cmovnzq %rbp,%r13
712 cmovnzq %rcx,%r8
713 cmovnzq %r10,%r9
714
715 .byte 0xf3,0xc3
716
717
718
719.p2align 5
720__ecp_nistz256_mul_by_2q:
Steven Valdez909b19f2016-11-21 15:35:44 -0500721 xorq %r11,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800722 addq %r12,%r12
723 adcq %r13,%r13
724 movq %r12,%rax
725 adcq %r8,%r8
726 adcq %r9,%r9
727 movq %r13,%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -0500728 adcq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800729
730 subq $-1,%r12
731 movq %r8,%rcx
732 sbbq %r14,%r13
733 sbbq $0,%r8
734 movq %r9,%r10
735 sbbq %r15,%r9
Steven Valdez909b19f2016-11-21 15:35:44 -0500736 sbbq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -0800737
Steven Valdez909b19f2016-11-21 15:35:44 -0500738 cmovcq %rax,%r12
739 cmovcq %rbp,%r13
Adam Langleyfad63272015-11-12 12:15:39 -0800740 movq %r12,0(%rdi)
Steven Valdez909b19f2016-11-21 15:35:44 -0500741 cmovcq %rcx,%r8
Adam Langleyfad63272015-11-12 12:15:39 -0800742 movq %r13,8(%rdi)
Steven Valdez909b19f2016-11-21 15:35:44 -0500743 cmovcq %r10,%r9
Adam Langleyfad63272015-11-12 12:15:39 -0800744 movq %r8,16(%rdi)
745 movq %r9,24(%rdi)
746
747 .byte 0xf3,0xc3
748
749.globl _ecp_nistz256_point_double
750.private_extern _ecp_nistz256_point_double
751
752.p2align 5
753_ecp_nistz256_point_double:
754 pushq %rbp
755 pushq %rbx
756 pushq %r12
757 pushq %r13
758 pushq %r14
759 pushq %r15
760 subq $160+8,%rsp
761
David Benjamin4969cc92016-04-22 15:02:23 -0400762L$point_double_shortcutq:
Adam Langleyfad63272015-11-12 12:15:39 -0800763 movdqu 0(%rsi),%xmm0
764 movq %rsi,%rbx
765 movdqu 16(%rsi),%xmm1
766 movq 32+0(%rsi),%r12
767 movq 32+8(%rsi),%r13
768 movq 32+16(%rsi),%r8
769 movq 32+24(%rsi),%r9
770 movq L$poly+8(%rip),%r14
771 movq L$poly+24(%rip),%r15
772 movdqa %xmm0,96(%rsp)
773 movdqa %xmm1,96+16(%rsp)
774 leaq 32(%rdi),%r10
775 leaq 64(%rdi),%r11
776.byte 102,72,15,110,199
777.byte 102,73,15,110,202
778.byte 102,73,15,110,211
779
780 leaq 0(%rsp),%rdi
781 call __ecp_nistz256_mul_by_2q
782
783 movq 64+0(%rsi),%rax
784 movq 64+8(%rsi),%r14
785 movq 64+16(%rsi),%r15
786 movq 64+24(%rsi),%r8
787 leaq 64-0(%rsi),%rsi
788 leaq 64(%rsp),%rdi
789 call __ecp_nistz256_sqr_montq
790
791 movq 0+0(%rsp),%rax
792 movq 8+0(%rsp),%r14
793 leaq 0+0(%rsp),%rsi
794 movq 16+0(%rsp),%r15
795 movq 24+0(%rsp),%r8
796 leaq 0(%rsp),%rdi
797 call __ecp_nistz256_sqr_montq
798
799 movq 32(%rbx),%rax
800 movq 64+0(%rbx),%r9
801 movq 64+8(%rbx),%r10
802 movq 64+16(%rbx),%r11
803 movq 64+24(%rbx),%r12
804 leaq 64-0(%rbx),%rsi
805 leaq 32(%rbx),%rbx
806.byte 102,72,15,126,215
807 call __ecp_nistz256_mul_montq
808 call __ecp_nistz256_mul_by_2q
809
810 movq 96+0(%rsp),%r12
811 movq 96+8(%rsp),%r13
812 leaq 64(%rsp),%rbx
813 movq 96+16(%rsp),%r8
814 movq 96+24(%rsp),%r9
815 leaq 32(%rsp),%rdi
816 call __ecp_nistz256_add_toq
817
818 movq 96+0(%rsp),%r12
819 movq 96+8(%rsp),%r13
820 leaq 64(%rsp),%rbx
821 movq 96+16(%rsp),%r8
822 movq 96+24(%rsp),%r9
823 leaq 64(%rsp),%rdi
824 call __ecp_nistz256_sub_fromq
825
826 movq 0+0(%rsp),%rax
827 movq 8+0(%rsp),%r14
828 leaq 0+0(%rsp),%rsi
829 movq 16+0(%rsp),%r15
830 movq 24+0(%rsp),%r8
831.byte 102,72,15,126,207
832 call __ecp_nistz256_sqr_montq
833 xorq %r9,%r9
834 movq %r12,%rax
835 addq $-1,%r12
836 movq %r13,%r10
837 adcq %rsi,%r13
838 movq %r14,%rcx
839 adcq $0,%r14
840 movq %r15,%r8
841 adcq %rbp,%r15
842 adcq $0,%r9
843 xorq %rsi,%rsi
844 testq $1,%rax
845
846 cmovzq %rax,%r12
847 cmovzq %r10,%r13
848 cmovzq %rcx,%r14
849 cmovzq %r8,%r15
850 cmovzq %rsi,%r9
851
852 movq %r13,%rax
853 shrq $1,%r12
854 shlq $63,%rax
855 movq %r14,%r10
856 shrq $1,%r13
857 orq %rax,%r12
858 shlq $63,%r10
859 movq %r15,%rcx
860 shrq $1,%r14
861 orq %r10,%r13
862 shlq $63,%rcx
863 movq %r12,0(%rdi)
864 shrq $1,%r15
865 movq %r13,8(%rdi)
866 shlq $63,%r9
867 orq %rcx,%r14
868 orq %r9,%r15
869 movq %r14,16(%rdi)
870 movq %r15,24(%rdi)
871 movq 64(%rsp),%rax
872 leaq 64(%rsp),%rbx
873 movq 0+32(%rsp),%r9
874 movq 8+32(%rsp),%r10
875 leaq 0+32(%rsp),%rsi
876 movq 16+32(%rsp),%r11
877 movq 24+32(%rsp),%r12
878 leaq 32(%rsp),%rdi
879 call __ecp_nistz256_mul_montq
880
881 leaq 128(%rsp),%rdi
882 call __ecp_nistz256_mul_by_2q
883
884 leaq 32(%rsp),%rbx
885 leaq 32(%rsp),%rdi
886 call __ecp_nistz256_add_toq
887
888 movq 96(%rsp),%rax
889 leaq 96(%rsp),%rbx
890 movq 0+0(%rsp),%r9
891 movq 8+0(%rsp),%r10
892 leaq 0+0(%rsp),%rsi
893 movq 16+0(%rsp),%r11
894 movq 24+0(%rsp),%r12
895 leaq 0(%rsp),%rdi
896 call __ecp_nistz256_mul_montq
897
898 leaq 128(%rsp),%rdi
899 call __ecp_nistz256_mul_by_2q
900
901 movq 0+32(%rsp),%rax
902 movq 8+32(%rsp),%r14
903 leaq 0+32(%rsp),%rsi
904 movq 16+32(%rsp),%r15
905 movq 24+32(%rsp),%r8
906.byte 102,72,15,126,199
907 call __ecp_nistz256_sqr_montq
908
909 leaq 128(%rsp),%rbx
910 movq %r14,%r8
911 movq %r15,%r9
912 movq %rsi,%r14
913 movq %rbp,%r15
914 call __ecp_nistz256_sub_fromq
915
916 movq 0+0(%rsp),%rax
917 movq 0+8(%rsp),%rbp
918 movq 0+16(%rsp),%rcx
919 movq 0+24(%rsp),%r10
920 leaq 0(%rsp),%rdi
921 call __ecp_nistz256_subq
922
923 movq 32(%rsp),%rax
924 leaq 32(%rsp),%rbx
925 movq %r12,%r14
926 xorl %ecx,%ecx
927 movq %r12,0+0(%rsp)
928 movq %r13,%r10
929 movq %r13,0+8(%rsp)
930 cmovzq %r8,%r11
931 movq %r8,0+16(%rsp)
932 leaq 0-0(%rsp),%rsi
933 cmovzq %r9,%r12
934 movq %r9,0+24(%rsp)
935 movq %r14,%r9
936 leaq 0(%rsp),%rdi
937 call __ecp_nistz256_mul_montq
938
939.byte 102,72,15,126,203
940.byte 102,72,15,126,207
941 call __ecp_nistz256_sub_fromq
942
943 addq $160+8,%rsp
944 popq %r15
945 popq %r14
946 popq %r13
947 popq %r12
948 popq %rbx
949 popq %rbp
950 .byte 0xf3,0xc3
951
952.globl _ecp_nistz256_point_add
953.private_extern _ecp_nistz256_point_add
954
955.p2align 5
956_ecp_nistz256_point_add:
957 pushq %rbp
958 pushq %rbx
959 pushq %r12
960 pushq %r13
961 pushq %r14
962 pushq %r15
963 subq $576+8,%rsp
964
965 movdqu 0(%rsi),%xmm0
966 movdqu 16(%rsi),%xmm1
967 movdqu 32(%rsi),%xmm2
968 movdqu 48(%rsi),%xmm3
969 movdqu 64(%rsi),%xmm4
970 movdqu 80(%rsi),%xmm5
971 movq %rsi,%rbx
972 movq %rdx,%rsi
973 movdqa %xmm0,384(%rsp)
974 movdqa %xmm1,384+16(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -0800975 movdqa %xmm2,416(%rsp)
976 movdqa %xmm3,416+16(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -0800977 movdqa %xmm4,448(%rsp)
978 movdqa %xmm5,448+16(%rsp)
Steven Valdez909b19f2016-11-21 15:35:44 -0500979 por %xmm4,%xmm5
Adam Langleyfad63272015-11-12 12:15:39 -0800980
981 movdqu 0(%rsi),%xmm0
Steven Valdez909b19f2016-11-21 15:35:44 -0500982 pshufd $0xb1,%xmm5,%xmm3
Adam Langleyfad63272015-11-12 12:15:39 -0800983 movdqu 16(%rsi),%xmm1
984 movdqu 32(%rsi),%xmm2
985 por %xmm3,%xmm5
986 movdqu 48(%rsi),%xmm3
987 movq 64+0(%rsi),%rax
988 movq 64+8(%rsi),%r14
989 movq 64+16(%rsi),%r15
990 movq 64+24(%rsi),%r8
991 movdqa %xmm0,480(%rsp)
David Benjamin4969cc92016-04-22 15:02:23 -0400992 pshufd $0x1e,%xmm5,%xmm4
Adam Langleyfad63272015-11-12 12:15:39 -0800993 movdqa %xmm1,480+16(%rsp)
Steven Valdez909b19f2016-11-21 15:35:44 -0500994 movdqu 64(%rsi),%xmm0
995 movdqu 80(%rsi),%xmm1
Adam Langleyfad63272015-11-12 12:15:39 -0800996 movdqa %xmm2,512(%rsp)
997 movdqa %xmm3,512+16(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -0800998 por %xmm4,%xmm5
999 pxor %xmm4,%xmm4
Steven Valdez909b19f2016-11-21 15:35:44 -05001000 por %xmm0,%xmm1
1001.byte 102,72,15,110,199
Adam Langleyfad63272015-11-12 12:15:39 -08001002
1003 leaq 64-0(%rsi),%rsi
1004 movq %rax,544+0(%rsp)
1005 movq %r14,544+8(%rsp)
1006 movq %r15,544+16(%rsp)
1007 movq %r8,544+24(%rsp)
1008 leaq 96(%rsp),%rdi
1009 call __ecp_nistz256_sqr_montq
1010
1011 pcmpeqd %xmm4,%xmm5
Steven Valdez909b19f2016-11-21 15:35:44 -05001012 pshufd $0xb1,%xmm1,%xmm4
1013 por %xmm1,%xmm4
Adam Langleyfad63272015-11-12 12:15:39 -08001014 pshufd $0,%xmm5,%xmm5
David Benjamin4969cc92016-04-22 15:02:23 -04001015 pshufd $0x1e,%xmm4,%xmm3
Adam Langleyfad63272015-11-12 12:15:39 -08001016 por %xmm3,%xmm4
1017 pxor %xmm3,%xmm3
1018 pcmpeqd %xmm3,%xmm4
1019 pshufd $0,%xmm4,%xmm4
1020 movq 64+0(%rbx),%rax
1021 movq 64+8(%rbx),%r14
1022 movq 64+16(%rbx),%r15
1023 movq 64+24(%rbx),%r8
David Benjamin4969cc92016-04-22 15:02:23 -04001024.byte 102,72,15,110,203
Adam Langleyfad63272015-11-12 12:15:39 -08001025
1026 leaq 64-0(%rbx),%rsi
1027 leaq 32(%rsp),%rdi
1028 call __ecp_nistz256_sqr_montq
1029
1030 movq 544(%rsp),%rax
1031 leaq 544(%rsp),%rbx
1032 movq 0+96(%rsp),%r9
1033 movq 8+96(%rsp),%r10
1034 leaq 0+96(%rsp),%rsi
1035 movq 16+96(%rsp),%r11
1036 movq 24+96(%rsp),%r12
1037 leaq 224(%rsp),%rdi
1038 call __ecp_nistz256_mul_montq
1039
1040 movq 448(%rsp),%rax
1041 leaq 448(%rsp),%rbx
1042 movq 0+32(%rsp),%r9
1043 movq 8+32(%rsp),%r10
1044 leaq 0+32(%rsp),%rsi
1045 movq 16+32(%rsp),%r11
1046 movq 24+32(%rsp),%r12
1047 leaq 256(%rsp),%rdi
1048 call __ecp_nistz256_mul_montq
1049
1050 movq 416(%rsp),%rax
1051 leaq 416(%rsp),%rbx
1052 movq 0+224(%rsp),%r9
1053 movq 8+224(%rsp),%r10
1054 leaq 0+224(%rsp),%rsi
1055 movq 16+224(%rsp),%r11
1056 movq 24+224(%rsp),%r12
1057 leaq 224(%rsp),%rdi
1058 call __ecp_nistz256_mul_montq
1059
1060 movq 512(%rsp),%rax
1061 leaq 512(%rsp),%rbx
1062 movq 0+256(%rsp),%r9
1063 movq 8+256(%rsp),%r10
1064 leaq 0+256(%rsp),%rsi
1065 movq 16+256(%rsp),%r11
1066 movq 24+256(%rsp),%r12
1067 leaq 256(%rsp),%rdi
1068 call __ecp_nistz256_mul_montq
1069
1070 leaq 224(%rsp),%rbx
1071 leaq 64(%rsp),%rdi
1072 call __ecp_nistz256_sub_fromq
1073
1074 orq %r13,%r12
1075 movdqa %xmm4,%xmm2
1076 orq %r8,%r12
1077 orq %r9,%r12
1078 por %xmm5,%xmm2
1079.byte 102,73,15,110,220
1080
1081 movq 384(%rsp),%rax
1082 leaq 384(%rsp),%rbx
1083 movq 0+96(%rsp),%r9
1084 movq 8+96(%rsp),%r10
1085 leaq 0+96(%rsp),%rsi
1086 movq 16+96(%rsp),%r11
1087 movq 24+96(%rsp),%r12
1088 leaq 160(%rsp),%rdi
1089 call __ecp_nistz256_mul_montq
1090
1091 movq 480(%rsp),%rax
1092 leaq 480(%rsp),%rbx
1093 movq 0+32(%rsp),%r9
1094 movq 8+32(%rsp),%r10
1095 leaq 0+32(%rsp),%rsi
1096 movq 16+32(%rsp),%r11
1097 movq 24+32(%rsp),%r12
1098 leaq 192(%rsp),%rdi
1099 call __ecp_nistz256_mul_montq
1100
1101 leaq 160(%rsp),%rbx
1102 leaq 0(%rsp),%rdi
1103 call __ecp_nistz256_sub_fromq
1104
1105 orq %r13,%r12
1106 orq %r8,%r12
1107 orq %r9,%r12
1108
1109.byte 0x3e
1110 jnz L$add_proceedq
1111.byte 102,73,15,126,208
1112.byte 102,73,15,126,217
1113 testq %r8,%r8
1114 jnz L$add_proceedq
1115 testq %r9,%r9
David Benjamin4969cc92016-04-22 15:02:23 -04001116 jz L$add_doubleq
Adam Langleyfad63272015-11-12 12:15:39 -08001117
1118.byte 102,72,15,126,199
1119 pxor %xmm0,%xmm0
1120 movdqu %xmm0,0(%rdi)
1121 movdqu %xmm0,16(%rdi)
1122 movdqu %xmm0,32(%rdi)
1123 movdqu %xmm0,48(%rdi)
1124 movdqu %xmm0,64(%rdi)
1125 movdqu %xmm0,80(%rdi)
1126 jmp L$add_doneq
1127
1128.p2align 5
David Benjamin4969cc92016-04-22 15:02:23 -04001129L$add_doubleq:
1130.byte 102,72,15,126,206
1131.byte 102,72,15,126,199
1132 addq $416,%rsp
1133 jmp L$point_double_shortcutq
1134
1135.p2align 5
Adam Langleyfad63272015-11-12 12:15:39 -08001136L$add_proceedq:
1137 movq 0+64(%rsp),%rax
1138 movq 8+64(%rsp),%r14
1139 leaq 0+64(%rsp),%rsi
1140 movq 16+64(%rsp),%r15
1141 movq 24+64(%rsp),%r8
1142 leaq 96(%rsp),%rdi
1143 call __ecp_nistz256_sqr_montq
1144
1145 movq 448(%rsp),%rax
1146 leaq 448(%rsp),%rbx
1147 movq 0+0(%rsp),%r9
1148 movq 8+0(%rsp),%r10
1149 leaq 0+0(%rsp),%rsi
1150 movq 16+0(%rsp),%r11
1151 movq 24+0(%rsp),%r12
1152 leaq 352(%rsp),%rdi
1153 call __ecp_nistz256_mul_montq
1154
1155 movq 0+0(%rsp),%rax
1156 movq 8+0(%rsp),%r14
1157 leaq 0+0(%rsp),%rsi
1158 movq 16+0(%rsp),%r15
1159 movq 24+0(%rsp),%r8
1160 leaq 32(%rsp),%rdi
1161 call __ecp_nistz256_sqr_montq
1162
1163 movq 544(%rsp),%rax
1164 leaq 544(%rsp),%rbx
1165 movq 0+352(%rsp),%r9
1166 movq 8+352(%rsp),%r10
1167 leaq 0+352(%rsp),%rsi
1168 movq 16+352(%rsp),%r11
1169 movq 24+352(%rsp),%r12
1170 leaq 352(%rsp),%rdi
1171 call __ecp_nistz256_mul_montq
1172
1173 movq 0(%rsp),%rax
1174 leaq 0(%rsp),%rbx
1175 movq 0+32(%rsp),%r9
1176 movq 8+32(%rsp),%r10
1177 leaq 0+32(%rsp),%rsi
1178 movq 16+32(%rsp),%r11
1179 movq 24+32(%rsp),%r12
1180 leaq 128(%rsp),%rdi
1181 call __ecp_nistz256_mul_montq
1182
1183 movq 160(%rsp),%rax
1184 leaq 160(%rsp),%rbx
1185 movq 0+32(%rsp),%r9
1186 movq 8+32(%rsp),%r10
1187 leaq 0+32(%rsp),%rsi
1188 movq 16+32(%rsp),%r11
1189 movq 24+32(%rsp),%r12
1190 leaq 192(%rsp),%rdi
1191 call __ecp_nistz256_mul_montq
1192
1193
1194
1195
Steven Valdez909b19f2016-11-21 15:35:44 -05001196 xorq %r11,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001197 addq %r12,%r12
1198 leaq 96(%rsp),%rsi
1199 adcq %r13,%r13
1200 movq %r12,%rax
1201 adcq %r8,%r8
1202 adcq %r9,%r9
1203 movq %r13,%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -05001204 adcq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001205
1206 subq $-1,%r12
1207 movq %r8,%rcx
1208 sbbq %r14,%r13
1209 sbbq $0,%r8
1210 movq %r9,%r10
1211 sbbq %r15,%r9
Steven Valdez909b19f2016-11-21 15:35:44 -05001212 sbbq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001213
Steven Valdez909b19f2016-11-21 15:35:44 -05001214 cmovcq %rax,%r12
Adam Langleyfad63272015-11-12 12:15:39 -08001215 movq 0(%rsi),%rax
Steven Valdez909b19f2016-11-21 15:35:44 -05001216 cmovcq %rbp,%r13
Adam Langleyfad63272015-11-12 12:15:39 -08001217 movq 8(%rsi),%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -05001218 cmovcq %rcx,%r8
Adam Langleyfad63272015-11-12 12:15:39 -08001219 movq 16(%rsi),%rcx
Steven Valdez909b19f2016-11-21 15:35:44 -05001220 cmovcq %r10,%r9
Adam Langleyfad63272015-11-12 12:15:39 -08001221 movq 24(%rsi),%r10
1222
1223 call __ecp_nistz256_subq
1224
1225 leaq 128(%rsp),%rbx
1226 leaq 288(%rsp),%rdi
1227 call __ecp_nistz256_sub_fromq
1228
1229 movq 192+0(%rsp),%rax
1230 movq 192+8(%rsp),%rbp
1231 movq 192+16(%rsp),%rcx
1232 movq 192+24(%rsp),%r10
1233 leaq 320(%rsp),%rdi
1234
1235 call __ecp_nistz256_subq
1236
1237 movq %r12,0(%rdi)
1238 movq %r13,8(%rdi)
1239 movq %r8,16(%rdi)
1240 movq %r9,24(%rdi)
1241 movq 128(%rsp),%rax
1242 leaq 128(%rsp),%rbx
1243 movq 0+224(%rsp),%r9
1244 movq 8+224(%rsp),%r10
1245 leaq 0+224(%rsp),%rsi
1246 movq 16+224(%rsp),%r11
1247 movq 24+224(%rsp),%r12
1248 leaq 256(%rsp),%rdi
1249 call __ecp_nistz256_mul_montq
1250
1251 movq 320(%rsp),%rax
1252 leaq 320(%rsp),%rbx
1253 movq 0+64(%rsp),%r9
1254 movq 8+64(%rsp),%r10
1255 leaq 0+64(%rsp),%rsi
1256 movq 16+64(%rsp),%r11
1257 movq 24+64(%rsp),%r12
1258 leaq 320(%rsp),%rdi
1259 call __ecp_nistz256_mul_montq
1260
1261 leaq 256(%rsp),%rbx
1262 leaq 320(%rsp),%rdi
1263 call __ecp_nistz256_sub_fromq
1264
1265.byte 102,72,15,126,199
1266
1267 movdqa %xmm5,%xmm0
1268 movdqa %xmm5,%xmm1
1269 pandn 352(%rsp),%xmm0
1270 movdqa %xmm5,%xmm2
1271 pandn 352+16(%rsp),%xmm1
1272 movdqa %xmm5,%xmm3
1273 pand 544(%rsp),%xmm2
1274 pand 544+16(%rsp),%xmm3
1275 por %xmm0,%xmm2
1276 por %xmm1,%xmm3
1277
1278 movdqa %xmm4,%xmm0
1279 movdqa %xmm4,%xmm1
1280 pandn %xmm2,%xmm0
1281 movdqa %xmm4,%xmm2
1282 pandn %xmm3,%xmm1
1283 movdqa %xmm4,%xmm3
1284 pand 448(%rsp),%xmm2
1285 pand 448+16(%rsp),%xmm3
1286 por %xmm0,%xmm2
1287 por %xmm1,%xmm3
1288 movdqu %xmm2,64(%rdi)
1289 movdqu %xmm3,80(%rdi)
1290
1291 movdqa %xmm5,%xmm0
1292 movdqa %xmm5,%xmm1
1293 pandn 288(%rsp),%xmm0
1294 movdqa %xmm5,%xmm2
1295 pandn 288+16(%rsp),%xmm1
1296 movdqa %xmm5,%xmm3
1297 pand 480(%rsp),%xmm2
1298 pand 480+16(%rsp),%xmm3
1299 por %xmm0,%xmm2
1300 por %xmm1,%xmm3
1301
1302 movdqa %xmm4,%xmm0
1303 movdqa %xmm4,%xmm1
1304 pandn %xmm2,%xmm0
1305 movdqa %xmm4,%xmm2
1306 pandn %xmm3,%xmm1
1307 movdqa %xmm4,%xmm3
1308 pand 384(%rsp),%xmm2
1309 pand 384+16(%rsp),%xmm3
1310 por %xmm0,%xmm2
1311 por %xmm1,%xmm3
1312 movdqu %xmm2,0(%rdi)
1313 movdqu %xmm3,16(%rdi)
1314
1315 movdqa %xmm5,%xmm0
1316 movdqa %xmm5,%xmm1
1317 pandn 320(%rsp),%xmm0
1318 movdqa %xmm5,%xmm2
1319 pandn 320+16(%rsp),%xmm1
1320 movdqa %xmm5,%xmm3
1321 pand 512(%rsp),%xmm2
1322 pand 512+16(%rsp),%xmm3
1323 por %xmm0,%xmm2
1324 por %xmm1,%xmm3
1325
1326 movdqa %xmm4,%xmm0
1327 movdqa %xmm4,%xmm1
1328 pandn %xmm2,%xmm0
1329 movdqa %xmm4,%xmm2
1330 pandn %xmm3,%xmm1
1331 movdqa %xmm4,%xmm3
1332 pand 416(%rsp),%xmm2
1333 pand 416+16(%rsp),%xmm3
1334 por %xmm0,%xmm2
1335 por %xmm1,%xmm3
1336 movdqu %xmm2,32(%rdi)
1337 movdqu %xmm3,48(%rdi)
1338
1339L$add_doneq:
1340 addq $576+8,%rsp
1341 popq %r15
1342 popq %r14
1343 popq %r13
1344 popq %r12
1345 popq %rbx
1346 popq %rbp
1347 .byte 0xf3,0xc3
1348
1349.globl _ecp_nistz256_point_add_affine
1350.private_extern _ecp_nistz256_point_add_affine
1351
1352.p2align 5
1353_ecp_nistz256_point_add_affine:
1354 pushq %rbp
1355 pushq %rbx
1356 pushq %r12
1357 pushq %r13
1358 pushq %r14
1359 pushq %r15
1360 subq $480+8,%rsp
1361
1362 movdqu 0(%rsi),%xmm0
1363 movq %rdx,%rbx
1364 movdqu 16(%rsi),%xmm1
1365 movdqu 32(%rsi),%xmm2
1366 movdqu 48(%rsi),%xmm3
1367 movdqu 64(%rsi),%xmm4
1368 movdqu 80(%rsi),%xmm5
1369 movq 64+0(%rsi),%rax
1370 movq 64+8(%rsi),%r14
1371 movq 64+16(%rsi),%r15
1372 movq 64+24(%rsi),%r8
1373 movdqa %xmm0,320(%rsp)
1374 movdqa %xmm1,320+16(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08001375 movdqa %xmm2,352(%rsp)
1376 movdqa %xmm3,352+16(%rsp)
Adam Langleyfad63272015-11-12 12:15:39 -08001377 movdqa %xmm4,384(%rsp)
1378 movdqa %xmm5,384+16(%rsp)
Steven Valdez909b19f2016-11-21 15:35:44 -05001379 por %xmm4,%xmm5
Adam Langleyfad63272015-11-12 12:15:39 -08001380
1381 movdqu 0(%rbx),%xmm0
Steven Valdez909b19f2016-11-21 15:35:44 -05001382 pshufd $0xb1,%xmm5,%xmm3
Adam Langleyfad63272015-11-12 12:15:39 -08001383 movdqu 16(%rbx),%xmm1
1384 movdqu 32(%rbx),%xmm2
1385 por %xmm3,%xmm5
1386 movdqu 48(%rbx),%xmm3
1387 movdqa %xmm0,416(%rsp)
David Benjamin4969cc92016-04-22 15:02:23 -04001388 pshufd $0x1e,%xmm5,%xmm4
Adam Langleyfad63272015-11-12 12:15:39 -08001389 movdqa %xmm1,416+16(%rsp)
1390 por %xmm0,%xmm1
1391.byte 102,72,15,110,199
1392 movdqa %xmm2,448(%rsp)
1393 movdqa %xmm3,448+16(%rsp)
1394 por %xmm2,%xmm3
1395 por %xmm4,%xmm5
1396 pxor %xmm4,%xmm4
1397 por %xmm1,%xmm3
1398
1399 leaq 64-0(%rsi),%rsi
1400 leaq 32(%rsp),%rdi
1401 call __ecp_nistz256_sqr_montq
1402
1403 pcmpeqd %xmm4,%xmm5
David Benjamin4969cc92016-04-22 15:02:23 -04001404 pshufd $0xb1,%xmm3,%xmm4
Adam Langleyfad63272015-11-12 12:15:39 -08001405 movq 0(%rbx),%rax
1406
1407 movq %r12,%r9
1408 por %xmm3,%xmm4
1409 pshufd $0,%xmm5,%xmm5
David Benjamin4969cc92016-04-22 15:02:23 -04001410 pshufd $0x1e,%xmm4,%xmm3
Adam Langleyfad63272015-11-12 12:15:39 -08001411 movq %r13,%r10
1412 por %xmm3,%xmm4
1413 pxor %xmm3,%xmm3
1414 movq %r14,%r11
1415 pcmpeqd %xmm3,%xmm4
1416 pshufd $0,%xmm4,%xmm4
1417
1418 leaq 32-0(%rsp),%rsi
1419 movq %r15,%r12
1420 leaq 0(%rsp),%rdi
1421 call __ecp_nistz256_mul_montq
1422
1423 leaq 320(%rsp),%rbx
1424 leaq 64(%rsp),%rdi
1425 call __ecp_nistz256_sub_fromq
1426
1427 movq 384(%rsp),%rax
1428 leaq 384(%rsp),%rbx
1429 movq 0+32(%rsp),%r9
1430 movq 8+32(%rsp),%r10
1431 leaq 0+32(%rsp),%rsi
1432 movq 16+32(%rsp),%r11
1433 movq 24+32(%rsp),%r12
1434 leaq 32(%rsp),%rdi
1435 call __ecp_nistz256_mul_montq
1436
1437 movq 384(%rsp),%rax
1438 leaq 384(%rsp),%rbx
1439 movq 0+64(%rsp),%r9
1440 movq 8+64(%rsp),%r10
1441 leaq 0+64(%rsp),%rsi
1442 movq 16+64(%rsp),%r11
1443 movq 24+64(%rsp),%r12
1444 leaq 288(%rsp),%rdi
1445 call __ecp_nistz256_mul_montq
1446
1447 movq 448(%rsp),%rax
1448 leaq 448(%rsp),%rbx
1449 movq 0+32(%rsp),%r9
1450 movq 8+32(%rsp),%r10
1451 leaq 0+32(%rsp),%rsi
1452 movq 16+32(%rsp),%r11
1453 movq 24+32(%rsp),%r12
1454 leaq 32(%rsp),%rdi
1455 call __ecp_nistz256_mul_montq
1456
1457 leaq 352(%rsp),%rbx
1458 leaq 96(%rsp),%rdi
1459 call __ecp_nistz256_sub_fromq
1460
1461 movq 0+64(%rsp),%rax
1462 movq 8+64(%rsp),%r14
1463 leaq 0+64(%rsp),%rsi
1464 movq 16+64(%rsp),%r15
1465 movq 24+64(%rsp),%r8
1466 leaq 128(%rsp),%rdi
1467 call __ecp_nistz256_sqr_montq
1468
1469 movq 0+96(%rsp),%rax
1470 movq 8+96(%rsp),%r14
1471 leaq 0+96(%rsp),%rsi
1472 movq 16+96(%rsp),%r15
1473 movq 24+96(%rsp),%r8
1474 leaq 192(%rsp),%rdi
1475 call __ecp_nistz256_sqr_montq
1476
1477 movq 128(%rsp),%rax
1478 leaq 128(%rsp),%rbx
1479 movq 0+64(%rsp),%r9
1480 movq 8+64(%rsp),%r10
1481 leaq 0+64(%rsp),%rsi
1482 movq 16+64(%rsp),%r11
1483 movq 24+64(%rsp),%r12
1484 leaq 160(%rsp),%rdi
1485 call __ecp_nistz256_mul_montq
1486
1487 movq 320(%rsp),%rax
1488 leaq 320(%rsp),%rbx
1489 movq 0+128(%rsp),%r9
1490 movq 8+128(%rsp),%r10
1491 leaq 0+128(%rsp),%rsi
1492 movq 16+128(%rsp),%r11
1493 movq 24+128(%rsp),%r12
1494 leaq 0(%rsp),%rdi
1495 call __ecp_nistz256_mul_montq
1496
1497
1498
1499
Steven Valdez909b19f2016-11-21 15:35:44 -05001500 xorq %r11,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001501 addq %r12,%r12
1502 leaq 192(%rsp),%rsi
1503 adcq %r13,%r13
1504 movq %r12,%rax
1505 adcq %r8,%r8
1506 adcq %r9,%r9
1507 movq %r13,%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -05001508 adcq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001509
1510 subq $-1,%r12
1511 movq %r8,%rcx
1512 sbbq %r14,%r13
1513 sbbq $0,%r8
1514 movq %r9,%r10
1515 sbbq %r15,%r9
Steven Valdez909b19f2016-11-21 15:35:44 -05001516 sbbq $0,%r11
Adam Langleyfad63272015-11-12 12:15:39 -08001517
Steven Valdez909b19f2016-11-21 15:35:44 -05001518 cmovcq %rax,%r12
Adam Langleyfad63272015-11-12 12:15:39 -08001519 movq 0(%rsi),%rax
Steven Valdez909b19f2016-11-21 15:35:44 -05001520 cmovcq %rbp,%r13
Adam Langleyfad63272015-11-12 12:15:39 -08001521 movq 8(%rsi),%rbp
Steven Valdez909b19f2016-11-21 15:35:44 -05001522 cmovcq %rcx,%r8
Adam Langleyfad63272015-11-12 12:15:39 -08001523 movq 16(%rsi),%rcx
Steven Valdez909b19f2016-11-21 15:35:44 -05001524 cmovcq %r10,%r9
Adam Langleyfad63272015-11-12 12:15:39 -08001525 movq 24(%rsi),%r10
1526
1527 call __ecp_nistz256_subq
1528
1529 leaq 160(%rsp),%rbx
1530 leaq 224(%rsp),%rdi
1531 call __ecp_nistz256_sub_fromq
1532
1533 movq 0+0(%rsp),%rax
1534 movq 0+8(%rsp),%rbp
1535 movq 0+16(%rsp),%rcx
1536 movq 0+24(%rsp),%r10
1537 leaq 64(%rsp),%rdi
1538
1539 call __ecp_nistz256_subq
1540
1541 movq %r12,0(%rdi)
1542 movq %r13,8(%rdi)
1543 movq %r8,16(%rdi)
1544 movq %r9,24(%rdi)
1545 movq 352(%rsp),%rax
1546 leaq 352(%rsp),%rbx
1547 movq 0+160(%rsp),%r9
1548 movq 8+160(%rsp),%r10
1549 leaq 0+160(%rsp),%rsi
1550 movq 16+160(%rsp),%r11
1551 movq 24+160(%rsp),%r12
1552 leaq 32(%rsp),%rdi
1553 call __ecp_nistz256_mul_montq
1554
1555 movq 96(%rsp),%rax
1556 leaq 96(%rsp),%rbx
1557 movq 0+64(%rsp),%r9
1558 movq 8+64(%rsp),%r10
1559 leaq 0+64(%rsp),%rsi
1560 movq 16+64(%rsp),%r11
1561 movq 24+64(%rsp),%r12
1562 leaq 64(%rsp),%rdi
1563 call __ecp_nistz256_mul_montq
1564
1565 leaq 32(%rsp),%rbx
1566 leaq 256(%rsp),%rdi
1567 call __ecp_nistz256_sub_fromq
1568
1569.byte 102,72,15,126,199
1570
1571 movdqa %xmm5,%xmm0
1572 movdqa %xmm5,%xmm1
1573 pandn 288(%rsp),%xmm0
1574 movdqa %xmm5,%xmm2
1575 pandn 288+16(%rsp),%xmm1
1576 movdqa %xmm5,%xmm3
1577 pand L$ONE_mont(%rip),%xmm2
1578 pand L$ONE_mont+16(%rip),%xmm3
1579 por %xmm0,%xmm2
1580 por %xmm1,%xmm3
1581
1582 movdqa %xmm4,%xmm0
1583 movdqa %xmm4,%xmm1
1584 pandn %xmm2,%xmm0
1585 movdqa %xmm4,%xmm2
1586 pandn %xmm3,%xmm1
1587 movdqa %xmm4,%xmm3
1588 pand 384(%rsp),%xmm2
1589 pand 384+16(%rsp),%xmm3
1590 por %xmm0,%xmm2
1591 por %xmm1,%xmm3
1592 movdqu %xmm2,64(%rdi)
1593 movdqu %xmm3,80(%rdi)
1594
1595 movdqa %xmm5,%xmm0
1596 movdqa %xmm5,%xmm1
1597 pandn 224(%rsp),%xmm0
1598 movdqa %xmm5,%xmm2
1599 pandn 224+16(%rsp),%xmm1
1600 movdqa %xmm5,%xmm3
1601 pand 416(%rsp),%xmm2
1602 pand 416+16(%rsp),%xmm3
1603 por %xmm0,%xmm2
1604 por %xmm1,%xmm3
1605
1606 movdqa %xmm4,%xmm0
1607 movdqa %xmm4,%xmm1
1608 pandn %xmm2,%xmm0
1609 movdqa %xmm4,%xmm2
1610 pandn %xmm3,%xmm1
1611 movdqa %xmm4,%xmm3
1612 pand 320(%rsp),%xmm2
1613 pand 320+16(%rsp),%xmm3
1614 por %xmm0,%xmm2
1615 por %xmm1,%xmm3
1616 movdqu %xmm2,0(%rdi)
1617 movdqu %xmm3,16(%rdi)
1618
1619 movdqa %xmm5,%xmm0
1620 movdqa %xmm5,%xmm1
1621 pandn 256(%rsp),%xmm0
1622 movdqa %xmm5,%xmm2
1623 pandn 256+16(%rsp),%xmm1
1624 movdqa %xmm5,%xmm3
1625 pand 448(%rsp),%xmm2
1626 pand 448+16(%rsp),%xmm3
1627 por %xmm0,%xmm2
1628 por %xmm1,%xmm3
1629
1630 movdqa %xmm4,%xmm0
1631 movdqa %xmm4,%xmm1
1632 pandn %xmm2,%xmm0
1633 movdqa %xmm4,%xmm2
1634 pandn %xmm3,%xmm1
1635 movdqa %xmm4,%xmm3
1636 pand 352(%rsp),%xmm2
1637 pand 352+16(%rsp),%xmm3
1638 por %xmm0,%xmm2
1639 por %xmm1,%xmm3
1640 movdqu %xmm2,32(%rdi)
1641 movdqu %xmm3,48(%rdi)
1642
1643 addq $480+8,%rsp
1644 popq %r15
1645 popq %r14
1646 popq %r13
1647 popq %r12
1648 popq %rbx
1649 popq %rbp
1650 .byte 0xf3,0xc3
1651
1652#endif