blob: be3d13a651bbc679ae16fb5d090acadd3ff060c4 [file] [log] [blame]
Robert Sloan6f79a502017-04-03 09:16:40 -07001#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002.text
3
4
5
6.globl _bn_mul_mont
7.private_extern _bn_mul_mont
8
9.p2align 4
10_bn_mul_mont:
Robert Sloana94fe052017-02-21 08:49:28 -080011
12 movl %r9d,%r9d
13 movq %rsp,%rax
14
Adam Langleyd9e397b2015-01-22 14:27:53 -080015 testl $3,%r9d
16 jnz L$mul_enter
17 cmpl $8,%r9d
18 jb L$mul_enter
19 cmpq %rsi,%rdx
20 jne L$mul4x_enter
21 testl $7,%r9d
22 jz L$sqr8x_enter
23 jmp L$mul4x_enter
24
25.p2align 4
26L$mul_enter:
27 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -080028
Adam Langleyd9e397b2015-01-22 14:27:53 -080029 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -080030
Adam Langleyd9e397b2015-01-22 14:27:53 -080031 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -080032
Adam Langleyd9e397b2015-01-22 14:27:53 -080033 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -080034
Adam Langleyd9e397b2015-01-22 14:27:53 -080035 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -080036
Adam Langleyd9e397b2015-01-22 14:27:53 -080037 pushq %r15
38
Adam Langleyd9e397b2015-01-22 14:27:53 -080039
Robert Sloana94fe052017-02-21 08:49:28 -080040 negq %r9
41 movq %rsp,%r11
42 leaq -16(%rsp,%r9,8),%r10
43 negq %r9
44 andq $-1024,%r10
45
46
47
48
49
50
51
52
53
54 subq %r10,%r11
55 andq $-4096,%r11
56 leaq (%r10,%r11,1),%rsp
57 movq (%rsp),%r11
58 cmpq %r10,%rsp
59 ja L$mul_page_walk
60 jmp L$mul_page_walk_done
61
62.p2align 4
63L$mul_page_walk:
64 leaq -4096(%rsp),%rsp
65 movq (%rsp),%r11
66 cmpq %r10,%rsp
67 ja L$mul_page_walk
68L$mul_page_walk_done:
69
70 movq %rax,8(%rsp,%r9,8)
71
Adam Langleyd9e397b2015-01-22 14:27:53 -080072L$mul_body:
73 movq %rdx,%r12
74 movq (%r8),%r8
75 movq (%r12),%rbx
76 movq (%rsi),%rax
77
78 xorq %r14,%r14
79 xorq %r15,%r15
80
81 movq %r8,%rbp
82 mulq %rbx
83 movq %rax,%r10
84 movq (%rcx),%rax
85
86 imulq %r10,%rbp
87 movq %rdx,%r11
88
89 mulq %rbp
90 addq %rax,%r10
91 movq 8(%rsi),%rax
92 adcq $0,%rdx
93 movq %rdx,%r13
94
95 leaq 1(%r15),%r15
96 jmp L$1st_enter
97
98.p2align 4
99L$1st:
100 addq %rax,%r13
101 movq (%rsi,%r15,8),%rax
102 adcq $0,%rdx
103 addq %r11,%r13
104 movq %r10,%r11
105 adcq $0,%rdx
106 movq %r13,-16(%rsp,%r15,8)
107 movq %rdx,%r13
108
109L$1st_enter:
110 mulq %rbx
111 addq %rax,%r11
112 movq (%rcx,%r15,8),%rax
113 adcq $0,%rdx
114 leaq 1(%r15),%r15
115 movq %rdx,%r10
116
117 mulq %rbp
118 cmpq %r9,%r15
119 jne L$1st
120
121 addq %rax,%r13
122 movq (%rsi),%rax
123 adcq $0,%rdx
124 addq %r11,%r13
125 adcq $0,%rdx
126 movq %r13,-16(%rsp,%r15,8)
127 movq %rdx,%r13
128 movq %r10,%r11
129
130 xorq %rdx,%rdx
131 addq %r11,%r13
132 adcq $0,%rdx
133 movq %r13,-8(%rsp,%r9,8)
134 movq %rdx,(%rsp,%r9,8)
135
136 leaq 1(%r14),%r14
137 jmp L$outer
138.p2align 4
139L$outer:
140 movq (%r12,%r14,8),%rbx
141 xorq %r15,%r15
142 movq %r8,%rbp
143 movq (%rsp),%r10
144 mulq %rbx
145 addq %rax,%r10
146 movq (%rcx),%rax
147 adcq $0,%rdx
148
149 imulq %r10,%rbp
150 movq %rdx,%r11
151
152 mulq %rbp
153 addq %rax,%r10
154 movq 8(%rsi),%rax
155 adcq $0,%rdx
156 movq 8(%rsp),%r10
157 movq %rdx,%r13
158
159 leaq 1(%r15),%r15
160 jmp L$inner_enter
161
162.p2align 4
163L$inner:
164 addq %rax,%r13
165 movq (%rsi,%r15,8),%rax
166 adcq $0,%rdx
167 addq %r10,%r13
168 movq (%rsp,%r15,8),%r10
169 adcq $0,%rdx
170 movq %r13,-16(%rsp,%r15,8)
171 movq %rdx,%r13
172
173L$inner_enter:
174 mulq %rbx
175 addq %rax,%r11
176 movq (%rcx,%r15,8),%rax
177 adcq $0,%rdx
178 addq %r11,%r10
179 movq %rdx,%r11
180 adcq $0,%r11
181 leaq 1(%r15),%r15
182
183 mulq %rbp
184 cmpq %r9,%r15
185 jne L$inner
186
187 addq %rax,%r13
188 movq (%rsi),%rax
189 adcq $0,%rdx
190 addq %r10,%r13
191 movq (%rsp,%r15,8),%r10
192 adcq $0,%rdx
193 movq %r13,-16(%rsp,%r15,8)
194 movq %rdx,%r13
195
196 xorq %rdx,%rdx
197 addq %r11,%r13
198 adcq $0,%rdx
199 addq %r10,%r13
200 adcq $0,%rdx
201 movq %r13,-8(%rsp,%r9,8)
202 movq %rdx,(%rsp,%r9,8)
203
204 leaq 1(%r14),%r14
205 cmpq %r9,%r14
206 jb L$outer
207
208 xorq %r14,%r14
209 movq (%rsp),%rax
210 leaq (%rsp),%rsi
211 movq %r9,%r15
212 jmp L$sub
213.p2align 4
214L$sub: sbbq (%rcx,%r14,8),%rax
215 movq %rax,(%rdi,%r14,8)
216 movq 8(%rsi,%r14,8),%rax
217 leaq 1(%r14),%r14
218 decq %r15
219 jnz L$sub
220
221 sbbq $0,%rax
222 xorq %r14,%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800223 andq %rax,%rsi
224 notq %rax
225 movq %rdi,%rcx
226 andq %rax,%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800227 movq %r9,%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800228 orq %rcx,%rsi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800229.p2align 4
230L$copy:
Robert Sloana94fe052017-02-21 08:49:28 -0800231 movq (%rsi,%r14,8),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800232 movq %r14,(%rsp,%r14,8)
Robert Sloana94fe052017-02-21 08:49:28 -0800233 movq %rax,(%rdi,%r14,8)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800234 leaq 1(%r14),%r14
235 subq $1,%r15
236 jnz L$copy
237
238 movq 8(%rsp,%r9,8),%rsi
Robert Sloana94fe052017-02-21 08:49:28 -0800239
Adam Langleyd9e397b2015-01-22 14:27:53 -0800240 movq $1,%rax
Robert Sloana94fe052017-02-21 08:49:28 -0800241 movq -48(%rsi),%r15
242
243 movq -40(%rsi),%r14
244
245 movq -32(%rsi),%r13
246
247 movq -24(%rsi),%r12
248
249 movq -16(%rsi),%rbp
250
251 movq -8(%rsi),%rbx
252
253 leaq (%rsi),%rsp
254
Adam Langleyd9e397b2015-01-22 14:27:53 -0800255L$mul_epilogue:
256 .byte 0xf3,0xc3
257
258
Robert Sloana94fe052017-02-21 08:49:28 -0800259
Adam Langleyd9e397b2015-01-22 14:27:53 -0800260.p2align 4
261bn_mul4x_mont:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800262
263 movl %r9d,%r9d
Robert Sloana94fe052017-02-21 08:49:28 -0800264 movq %rsp,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800265
Robert Sloana94fe052017-02-21 08:49:28 -0800266L$mul4x_enter:
267 pushq %rbx
268
269 pushq %rbp
270
271 pushq %r12
272
273 pushq %r13
274
275 pushq %r14
276
277 pushq %r15
278
279
280 negq %r9
281 movq %rsp,%r11
282 leaq -32(%rsp,%r9,8),%r10
283 negq %r9
284 andq $-1024,%r10
285
286 subq %r10,%r11
287 andq $-4096,%r11
288 leaq (%r10,%r11,1),%rsp
289 movq (%rsp),%r11
290 cmpq %r10,%rsp
291 ja L$mul4x_page_walk
292 jmp L$mul4x_page_walk_done
293
294L$mul4x_page_walk:
295 leaq -4096(%rsp),%rsp
296 movq (%rsp),%r11
297 cmpq %r10,%rsp
298 ja L$mul4x_page_walk
299L$mul4x_page_walk_done:
300
301 movq %rax,8(%rsp,%r9,8)
302
Adam Langleyd9e397b2015-01-22 14:27:53 -0800303L$mul4x_body:
304 movq %rdi,16(%rsp,%r9,8)
305 movq %rdx,%r12
306 movq (%r8),%r8
307 movq (%r12),%rbx
308 movq (%rsi),%rax
309
310 xorq %r14,%r14
311 xorq %r15,%r15
312
313 movq %r8,%rbp
314 mulq %rbx
315 movq %rax,%r10
316 movq (%rcx),%rax
317
318 imulq %r10,%rbp
319 movq %rdx,%r11
320
321 mulq %rbp
322 addq %rax,%r10
323 movq 8(%rsi),%rax
324 adcq $0,%rdx
325 movq %rdx,%rdi
326
327 mulq %rbx
328 addq %rax,%r11
329 movq 8(%rcx),%rax
330 adcq $0,%rdx
331 movq %rdx,%r10
332
333 mulq %rbp
334 addq %rax,%rdi
335 movq 16(%rsi),%rax
336 adcq $0,%rdx
337 addq %r11,%rdi
338 leaq 4(%r15),%r15
339 adcq $0,%rdx
340 movq %rdi,(%rsp)
341 movq %rdx,%r13
342 jmp L$1st4x
343.p2align 4
344L$1st4x:
345 mulq %rbx
346 addq %rax,%r10
347 movq -16(%rcx,%r15,8),%rax
348 adcq $0,%rdx
349 movq %rdx,%r11
350
351 mulq %rbp
352 addq %rax,%r13
353 movq -8(%rsi,%r15,8),%rax
354 adcq $0,%rdx
355 addq %r10,%r13
356 adcq $0,%rdx
357 movq %r13,-24(%rsp,%r15,8)
358 movq %rdx,%rdi
359
360 mulq %rbx
361 addq %rax,%r11
362 movq -8(%rcx,%r15,8),%rax
363 adcq $0,%rdx
364 movq %rdx,%r10
365
366 mulq %rbp
367 addq %rax,%rdi
368 movq (%rsi,%r15,8),%rax
369 adcq $0,%rdx
370 addq %r11,%rdi
371 adcq $0,%rdx
372 movq %rdi,-16(%rsp,%r15,8)
373 movq %rdx,%r13
374
375 mulq %rbx
376 addq %rax,%r10
377 movq (%rcx,%r15,8),%rax
378 adcq $0,%rdx
379 movq %rdx,%r11
380
381 mulq %rbp
382 addq %rax,%r13
383 movq 8(%rsi,%r15,8),%rax
384 adcq $0,%rdx
385 addq %r10,%r13
386 adcq $0,%rdx
387 movq %r13,-8(%rsp,%r15,8)
388 movq %rdx,%rdi
389
390 mulq %rbx
391 addq %rax,%r11
392 movq 8(%rcx,%r15,8),%rax
393 adcq $0,%rdx
394 leaq 4(%r15),%r15
395 movq %rdx,%r10
396
397 mulq %rbp
398 addq %rax,%rdi
399 movq -16(%rsi,%r15,8),%rax
400 adcq $0,%rdx
401 addq %r11,%rdi
402 adcq $0,%rdx
403 movq %rdi,-32(%rsp,%r15,8)
404 movq %rdx,%r13
405 cmpq %r9,%r15
406 jb L$1st4x
407
408 mulq %rbx
409 addq %rax,%r10
410 movq -16(%rcx,%r15,8),%rax
411 adcq $0,%rdx
412 movq %rdx,%r11
413
414 mulq %rbp
415 addq %rax,%r13
416 movq -8(%rsi,%r15,8),%rax
417 adcq $0,%rdx
418 addq %r10,%r13
419 adcq $0,%rdx
420 movq %r13,-24(%rsp,%r15,8)
421 movq %rdx,%rdi
422
423 mulq %rbx
424 addq %rax,%r11
425 movq -8(%rcx,%r15,8),%rax
426 adcq $0,%rdx
427 movq %rdx,%r10
428
429 mulq %rbp
430 addq %rax,%rdi
431 movq (%rsi),%rax
432 adcq $0,%rdx
433 addq %r11,%rdi
434 adcq $0,%rdx
435 movq %rdi,-16(%rsp,%r15,8)
436 movq %rdx,%r13
437
438 xorq %rdi,%rdi
439 addq %r10,%r13
440 adcq $0,%rdi
441 movq %r13,-8(%rsp,%r15,8)
442 movq %rdi,(%rsp,%r15,8)
443
444 leaq 1(%r14),%r14
445.p2align 2
446L$outer4x:
447 movq (%r12,%r14,8),%rbx
448 xorq %r15,%r15
449 movq (%rsp),%r10
450 movq %r8,%rbp
451 mulq %rbx
452 addq %rax,%r10
453 movq (%rcx),%rax
454 adcq $0,%rdx
455
456 imulq %r10,%rbp
457 movq %rdx,%r11
458
459 mulq %rbp
460 addq %rax,%r10
461 movq 8(%rsi),%rax
462 adcq $0,%rdx
463 movq %rdx,%rdi
464
465 mulq %rbx
466 addq %rax,%r11
467 movq 8(%rcx),%rax
468 adcq $0,%rdx
469 addq 8(%rsp),%r11
470 adcq $0,%rdx
471 movq %rdx,%r10
472
473 mulq %rbp
474 addq %rax,%rdi
475 movq 16(%rsi),%rax
476 adcq $0,%rdx
477 addq %r11,%rdi
478 leaq 4(%r15),%r15
479 adcq $0,%rdx
480 movq %rdi,(%rsp)
481 movq %rdx,%r13
482 jmp L$inner4x
483.p2align 4
484L$inner4x:
485 mulq %rbx
486 addq %rax,%r10
487 movq -16(%rcx,%r15,8),%rax
488 adcq $0,%rdx
489 addq -16(%rsp,%r15,8),%r10
490 adcq $0,%rdx
491 movq %rdx,%r11
492
493 mulq %rbp
494 addq %rax,%r13
495 movq -8(%rsi,%r15,8),%rax
496 adcq $0,%rdx
497 addq %r10,%r13
498 adcq $0,%rdx
499 movq %r13,-24(%rsp,%r15,8)
500 movq %rdx,%rdi
501
502 mulq %rbx
503 addq %rax,%r11
504 movq -8(%rcx,%r15,8),%rax
505 adcq $0,%rdx
506 addq -8(%rsp,%r15,8),%r11
507 adcq $0,%rdx
508 movq %rdx,%r10
509
510 mulq %rbp
511 addq %rax,%rdi
512 movq (%rsi,%r15,8),%rax
513 adcq $0,%rdx
514 addq %r11,%rdi
515 adcq $0,%rdx
516 movq %rdi,-16(%rsp,%r15,8)
517 movq %rdx,%r13
518
519 mulq %rbx
520 addq %rax,%r10
521 movq (%rcx,%r15,8),%rax
522 adcq $0,%rdx
523 addq (%rsp,%r15,8),%r10
524 adcq $0,%rdx
525 movq %rdx,%r11
526
527 mulq %rbp
528 addq %rax,%r13
529 movq 8(%rsi,%r15,8),%rax
530 adcq $0,%rdx
531 addq %r10,%r13
532 adcq $0,%rdx
533 movq %r13,-8(%rsp,%r15,8)
534 movq %rdx,%rdi
535
536 mulq %rbx
537 addq %rax,%r11
538 movq 8(%rcx,%r15,8),%rax
539 adcq $0,%rdx
540 addq 8(%rsp,%r15,8),%r11
541 adcq $0,%rdx
542 leaq 4(%r15),%r15
543 movq %rdx,%r10
544
545 mulq %rbp
546 addq %rax,%rdi
547 movq -16(%rsi,%r15,8),%rax
548 adcq $0,%rdx
549 addq %r11,%rdi
550 adcq $0,%rdx
551 movq %rdi,-32(%rsp,%r15,8)
552 movq %rdx,%r13
553 cmpq %r9,%r15
554 jb L$inner4x
555
556 mulq %rbx
557 addq %rax,%r10
558 movq -16(%rcx,%r15,8),%rax
559 adcq $0,%rdx
560 addq -16(%rsp,%r15,8),%r10
561 adcq $0,%rdx
562 movq %rdx,%r11
563
564 mulq %rbp
565 addq %rax,%r13
566 movq -8(%rsi,%r15,8),%rax
567 adcq $0,%rdx
568 addq %r10,%r13
569 adcq $0,%rdx
570 movq %r13,-24(%rsp,%r15,8)
571 movq %rdx,%rdi
572
573 mulq %rbx
574 addq %rax,%r11
575 movq -8(%rcx,%r15,8),%rax
576 adcq $0,%rdx
577 addq -8(%rsp,%r15,8),%r11
578 adcq $0,%rdx
579 leaq 1(%r14),%r14
580 movq %rdx,%r10
581
582 mulq %rbp
583 addq %rax,%rdi
584 movq (%rsi),%rax
585 adcq $0,%rdx
586 addq %r11,%rdi
587 adcq $0,%rdx
588 movq %rdi,-16(%rsp,%r15,8)
589 movq %rdx,%r13
590
591 xorq %rdi,%rdi
592 addq %r10,%r13
593 adcq $0,%rdi
594 addq (%rsp,%r9,8),%r13
595 adcq $0,%rdi
596 movq %r13,-8(%rsp,%r15,8)
597 movq %rdi,(%rsp,%r15,8)
598
599 cmpq %r9,%r14
600 jb L$outer4x
601 movq 16(%rsp,%r9,8),%rdi
Robert Sloana94fe052017-02-21 08:49:28 -0800602 leaq -4(%r9),%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800603 movq 0(%rsp),%rax
Robert Sloana94fe052017-02-21 08:49:28 -0800604 pxor %xmm0,%xmm0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800605 movq 8(%rsp),%rdx
Robert Sloana94fe052017-02-21 08:49:28 -0800606 shrq $2,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800607 leaq (%rsp),%rsi
608 xorq %r14,%r14
609
610 subq 0(%rcx),%rax
611 movq 16(%rsi),%rbx
612 movq 24(%rsi),%rbp
613 sbbq 8(%rcx),%rdx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800614 jmp L$sub4x
615.p2align 4
616L$sub4x:
617 movq %rax,0(%rdi,%r14,8)
618 movq %rdx,8(%rdi,%r14,8)
619 sbbq 16(%rcx,%r14,8),%rbx
620 movq 32(%rsi,%r14,8),%rax
621 movq 40(%rsi,%r14,8),%rdx
622 sbbq 24(%rcx,%r14,8),%rbp
623 movq %rbx,16(%rdi,%r14,8)
624 movq %rbp,24(%rdi,%r14,8)
625 sbbq 32(%rcx,%r14,8),%rax
626 movq 48(%rsi,%r14,8),%rbx
627 movq 56(%rsi,%r14,8),%rbp
628 sbbq 40(%rcx,%r14,8),%rdx
629 leaq 4(%r14),%r14
630 decq %r15
631 jnz L$sub4x
632
633 movq %rax,0(%rdi,%r14,8)
634 movq 32(%rsi,%r14,8),%rax
635 sbbq 16(%rcx,%r14,8),%rbx
636 movq %rdx,8(%rdi,%r14,8)
637 sbbq 24(%rcx,%r14,8),%rbp
638 movq %rbx,16(%rdi,%r14,8)
639
640 sbbq $0,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800641 movq %rbp,24(%rdi,%r14,8)
642 xorq %r14,%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800643 andq %rax,%rsi
644 notq %rax
645 movq %rdi,%rcx
646 andq %rax,%rcx
647 leaq -4(%r9),%r15
648 orq %rcx,%rsi
649 shrq $2,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800650
Robert Sloana94fe052017-02-21 08:49:28 -0800651 movdqu (%rsi),%xmm1
652 movdqa %xmm0,(%rsp)
653 movdqu %xmm1,(%rdi)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800654 jmp L$copy4x
655.p2align 4
656L$copy4x:
Robert Sloana94fe052017-02-21 08:49:28 -0800657 movdqu 16(%rsi,%r14,1),%xmm2
658 movdqu 32(%rsi,%r14,1),%xmm1
659 movdqa %xmm0,16(%rsp,%r14,1)
660 movdqu %xmm2,16(%rdi,%r14,1)
661 movdqa %xmm0,32(%rsp,%r14,1)
662 movdqu %xmm1,32(%rdi,%r14,1)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800663 leaq 32(%r14),%r14
664 decq %r15
665 jnz L$copy4x
666
Robert Sloana94fe052017-02-21 08:49:28 -0800667 movdqu 16(%rsi,%r14,1),%xmm2
668 movdqa %xmm0,16(%rsp,%r14,1)
669 movdqu %xmm2,16(%rdi,%r14,1)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800670 movq 8(%rsp,%r9,8),%rsi
Robert Sloana94fe052017-02-21 08:49:28 -0800671
Adam Langleyd9e397b2015-01-22 14:27:53 -0800672 movq $1,%rax
Robert Sloana94fe052017-02-21 08:49:28 -0800673 movq -48(%rsi),%r15
674
675 movq -40(%rsi),%r14
676
677 movq -32(%rsi),%r13
678
679 movq -24(%rsi),%r12
680
681 movq -16(%rsi),%rbp
682
683 movq -8(%rsi),%rbx
684
685 leaq (%rsi),%rsp
686
Adam Langleyd9e397b2015-01-22 14:27:53 -0800687L$mul4x_epilogue:
688 .byte 0xf3,0xc3
689
690
691
692
Robert Sloana94fe052017-02-21 08:49:28 -0800693
Adam Langleyd9e397b2015-01-22 14:27:53 -0800694.p2align 5
695bn_sqr8x_mont:
Robert Sloana94fe052017-02-21 08:49:28 -0800696
Adam Langleyd9e397b2015-01-22 14:27:53 -0800697 movq %rsp,%rax
Robert Sloana94fe052017-02-21 08:49:28 -0800698
699L$sqr8x_enter:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800700 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800701
Adam Langleyd9e397b2015-01-22 14:27:53 -0800702 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800703
Adam Langleyd9e397b2015-01-22 14:27:53 -0800704 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -0800705
Adam Langleyd9e397b2015-01-22 14:27:53 -0800706 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -0800707
Adam Langleyd9e397b2015-01-22 14:27:53 -0800708 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -0800709
Adam Langleyd9e397b2015-01-22 14:27:53 -0800710 pushq %r15
711
Robert Sloana94fe052017-02-21 08:49:28 -0800712L$sqr8x_prologue:
713
Adam Langleyd9e397b2015-01-22 14:27:53 -0800714 movl %r9d,%r10d
715 shll $3,%r9d
716 shlq $3+2,%r10
717 negq %r9
718
719
720
721
722
723
David Benjamin4969cc92016-04-22 15:02:23 -0400724 leaq -64(%rsp,%r9,2),%r11
Robert Sloana94fe052017-02-21 08:49:28 -0800725 movq %rsp,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800726 movq (%r8),%r8
727 subq %rsi,%r11
728 andq $4095,%r11
729 cmpq %r11,%r10
730 jb L$sqr8x_sp_alt
Robert Sloana94fe052017-02-21 08:49:28 -0800731 subq %r11,%rbp
732 leaq -64(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800733 jmp L$sqr8x_sp_done
734
735.p2align 5
736L$sqr8x_sp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -0400737 leaq 4096-64(,%r9,2),%r10
Robert Sloana94fe052017-02-21 08:49:28 -0800738 leaq -64(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800739 subq %r10,%r11
740 movq $0,%r10
741 cmovcq %r10,%r11
Robert Sloana94fe052017-02-21 08:49:28 -0800742 subq %r11,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800743L$sqr8x_sp_done:
Robert Sloana94fe052017-02-21 08:49:28 -0800744 andq $-64,%rbp
745 movq %rsp,%r11
746 subq %rbp,%r11
747 andq $-4096,%r11
748 leaq (%r11,%rbp,1),%rsp
749 movq (%rsp),%r10
750 cmpq %rbp,%rsp
751 ja L$sqr8x_page_walk
752 jmp L$sqr8x_page_walk_done
753
754.p2align 4
755L$sqr8x_page_walk:
756 leaq -4096(%rsp),%rsp
757 movq (%rsp),%r10
758 cmpq %rbp,%rsp
759 ja L$sqr8x_page_walk
760L$sqr8x_page_walk_done:
761
Adam Langleyd9e397b2015-01-22 14:27:53 -0800762 movq %r9,%r10
763 negq %r9
764
Adam Langleyd9e397b2015-01-22 14:27:53 -0800765 movq %r8,32(%rsp)
766 movq %rax,40(%rsp)
Robert Sloana94fe052017-02-21 08:49:28 -0800767
Adam Langleyd9e397b2015-01-22 14:27:53 -0800768L$sqr8x_body:
769
David Benjamin4969cc92016-04-22 15:02:23 -0400770.byte 102,72,15,110,209
Adam Langleyd9e397b2015-01-22 14:27:53 -0800771 pxor %xmm0,%xmm0
772.byte 102,72,15,110,207
773.byte 102,73,15,110,218
774 call _bn_sqr8x_internal
775
David Benjamin4969cc92016-04-22 15:02:23 -0400776
777
778
779 leaq (%rdi,%r9,1),%rbx
780 movq %r9,%rcx
781 movq %r9,%rdx
782.byte 102,72,15,126,207
783 sarq $3+2,%rcx
784 jmp L$sqr8x_sub
Adam Langleyd9e397b2015-01-22 14:27:53 -0800785
786.p2align 5
David Benjamin4969cc92016-04-22 15:02:23 -0400787L$sqr8x_sub:
788 movq 0(%rbx),%r12
789 movq 8(%rbx),%r13
790 movq 16(%rbx),%r14
791 movq 24(%rbx),%r15
792 leaq 32(%rbx),%rbx
793 sbbq 0(%rbp),%r12
794 sbbq 8(%rbp),%r13
795 sbbq 16(%rbp),%r14
796 sbbq 24(%rbp),%r15
797 leaq 32(%rbp),%rbp
798 movq %r12,0(%rdi)
799 movq %r13,8(%rdi)
800 movq %r14,16(%rdi)
801 movq %r15,24(%rdi)
802 leaq 32(%rdi),%rdi
803 incq %rcx
804 jnz L$sqr8x_sub
805
806 sbbq $0,%rax
807 leaq (%rbx,%r9,1),%rbx
808 leaq (%rdi,%r9,1),%rdi
809
810.byte 102,72,15,110,200
811 pxor %xmm0,%xmm0
812 pshufd $0,%xmm1,%xmm1
813 movq 40(%rsp),%rsi
Robert Sloana94fe052017-02-21 08:49:28 -0800814
David Benjamin4969cc92016-04-22 15:02:23 -0400815 jmp L$sqr8x_cond_copy
816
817.p2align 5
818L$sqr8x_cond_copy:
819 movdqa 0(%rbx),%xmm2
820 movdqa 16(%rbx),%xmm3
821 leaq 32(%rbx),%rbx
822 movdqu 0(%rdi),%xmm4
823 movdqu 16(%rdi),%xmm5
824 leaq 32(%rdi),%rdi
825 movdqa %xmm0,-32(%rbx)
826 movdqa %xmm0,-16(%rbx)
827 movdqa %xmm0,-32(%rbx,%rdx,1)
828 movdqa %xmm0,-16(%rbx,%rdx,1)
829 pcmpeqd %xmm1,%xmm0
830 pand %xmm1,%xmm2
831 pand %xmm1,%xmm3
832 pand %xmm0,%xmm4
833 pand %xmm0,%xmm5
834 pxor %xmm0,%xmm0
835 por %xmm2,%xmm4
836 por %xmm3,%xmm5
837 movdqu %xmm4,-32(%rdi)
838 movdqu %xmm5,-16(%rdi)
839 addq $32,%r9
840 jnz L$sqr8x_cond_copy
Adam Langleyd9e397b2015-01-22 14:27:53 -0800841
842 movq $1,%rax
843 movq -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800844
Adam Langleyd9e397b2015-01-22 14:27:53 -0800845 movq -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800846
Adam Langleyd9e397b2015-01-22 14:27:53 -0800847 movq -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -0800848
Adam Langleyd9e397b2015-01-22 14:27:53 -0800849 movq -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -0800850
Adam Langleyd9e397b2015-01-22 14:27:53 -0800851 movq -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800852
Adam Langleyd9e397b2015-01-22 14:27:53 -0800853 movq -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800854
Adam Langleyd9e397b2015-01-22 14:27:53 -0800855 leaq (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800856
Adam Langleyd9e397b2015-01-22 14:27:53 -0800857L$sqr8x_epilogue:
858 .byte 0xf3,0xc3
859
Robert Sloana94fe052017-02-21 08:49:28 -0800860
Adam Langleyd9e397b2015-01-22 14:27:53 -0800861.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
862.p2align 4
863#endif