blob: b32e2f0ef4cc13d79d9adb2932e117c0b018a4b8 [file] [log] [blame]
Robert Sloan6f79a502017-04-03 09:16:40 -07001#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002.text
3
Adam Langleya4fb56a2015-03-06 11:08:23 -08004.extern OPENSSL_ia32cap_P
5.hidden OPENSSL_ia32cap_P
Adam Langleyd9e397b2015-01-22 14:27:53 -08006
7.globl bn_mul_mont
8.hidden bn_mul_mont
9.type bn_mul_mont,@function
10.align 16
11bn_mul_mont:
Robert Sloana94fe052017-02-21 08:49:28 -080012.cfi_startproc
13 movl %r9d,%r9d
14 movq %rsp,%rax
15.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -080016 testl $3,%r9d
17 jnz .Lmul_enter
18 cmpl $8,%r9d
19 jb .Lmul_enter
20 cmpq %rsi,%rdx
21 jne .Lmul4x_enter
22 testl $7,%r9d
23 jz .Lsqr8x_enter
24 jmp .Lmul4x_enter
25
26.align 16
27.Lmul_enter:
28 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -080029.cfi_offset %rbx,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -080030 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -080031.cfi_offset %rbp,-24
Adam Langleyd9e397b2015-01-22 14:27:53 -080032 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -080033.cfi_offset %r12,-32
Adam Langleyd9e397b2015-01-22 14:27:53 -080034 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -080035.cfi_offset %r13,-40
Adam Langleyd9e397b2015-01-22 14:27:53 -080036 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -080037.cfi_offset %r14,-48
Adam Langleyd9e397b2015-01-22 14:27:53 -080038 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -080039.cfi_offset %r15,-56
Adam Langleyd9e397b2015-01-22 14:27:53 -080040
Robert Sloana94fe052017-02-21 08:49:28 -080041 negq %r9
Adam Langleyd9e397b2015-01-22 14:27:53 -080042 movq %rsp,%r11
Robert Sloana94fe052017-02-21 08:49:28 -080043 leaq -16(%rsp,%r9,8),%r10
44 negq %r9
45 andq $-1024,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -080046
Robert Sloana94fe052017-02-21 08:49:28 -080047
48
49
50
51
52
53
54
55 subq %r10,%r11
56 andq $-4096,%r11
57 leaq (%r10,%r11,1),%rsp
58 movq (%rsp),%r11
59 cmpq %r10,%rsp
60 ja .Lmul_page_walk
61 jmp .Lmul_page_walk_done
62
63.align 16
64.Lmul_page_walk:
65 leaq -4096(%rsp),%rsp
66 movq (%rsp),%r11
67 cmpq %r10,%rsp
68 ja .Lmul_page_walk
69.Lmul_page_walk_done:
70
71 movq %rax,8(%rsp,%r9,8)
72.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
Adam Langleyd9e397b2015-01-22 14:27:53 -080073.Lmul_body:
74 movq %rdx,%r12
75 movq (%r8),%r8
76 movq (%r12),%rbx
77 movq (%rsi),%rax
78
79 xorq %r14,%r14
80 xorq %r15,%r15
81
82 movq %r8,%rbp
83 mulq %rbx
84 movq %rax,%r10
85 movq (%rcx),%rax
86
87 imulq %r10,%rbp
88 movq %rdx,%r11
89
90 mulq %rbp
91 addq %rax,%r10
92 movq 8(%rsi),%rax
93 adcq $0,%rdx
94 movq %rdx,%r13
95
96 leaq 1(%r15),%r15
97 jmp .L1st_enter
98
99.align 16
100.L1st:
101 addq %rax,%r13
102 movq (%rsi,%r15,8),%rax
103 adcq $0,%rdx
104 addq %r11,%r13
105 movq %r10,%r11
106 adcq $0,%rdx
107 movq %r13,-16(%rsp,%r15,8)
108 movq %rdx,%r13
109
110.L1st_enter:
111 mulq %rbx
112 addq %rax,%r11
113 movq (%rcx,%r15,8),%rax
114 adcq $0,%rdx
115 leaq 1(%r15),%r15
116 movq %rdx,%r10
117
118 mulq %rbp
119 cmpq %r9,%r15
120 jne .L1st
121
122 addq %rax,%r13
123 movq (%rsi),%rax
124 adcq $0,%rdx
125 addq %r11,%r13
126 adcq $0,%rdx
127 movq %r13,-16(%rsp,%r15,8)
128 movq %rdx,%r13
129 movq %r10,%r11
130
131 xorq %rdx,%rdx
132 addq %r11,%r13
133 adcq $0,%rdx
134 movq %r13,-8(%rsp,%r9,8)
135 movq %rdx,(%rsp,%r9,8)
136
137 leaq 1(%r14),%r14
138 jmp .Louter
139.align 16
140.Louter:
141 movq (%r12,%r14,8),%rbx
142 xorq %r15,%r15
143 movq %r8,%rbp
144 movq (%rsp),%r10
145 mulq %rbx
146 addq %rax,%r10
147 movq (%rcx),%rax
148 adcq $0,%rdx
149
150 imulq %r10,%rbp
151 movq %rdx,%r11
152
153 mulq %rbp
154 addq %rax,%r10
155 movq 8(%rsi),%rax
156 adcq $0,%rdx
157 movq 8(%rsp),%r10
158 movq %rdx,%r13
159
160 leaq 1(%r15),%r15
161 jmp .Linner_enter
162
163.align 16
164.Linner:
165 addq %rax,%r13
166 movq (%rsi,%r15,8),%rax
167 adcq $0,%rdx
168 addq %r10,%r13
169 movq (%rsp,%r15,8),%r10
170 adcq $0,%rdx
171 movq %r13,-16(%rsp,%r15,8)
172 movq %rdx,%r13
173
174.Linner_enter:
175 mulq %rbx
176 addq %rax,%r11
177 movq (%rcx,%r15,8),%rax
178 adcq $0,%rdx
179 addq %r11,%r10
180 movq %rdx,%r11
181 adcq $0,%r11
182 leaq 1(%r15),%r15
183
184 mulq %rbp
185 cmpq %r9,%r15
186 jne .Linner
187
188 addq %rax,%r13
189 movq (%rsi),%rax
190 adcq $0,%rdx
191 addq %r10,%r13
192 movq (%rsp,%r15,8),%r10
193 adcq $0,%rdx
194 movq %r13,-16(%rsp,%r15,8)
195 movq %rdx,%r13
196
197 xorq %rdx,%rdx
198 addq %r11,%r13
199 adcq $0,%rdx
200 addq %r10,%r13
201 adcq $0,%rdx
202 movq %r13,-8(%rsp,%r9,8)
203 movq %rdx,(%rsp,%r9,8)
204
205 leaq 1(%r14),%r14
206 cmpq %r9,%r14
207 jb .Louter
208
209 xorq %r14,%r14
210 movq (%rsp),%rax
211 leaq (%rsp),%rsi
212 movq %r9,%r15
213 jmp .Lsub
214.align 16
Robert Sloan8ff03552017-06-14 12:40:58 -0700215.Lsub:
216 sbbq (%rcx,%r14,8),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800217 movq %rax,(%rdi,%r14,8)
218 movq 8(%rsi,%r14,8),%rax
219 leaq 1(%r14),%r14
220 decq %r15
221 jnz .Lsub
222
223 sbbq $0,%rax
224 xorq %r14,%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800225 andq %rax,%rsi
226 notq %rax
227 movq %rdi,%rcx
228 andq %rax,%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800229 movq %r9,%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800230 orq %rcx,%rsi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800231.align 16
232.Lcopy:
Robert Sloana94fe052017-02-21 08:49:28 -0800233 movq (%rsi,%r14,8),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800234 movq %r14,(%rsp,%r14,8)
Robert Sloana94fe052017-02-21 08:49:28 -0800235 movq %rax,(%rdi,%r14,8)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800236 leaq 1(%r14),%r14
237 subq $1,%r15
238 jnz .Lcopy
239
240 movq 8(%rsp,%r9,8),%rsi
Robert Sloana94fe052017-02-21 08:49:28 -0800241.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800242 movq $1,%rax
Robert Sloana94fe052017-02-21 08:49:28 -0800243 movq -48(%rsi),%r15
244.cfi_restore %r15
245 movq -40(%rsi),%r14
246.cfi_restore %r14
247 movq -32(%rsi),%r13
248.cfi_restore %r13
249 movq -24(%rsi),%r12
250.cfi_restore %r12
251 movq -16(%rsi),%rbp
252.cfi_restore %rbp
253 movq -8(%rsi),%rbx
254.cfi_restore %rbx
255 leaq (%rsi),%rsp
256.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800257.Lmul_epilogue:
258 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -0800259.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800260.size bn_mul_mont,.-bn_mul_mont
261.type bn_mul4x_mont,@function
262.align 16
263bn_mul4x_mont:
Robert Sloana94fe052017-02-21 08:49:28 -0800264.cfi_startproc
265 movl %r9d,%r9d
266 movq %rsp,%rax
267.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800268.Lmul4x_enter:
269 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800270.cfi_offset %rbx,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800271 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800272.cfi_offset %rbp,-24
Adam Langleyd9e397b2015-01-22 14:27:53 -0800273 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -0800274.cfi_offset %r12,-32
Adam Langleyd9e397b2015-01-22 14:27:53 -0800275 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -0800276.cfi_offset %r13,-40
Adam Langleyd9e397b2015-01-22 14:27:53 -0800277 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -0800278.cfi_offset %r14,-48
Adam Langleyd9e397b2015-01-22 14:27:53 -0800279 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -0800280.cfi_offset %r15,-56
Adam Langleyd9e397b2015-01-22 14:27:53 -0800281
Robert Sloana94fe052017-02-21 08:49:28 -0800282 negq %r9
Adam Langleyd9e397b2015-01-22 14:27:53 -0800283 movq %rsp,%r11
Robert Sloana94fe052017-02-21 08:49:28 -0800284 leaq -32(%rsp,%r9,8),%r10
285 negq %r9
286 andq $-1024,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -0800287
Robert Sloana94fe052017-02-21 08:49:28 -0800288 subq %r10,%r11
289 andq $-4096,%r11
290 leaq (%r10,%r11,1),%rsp
291 movq (%rsp),%r11
292 cmpq %r10,%rsp
293 ja .Lmul4x_page_walk
294 jmp .Lmul4x_page_walk_done
295
296.Lmul4x_page_walk:
297 leaq -4096(%rsp),%rsp
298 movq (%rsp),%r11
299 cmpq %r10,%rsp
300 ja .Lmul4x_page_walk
301.Lmul4x_page_walk_done:
302
303 movq %rax,8(%rsp,%r9,8)
304.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
Adam Langleyd9e397b2015-01-22 14:27:53 -0800305.Lmul4x_body:
306 movq %rdi,16(%rsp,%r9,8)
307 movq %rdx,%r12
308 movq (%r8),%r8
309 movq (%r12),%rbx
310 movq (%rsi),%rax
311
312 xorq %r14,%r14
313 xorq %r15,%r15
314
315 movq %r8,%rbp
316 mulq %rbx
317 movq %rax,%r10
318 movq (%rcx),%rax
319
320 imulq %r10,%rbp
321 movq %rdx,%r11
322
323 mulq %rbp
324 addq %rax,%r10
325 movq 8(%rsi),%rax
326 adcq $0,%rdx
327 movq %rdx,%rdi
328
329 mulq %rbx
330 addq %rax,%r11
331 movq 8(%rcx),%rax
332 adcq $0,%rdx
333 movq %rdx,%r10
334
335 mulq %rbp
336 addq %rax,%rdi
337 movq 16(%rsi),%rax
338 adcq $0,%rdx
339 addq %r11,%rdi
340 leaq 4(%r15),%r15
341 adcq $0,%rdx
342 movq %rdi,(%rsp)
343 movq %rdx,%r13
344 jmp .L1st4x
345.align 16
346.L1st4x:
347 mulq %rbx
348 addq %rax,%r10
349 movq -16(%rcx,%r15,8),%rax
350 adcq $0,%rdx
351 movq %rdx,%r11
352
353 mulq %rbp
354 addq %rax,%r13
355 movq -8(%rsi,%r15,8),%rax
356 adcq $0,%rdx
357 addq %r10,%r13
358 adcq $0,%rdx
359 movq %r13,-24(%rsp,%r15,8)
360 movq %rdx,%rdi
361
362 mulq %rbx
363 addq %rax,%r11
364 movq -8(%rcx,%r15,8),%rax
365 adcq $0,%rdx
366 movq %rdx,%r10
367
368 mulq %rbp
369 addq %rax,%rdi
370 movq (%rsi,%r15,8),%rax
371 adcq $0,%rdx
372 addq %r11,%rdi
373 adcq $0,%rdx
374 movq %rdi,-16(%rsp,%r15,8)
375 movq %rdx,%r13
376
377 mulq %rbx
378 addq %rax,%r10
379 movq (%rcx,%r15,8),%rax
380 adcq $0,%rdx
381 movq %rdx,%r11
382
383 mulq %rbp
384 addq %rax,%r13
385 movq 8(%rsi,%r15,8),%rax
386 adcq $0,%rdx
387 addq %r10,%r13
388 adcq $0,%rdx
389 movq %r13,-8(%rsp,%r15,8)
390 movq %rdx,%rdi
391
392 mulq %rbx
393 addq %rax,%r11
394 movq 8(%rcx,%r15,8),%rax
395 adcq $0,%rdx
396 leaq 4(%r15),%r15
397 movq %rdx,%r10
398
399 mulq %rbp
400 addq %rax,%rdi
401 movq -16(%rsi,%r15,8),%rax
402 adcq $0,%rdx
403 addq %r11,%rdi
404 adcq $0,%rdx
405 movq %rdi,-32(%rsp,%r15,8)
406 movq %rdx,%r13
407 cmpq %r9,%r15
408 jb .L1st4x
409
410 mulq %rbx
411 addq %rax,%r10
412 movq -16(%rcx,%r15,8),%rax
413 adcq $0,%rdx
414 movq %rdx,%r11
415
416 mulq %rbp
417 addq %rax,%r13
418 movq -8(%rsi,%r15,8),%rax
419 adcq $0,%rdx
420 addq %r10,%r13
421 adcq $0,%rdx
422 movq %r13,-24(%rsp,%r15,8)
423 movq %rdx,%rdi
424
425 mulq %rbx
426 addq %rax,%r11
427 movq -8(%rcx,%r15,8),%rax
428 adcq $0,%rdx
429 movq %rdx,%r10
430
431 mulq %rbp
432 addq %rax,%rdi
433 movq (%rsi),%rax
434 adcq $0,%rdx
435 addq %r11,%rdi
436 adcq $0,%rdx
437 movq %rdi,-16(%rsp,%r15,8)
438 movq %rdx,%r13
439
440 xorq %rdi,%rdi
441 addq %r10,%r13
442 adcq $0,%rdi
443 movq %r13,-8(%rsp,%r15,8)
444 movq %rdi,(%rsp,%r15,8)
445
446 leaq 1(%r14),%r14
447.align 4
448.Louter4x:
449 movq (%r12,%r14,8),%rbx
450 xorq %r15,%r15
451 movq (%rsp),%r10
452 movq %r8,%rbp
453 mulq %rbx
454 addq %rax,%r10
455 movq (%rcx),%rax
456 adcq $0,%rdx
457
458 imulq %r10,%rbp
459 movq %rdx,%r11
460
461 mulq %rbp
462 addq %rax,%r10
463 movq 8(%rsi),%rax
464 adcq $0,%rdx
465 movq %rdx,%rdi
466
467 mulq %rbx
468 addq %rax,%r11
469 movq 8(%rcx),%rax
470 adcq $0,%rdx
471 addq 8(%rsp),%r11
472 adcq $0,%rdx
473 movq %rdx,%r10
474
475 mulq %rbp
476 addq %rax,%rdi
477 movq 16(%rsi),%rax
478 adcq $0,%rdx
479 addq %r11,%rdi
480 leaq 4(%r15),%r15
481 adcq $0,%rdx
482 movq %rdi,(%rsp)
483 movq %rdx,%r13
484 jmp .Linner4x
485.align 16
486.Linner4x:
487 mulq %rbx
488 addq %rax,%r10
489 movq -16(%rcx,%r15,8),%rax
490 adcq $0,%rdx
491 addq -16(%rsp,%r15,8),%r10
492 adcq $0,%rdx
493 movq %rdx,%r11
494
495 mulq %rbp
496 addq %rax,%r13
497 movq -8(%rsi,%r15,8),%rax
498 adcq $0,%rdx
499 addq %r10,%r13
500 adcq $0,%rdx
501 movq %r13,-24(%rsp,%r15,8)
502 movq %rdx,%rdi
503
504 mulq %rbx
505 addq %rax,%r11
506 movq -8(%rcx,%r15,8),%rax
507 adcq $0,%rdx
508 addq -8(%rsp,%r15,8),%r11
509 adcq $0,%rdx
510 movq %rdx,%r10
511
512 mulq %rbp
513 addq %rax,%rdi
514 movq (%rsi,%r15,8),%rax
515 adcq $0,%rdx
516 addq %r11,%rdi
517 adcq $0,%rdx
518 movq %rdi,-16(%rsp,%r15,8)
519 movq %rdx,%r13
520
521 mulq %rbx
522 addq %rax,%r10
523 movq (%rcx,%r15,8),%rax
524 adcq $0,%rdx
525 addq (%rsp,%r15,8),%r10
526 adcq $0,%rdx
527 movq %rdx,%r11
528
529 mulq %rbp
530 addq %rax,%r13
531 movq 8(%rsi,%r15,8),%rax
532 adcq $0,%rdx
533 addq %r10,%r13
534 adcq $0,%rdx
535 movq %r13,-8(%rsp,%r15,8)
536 movq %rdx,%rdi
537
538 mulq %rbx
539 addq %rax,%r11
540 movq 8(%rcx,%r15,8),%rax
541 adcq $0,%rdx
542 addq 8(%rsp,%r15,8),%r11
543 adcq $0,%rdx
544 leaq 4(%r15),%r15
545 movq %rdx,%r10
546
547 mulq %rbp
548 addq %rax,%rdi
549 movq -16(%rsi,%r15,8),%rax
550 adcq $0,%rdx
551 addq %r11,%rdi
552 adcq $0,%rdx
553 movq %rdi,-32(%rsp,%r15,8)
554 movq %rdx,%r13
555 cmpq %r9,%r15
556 jb .Linner4x
557
558 mulq %rbx
559 addq %rax,%r10
560 movq -16(%rcx,%r15,8),%rax
561 adcq $0,%rdx
562 addq -16(%rsp,%r15,8),%r10
563 adcq $0,%rdx
564 movq %rdx,%r11
565
566 mulq %rbp
567 addq %rax,%r13
568 movq -8(%rsi,%r15,8),%rax
569 adcq $0,%rdx
570 addq %r10,%r13
571 adcq $0,%rdx
572 movq %r13,-24(%rsp,%r15,8)
573 movq %rdx,%rdi
574
575 mulq %rbx
576 addq %rax,%r11
577 movq -8(%rcx,%r15,8),%rax
578 adcq $0,%rdx
579 addq -8(%rsp,%r15,8),%r11
580 adcq $0,%rdx
581 leaq 1(%r14),%r14
582 movq %rdx,%r10
583
584 mulq %rbp
585 addq %rax,%rdi
586 movq (%rsi),%rax
587 adcq $0,%rdx
588 addq %r11,%rdi
589 adcq $0,%rdx
590 movq %rdi,-16(%rsp,%r15,8)
591 movq %rdx,%r13
592
593 xorq %rdi,%rdi
594 addq %r10,%r13
595 adcq $0,%rdi
596 addq (%rsp,%r9,8),%r13
597 adcq $0,%rdi
598 movq %r13,-8(%rsp,%r15,8)
599 movq %rdi,(%rsp,%r15,8)
600
601 cmpq %r9,%r14
602 jb .Louter4x
603 movq 16(%rsp,%r9,8),%rdi
Robert Sloana94fe052017-02-21 08:49:28 -0800604 leaq -4(%r9),%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800605 movq 0(%rsp),%rax
Robert Sloana94fe052017-02-21 08:49:28 -0800606 pxor %xmm0,%xmm0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800607 movq 8(%rsp),%rdx
Robert Sloana94fe052017-02-21 08:49:28 -0800608 shrq $2,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800609 leaq (%rsp),%rsi
610 xorq %r14,%r14
611
612 subq 0(%rcx),%rax
613 movq 16(%rsi),%rbx
614 movq 24(%rsi),%rbp
615 sbbq 8(%rcx),%rdx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800616 jmp .Lsub4x
617.align 16
618.Lsub4x:
619 movq %rax,0(%rdi,%r14,8)
620 movq %rdx,8(%rdi,%r14,8)
621 sbbq 16(%rcx,%r14,8),%rbx
622 movq 32(%rsi,%r14,8),%rax
623 movq 40(%rsi,%r14,8),%rdx
624 sbbq 24(%rcx,%r14,8),%rbp
625 movq %rbx,16(%rdi,%r14,8)
626 movq %rbp,24(%rdi,%r14,8)
627 sbbq 32(%rcx,%r14,8),%rax
628 movq 48(%rsi,%r14,8),%rbx
629 movq 56(%rsi,%r14,8),%rbp
630 sbbq 40(%rcx,%r14,8),%rdx
631 leaq 4(%r14),%r14
632 decq %r15
633 jnz .Lsub4x
634
635 movq %rax,0(%rdi,%r14,8)
636 movq 32(%rsi,%r14,8),%rax
637 sbbq 16(%rcx,%r14,8),%rbx
638 movq %rdx,8(%rdi,%r14,8)
639 sbbq 24(%rcx,%r14,8),%rbp
640 movq %rbx,16(%rdi,%r14,8)
641
642 sbbq $0,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800643 movq %rbp,24(%rdi,%r14,8)
644 xorq %r14,%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800645 andq %rax,%rsi
646 notq %rax
647 movq %rdi,%rcx
648 andq %rax,%rcx
649 leaq -4(%r9),%r15
650 orq %rcx,%rsi
651 shrq $2,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800652
Robert Sloana94fe052017-02-21 08:49:28 -0800653 movdqu (%rsi),%xmm1
654 movdqa %xmm0,(%rsp)
655 movdqu %xmm1,(%rdi)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800656 jmp .Lcopy4x
657.align 16
658.Lcopy4x:
Robert Sloana94fe052017-02-21 08:49:28 -0800659 movdqu 16(%rsi,%r14,1),%xmm2
660 movdqu 32(%rsi,%r14,1),%xmm1
661 movdqa %xmm0,16(%rsp,%r14,1)
662 movdqu %xmm2,16(%rdi,%r14,1)
663 movdqa %xmm0,32(%rsp,%r14,1)
664 movdqu %xmm1,32(%rdi,%r14,1)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800665 leaq 32(%r14),%r14
666 decq %r15
667 jnz .Lcopy4x
668
Robert Sloana94fe052017-02-21 08:49:28 -0800669 movdqu 16(%rsi,%r14,1),%xmm2
670 movdqa %xmm0,16(%rsp,%r14,1)
671 movdqu %xmm2,16(%rdi,%r14,1)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800672 movq 8(%rsp,%r9,8),%rsi
Robert Sloana94fe052017-02-21 08:49:28 -0800673.cfi_def_cfa %rsi, 8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800674 movq $1,%rax
Robert Sloana94fe052017-02-21 08:49:28 -0800675 movq -48(%rsi),%r15
676.cfi_restore %r15
677 movq -40(%rsi),%r14
678.cfi_restore %r14
679 movq -32(%rsi),%r13
680.cfi_restore %r13
681 movq -24(%rsi),%r12
682.cfi_restore %r12
683 movq -16(%rsi),%rbp
684.cfi_restore %rbp
685 movq -8(%rsi),%rbx
686.cfi_restore %rbx
687 leaq (%rsi),%rsp
688.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800689.Lmul4x_epilogue:
690 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -0800691.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800692.size bn_mul4x_mont,.-bn_mul4x_mont
Adam Langleya4fb56a2015-03-06 11:08:23 -0800693.extern bn_sqr8x_internal
694.hidden bn_sqr8x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -0800695
696.type bn_sqr8x_mont,@function
697.align 32
698bn_sqr8x_mont:
Robert Sloana94fe052017-02-21 08:49:28 -0800699.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800700 movq %rsp,%rax
Robert Sloana94fe052017-02-21 08:49:28 -0800701.cfi_def_cfa_register %rax
702.Lsqr8x_enter:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800703 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800704.cfi_offset %rbx,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800705 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800706.cfi_offset %rbp,-24
Adam Langleyd9e397b2015-01-22 14:27:53 -0800707 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -0800708.cfi_offset %r12,-32
Adam Langleyd9e397b2015-01-22 14:27:53 -0800709 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -0800710.cfi_offset %r13,-40
Adam Langleyd9e397b2015-01-22 14:27:53 -0800711 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -0800712.cfi_offset %r14,-48
Adam Langleyd9e397b2015-01-22 14:27:53 -0800713 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -0800714.cfi_offset %r15,-56
715.Lsqr8x_prologue:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800716
717 movl %r9d,%r10d
718 shll $3,%r9d
719 shlq $3+2,%r10
720 negq %r9
721
722
723
724
725
726
David Benjamin4969cc92016-04-22 15:02:23 -0400727 leaq -64(%rsp,%r9,2),%r11
Robert Sloana94fe052017-02-21 08:49:28 -0800728 movq %rsp,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800729 movq (%r8),%r8
730 subq %rsi,%r11
731 andq $4095,%r11
732 cmpq %r11,%r10
733 jb .Lsqr8x_sp_alt
Robert Sloana94fe052017-02-21 08:49:28 -0800734 subq %r11,%rbp
735 leaq -64(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800736 jmp .Lsqr8x_sp_done
737
738.align 32
739.Lsqr8x_sp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -0400740 leaq 4096-64(,%r9,2),%r10
Robert Sloana94fe052017-02-21 08:49:28 -0800741 leaq -64(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800742 subq %r10,%r11
743 movq $0,%r10
744 cmovcq %r10,%r11
Robert Sloana94fe052017-02-21 08:49:28 -0800745 subq %r11,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800746.Lsqr8x_sp_done:
Robert Sloana94fe052017-02-21 08:49:28 -0800747 andq $-64,%rbp
748 movq %rsp,%r11
749 subq %rbp,%r11
750 andq $-4096,%r11
751 leaq (%r11,%rbp,1),%rsp
752 movq (%rsp),%r10
753 cmpq %rbp,%rsp
754 ja .Lsqr8x_page_walk
755 jmp .Lsqr8x_page_walk_done
756
757.align 16
758.Lsqr8x_page_walk:
759 leaq -4096(%rsp),%rsp
760 movq (%rsp),%r10
761 cmpq %rbp,%rsp
762 ja .Lsqr8x_page_walk
763.Lsqr8x_page_walk_done:
764
Adam Langleyd9e397b2015-01-22 14:27:53 -0800765 movq %r9,%r10
766 negq %r9
767
Adam Langleyd9e397b2015-01-22 14:27:53 -0800768 movq %r8,32(%rsp)
769 movq %rax,40(%rsp)
Robert Sloana94fe052017-02-21 08:49:28 -0800770.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
Adam Langleyd9e397b2015-01-22 14:27:53 -0800771.Lsqr8x_body:
772
David Benjamin4969cc92016-04-22 15:02:23 -0400773.byte 102,72,15,110,209
Adam Langleyd9e397b2015-01-22 14:27:53 -0800774 pxor %xmm0,%xmm0
775.byte 102,72,15,110,207
776.byte 102,73,15,110,218
777 call bn_sqr8x_internal
778
David Benjamin4969cc92016-04-22 15:02:23 -0400779
780
781
782 leaq (%rdi,%r9,1),%rbx
783 movq %r9,%rcx
784 movq %r9,%rdx
785.byte 102,72,15,126,207
786 sarq $3+2,%rcx
787 jmp .Lsqr8x_sub
Adam Langleyd9e397b2015-01-22 14:27:53 -0800788
789.align 32
David Benjamin4969cc92016-04-22 15:02:23 -0400790.Lsqr8x_sub:
791 movq 0(%rbx),%r12
792 movq 8(%rbx),%r13
793 movq 16(%rbx),%r14
794 movq 24(%rbx),%r15
795 leaq 32(%rbx),%rbx
796 sbbq 0(%rbp),%r12
797 sbbq 8(%rbp),%r13
798 sbbq 16(%rbp),%r14
799 sbbq 24(%rbp),%r15
800 leaq 32(%rbp),%rbp
801 movq %r12,0(%rdi)
802 movq %r13,8(%rdi)
803 movq %r14,16(%rdi)
804 movq %r15,24(%rdi)
805 leaq 32(%rdi),%rdi
806 incq %rcx
807 jnz .Lsqr8x_sub
808
809 sbbq $0,%rax
810 leaq (%rbx,%r9,1),%rbx
811 leaq (%rdi,%r9,1),%rdi
812
813.byte 102,72,15,110,200
814 pxor %xmm0,%xmm0
815 pshufd $0,%xmm1,%xmm1
816 movq 40(%rsp),%rsi
Robert Sloana94fe052017-02-21 08:49:28 -0800817.cfi_def_cfa %rsi,8
David Benjamin4969cc92016-04-22 15:02:23 -0400818 jmp .Lsqr8x_cond_copy
819
820.align 32
821.Lsqr8x_cond_copy:
822 movdqa 0(%rbx),%xmm2
823 movdqa 16(%rbx),%xmm3
824 leaq 32(%rbx),%rbx
825 movdqu 0(%rdi),%xmm4
826 movdqu 16(%rdi),%xmm5
827 leaq 32(%rdi),%rdi
828 movdqa %xmm0,-32(%rbx)
829 movdqa %xmm0,-16(%rbx)
830 movdqa %xmm0,-32(%rbx,%rdx,1)
831 movdqa %xmm0,-16(%rbx,%rdx,1)
832 pcmpeqd %xmm1,%xmm0
833 pand %xmm1,%xmm2
834 pand %xmm1,%xmm3
835 pand %xmm0,%xmm4
836 pand %xmm0,%xmm5
837 pxor %xmm0,%xmm0
838 por %xmm2,%xmm4
839 por %xmm3,%xmm5
840 movdqu %xmm4,-32(%rdi)
841 movdqu %xmm5,-16(%rdi)
842 addq $32,%r9
843 jnz .Lsqr8x_cond_copy
Adam Langleyd9e397b2015-01-22 14:27:53 -0800844
845 movq $1,%rax
846 movq -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800847.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800848 movq -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800849.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800850 movq -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -0800851.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800852 movq -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -0800853.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800854 movq -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800855.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800856 movq -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800857.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800858 leaq (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800859.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800860.Lsqr8x_epilogue:
861 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -0800862.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800863.size bn_sqr8x_mont,.-bn_sqr8x_mont
864.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
865.align 16
866#endif