blob: 1f673ef81658c3a8280e0d0c60c7dce9d669e826 [file] [log] [blame]
Robert Sloan6f79a502017-04-03 09:16:40 -07001#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002.text
3
Adam Langleya4fb56a2015-03-06 11:08:23 -08004.extern OPENSSL_ia32cap_P
5.hidden OPENSSL_ia32cap_P
Adam Langleyd9e397b2015-01-22 14:27:53 -08006
7.globl bn_mul_mont
8.hidden bn_mul_mont
9.type bn_mul_mont,@function
10.align 16
11bn_mul_mont:
Robert Sloana94fe052017-02-21 08:49:28 -080012.cfi_startproc
13 movl %r9d,%r9d
14 movq %rsp,%rax
15.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -080016 testl $3,%r9d
17 jnz .Lmul_enter
18 cmpl $8,%r9d
19 jb .Lmul_enter
20 cmpq %rsi,%rdx
21 jne .Lmul4x_enter
22 testl $7,%r9d
23 jz .Lsqr8x_enter
24 jmp .Lmul4x_enter
25
26.align 16
27.Lmul_enter:
28 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -080029.cfi_offset %rbx,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -080030 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -080031.cfi_offset %rbp,-24
Adam Langleyd9e397b2015-01-22 14:27:53 -080032 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -080033.cfi_offset %r12,-32
Adam Langleyd9e397b2015-01-22 14:27:53 -080034 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -080035.cfi_offset %r13,-40
Adam Langleyd9e397b2015-01-22 14:27:53 -080036 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -080037.cfi_offset %r14,-48
Adam Langleyd9e397b2015-01-22 14:27:53 -080038 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -080039.cfi_offset %r15,-56
Adam Langleyd9e397b2015-01-22 14:27:53 -080040
Robert Sloana94fe052017-02-21 08:49:28 -080041 negq %r9
Adam Langleyd9e397b2015-01-22 14:27:53 -080042 movq %rsp,%r11
Robert Sloana94fe052017-02-21 08:49:28 -080043 leaq -16(%rsp,%r9,8),%r10
44 negq %r9
45 andq $-1024,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -080046
Robert Sloana94fe052017-02-21 08:49:28 -080047
48
49
50
51
52
53
54
55 subq %r10,%r11
56 andq $-4096,%r11
57 leaq (%r10,%r11,1),%rsp
58 movq (%rsp),%r11
59 cmpq %r10,%rsp
60 ja .Lmul_page_walk
61 jmp .Lmul_page_walk_done
62
63.align 16
64.Lmul_page_walk:
65 leaq -4096(%rsp),%rsp
66 movq (%rsp),%r11
67 cmpq %r10,%rsp
68 ja .Lmul_page_walk
69.Lmul_page_walk_done:
70
71 movq %rax,8(%rsp,%r9,8)
72.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
Adam Langleyd9e397b2015-01-22 14:27:53 -080073.Lmul_body:
74 movq %rdx,%r12
75 movq (%r8),%r8
76 movq (%r12),%rbx
77 movq (%rsi),%rax
78
79 xorq %r14,%r14
80 xorq %r15,%r15
81
82 movq %r8,%rbp
83 mulq %rbx
84 movq %rax,%r10
85 movq (%rcx),%rax
86
87 imulq %r10,%rbp
88 movq %rdx,%r11
89
90 mulq %rbp
91 addq %rax,%r10
92 movq 8(%rsi),%rax
93 adcq $0,%rdx
94 movq %rdx,%r13
95
96 leaq 1(%r15),%r15
97 jmp .L1st_enter
98
99.align 16
100.L1st:
101 addq %rax,%r13
102 movq (%rsi,%r15,8),%rax
103 adcq $0,%rdx
104 addq %r11,%r13
105 movq %r10,%r11
106 adcq $0,%rdx
107 movq %r13,-16(%rsp,%r15,8)
108 movq %rdx,%r13
109
110.L1st_enter:
111 mulq %rbx
112 addq %rax,%r11
113 movq (%rcx,%r15,8),%rax
114 adcq $0,%rdx
115 leaq 1(%r15),%r15
116 movq %rdx,%r10
117
118 mulq %rbp
119 cmpq %r9,%r15
120 jne .L1st
121
122 addq %rax,%r13
123 movq (%rsi),%rax
124 adcq $0,%rdx
125 addq %r11,%r13
126 adcq $0,%rdx
127 movq %r13,-16(%rsp,%r15,8)
128 movq %rdx,%r13
129 movq %r10,%r11
130
131 xorq %rdx,%rdx
132 addq %r11,%r13
133 adcq $0,%rdx
134 movq %r13,-8(%rsp,%r9,8)
135 movq %rdx,(%rsp,%r9,8)
136
137 leaq 1(%r14),%r14
138 jmp .Louter
139.align 16
140.Louter:
141 movq (%r12,%r14,8),%rbx
142 xorq %r15,%r15
143 movq %r8,%rbp
144 movq (%rsp),%r10
145 mulq %rbx
146 addq %rax,%r10
147 movq (%rcx),%rax
148 adcq $0,%rdx
149
150 imulq %r10,%rbp
151 movq %rdx,%r11
152
153 mulq %rbp
154 addq %rax,%r10
155 movq 8(%rsi),%rax
156 adcq $0,%rdx
157 movq 8(%rsp),%r10
158 movq %rdx,%r13
159
160 leaq 1(%r15),%r15
161 jmp .Linner_enter
162
163.align 16
164.Linner:
165 addq %rax,%r13
166 movq (%rsi,%r15,8),%rax
167 adcq $0,%rdx
168 addq %r10,%r13
169 movq (%rsp,%r15,8),%r10
170 adcq $0,%rdx
171 movq %r13,-16(%rsp,%r15,8)
172 movq %rdx,%r13
173
174.Linner_enter:
175 mulq %rbx
176 addq %rax,%r11
177 movq (%rcx,%r15,8),%rax
178 adcq $0,%rdx
179 addq %r11,%r10
180 movq %rdx,%r11
181 adcq $0,%r11
182 leaq 1(%r15),%r15
183
184 mulq %rbp
185 cmpq %r9,%r15
186 jne .Linner
187
188 addq %rax,%r13
189 movq (%rsi),%rax
190 adcq $0,%rdx
191 addq %r10,%r13
192 movq (%rsp,%r15,8),%r10
193 adcq $0,%rdx
194 movq %r13,-16(%rsp,%r15,8)
195 movq %rdx,%r13
196
197 xorq %rdx,%rdx
198 addq %r11,%r13
199 adcq $0,%rdx
200 addq %r10,%r13
201 adcq $0,%rdx
202 movq %r13,-8(%rsp,%r9,8)
203 movq %rdx,(%rsp,%r9,8)
204
205 leaq 1(%r14),%r14
206 cmpq %r9,%r14
207 jb .Louter
208
209 xorq %r14,%r14
210 movq (%rsp),%rax
211 leaq (%rsp),%rsi
212 movq %r9,%r15
213 jmp .Lsub
214.align 16
Robert Sloanab8b8882018-03-26 11:39:51 -0700215.Lsub: sbbq (%rcx,%r14,8),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800216 movq %rax,(%rdi,%r14,8)
217 movq 8(%rsi,%r14,8),%rax
218 leaq 1(%r14),%r14
219 decq %r15
220 jnz .Lsub
221
222 sbbq $0,%rax
223 xorq %r14,%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800224 andq %rax,%rsi
225 notq %rax
226 movq %rdi,%rcx
227 andq %rax,%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800228 movq %r9,%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800229 orq %rcx,%rsi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800230.align 16
231.Lcopy:
Robert Sloana94fe052017-02-21 08:49:28 -0800232 movq (%rsi,%r14,8),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800233 movq %r14,(%rsp,%r14,8)
Robert Sloana94fe052017-02-21 08:49:28 -0800234 movq %rax,(%rdi,%r14,8)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800235 leaq 1(%r14),%r14
236 subq $1,%r15
237 jnz .Lcopy
238
239 movq 8(%rsp,%r9,8),%rsi
Robert Sloana94fe052017-02-21 08:49:28 -0800240.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800241 movq $1,%rax
Robert Sloana94fe052017-02-21 08:49:28 -0800242 movq -48(%rsi),%r15
243.cfi_restore %r15
244 movq -40(%rsi),%r14
245.cfi_restore %r14
246 movq -32(%rsi),%r13
247.cfi_restore %r13
248 movq -24(%rsi),%r12
249.cfi_restore %r12
250 movq -16(%rsi),%rbp
251.cfi_restore %rbp
252 movq -8(%rsi),%rbx
253.cfi_restore %rbx
254 leaq (%rsi),%rsp
255.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800256.Lmul_epilogue:
257 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -0800258.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800259.size bn_mul_mont,.-bn_mul_mont
260.type bn_mul4x_mont,@function
261.align 16
262bn_mul4x_mont:
Robert Sloana94fe052017-02-21 08:49:28 -0800263.cfi_startproc
264 movl %r9d,%r9d
265 movq %rsp,%rax
266.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800267.Lmul4x_enter:
268 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800269.cfi_offset %rbx,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800270 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800271.cfi_offset %rbp,-24
Adam Langleyd9e397b2015-01-22 14:27:53 -0800272 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -0800273.cfi_offset %r12,-32
Adam Langleyd9e397b2015-01-22 14:27:53 -0800274 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -0800275.cfi_offset %r13,-40
Adam Langleyd9e397b2015-01-22 14:27:53 -0800276 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -0800277.cfi_offset %r14,-48
Adam Langleyd9e397b2015-01-22 14:27:53 -0800278 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -0800279.cfi_offset %r15,-56
Adam Langleyd9e397b2015-01-22 14:27:53 -0800280
Robert Sloana94fe052017-02-21 08:49:28 -0800281 negq %r9
Adam Langleyd9e397b2015-01-22 14:27:53 -0800282 movq %rsp,%r11
Robert Sloana94fe052017-02-21 08:49:28 -0800283 leaq -32(%rsp,%r9,8),%r10
284 negq %r9
285 andq $-1024,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -0800286
Robert Sloana94fe052017-02-21 08:49:28 -0800287 subq %r10,%r11
288 andq $-4096,%r11
289 leaq (%r10,%r11,1),%rsp
290 movq (%rsp),%r11
291 cmpq %r10,%rsp
292 ja .Lmul4x_page_walk
293 jmp .Lmul4x_page_walk_done
294
295.Lmul4x_page_walk:
296 leaq -4096(%rsp),%rsp
297 movq (%rsp),%r11
298 cmpq %r10,%rsp
299 ja .Lmul4x_page_walk
300.Lmul4x_page_walk_done:
301
302 movq %rax,8(%rsp,%r9,8)
303.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
Adam Langleyd9e397b2015-01-22 14:27:53 -0800304.Lmul4x_body:
305 movq %rdi,16(%rsp,%r9,8)
306 movq %rdx,%r12
307 movq (%r8),%r8
308 movq (%r12),%rbx
309 movq (%rsi),%rax
310
311 xorq %r14,%r14
312 xorq %r15,%r15
313
314 movq %r8,%rbp
315 mulq %rbx
316 movq %rax,%r10
317 movq (%rcx),%rax
318
319 imulq %r10,%rbp
320 movq %rdx,%r11
321
322 mulq %rbp
323 addq %rax,%r10
324 movq 8(%rsi),%rax
325 adcq $0,%rdx
326 movq %rdx,%rdi
327
328 mulq %rbx
329 addq %rax,%r11
330 movq 8(%rcx),%rax
331 adcq $0,%rdx
332 movq %rdx,%r10
333
334 mulq %rbp
335 addq %rax,%rdi
336 movq 16(%rsi),%rax
337 adcq $0,%rdx
338 addq %r11,%rdi
339 leaq 4(%r15),%r15
340 adcq $0,%rdx
341 movq %rdi,(%rsp)
342 movq %rdx,%r13
343 jmp .L1st4x
344.align 16
345.L1st4x:
346 mulq %rbx
347 addq %rax,%r10
348 movq -16(%rcx,%r15,8),%rax
349 adcq $0,%rdx
350 movq %rdx,%r11
351
352 mulq %rbp
353 addq %rax,%r13
354 movq -8(%rsi,%r15,8),%rax
355 adcq $0,%rdx
356 addq %r10,%r13
357 adcq $0,%rdx
358 movq %r13,-24(%rsp,%r15,8)
359 movq %rdx,%rdi
360
361 mulq %rbx
362 addq %rax,%r11
363 movq -8(%rcx,%r15,8),%rax
364 adcq $0,%rdx
365 movq %rdx,%r10
366
367 mulq %rbp
368 addq %rax,%rdi
369 movq (%rsi,%r15,8),%rax
370 adcq $0,%rdx
371 addq %r11,%rdi
372 adcq $0,%rdx
373 movq %rdi,-16(%rsp,%r15,8)
374 movq %rdx,%r13
375
376 mulq %rbx
377 addq %rax,%r10
378 movq (%rcx,%r15,8),%rax
379 adcq $0,%rdx
380 movq %rdx,%r11
381
382 mulq %rbp
383 addq %rax,%r13
384 movq 8(%rsi,%r15,8),%rax
385 adcq $0,%rdx
386 addq %r10,%r13
387 adcq $0,%rdx
388 movq %r13,-8(%rsp,%r15,8)
389 movq %rdx,%rdi
390
391 mulq %rbx
392 addq %rax,%r11
393 movq 8(%rcx,%r15,8),%rax
394 adcq $0,%rdx
395 leaq 4(%r15),%r15
396 movq %rdx,%r10
397
398 mulq %rbp
399 addq %rax,%rdi
400 movq -16(%rsi,%r15,8),%rax
401 adcq $0,%rdx
402 addq %r11,%rdi
403 adcq $0,%rdx
404 movq %rdi,-32(%rsp,%r15,8)
405 movq %rdx,%r13
406 cmpq %r9,%r15
407 jb .L1st4x
408
409 mulq %rbx
410 addq %rax,%r10
411 movq -16(%rcx,%r15,8),%rax
412 adcq $0,%rdx
413 movq %rdx,%r11
414
415 mulq %rbp
416 addq %rax,%r13
417 movq -8(%rsi,%r15,8),%rax
418 adcq $0,%rdx
419 addq %r10,%r13
420 adcq $0,%rdx
421 movq %r13,-24(%rsp,%r15,8)
422 movq %rdx,%rdi
423
424 mulq %rbx
425 addq %rax,%r11
426 movq -8(%rcx,%r15,8),%rax
427 adcq $0,%rdx
428 movq %rdx,%r10
429
430 mulq %rbp
431 addq %rax,%rdi
432 movq (%rsi),%rax
433 adcq $0,%rdx
434 addq %r11,%rdi
435 adcq $0,%rdx
436 movq %rdi,-16(%rsp,%r15,8)
437 movq %rdx,%r13
438
439 xorq %rdi,%rdi
440 addq %r10,%r13
441 adcq $0,%rdi
442 movq %r13,-8(%rsp,%r15,8)
443 movq %rdi,(%rsp,%r15,8)
444
445 leaq 1(%r14),%r14
446.align 4
447.Louter4x:
448 movq (%r12,%r14,8),%rbx
449 xorq %r15,%r15
450 movq (%rsp),%r10
451 movq %r8,%rbp
452 mulq %rbx
453 addq %rax,%r10
454 movq (%rcx),%rax
455 adcq $0,%rdx
456
457 imulq %r10,%rbp
458 movq %rdx,%r11
459
460 mulq %rbp
461 addq %rax,%r10
462 movq 8(%rsi),%rax
463 adcq $0,%rdx
464 movq %rdx,%rdi
465
466 mulq %rbx
467 addq %rax,%r11
468 movq 8(%rcx),%rax
469 adcq $0,%rdx
470 addq 8(%rsp),%r11
471 adcq $0,%rdx
472 movq %rdx,%r10
473
474 mulq %rbp
475 addq %rax,%rdi
476 movq 16(%rsi),%rax
477 adcq $0,%rdx
478 addq %r11,%rdi
479 leaq 4(%r15),%r15
480 adcq $0,%rdx
481 movq %rdi,(%rsp)
482 movq %rdx,%r13
483 jmp .Linner4x
484.align 16
485.Linner4x:
486 mulq %rbx
487 addq %rax,%r10
488 movq -16(%rcx,%r15,8),%rax
489 adcq $0,%rdx
490 addq -16(%rsp,%r15,8),%r10
491 adcq $0,%rdx
492 movq %rdx,%r11
493
494 mulq %rbp
495 addq %rax,%r13
496 movq -8(%rsi,%r15,8),%rax
497 adcq $0,%rdx
498 addq %r10,%r13
499 adcq $0,%rdx
500 movq %r13,-24(%rsp,%r15,8)
501 movq %rdx,%rdi
502
503 mulq %rbx
504 addq %rax,%r11
505 movq -8(%rcx,%r15,8),%rax
506 adcq $0,%rdx
507 addq -8(%rsp,%r15,8),%r11
508 adcq $0,%rdx
509 movq %rdx,%r10
510
511 mulq %rbp
512 addq %rax,%rdi
513 movq (%rsi,%r15,8),%rax
514 adcq $0,%rdx
515 addq %r11,%rdi
516 adcq $0,%rdx
517 movq %rdi,-16(%rsp,%r15,8)
518 movq %rdx,%r13
519
520 mulq %rbx
521 addq %rax,%r10
522 movq (%rcx,%r15,8),%rax
523 adcq $0,%rdx
524 addq (%rsp,%r15,8),%r10
525 adcq $0,%rdx
526 movq %rdx,%r11
527
528 mulq %rbp
529 addq %rax,%r13
530 movq 8(%rsi,%r15,8),%rax
531 adcq $0,%rdx
532 addq %r10,%r13
533 adcq $0,%rdx
534 movq %r13,-8(%rsp,%r15,8)
535 movq %rdx,%rdi
536
537 mulq %rbx
538 addq %rax,%r11
539 movq 8(%rcx,%r15,8),%rax
540 adcq $0,%rdx
541 addq 8(%rsp,%r15,8),%r11
542 adcq $0,%rdx
543 leaq 4(%r15),%r15
544 movq %rdx,%r10
545
546 mulq %rbp
547 addq %rax,%rdi
548 movq -16(%rsi,%r15,8),%rax
549 adcq $0,%rdx
550 addq %r11,%rdi
551 adcq $0,%rdx
552 movq %rdi,-32(%rsp,%r15,8)
553 movq %rdx,%r13
554 cmpq %r9,%r15
555 jb .Linner4x
556
557 mulq %rbx
558 addq %rax,%r10
559 movq -16(%rcx,%r15,8),%rax
560 adcq $0,%rdx
561 addq -16(%rsp,%r15,8),%r10
562 adcq $0,%rdx
563 movq %rdx,%r11
564
565 mulq %rbp
566 addq %rax,%r13
567 movq -8(%rsi,%r15,8),%rax
568 adcq $0,%rdx
569 addq %r10,%r13
570 adcq $0,%rdx
571 movq %r13,-24(%rsp,%r15,8)
572 movq %rdx,%rdi
573
574 mulq %rbx
575 addq %rax,%r11
576 movq -8(%rcx,%r15,8),%rax
577 adcq $0,%rdx
578 addq -8(%rsp,%r15,8),%r11
579 adcq $0,%rdx
580 leaq 1(%r14),%r14
581 movq %rdx,%r10
582
583 mulq %rbp
584 addq %rax,%rdi
585 movq (%rsi),%rax
586 adcq $0,%rdx
587 addq %r11,%rdi
588 adcq $0,%rdx
589 movq %rdi,-16(%rsp,%r15,8)
590 movq %rdx,%r13
591
592 xorq %rdi,%rdi
593 addq %r10,%r13
594 adcq $0,%rdi
595 addq (%rsp,%r9,8),%r13
596 adcq $0,%rdi
597 movq %r13,-8(%rsp,%r15,8)
598 movq %rdi,(%rsp,%r15,8)
599
600 cmpq %r9,%r14
601 jb .Louter4x
602 movq 16(%rsp,%r9,8),%rdi
Robert Sloana94fe052017-02-21 08:49:28 -0800603 leaq -4(%r9),%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800604 movq 0(%rsp),%rax
Robert Sloana94fe052017-02-21 08:49:28 -0800605 pxor %xmm0,%xmm0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800606 movq 8(%rsp),%rdx
Robert Sloana94fe052017-02-21 08:49:28 -0800607 shrq $2,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800608 leaq (%rsp),%rsi
609 xorq %r14,%r14
610
611 subq 0(%rcx),%rax
612 movq 16(%rsi),%rbx
613 movq 24(%rsi),%rbp
614 sbbq 8(%rcx),%rdx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800615 jmp .Lsub4x
616.align 16
617.Lsub4x:
618 movq %rax,0(%rdi,%r14,8)
619 movq %rdx,8(%rdi,%r14,8)
620 sbbq 16(%rcx,%r14,8),%rbx
621 movq 32(%rsi,%r14,8),%rax
622 movq 40(%rsi,%r14,8),%rdx
623 sbbq 24(%rcx,%r14,8),%rbp
624 movq %rbx,16(%rdi,%r14,8)
625 movq %rbp,24(%rdi,%r14,8)
626 sbbq 32(%rcx,%r14,8),%rax
627 movq 48(%rsi,%r14,8),%rbx
628 movq 56(%rsi,%r14,8),%rbp
629 sbbq 40(%rcx,%r14,8),%rdx
630 leaq 4(%r14),%r14
631 decq %r15
632 jnz .Lsub4x
633
634 movq %rax,0(%rdi,%r14,8)
635 movq 32(%rsi,%r14,8),%rax
636 sbbq 16(%rcx,%r14,8),%rbx
637 movq %rdx,8(%rdi,%r14,8)
638 sbbq 24(%rcx,%r14,8),%rbp
639 movq %rbx,16(%rdi,%r14,8)
640
641 sbbq $0,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800642 movq %rbp,24(%rdi,%r14,8)
643 xorq %r14,%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800644 andq %rax,%rsi
645 notq %rax
646 movq %rdi,%rcx
647 andq %rax,%rcx
648 leaq -4(%r9),%r15
649 orq %rcx,%rsi
650 shrq $2,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800651
Robert Sloana94fe052017-02-21 08:49:28 -0800652 movdqu (%rsi),%xmm1
653 movdqa %xmm0,(%rsp)
654 movdqu %xmm1,(%rdi)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800655 jmp .Lcopy4x
656.align 16
657.Lcopy4x:
Robert Sloana94fe052017-02-21 08:49:28 -0800658 movdqu 16(%rsi,%r14,1),%xmm2
659 movdqu 32(%rsi,%r14,1),%xmm1
660 movdqa %xmm0,16(%rsp,%r14,1)
661 movdqu %xmm2,16(%rdi,%r14,1)
662 movdqa %xmm0,32(%rsp,%r14,1)
663 movdqu %xmm1,32(%rdi,%r14,1)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800664 leaq 32(%r14),%r14
665 decq %r15
666 jnz .Lcopy4x
667
Robert Sloana94fe052017-02-21 08:49:28 -0800668 movdqu 16(%rsi,%r14,1),%xmm2
669 movdqa %xmm0,16(%rsp,%r14,1)
670 movdqu %xmm2,16(%rdi,%r14,1)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800671 movq 8(%rsp,%r9,8),%rsi
Robert Sloana94fe052017-02-21 08:49:28 -0800672.cfi_def_cfa %rsi, 8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800673 movq $1,%rax
Robert Sloana94fe052017-02-21 08:49:28 -0800674 movq -48(%rsi),%r15
675.cfi_restore %r15
676 movq -40(%rsi),%r14
677.cfi_restore %r14
678 movq -32(%rsi),%r13
679.cfi_restore %r13
680 movq -24(%rsi),%r12
681.cfi_restore %r12
682 movq -16(%rsi),%rbp
683.cfi_restore %rbp
684 movq -8(%rsi),%rbx
685.cfi_restore %rbx
686 leaq (%rsi),%rsp
687.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800688.Lmul4x_epilogue:
689 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -0800690.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800691.size bn_mul4x_mont,.-bn_mul4x_mont
Adam Langleya4fb56a2015-03-06 11:08:23 -0800692.extern bn_sqr8x_internal
693.hidden bn_sqr8x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -0800694
695.type bn_sqr8x_mont,@function
696.align 32
697bn_sqr8x_mont:
Robert Sloana94fe052017-02-21 08:49:28 -0800698.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800699 movq %rsp,%rax
Robert Sloana94fe052017-02-21 08:49:28 -0800700.cfi_def_cfa_register %rax
701.Lsqr8x_enter:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800702 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800703.cfi_offset %rbx,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800704 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800705.cfi_offset %rbp,-24
Adam Langleyd9e397b2015-01-22 14:27:53 -0800706 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -0800707.cfi_offset %r12,-32
Adam Langleyd9e397b2015-01-22 14:27:53 -0800708 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -0800709.cfi_offset %r13,-40
Adam Langleyd9e397b2015-01-22 14:27:53 -0800710 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -0800711.cfi_offset %r14,-48
Adam Langleyd9e397b2015-01-22 14:27:53 -0800712 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -0800713.cfi_offset %r15,-56
714.Lsqr8x_prologue:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800715
716 movl %r9d,%r10d
717 shll $3,%r9d
718 shlq $3+2,%r10
719 negq %r9
720
721
722
723
724
725
David Benjamin4969cc92016-04-22 15:02:23 -0400726 leaq -64(%rsp,%r9,2),%r11
Robert Sloana94fe052017-02-21 08:49:28 -0800727 movq %rsp,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800728 movq (%r8),%r8
729 subq %rsi,%r11
730 andq $4095,%r11
731 cmpq %r11,%r10
732 jb .Lsqr8x_sp_alt
Robert Sloana94fe052017-02-21 08:49:28 -0800733 subq %r11,%rbp
734 leaq -64(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800735 jmp .Lsqr8x_sp_done
736
737.align 32
738.Lsqr8x_sp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -0400739 leaq 4096-64(,%r9,2),%r10
Robert Sloana94fe052017-02-21 08:49:28 -0800740 leaq -64(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800741 subq %r10,%r11
742 movq $0,%r10
743 cmovcq %r10,%r11
Robert Sloana94fe052017-02-21 08:49:28 -0800744 subq %r11,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800745.Lsqr8x_sp_done:
Robert Sloana94fe052017-02-21 08:49:28 -0800746 andq $-64,%rbp
747 movq %rsp,%r11
748 subq %rbp,%r11
749 andq $-4096,%r11
750 leaq (%r11,%rbp,1),%rsp
751 movq (%rsp),%r10
752 cmpq %rbp,%rsp
753 ja .Lsqr8x_page_walk
754 jmp .Lsqr8x_page_walk_done
755
756.align 16
757.Lsqr8x_page_walk:
758 leaq -4096(%rsp),%rsp
759 movq (%rsp),%r10
760 cmpq %rbp,%rsp
761 ja .Lsqr8x_page_walk
762.Lsqr8x_page_walk_done:
763
Adam Langleyd9e397b2015-01-22 14:27:53 -0800764 movq %r9,%r10
765 negq %r9
766
Adam Langleyd9e397b2015-01-22 14:27:53 -0800767 movq %r8,32(%rsp)
768 movq %rax,40(%rsp)
Robert Sloana94fe052017-02-21 08:49:28 -0800769.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
Adam Langleyd9e397b2015-01-22 14:27:53 -0800770.Lsqr8x_body:
771
David Benjamin4969cc92016-04-22 15:02:23 -0400772.byte 102,72,15,110,209
Adam Langleyd9e397b2015-01-22 14:27:53 -0800773 pxor %xmm0,%xmm0
774.byte 102,72,15,110,207
775.byte 102,73,15,110,218
776 call bn_sqr8x_internal
777
David Benjamin4969cc92016-04-22 15:02:23 -0400778
779
780
781 leaq (%rdi,%r9,1),%rbx
782 movq %r9,%rcx
783 movq %r9,%rdx
784.byte 102,72,15,126,207
785 sarq $3+2,%rcx
786 jmp .Lsqr8x_sub
Adam Langleyd9e397b2015-01-22 14:27:53 -0800787
788.align 32
David Benjamin4969cc92016-04-22 15:02:23 -0400789.Lsqr8x_sub:
790 movq 0(%rbx),%r12
791 movq 8(%rbx),%r13
792 movq 16(%rbx),%r14
793 movq 24(%rbx),%r15
794 leaq 32(%rbx),%rbx
795 sbbq 0(%rbp),%r12
796 sbbq 8(%rbp),%r13
797 sbbq 16(%rbp),%r14
798 sbbq 24(%rbp),%r15
799 leaq 32(%rbp),%rbp
800 movq %r12,0(%rdi)
801 movq %r13,8(%rdi)
802 movq %r14,16(%rdi)
803 movq %r15,24(%rdi)
804 leaq 32(%rdi),%rdi
805 incq %rcx
806 jnz .Lsqr8x_sub
807
808 sbbq $0,%rax
809 leaq (%rbx,%r9,1),%rbx
810 leaq (%rdi,%r9,1),%rdi
811
812.byte 102,72,15,110,200
813 pxor %xmm0,%xmm0
814 pshufd $0,%xmm1,%xmm1
815 movq 40(%rsp),%rsi
Robert Sloana94fe052017-02-21 08:49:28 -0800816.cfi_def_cfa %rsi,8
David Benjamin4969cc92016-04-22 15:02:23 -0400817 jmp .Lsqr8x_cond_copy
818
819.align 32
820.Lsqr8x_cond_copy:
821 movdqa 0(%rbx),%xmm2
822 movdqa 16(%rbx),%xmm3
823 leaq 32(%rbx),%rbx
824 movdqu 0(%rdi),%xmm4
825 movdqu 16(%rdi),%xmm5
826 leaq 32(%rdi),%rdi
827 movdqa %xmm0,-32(%rbx)
828 movdqa %xmm0,-16(%rbx)
829 movdqa %xmm0,-32(%rbx,%rdx,1)
830 movdqa %xmm0,-16(%rbx,%rdx,1)
831 pcmpeqd %xmm1,%xmm0
832 pand %xmm1,%xmm2
833 pand %xmm1,%xmm3
834 pand %xmm0,%xmm4
835 pand %xmm0,%xmm5
836 pxor %xmm0,%xmm0
837 por %xmm2,%xmm4
838 por %xmm3,%xmm5
839 movdqu %xmm4,-32(%rdi)
840 movdqu %xmm5,-16(%rdi)
841 addq $32,%r9
842 jnz .Lsqr8x_cond_copy
Adam Langleyd9e397b2015-01-22 14:27:53 -0800843
844 movq $1,%rax
845 movq -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800846.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800847 movq -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800848.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800849 movq -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -0800850.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800851 movq -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -0800852.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800853 movq -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800854.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800855 movq -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800856.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800857 leaq (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800858.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800859.Lsqr8x_epilogue:
860 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -0800861.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800862.size bn_sqr8x_mont,.-bn_sqr8x_mont
863.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
864.align 16
865#endif