blob: 1ec58ca072a872e53ae849c84caa40fc55ad8a10 [file] [log] [blame]
Robert Sloan6f79a502017-04-03 09:16:40 -07001#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002.text
3
Adam Langleya4fb56a2015-03-06 11:08:23 -08004.extern OPENSSL_ia32cap_P
5.hidden OPENSSL_ia32cap_P
Adam Langleyd9e397b2015-01-22 14:27:53 -08006
7.globl bn_mul_mont_gather5
8.hidden bn_mul_mont_gather5
9.type bn_mul_mont_gather5,@function
10.align 64
11bn_mul_mont_gather5:
Robert Sloana94fe052017-02-21 08:49:28 -080012.cfi_startproc
13 movl %r9d,%r9d
14 movq %rsp,%rax
15.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -080016 testl $7,%r9d
17 jnz .Lmul_enter
18 jmp .Lmul4x_enter
19
20.align 16
21.Lmul_enter:
David Benjamin4969cc92016-04-22 15:02:23 -040022 movd 8(%rsp),%xmm5
Adam Langleyd9e397b2015-01-22 14:27:53 -080023 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -080024.cfi_offset %rbx,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -080025 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -080026.cfi_offset %rbp,-24
Adam Langleyd9e397b2015-01-22 14:27:53 -080027 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -080028.cfi_offset %r12,-32
Adam Langleyd9e397b2015-01-22 14:27:53 -080029 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -080030.cfi_offset %r13,-40
Adam Langleyd9e397b2015-01-22 14:27:53 -080031 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -080032.cfi_offset %r14,-48
Adam Langleyd9e397b2015-01-22 14:27:53 -080033 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -080034.cfi_offset %r15,-56
David Benjamin4969cc92016-04-22 15:02:23 -040035
Robert Sloana94fe052017-02-21 08:49:28 -080036 negq %r9
37 movq %rsp,%r11
38 leaq -280(%rsp,%r9,8),%r10
39 negq %r9
40 andq $-1024,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -080041
Robert Sloana94fe052017-02-21 08:49:28 -080042
43
44
45
46
47
48
49
50 subq %r10,%r11
51 andq $-4096,%r11
52 leaq (%r10,%r11,1),%rsp
53 movq (%rsp),%r11
54 cmpq %r10,%rsp
55 ja .Lmul_page_walk
56 jmp .Lmul_page_walk_done
57
58.Lmul_page_walk:
59 leaq -4096(%rsp),%rsp
60 movq (%rsp),%r11
61 cmpq %r10,%rsp
62 ja .Lmul_page_walk
63.Lmul_page_walk_done:
64
65 leaq .Linc(%rip),%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -080066 movq %rax,8(%rsp,%r9,8)
Robert Sloana94fe052017-02-21 08:49:28 -080067.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
Adam Langleyd9e397b2015-01-22 14:27:53 -080068.Lmul_body:
Robert Sloana94fe052017-02-21 08:49:28 -080069
David Benjamin4969cc92016-04-22 15:02:23 -040070 leaq 128(%rdx),%r12
71 movdqa 0(%r10),%xmm0
72 movdqa 16(%r10),%xmm1
73 leaq 24-112(%rsp,%r9,8),%r10
74 andq $-16,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -080075
David Benjamin4969cc92016-04-22 15:02:23 -040076 pshufd $0,%xmm5,%xmm5
77 movdqa %xmm1,%xmm4
78 movdqa %xmm1,%xmm2
79 paddd %xmm0,%xmm1
80 pcmpeqd %xmm5,%xmm0
81.byte 0x67
82 movdqa %xmm4,%xmm3
83 paddd %xmm1,%xmm2
84 pcmpeqd %xmm5,%xmm1
85 movdqa %xmm0,112(%r10)
86 movdqa %xmm4,%xmm0
87
88 paddd %xmm2,%xmm3
89 pcmpeqd %xmm5,%xmm2
90 movdqa %xmm1,128(%r10)
91 movdqa %xmm4,%xmm1
92
93 paddd %xmm3,%xmm0
94 pcmpeqd %xmm5,%xmm3
95 movdqa %xmm2,144(%r10)
96 movdqa %xmm4,%xmm2
97
98 paddd %xmm0,%xmm1
99 pcmpeqd %xmm5,%xmm0
100 movdqa %xmm3,160(%r10)
101 movdqa %xmm4,%xmm3
102 paddd %xmm1,%xmm2
103 pcmpeqd %xmm5,%xmm1
104 movdqa %xmm0,176(%r10)
105 movdqa %xmm4,%xmm0
106
107 paddd %xmm2,%xmm3
108 pcmpeqd %xmm5,%xmm2
109 movdqa %xmm1,192(%r10)
110 movdqa %xmm4,%xmm1
111
112 paddd %xmm3,%xmm0
113 pcmpeqd %xmm5,%xmm3
114 movdqa %xmm2,208(%r10)
115 movdqa %xmm4,%xmm2
116
117 paddd %xmm0,%xmm1
118 pcmpeqd %xmm5,%xmm0
119 movdqa %xmm3,224(%r10)
120 movdqa %xmm4,%xmm3
121 paddd %xmm1,%xmm2
122 pcmpeqd %xmm5,%xmm1
123 movdqa %xmm0,240(%r10)
124 movdqa %xmm4,%xmm0
125
126 paddd %xmm2,%xmm3
127 pcmpeqd %xmm5,%xmm2
128 movdqa %xmm1,256(%r10)
129 movdqa %xmm4,%xmm1
130
131 paddd %xmm3,%xmm0
132 pcmpeqd %xmm5,%xmm3
133 movdqa %xmm2,272(%r10)
134 movdqa %xmm4,%xmm2
135
136 paddd %xmm0,%xmm1
137 pcmpeqd %xmm5,%xmm0
138 movdqa %xmm3,288(%r10)
139 movdqa %xmm4,%xmm3
140 paddd %xmm1,%xmm2
141 pcmpeqd %xmm5,%xmm1
142 movdqa %xmm0,304(%r10)
143
144 paddd %xmm2,%xmm3
145.byte 0x67
146 pcmpeqd %xmm5,%xmm2
147 movdqa %xmm1,320(%r10)
148
149 pcmpeqd %xmm5,%xmm3
150 movdqa %xmm2,336(%r10)
151 pand 64(%r12),%xmm0
152
153 pand 80(%r12),%xmm1
154 pand 96(%r12),%xmm2
155 movdqa %xmm3,352(%r10)
156 pand 112(%r12),%xmm3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800157 por %xmm2,%xmm0
David Benjamin4969cc92016-04-22 15:02:23 -0400158 por %xmm3,%xmm1
159 movdqa -128(%r12),%xmm4
160 movdqa -112(%r12),%xmm5
161 movdqa -96(%r12),%xmm2
162 pand 112(%r10),%xmm4
163 movdqa -80(%r12),%xmm3
164 pand 128(%r10),%xmm5
165 por %xmm4,%xmm0
166 pand 144(%r10),%xmm2
167 por %xmm5,%xmm1
168 pand 160(%r10),%xmm3
169 por %xmm2,%xmm0
170 por %xmm3,%xmm1
171 movdqa -64(%r12),%xmm4
172 movdqa -48(%r12),%xmm5
173 movdqa -32(%r12),%xmm2
174 pand 176(%r10),%xmm4
175 movdqa -16(%r12),%xmm3
176 pand 192(%r10),%xmm5
177 por %xmm4,%xmm0
178 pand 208(%r10),%xmm2
179 por %xmm5,%xmm1
180 pand 224(%r10),%xmm3
181 por %xmm2,%xmm0
182 por %xmm3,%xmm1
183 movdqa 0(%r12),%xmm4
184 movdqa 16(%r12),%xmm5
185 movdqa 32(%r12),%xmm2
186 pand 240(%r10),%xmm4
187 movdqa 48(%r12),%xmm3
188 pand 256(%r10),%xmm5
189 por %xmm4,%xmm0
190 pand 272(%r10),%xmm2
191 por %xmm5,%xmm1
192 pand 288(%r10),%xmm3
193 por %xmm2,%xmm0
194 por %xmm3,%xmm1
195 por %xmm1,%xmm0
196 pshufd $0x4e,%xmm0,%xmm1
197 por %xmm1,%xmm0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800198 leaq 256(%r12),%r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800199.byte 102,72,15,126,195
200
201 movq (%r8),%r8
202 movq (%rsi),%rax
203
204 xorq %r14,%r14
205 xorq %r15,%r15
206
Adam Langleyd9e397b2015-01-22 14:27:53 -0800207 movq %r8,%rbp
208 mulq %rbx
209 movq %rax,%r10
210 movq (%rcx),%rax
211
Adam Langleyd9e397b2015-01-22 14:27:53 -0800212 imulq %r10,%rbp
213 movq %rdx,%r11
214
Adam Langleyd9e397b2015-01-22 14:27:53 -0800215 mulq %rbp
216 addq %rax,%r10
217 movq 8(%rsi),%rax
218 adcq $0,%rdx
219 movq %rdx,%r13
220
221 leaq 1(%r15),%r15
222 jmp .L1st_enter
223
224.align 16
225.L1st:
226 addq %rax,%r13
227 movq (%rsi,%r15,8),%rax
228 adcq $0,%rdx
229 addq %r11,%r13
230 movq %r10,%r11
231 adcq $0,%rdx
232 movq %r13,-16(%rsp,%r15,8)
233 movq %rdx,%r13
234
235.L1st_enter:
236 mulq %rbx
237 addq %rax,%r11
238 movq (%rcx,%r15,8),%rax
239 adcq $0,%rdx
240 leaq 1(%r15),%r15
241 movq %rdx,%r10
242
243 mulq %rbp
244 cmpq %r9,%r15
245 jne .L1st
246
Adam Langleyd9e397b2015-01-22 14:27:53 -0800247
248 addq %rax,%r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800249 adcq $0,%rdx
250 addq %r11,%r13
251 adcq $0,%rdx
David Benjamin4969cc92016-04-22 15:02:23 -0400252 movq %r13,-16(%rsp,%r9,8)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800253 movq %rdx,%r13
254 movq %r10,%r11
255
256 xorq %rdx,%rdx
257 addq %r11,%r13
258 adcq $0,%rdx
259 movq %r13,-8(%rsp,%r9,8)
260 movq %rdx,(%rsp,%r9,8)
261
262 leaq 1(%r14),%r14
263 jmp .Louter
264.align 16
265.Louter:
David Benjamin4969cc92016-04-22 15:02:23 -0400266 leaq 24+128(%rsp,%r9,8),%rdx
267 andq $-16,%rdx
268 pxor %xmm4,%xmm4
269 pxor %xmm5,%xmm5
270 movdqa -128(%r12),%xmm0
271 movdqa -112(%r12),%xmm1
272 movdqa -96(%r12),%xmm2
273 movdqa -80(%r12),%xmm3
274 pand -128(%rdx),%xmm0
275 pand -112(%rdx),%xmm1
276 por %xmm0,%xmm4
277 pand -96(%rdx),%xmm2
278 por %xmm1,%xmm5
279 pand -80(%rdx),%xmm3
280 por %xmm2,%xmm4
281 por %xmm3,%xmm5
282 movdqa -64(%r12),%xmm0
283 movdqa -48(%r12),%xmm1
284 movdqa -32(%r12),%xmm2
285 movdqa -16(%r12),%xmm3
286 pand -64(%rdx),%xmm0
287 pand -48(%rdx),%xmm1
288 por %xmm0,%xmm4
289 pand -32(%rdx),%xmm2
290 por %xmm1,%xmm5
291 pand -16(%rdx),%xmm3
292 por %xmm2,%xmm4
293 por %xmm3,%xmm5
294 movdqa 0(%r12),%xmm0
295 movdqa 16(%r12),%xmm1
296 movdqa 32(%r12),%xmm2
297 movdqa 48(%r12),%xmm3
298 pand 0(%rdx),%xmm0
299 pand 16(%rdx),%xmm1
300 por %xmm0,%xmm4
301 pand 32(%rdx),%xmm2
302 por %xmm1,%xmm5
303 pand 48(%rdx),%xmm3
304 por %xmm2,%xmm4
305 por %xmm3,%xmm5
306 movdqa 64(%r12),%xmm0
307 movdqa 80(%r12),%xmm1
308 movdqa 96(%r12),%xmm2
309 movdqa 112(%r12),%xmm3
310 pand 64(%rdx),%xmm0
311 pand 80(%rdx),%xmm1
312 por %xmm0,%xmm4
313 pand 96(%rdx),%xmm2
314 por %xmm1,%xmm5
315 pand 112(%rdx),%xmm3
316 por %xmm2,%xmm4
317 por %xmm3,%xmm5
318 por %xmm5,%xmm4
319 pshufd $0x4e,%xmm4,%xmm0
320 por %xmm4,%xmm0
321 leaq 256(%r12),%r12
322
323 movq (%rsi),%rax
324.byte 102,72,15,126,195
325
Adam Langleyd9e397b2015-01-22 14:27:53 -0800326 xorq %r15,%r15
327 movq %r8,%rbp
328 movq (%rsp),%r10
329
Adam Langleyd9e397b2015-01-22 14:27:53 -0800330 mulq %rbx
331 addq %rax,%r10
332 movq (%rcx),%rax
333 adcq $0,%rdx
334
Adam Langleyd9e397b2015-01-22 14:27:53 -0800335 imulq %r10,%rbp
336 movq %rdx,%r11
337
Adam Langleyd9e397b2015-01-22 14:27:53 -0800338 mulq %rbp
339 addq %rax,%r10
340 movq 8(%rsi),%rax
341 adcq $0,%rdx
342 movq 8(%rsp),%r10
343 movq %rdx,%r13
344
345 leaq 1(%r15),%r15
346 jmp .Linner_enter
347
348.align 16
349.Linner:
350 addq %rax,%r13
351 movq (%rsi,%r15,8),%rax
352 adcq $0,%rdx
353 addq %r10,%r13
354 movq (%rsp,%r15,8),%r10
355 adcq $0,%rdx
356 movq %r13,-16(%rsp,%r15,8)
357 movq %rdx,%r13
358
359.Linner_enter:
360 mulq %rbx
361 addq %rax,%r11
362 movq (%rcx,%r15,8),%rax
363 adcq $0,%rdx
364 addq %r11,%r10
365 movq %rdx,%r11
366 adcq $0,%r11
367 leaq 1(%r15),%r15
368
369 mulq %rbp
370 cmpq %r9,%r15
371 jne .Linner
372
Adam Langleyd9e397b2015-01-22 14:27:53 -0800373 addq %rax,%r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800374 adcq $0,%rdx
375 addq %r10,%r13
David Benjamin4969cc92016-04-22 15:02:23 -0400376 movq (%rsp,%r9,8),%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -0800377 adcq $0,%rdx
David Benjamin4969cc92016-04-22 15:02:23 -0400378 movq %r13,-16(%rsp,%r9,8)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800379 movq %rdx,%r13
380
381 xorq %rdx,%rdx
382 addq %r11,%r13
383 adcq $0,%rdx
384 addq %r10,%r13
385 adcq $0,%rdx
386 movq %r13,-8(%rsp,%r9,8)
387 movq %rdx,(%rsp,%r9,8)
388
389 leaq 1(%r14),%r14
390 cmpq %r9,%r14
391 jb .Louter
392
393 xorq %r14,%r14
394 movq (%rsp),%rax
395 leaq (%rsp),%rsi
396 movq %r9,%r15
397 jmp .Lsub
398.align 16
Robert Sloanab8b8882018-03-26 11:39:51 -0700399.Lsub: sbbq (%rcx,%r14,8),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800400 movq %rax,(%rdi,%r14,8)
401 movq 8(%rsi,%r14,8),%rax
402 leaq 1(%r14),%r14
403 decq %r15
404 jnz .Lsub
405
406 sbbq $0,%rax
407 xorq %r14,%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800408 andq %rax,%rsi
409 notq %rax
410 movq %rdi,%rcx
411 andq %rax,%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800412 movq %r9,%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800413 orq %rcx,%rsi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800414.align 16
415.Lcopy:
Robert Sloana94fe052017-02-21 08:49:28 -0800416 movq (%rsi,%r14,8),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800417 movq %r14,(%rsp,%r14,8)
Robert Sloana94fe052017-02-21 08:49:28 -0800418 movq %rax,(%rdi,%r14,8)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800419 leaq 1(%r14),%r14
420 subq $1,%r15
421 jnz .Lcopy
422
423 movq 8(%rsp,%r9,8),%rsi
Robert Sloana94fe052017-02-21 08:49:28 -0800424.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800425 movq $1,%rax
David Benjamin4969cc92016-04-22 15:02:23 -0400426
Adam Langleyd9e397b2015-01-22 14:27:53 -0800427 movq -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800428.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800429 movq -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800430.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800431 movq -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -0800432.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800433 movq -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -0800434.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800435 movq -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800436.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800437 movq -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800438.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800439 leaq (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800440.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800441.Lmul_epilogue:
442 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -0800443.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800444.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
445.type bn_mul4x_mont_gather5,@function
446.align 32
447bn_mul4x_mont_gather5:
Robert Sloana94fe052017-02-21 08:49:28 -0800448.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800449.byte 0x67
450 movq %rsp,%rax
Robert Sloana94fe052017-02-21 08:49:28 -0800451.cfi_def_cfa_register %rax
452.Lmul4x_enter:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800453 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800454.cfi_offset %rbx,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800455 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800456.cfi_offset %rbp,-24
Adam Langleyd9e397b2015-01-22 14:27:53 -0800457 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -0800458.cfi_offset %r12,-32
Adam Langleyd9e397b2015-01-22 14:27:53 -0800459 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -0800460.cfi_offset %r13,-40
Adam Langleyd9e397b2015-01-22 14:27:53 -0800461 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -0800462.cfi_offset %r14,-48
Adam Langleyd9e397b2015-01-22 14:27:53 -0800463 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -0800464.cfi_offset %r15,-56
465.Lmul4x_prologue:
David Benjamin4969cc92016-04-22 15:02:23 -0400466
Adam Langleyd9e397b2015-01-22 14:27:53 -0800467.byte 0x67
Adam Langleyd9e397b2015-01-22 14:27:53 -0800468 shll $3,%r9d
David Benjamin4969cc92016-04-22 15:02:23 -0400469 leaq (%r9,%r9,2),%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -0800470 negq %r9
471
472
473
474
475
476
477
478
David Benjamin4969cc92016-04-22 15:02:23 -0400479
480
481 leaq -320(%rsp,%r9,2),%r11
Robert Sloana94fe052017-02-21 08:49:28 -0800482 movq %rsp,%rbp
David Benjamin4969cc92016-04-22 15:02:23 -0400483 subq %rdi,%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -0800484 andq $4095,%r11
485 cmpq %r11,%r10
486 jb .Lmul4xsp_alt
Robert Sloana94fe052017-02-21 08:49:28 -0800487 subq %r11,%rbp
488 leaq -320(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800489 jmp .Lmul4xsp_done
490
491.align 32
492.Lmul4xsp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -0400493 leaq 4096-320(,%r9,2),%r10
Robert Sloana94fe052017-02-21 08:49:28 -0800494 leaq -320(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800495 subq %r10,%r11
496 movq $0,%r10
497 cmovcq %r10,%r11
Robert Sloana94fe052017-02-21 08:49:28 -0800498 subq %r11,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800499.Lmul4xsp_done:
Robert Sloana94fe052017-02-21 08:49:28 -0800500 andq $-64,%rbp
501 movq %rsp,%r11
502 subq %rbp,%r11
503 andq $-4096,%r11
504 leaq (%r11,%rbp,1),%rsp
505 movq (%rsp),%r10
506 cmpq %rbp,%rsp
507 ja .Lmul4x_page_walk
508 jmp .Lmul4x_page_walk_done
509
510.Lmul4x_page_walk:
511 leaq -4096(%rsp),%rsp
512 movq (%rsp),%r10
513 cmpq %rbp,%rsp
514 ja .Lmul4x_page_walk
515.Lmul4x_page_walk_done:
516
Adam Langleyd9e397b2015-01-22 14:27:53 -0800517 negq %r9
518
519 movq %rax,40(%rsp)
Robert Sloana94fe052017-02-21 08:49:28 -0800520.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
Adam Langleyd9e397b2015-01-22 14:27:53 -0800521.Lmul4x_body:
522
523 call mul4x_internal
524
525 movq 40(%rsp),%rsi
Robert Sloana94fe052017-02-21 08:49:28 -0800526.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800527 movq $1,%rax
David Benjamin4969cc92016-04-22 15:02:23 -0400528
Adam Langleyd9e397b2015-01-22 14:27:53 -0800529 movq -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800530.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800531 movq -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800532.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800533 movq -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -0800534.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800535 movq -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -0800536.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800537 movq -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800538.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800539 movq -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800540.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800541 leaq (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800542.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800543.Lmul4x_epilogue:
544 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -0800545.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800546.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
547
548.type mul4x_internal,@function
549.align 32
550mul4x_internal:
551 shlq $5,%r9
David Benjamin4969cc92016-04-22 15:02:23 -0400552 movd 8(%rax),%xmm5
553 leaq .Linc(%rip),%rax
554 leaq 128(%rdx,%r9,1),%r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800555 shrq $5,%r9
David Benjamin4969cc92016-04-22 15:02:23 -0400556 movdqa 0(%rax),%xmm0
557 movdqa 16(%rax),%xmm1
558 leaq 88-112(%rsp,%r9,1),%r10
559 leaq 128(%rdx),%r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800560
David Benjamin4969cc92016-04-22 15:02:23 -0400561 pshufd $0,%xmm5,%xmm5
562 movdqa %xmm1,%xmm4
563.byte 0x67,0x67
564 movdqa %xmm1,%xmm2
565 paddd %xmm0,%xmm1
566 pcmpeqd %xmm5,%xmm0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800567.byte 0x67
David Benjamin4969cc92016-04-22 15:02:23 -0400568 movdqa %xmm4,%xmm3
569 paddd %xmm1,%xmm2
570 pcmpeqd %xmm5,%xmm1
571 movdqa %xmm0,112(%r10)
572 movdqa %xmm4,%xmm0
573
574 paddd %xmm2,%xmm3
575 pcmpeqd %xmm5,%xmm2
576 movdqa %xmm1,128(%r10)
577 movdqa %xmm4,%xmm1
578
579 paddd %xmm3,%xmm0
580 pcmpeqd %xmm5,%xmm3
581 movdqa %xmm2,144(%r10)
582 movdqa %xmm4,%xmm2
583
584 paddd %xmm0,%xmm1
585 pcmpeqd %xmm5,%xmm0
586 movdqa %xmm3,160(%r10)
587 movdqa %xmm4,%xmm3
588 paddd %xmm1,%xmm2
589 pcmpeqd %xmm5,%xmm1
590 movdqa %xmm0,176(%r10)
591 movdqa %xmm4,%xmm0
592
593 paddd %xmm2,%xmm3
594 pcmpeqd %xmm5,%xmm2
595 movdqa %xmm1,192(%r10)
596 movdqa %xmm4,%xmm1
597
598 paddd %xmm3,%xmm0
599 pcmpeqd %xmm5,%xmm3
600 movdqa %xmm2,208(%r10)
601 movdqa %xmm4,%xmm2
602
603 paddd %xmm0,%xmm1
604 pcmpeqd %xmm5,%xmm0
605 movdqa %xmm3,224(%r10)
606 movdqa %xmm4,%xmm3
607 paddd %xmm1,%xmm2
608 pcmpeqd %xmm5,%xmm1
609 movdqa %xmm0,240(%r10)
610 movdqa %xmm4,%xmm0
611
612 paddd %xmm2,%xmm3
613 pcmpeqd %xmm5,%xmm2
614 movdqa %xmm1,256(%r10)
615 movdqa %xmm4,%xmm1
616
617 paddd %xmm3,%xmm0
618 pcmpeqd %xmm5,%xmm3
619 movdqa %xmm2,272(%r10)
620 movdqa %xmm4,%xmm2
621
622 paddd %xmm0,%xmm1
623 pcmpeqd %xmm5,%xmm0
624 movdqa %xmm3,288(%r10)
625 movdqa %xmm4,%xmm3
626 paddd %xmm1,%xmm2
627 pcmpeqd %xmm5,%xmm1
628 movdqa %xmm0,304(%r10)
629
630 paddd %xmm2,%xmm3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800631.byte 0x67
David Benjamin4969cc92016-04-22 15:02:23 -0400632 pcmpeqd %xmm5,%xmm2
633 movdqa %xmm1,320(%r10)
634
635 pcmpeqd %xmm5,%xmm3
636 movdqa %xmm2,336(%r10)
637 pand 64(%r12),%xmm0
638
639 pand 80(%r12),%xmm1
640 pand 96(%r12),%xmm2
641 movdqa %xmm3,352(%r10)
642 pand 112(%r12),%xmm3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800643 por %xmm2,%xmm0
David Benjamin4969cc92016-04-22 15:02:23 -0400644 por %xmm3,%xmm1
645 movdqa -128(%r12),%xmm4
646 movdqa -112(%r12),%xmm5
647 movdqa -96(%r12),%xmm2
648 pand 112(%r10),%xmm4
649 movdqa -80(%r12),%xmm3
650 pand 128(%r10),%xmm5
651 por %xmm4,%xmm0
652 pand 144(%r10),%xmm2
653 por %xmm5,%xmm1
654 pand 160(%r10),%xmm3
655 por %xmm2,%xmm0
656 por %xmm3,%xmm1
657 movdqa -64(%r12),%xmm4
658 movdqa -48(%r12),%xmm5
659 movdqa -32(%r12),%xmm2
660 pand 176(%r10),%xmm4
661 movdqa -16(%r12),%xmm3
662 pand 192(%r10),%xmm5
663 por %xmm4,%xmm0
664 pand 208(%r10),%xmm2
665 por %xmm5,%xmm1
666 pand 224(%r10),%xmm3
667 por %xmm2,%xmm0
668 por %xmm3,%xmm1
669 movdqa 0(%r12),%xmm4
670 movdqa 16(%r12),%xmm5
671 movdqa 32(%r12),%xmm2
672 pand 240(%r10),%xmm4
673 movdqa 48(%r12),%xmm3
674 pand 256(%r10),%xmm5
675 por %xmm4,%xmm0
676 pand 272(%r10),%xmm2
677 por %xmm5,%xmm1
678 pand 288(%r10),%xmm3
679 por %xmm2,%xmm0
680 por %xmm3,%xmm1
681 por %xmm1,%xmm0
682 pshufd $0x4e,%xmm0,%xmm1
683 por %xmm1,%xmm0
684 leaq 256(%r12),%r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800685.byte 102,72,15,126,195
David Benjamin4969cc92016-04-22 15:02:23 -0400686
Adam Langleyd9e397b2015-01-22 14:27:53 -0800687 movq %r13,16+8(%rsp)
688 movq %rdi,56+8(%rsp)
689
690 movq (%r8),%r8
691 movq (%rsi),%rax
692 leaq (%rsi,%r9,1),%rsi
693 negq %r9
694
695 movq %r8,%rbp
696 mulq %rbx
697 movq %rax,%r10
698 movq (%rcx),%rax
699
Adam Langleyd9e397b2015-01-22 14:27:53 -0800700 imulq %r10,%rbp
David Benjamin4969cc92016-04-22 15:02:23 -0400701 leaq 64+8(%rsp),%r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800702 movq %rdx,%r11
703
Adam Langleyd9e397b2015-01-22 14:27:53 -0800704 mulq %rbp
705 addq %rax,%r10
706 movq 8(%rsi,%r9,1),%rax
707 adcq $0,%rdx
708 movq %rdx,%rdi
709
710 mulq %rbx
711 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -0400712 movq 8(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800713 adcq $0,%rdx
714 movq %rdx,%r10
715
716 mulq %rbp
717 addq %rax,%rdi
718 movq 16(%rsi,%r9,1),%rax
719 adcq $0,%rdx
720 addq %r11,%rdi
721 leaq 32(%r9),%r15
David Benjamin4969cc92016-04-22 15:02:23 -0400722 leaq 32(%rcx),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800723 adcq $0,%rdx
724 movq %rdi,(%r14)
725 movq %rdx,%r13
726 jmp .L1st4x
727
728.align 32
729.L1st4x:
730 mulq %rbx
731 addq %rax,%r10
David Benjamin4969cc92016-04-22 15:02:23 -0400732 movq -16(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800733 leaq 32(%r14),%r14
734 adcq $0,%rdx
735 movq %rdx,%r11
736
737 mulq %rbp
738 addq %rax,%r13
739 movq -8(%rsi,%r15,1),%rax
740 adcq $0,%rdx
741 addq %r10,%r13
742 adcq $0,%rdx
743 movq %r13,-24(%r14)
744 movq %rdx,%rdi
745
746 mulq %rbx
747 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -0400748 movq -8(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800749 adcq $0,%rdx
750 movq %rdx,%r10
751
752 mulq %rbp
753 addq %rax,%rdi
754 movq (%rsi,%r15,1),%rax
755 adcq $0,%rdx
756 addq %r11,%rdi
757 adcq $0,%rdx
758 movq %rdi,-16(%r14)
759 movq %rdx,%r13
760
761 mulq %rbx
762 addq %rax,%r10
763 movq 0(%rcx),%rax
764 adcq $0,%rdx
765 movq %rdx,%r11
766
767 mulq %rbp
768 addq %rax,%r13
769 movq 8(%rsi,%r15,1),%rax
770 adcq $0,%rdx
771 addq %r10,%r13
772 adcq $0,%rdx
773 movq %r13,-8(%r14)
774 movq %rdx,%rdi
775
776 mulq %rbx
777 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -0400778 movq 8(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800779 adcq $0,%rdx
780 movq %rdx,%r10
781
782 mulq %rbp
783 addq %rax,%rdi
784 movq 16(%rsi,%r15,1),%rax
785 adcq $0,%rdx
786 addq %r11,%rdi
David Benjamin4969cc92016-04-22 15:02:23 -0400787 leaq 32(%rcx),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800788 adcq $0,%rdx
789 movq %rdi,(%r14)
790 movq %rdx,%r13
791
792 addq $32,%r15
793 jnz .L1st4x
794
795 mulq %rbx
796 addq %rax,%r10
David Benjamin4969cc92016-04-22 15:02:23 -0400797 movq -16(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800798 leaq 32(%r14),%r14
799 adcq $0,%rdx
800 movq %rdx,%r11
801
802 mulq %rbp
803 addq %rax,%r13
804 movq -8(%rsi),%rax
805 adcq $0,%rdx
806 addq %r10,%r13
807 adcq $0,%rdx
808 movq %r13,-24(%r14)
809 movq %rdx,%rdi
810
811 mulq %rbx
812 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -0400813 movq -8(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800814 adcq $0,%rdx
815 movq %rdx,%r10
816
817 mulq %rbp
818 addq %rax,%rdi
819 movq (%rsi,%r9,1),%rax
820 adcq $0,%rdx
821 addq %r11,%rdi
822 adcq $0,%rdx
823 movq %rdi,-16(%r14)
824 movq %rdx,%r13
825
David Benjamin4969cc92016-04-22 15:02:23 -0400826 leaq (%rcx,%r9,1),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800827
828 xorq %rdi,%rdi
829 addq %r10,%r13
830 adcq $0,%rdi
831 movq %r13,-8(%r14)
832
833 jmp .Louter4x
834
835.align 32
836.Louter4x:
David Benjamin4969cc92016-04-22 15:02:23 -0400837 leaq 16+128(%r14),%rdx
838 pxor %xmm4,%xmm4
839 pxor %xmm5,%xmm5
840 movdqa -128(%r12),%xmm0
841 movdqa -112(%r12),%xmm1
842 movdqa -96(%r12),%xmm2
843 movdqa -80(%r12),%xmm3
844 pand -128(%rdx),%xmm0
845 pand -112(%rdx),%xmm1
846 por %xmm0,%xmm4
847 pand -96(%rdx),%xmm2
848 por %xmm1,%xmm5
849 pand -80(%rdx),%xmm3
850 por %xmm2,%xmm4
851 por %xmm3,%xmm5
852 movdqa -64(%r12),%xmm0
853 movdqa -48(%r12),%xmm1
854 movdqa -32(%r12),%xmm2
855 movdqa -16(%r12),%xmm3
856 pand -64(%rdx),%xmm0
857 pand -48(%rdx),%xmm1
858 por %xmm0,%xmm4
859 pand -32(%rdx),%xmm2
860 por %xmm1,%xmm5
861 pand -16(%rdx),%xmm3
862 por %xmm2,%xmm4
863 por %xmm3,%xmm5
864 movdqa 0(%r12),%xmm0
865 movdqa 16(%r12),%xmm1
866 movdqa 32(%r12),%xmm2
867 movdqa 48(%r12),%xmm3
868 pand 0(%rdx),%xmm0
869 pand 16(%rdx),%xmm1
870 por %xmm0,%xmm4
871 pand 32(%rdx),%xmm2
872 por %xmm1,%xmm5
873 pand 48(%rdx),%xmm3
874 por %xmm2,%xmm4
875 por %xmm3,%xmm5
876 movdqa 64(%r12),%xmm0
877 movdqa 80(%r12),%xmm1
878 movdqa 96(%r12),%xmm2
879 movdqa 112(%r12),%xmm3
880 pand 64(%rdx),%xmm0
881 pand 80(%rdx),%xmm1
882 por %xmm0,%xmm4
883 pand 96(%rdx),%xmm2
884 por %xmm1,%xmm5
885 pand 112(%rdx),%xmm3
886 por %xmm2,%xmm4
887 por %xmm3,%xmm5
888 por %xmm5,%xmm4
889 pshufd $0x4e,%xmm4,%xmm0
890 por %xmm4,%xmm0
891 leaq 256(%r12),%r12
892.byte 102,72,15,126,195
893
Adam Langleyd9e397b2015-01-22 14:27:53 -0800894 movq (%r14,%r9,1),%r10
895 movq %r8,%rbp
896 mulq %rbx
897 addq %rax,%r10
898 movq (%rcx),%rax
899 adcq $0,%rdx
900
Adam Langleyd9e397b2015-01-22 14:27:53 -0800901 imulq %r10,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800902 movq %rdx,%r11
903 movq %rdi,(%r14)
904
Adam Langleyd9e397b2015-01-22 14:27:53 -0800905 leaq (%r14,%r9,1),%r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800906
907 mulq %rbp
908 addq %rax,%r10
909 movq 8(%rsi,%r9,1),%rax
910 adcq $0,%rdx
911 movq %rdx,%rdi
912
913 mulq %rbx
914 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -0400915 movq 8(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800916 adcq $0,%rdx
917 addq 8(%r14),%r11
918 adcq $0,%rdx
919 movq %rdx,%r10
920
921 mulq %rbp
922 addq %rax,%rdi
923 movq 16(%rsi,%r9,1),%rax
924 adcq $0,%rdx
925 addq %r11,%rdi
926 leaq 32(%r9),%r15
David Benjamin4969cc92016-04-22 15:02:23 -0400927 leaq 32(%rcx),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800928 adcq $0,%rdx
929 movq %rdx,%r13
930 jmp .Linner4x
931
932.align 32
933.Linner4x:
934 mulq %rbx
935 addq %rax,%r10
David Benjamin4969cc92016-04-22 15:02:23 -0400936 movq -16(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800937 adcq $0,%rdx
938 addq 16(%r14),%r10
939 leaq 32(%r14),%r14
940 adcq $0,%rdx
941 movq %rdx,%r11
942
943 mulq %rbp
944 addq %rax,%r13
945 movq -8(%rsi,%r15,1),%rax
946 adcq $0,%rdx
947 addq %r10,%r13
948 adcq $0,%rdx
949 movq %rdi,-32(%r14)
950 movq %rdx,%rdi
951
952 mulq %rbx
953 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -0400954 movq -8(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800955 adcq $0,%rdx
956 addq -8(%r14),%r11
957 adcq $0,%rdx
958 movq %rdx,%r10
959
960 mulq %rbp
961 addq %rax,%rdi
962 movq (%rsi,%r15,1),%rax
963 adcq $0,%rdx
964 addq %r11,%rdi
965 adcq $0,%rdx
966 movq %r13,-24(%r14)
967 movq %rdx,%r13
968
969 mulq %rbx
970 addq %rax,%r10
971 movq 0(%rcx),%rax
972 adcq $0,%rdx
973 addq (%r14),%r10
974 adcq $0,%rdx
975 movq %rdx,%r11
976
977 mulq %rbp
978 addq %rax,%r13
979 movq 8(%rsi,%r15,1),%rax
980 adcq $0,%rdx
981 addq %r10,%r13
982 adcq $0,%rdx
983 movq %rdi,-16(%r14)
984 movq %rdx,%rdi
985
986 mulq %rbx
987 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -0400988 movq 8(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800989 adcq $0,%rdx
990 addq 8(%r14),%r11
991 adcq $0,%rdx
992 movq %rdx,%r10
993
994 mulq %rbp
995 addq %rax,%rdi
996 movq 16(%rsi,%r15,1),%rax
997 adcq $0,%rdx
998 addq %r11,%rdi
David Benjamin4969cc92016-04-22 15:02:23 -0400999 leaq 32(%rcx),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001000 adcq $0,%rdx
1001 movq %r13,-8(%r14)
1002 movq %rdx,%r13
1003
1004 addq $32,%r15
1005 jnz .Linner4x
1006
1007 mulq %rbx
1008 addq %rax,%r10
David Benjamin4969cc92016-04-22 15:02:23 -04001009 movq -16(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001010 adcq $0,%rdx
1011 addq 16(%r14),%r10
1012 leaq 32(%r14),%r14
1013 adcq $0,%rdx
1014 movq %rdx,%r11
1015
1016 mulq %rbp
1017 addq %rax,%r13
1018 movq -8(%rsi),%rax
1019 adcq $0,%rdx
1020 addq %r10,%r13
1021 adcq $0,%rdx
1022 movq %rdi,-32(%r14)
1023 movq %rdx,%rdi
1024
1025 mulq %rbx
1026 addq %rax,%r11
1027 movq %rbp,%rax
David Benjamin4969cc92016-04-22 15:02:23 -04001028 movq -8(%rcx),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001029 adcq $0,%rdx
1030 addq -8(%r14),%r11
1031 adcq $0,%rdx
1032 movq %rdx,%r10
1033
1034 mulq %rbp
1035 addq %rax,%rdi
1036 movq (%rsi,%r9,1),%rax
1037 adcq $0,%rdx
1038 addq %r11,%rdi
1039 adcq $0,%rdx
1040 movq %r13,-24(%r14)
1041 movq %rdx,%r13
1042
Adam Langleyd9e397b2015-01-22 14:27:53 -08001043 movq %rdi,-16(%r14)
David Benjamin4969cc92016-04-22 15:02:23 -04001044 leaq (%rcx,%r9,1),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001045
1046 xorq %rdi,%rdi
1047 addq %r10,%r13
1048 adcq $0,%rdi
1049 addq (%r14),%r13
1050 adcq $0,%rdi
1051 movq %r13,-8(%r14)
1052
1053 cmpq 16+8(%rsp),%r12
1054 jb .Louter4x
David Benjamin4969cc92016-04-22 15:02:23 -04001055 xorq %rax,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001056 subq %r13,%rbp
1057 adcq %r15,%r15
1058 orq %r15,%rdi
David Benjamin4969cc92016-04-22 15:02:23 -04001059 subq %rdi,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001060 leaq (%r14,%r9,1),%rbx
David Benjamin4969cc92016-04-22 15:02:23 -04001061 movq (%rcx),%r12
1062 leaq (%rcx),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001063 movq %r9,%rcx
1064 sarq $3+2,%rcx
1065 movq 56+8(%rsp),%rdi
David Benjamin4969cc92016-04-22 15:02:23 -04001066 decq %r12
1067 xorq %r10,%r10
1068 movq 8(%rbp),%r13
1069 movq 16(%rbp),%r14
1070 movq 24(%rbp),%r15
1071 jmp .Lsqr4x_sub_entry
Adam Langleyd9e397b2015-01-22 14:27:53 -08001072.size mul4x_internal,.-mul4x_internal
1073.globl bn_power5
1074.hidden bn_power5
1075.type bn_power5,@function
1076.align 32
1077bn_power5:
Robert Sloana94fe052017-02-21 08:49:28 -08001078.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08001079 movq %rsp,%rax
Robert Sloana94fe052017-02-21 08:49:28 -08001080.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001081 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08001082.cfi_offset %rbx,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001083 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08001084.cfi_offset %rbp,-24
Adam Langleyd9e397b2015-01-22 14:27:53 -08001085 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -08001086.cfi_offset %r12,-32
Adam Langleyd9e397b2015-01-22 14:27:53 -08001087 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -08001088.cfi_offset %r13,-40
Adam Langleyd9e397b2015-01-22 14:27:53 -08001089 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -08001090.cfi_offset %r14,-48
Adam Langleyd9e397b2015-01-22 14:27:53 -08001091 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -08001092.cfi_offset %r15,-56
1093.Lpower5_prologue:
David Benjamin4969cc92016-04-22 15:02:23 -04001094
Adam Langleyd9e397b2015-01-22 14:27:53 -08001095 shll $3,%r9d
David Benjamin4969cc92016-04-22 15:02:23 -04001096 leal (%r9,%r9,2),%r10d
Adam Langleyd9e397b2015-01-22 14:27:53 -08001097 negq %r9
1098 movq (%r8),%r8
1099
1100
1101
1102
1103
1104
1105
David Benjamin4969cc92016-04-22 15:02:23 -04001106
1107 leaq -320(%rsp,%r9,2),%r11
Robert Sloana94fe052017-02-21 08:49:28 -08001108 movq %rsp,%rbp
David Benjamin4969cc92016-04-22 15:02:23 -04001109 subq %rdi,%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08001110 andq $4095,%r11
1111 cmpq %r11,%r10
1112 jb .Lpwr_sp_alt
Robert Sloana94fe052017-02-21 08:49:28 -08001113 subq %r11,%rbp
1114 leaq -320(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001115 jmp .Lpwr_sp_done
1116
1117.align 32
1118.Lpwr_sp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -04001119 leaq 4096-320(,%r9,2),%r10
Robert Sloana94fe052017-02-21 08:49:28 -08001120 leaq -320(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001121 subq %r10,%r11
1122 movq $0,%r10
1123 cmovcq %r10,%r11
Robert Sloana94fe052017-02-21 08:49:28 -08001124 subq %r11,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001125.Lpwr_sp_done:
Robert Sloana94fe052017-02-21 08:49:28 -08001126 andq $-64,%rbp
1127 movq %rsp,%r11
1128 subq %rbp,%r11
1129 andq $-4096,%r11
1130 leaq (%r11,%rbp,1),%rsp
1131 movq (%rsp),%r10
1132 cmpq %rbp,%rsp
1133 ja .Lpwr_page_walk
1134 jmp .Lpwr_page_walk_done
1135
1136.Lpwr_page_walk:
1137 leaq -4096(%rsp),%rsp
1138 movq (%rsp),%r10
1139 cmpq %rbp,%rsp
1140 ja .Lpwr_page_walk
1141.Lpwr_page_walk_done:
1142
Adam Langleyd9e397b2015-01-22 14:27:53 -08001143 movq %r9,%r10
1144 negq %r9
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155 movq %r8,32(%rsp)
1156 movq %rax,40(%rsp)
Robert Sloana94fe052017-02-21 08:49:28 -08001157.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
Adam Langleyd9e397b2015-01-22 14:27:53 -08001158.Lpower5_body:
1159.byte 102,72,15,110,207
1160.byte 102,72,15,110,209
1161.byte 102,73,15,110,218
1162.byte 102,72,15,110,226
1163
1164 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001165 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001166 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001167 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001168 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001169 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001170 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001171 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001172 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001173 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001174
1175.byte 102,72,15,126,209
1176.byte 102,72,15,126,226
1177 movq %rsi,%rdi
1178 movq 40(%rsp),%rax
1179 leaq 32(%rsp),%r8
1180
1181 call mul4x_internal
1182
1183 movq 40(%rsp),%rsi
Robert Sloana94fe052017-02-21 08:49:28 -08001184.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001185 movq $1,%rax
1186 movq -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -08001187.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08001188 movq -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -08001189.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08001190 movq -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -08001191.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001192 movq -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -08001193.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08001194 movq -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -08001195.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001196 movq -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -08001197.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001198 leaq (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -08001199.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001200.Lpower5_epilogue:
1201 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -08001202.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08001203.size bn_power5,.-bn_power5
1204
1205.globl bn_sqr8x_internal
1206.hidden bn_sqr8x_internal
1207.hidden bn_sqr8x_internal
1208.type bn_sqr8x_internal,@function
1209.align 32
1210bn_sqr8x_internal:
1211__bn_sqr8x_internal:
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285 leaq 32(%r10),%rbp
1286 leaq (%rsi,%r9,1),%rsi
1287
1288 movq %r9,%rcx
1289
1290
1291 movq -32(%rsi,%rbp,1),%r14
1292 leaq 48+8(%rsp,%r9,2),%rdi
1293 movq -24(%rsi,%rbp,1),%rax
1294 leaq -32(%rdi,%rbp,1),%rdi
1295 movq -16(%rsi,%rbp,1),%rbx
1296 movq %rax,%r15
1297
1298 mulq %r14
1299 movq %rax,%r10
1300 movq %rbx,%rax
1301 movq %rdx,%r11
1302 movq %r10,-24(%rdi,%rbp,1)
1303
1304 mulq %r14
1305 addq %rax,%r11
1306 movq %rbx,%rax
1307 adcq $0,%rdx
1308 movq %r11,-16(%rdi,%rbp,1)
1309 movq %rdx,%r10
1310
1311
1312 movq -8(%rsi,%rbp,1),%rbx
1313 mulq %r15
1314 movq %rax,%r12
1315 movq %rbx,%rax
1316 movq %rdx,%r13
1317
1318 leaq (%rbp),%rcx
1319 mulq %r14
1320 addq %rax,%r10
1321 movq %rbx,%rax
1322 movq %rdx,%r11
1323 adcq $0,%r11
1324 addq %r12,%r10
1325 adcq $0,%r11
1326 movq %r10,-8(%rdi,%rcx,1)
1327 jmp .Lsqr4x_1st
1328
1329.align 32
1330.Lsqr4x_1st:
1331 movq (%rsi,%rcx,1),%rbx
1332 mulq %r15
1333 addq %rax,%r13
1334 movq %rbx,%rax
1335 movq %rdx,%r12
1336 adcq $0,%r12
1337
1338 mulq %r14
1339 addq %rax,%r11
1340 movq %rbx,%rax
1341 movq 8(%rsi,%rcx,1),%rbx
1342 movq %rdx,%r10
1343 adcq $0,%r10
1344 addq %r13,%r11
1345 adcq $0,%r10
1346
1347
1348 mulq %r15
1349 addq %rax,%r12
1350 movq %rbx,%rax
1351 movq %r11,(%rdi,%rcx,1)
1352 movq %rdx,%r13
1353 adcq $0,%r13
1354
1355 mulq %r14
1356 addq %rax,%r10
1357 movq %rbx,%rax
1358 movq 16(%rsi,%rcx,1),%rbx
1359 movq %rdx,%r11
1360 adcq $0,%r11
1361 addq %r12,%r10
1362 adcq $0,%r11
1363
1364 mulq %r15
1365 addq %rax,%r13
1366 movq %rbx,%rax
1367 movq %r10,8(%rdi,%rcx,1)
1368 movq %rdx,%r12
1369 adcq $0,%r12
1370
1371 mulq %r14
1372 addq %rax,%r11
1373 movq %rbx,%rax
1374 movq 24(%rsi,%rcx,1),%rbx
1375 movq %rdx,%r10
1376 adcq $0,%r10
1377 addq %r13,%r11
1378 adcq $0,%r10
1379
1380
1381 mulq %r15
1382 addq %rax,%r12
1383 movq %rbx,%rax
1384 movq %r11,16(%rdi,%rcx,1)
1385 movq %rdx,%r13
1386 adcq $0,%r13
1387 leaq 32(%rcx),%rcx
1388
1389 mulq %r14
1390 addq %rax,%r10
1391 movq %rbx,%rax
1392 movq %rdx,%r11
1393 adcq $0,%r11
1394 addq %r12,%r10
1395 adcq $0,%r11
1396 movq %r10,-8(%rdi,%rcx,1)
1397
1398 cmpq $0,%rcx
1399 jne .Lsqr4x_1st
1400
1401 mulq %r15
1402 addq %rax,%r13
1403 leaq 16(%rbp),%rbp
1404 adcq $0,%rdx
1405 addq %r11,%r13
1406 adcq $0,%rdx
1407
1408 movq %r13,(%rdi)
1409 movq %rdx,%r12
1410 movq %rdx,8(%rdi)
1411 jmp .Lsqr4x_outer
1412
1413.align 32
1414.Lsqr4x_outer:
1415 movq -32(%rsi,%rbp,1),%r14
1416 leaq 48+8(%rsp,%r9,2),%rdi
1417 movq -24(%rsi,%rbp,1),%rax
1418 leaq -32(%rdi,%rbp,1),%rdi
1419 movq -16(%rsi,%rbp,1),%rbx
1420 movq %rax,%r15
1421
1422 mulq %r14
1423 movq -24(%rdi,%rbp,1),%r10
1424 addq %rax,%r10
1425 movq %rbx,%rax
1426 adcq $0,%rdx
1427 movq %r10,-24(%rdi,%rbp,1)
1428 movq %rdx,%r11
1429
1430 mulq %r14
1431 addq %rax,%r11
1432 movq %rbx,%rax
1433 adcq $0,%rdx
1434 addq -16(%rdi,%rbp,1),%r11
1435 movq %rdx,%r10
1436 adcq $0,%r10
1437 movq %r11,-16(%rdi,%rbp,1)
1438
1439 xorq %r12,%r12
1440
1441 movq -8(%rsi,%rbp,1),%rbx
1442 mulq %r15
1443 addq %rax,%r12
1444 movq %rbx,%rax
1445 adcq $0,%rdx
1446 addq -8(%rdi,%rbp,1),%r12
1447 movq %rdx,%r13
1448 adcq $0,%r13
1449
1450 mulq %r14
1451 addq %rax,%r10
1452 movq %rbx,%rax
1453 adcq $0,%rdx
1454 addq %r12,%r10
1455 movq %rdx,%r11
1456 adcq $0,%r11
1457 movq %r10,-8(%rdi,%rbp,1)
1458
1459 leaq (%rbp),%rcx
1460 jmp .Lsqr4x_inner
1461
1462.align 32
1463.Lsqr4x_inner:
1464 movq (%rsi,%rcx,1),%rbx
1465 mulq %r15
1466 addq %rax,%r13
1467 movq %rbx,%rax
1468 movq %rdx,%r12
1469 adcq $0,%r12
1470 addq (%rdi,%rcx,1),%r13
1471 adcq $0,%r12
1472
1473.byte 0x67
1474 mulq %r14
1475 addq %rax,%r11
1476 movq %rbx,%rax
1477 movq 8(%rsi,%rcx,1),%rbx
1478 movq %rdx,%r10
1479 adcq $0,%r10
1480 addq %r13,%r11
1481 adcq $0,%r10
1482
1483 mulq %r15
1484 addq %rax,%r12
1485 movq %r11,(%rdi,%rcx,1)
1486 movq %rbx,%rax
1487 movq %rdx,%r13
1488 adcq $0,%r13
1489 addq 8(%rdi,%rcx,1),%r12
1490 leaq 16(%rcx),%rcx
1491 adcq $0,%r13
1492
1493 mulq %r14
1494 addq %rax,%r10
1495 movq %rbx,%rax
1496 adcq $0,%rdx
1497 addq %r12,%r10
1498 movq %rdx,%r11
1499 adcq $0,%r11
1500 movq %r10,-8(%rdi,%rcx,1)
1501
1502 cmpq $0,%rcx
1503 jne .Lsqr4x_inner
1504
1505.byte 0x67
1506 mulq %r15
1507 addq %rax,%r13
1508 adcq $0,%rdx
1509 addq %r11,%r13
1510 adcq $0,%rdx
1511
1512 movq %r13,(%rdi)
1513 movq %rdx,%r12
1514 movq %rdx,8(%rdi)
1515
1516 addq $16,%rbp
1517 jnz .Lsqr4x_outer
1518
1519
1520 movq -32(%rsi),%r14
1521 leaq 48+8(%rsp,%r9,2),%rdi
1522 movq -24(%rsi),%rax
1523 leaq -32(%rdi,%rbp,1),%rdi
1524 movq -16(%rsi),%rbx
1525 movq %rax,%r15
1526
1527 mulq %r14
1528 addq %rax,%r10
1529 movq %rbx,%rax
1530 movq %rdx,%r11
1531 adcq $0,%r11
1532
1533 mulq %r14
1534 addq %rax,%r11
1535 movq %rbx,%rax
1536 movq %r10,-24(%rdi)
1537 movq %rdx,%r10
1538 adcq $0,%r10
1539 addq %r13,%r11
1540 movq -8(%rsi),%rbx
1541 adcq $0,%r10
1542
1543 mulq %r15
1544 addq %rax,%r12
1545 movq %rbx,%rax
1546 movq %r11,-16(%rdi)
1547 movq %rdx,%r13
1548 adcq $0,%r13
1549
1550 mulq %r14
1551 addq %rax,%r10
1552 movq %rbx,%rax
1553 movq %rdx,%r11
1554 adcq $0,%r11
1555 addq %r12,%r10
1556 adcq $0,%r11
1557 movq %r10,-8(%rdi)
1558
1559 mulq %r15
1560 addq %rax,%r13
1561 movq -16(%rsi),%rax
1562 adcq $0,%rdx
1563 addq %r11,%r13
1564 adcq $0,%rdx
1565
1566 movq %r13,(%rdi)
1567 movq %rdx,%r12
1568 movq %rdx,8(%rdi)
1569
1570 mulq %rbx
1571 addq $16,%rbp
1572 xorq %r14,%r14
1573 subq %r9,%rbp
1574 xorq %r15,%r15
1575
1576 addq %r12,%rax
1577 adcq $0,%rdx
1578 movq %rax,8(%rdi)
1579 movq %rdx,16(%rdi)
1580 movq %r15,24(%rdi)
1581
1582 movq -16(%rsi,%rbp,1),%rax
1583 leaq 48+8(%rsp),%rdi
1584 xorq %r10,%r10
1585 movq 8(%rdi),%r11
1586
1587 leaq (%r14,%r10,2),%r12
1588 shrq $63,%r10
1589 leaq (%rcx,%r11,2),%r13
1590 shrq $63,%r11
1591 orq %r10,%r13
1592 movq 16(%rdi),%r10
1593 movq %r11,%r14
1594 mulq %rax
1595 negq %r15
1596 movq 24(%rdi),%r11
1597 adcq %rax,%r12
1598 movq -8(%rsi,%rbp,1),%rax
1599 movq %r12,(%rdi)
1600 adcq %rdx,%r13
1601
1602 leaq (%r14,%r10,2),%rbx
1603 movq %r13,8(%rdi)
1604 sbbq %r15,%r15
1605 shrq $63,%r10
1606 leaq (%rcx,%r11,2),%r8
1607 shrq $63,%r11
1608 orq %r10,%r8
1609 movq 32(%rdi),%r10
1610 movq %r11,%r14
1611 mulq %rax
1612 negq %r15
1613 movq 40(%rdi),%r11
1614 adcq %rax,%rbx
1615 movq 0(%rsi,%rbp,1),%rax
1616 movq %rbx,16(%rdi)
1617 adcq %rdx,%r8
1618 leaq 16(%rbp),%rbp
1619 movq %r8,24(%rdi)
1620 sbbq %r15,%r15
1621 leaq 64(%rdi),%rdi
1622 jmp .Lsqr4x_shift_n_add
1623
1624.align 32
1625.Lsqr4x_shift_n_add:
1626 leaq (%r14,%r10,2),%r12
1627 shrq $63,%r10
1628 leaq (%rcx,%r11,2),%r13
1629 shrq $63,%r11
1630 orq %r10,%r13
1631 movq -16(%rdi),%r10
1632 movq %r11,%r14
1633 mulq %rax
1634 negq %r15
1635 movq -8(%rdi),%r11
1636 adcq %rax,%r12
1637 movq -8(%rsi,%rbp,1),%rax
1638 movq %r12,-32(%rdi)
1639 adcq %rdx,%r13
1640
1641 leaq (%r14,%r10,2),%rbx
1642 movq %r13,-24(%rdi)
1643 sbbq %r15,%r15
1644 shrq $63,%r10
1645 leaq (%rcx,%r11,2),%r8
1646 shrq $63,%r11
1647 orq %r10,%r8
1648 movq 0(%rdi),%r10
1649 movq %r11,%r14
1650 mulq %rax
1651 negq %r15
1652 movq 8(%rdi),%r11
1653 adcq %rax,%rbx
1654 movq 0(%rsi,%rbp,1),%rax
1655 movq %rbx,-16(%rdi)
1656 adcq %rdx,%r8
1657
1658 leaq (%r14,%r10,2),%r12
1659 movq %r8,-8(%rdi)
1660 sbbq %r15,%r15
1661 shrq $63,%r10
1662 leaq (%rcx,%r11,2),%r13
1663 shrq $63,%r11
1664 orq %r10,%r13
1665 movq 16(%rdi),%r10
1666 movq %r11,%r14
1667 mulq %rax
1668 negq %r15
1669 movq 24(%rdi),%r11
1670 adcq %rax,%r12
1671 movq 8(%rsi,%rbp,1),%rax
1672 movq %r12,0(%rdi)
1673 adcq %rdx,%r13
1674
1675 leaq (%r14,%r10,2),%rbx
1676 movq %r13,8(%rdi)
1677 sbbq %r15,%r15
1678 shrq $63,%r10
1679 leaq (%rcx,%r11,2),%r8
1680 shrq $63,%r11
1681 orq %r10,%r8
1682 movq 32(%rdi),%r10
1683 movq %r11,%r14
1684 mulq %rax
1685 negq %r15
1686 movq 40(%rdi),%r11
1687 adcq %rax,%rbx
1688 movq 16(%rsi,%rbp,1),%rax
1689 movq %rbx,16(%rdi)
1690 adcq %rdx,%r8
1691 movq %r8,24(%rdi)
1692 sbbq %r15,%r15
1693 leaq 64(%rdi),%rdi
1694 addq $32,%rbp
1695 jnz .Lsqr4x_shift_n_add
1696
1697 leaq (%r14,%r10,2),%r12
1698.byte 0x67
1699 shrq $63,%r10
1700 leaq (%rcx,%r11,2),%r13
1701 shrq $63,%r11
1702 orq %r10,%r13
1703 movq -16(%rdi),%r10
1704 movq %r11,%r14
1705 mulq %rax
1706 negq %r15
1707 movq -8(%rdi),%r11
1708 adcq %rax,%r12
1709 movq -8(%rsi),%rax
1710 movq %r12,-32(%rdi)
1711 adcq %rdx,%r13
1712
1713 leaq (%r14,%r10,2),%rbx
1714 movq %r13,-24(%rdi)
1715 sbbq %r15,%r15
1716 shrq $63,%r10
1717 leaq (%rcx,%r11,2),%r8
1718 shrq $63,%r11
1719 orq %r10,%r8
1720 mulq %rax
1721 negq %r15
1722 adcq %rax,%rbx
1723 adcq %rdx,%r8
1724 movq %rbx,-16(%rdi)
1725 movq %r8,-8(%rdi)
1726.byte 102,72,15,126,213
David Benjamin4969cc92016-04-22 15:02:23 -04001727__bn_sqr8x_reduction:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001728 xorq %rax,%rax
David Benjamin4969cc92016-04-22 15:02:23 -04001729 leaq (%r9,%rbp,1),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001730 leaq 48+8(%rsp,%r9,2),%rdx
1731 movq %rcx,0+8(%rsp)
1732 leaq 48+8(%rsp,%r9,1),%rdi
1733 movq %rdx,8+8(%rsp)
1734 negq %r9
1735 jmp .L8x_reduction_loop
1736
1737.align 32
1738.L8x_reduction_loop:
1739 leaq (%rdi,%r9,1),%rdi
1740.byte 0x66
1741 movq 0(%rdi),%rbx
1742 movq 8(%rdi),%r9
1743 movq 16(%rdi),%r10
1744 movq 24(%rdi),%r11
1745 movq 32(%rdi),%r12
1746 movq 40(%rdi),%r13
1747 movq 48(%rdi),%r14
1748 movq 56(%rdi),%r15
1749 movq %rax,(%rdx)
1750 leaq 64(%rdi),%rdi
1751
1752.byte 0x67
1753 movq %rbx,%r8
1754 imulq 32+8(%rsp),%rbx
1755 movq 0(%rbp),%rax
1756 movl $8,%ecx
1757 jmp .L8x_reduce
1758
1759.align 32
1760.L8x_reduce:
1761 mulq %rbx
David Benjamin4969cc92016-04-22 15:02:23 -04001762 movq 8(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001763 negq %r8
1764 movq %rdx,%r8
1765 adcq $0,%r8
1766
1767 mulq %rbx
1768 addq %rax,%r9
David Benjamin4969cc92016-04-22 15:02:23 -04001769 movq 16(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001770 adcq $0,%rdx
1771 addq %r9,%r8
1772 movq %rbx,48-8+8(%rsp,%rcx,8)
1773 movq %rdx,%r9
1774 adcq $0,%r9
1775
1776 mulq %rbx
1777 addq %rax,%r10
David Benjamin4969cc92016-04-22 15:02:23 -04001778 movq 24(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001779 adcq $0,%rdx
1780 addq %r10,%r9
1781 movq 32+8(%rsp),%rsi
1782 movq %rdx,%r10
1783 adcq $0,%r10
1784
1785 mulq %rbx
1786 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -04001787 movq 32(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001788 adcq $0,%rdx
1789 imulq %r8,%rsi
1790 addq %r11,%r10
1791 movq %rdx,%r11
1792 adcq $0,%r11
1793
1794 mulq %rbx
1795 addq %rax,%r12
David Benjamin4969cc92016-04-22 15:02:23 -04001796 movq 40(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001797 adcq $0,%rdx
1798 addq %r12,%r11
1799 movq %rdx,%r12
1800 adcq $0,%r12
1801
1802 mulq %rbx
1803 addq %rax,%r13
David Benjamin4969cc92016-04-22 15:02:23 -04001804 movq 48(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001805 adcq $0,%rdx
1806 addq %r13,%r12
1807 movq %rdx,%r13
1808 adcq $0,%r13
1809
1810 mulq %rbx
1811 addq %rax,%r14
David Benjamin4969cc92016-04-22 15:02:23 -04001812 movq 56(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001813 adcq $0,%rdx
1814 addq %r14,%r13
1815 movq %rdx,%r14
1816 adcq $0,%r14
1817
1818 mulq %rbx
1819 movq %rsi,%rbx
1820 addq %rax,%r15
1821 movq 0(%rbp),%rax
1822 adcq $0,%rdx
1823 addq %r15,%r14
1824 movq %rdx,%r15
1825 adcq $0,%r15
1826
1827 decl %ecx
1828 jnz .L8x_reduce
1829
David Benjamin4969cc92016-04-22 15:02:23 -04001830 leaq 64(%rbp),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001831 xorq %rax,%rax
1832 movq 8+8(%rsp),%rdx
1833 cmpq 0+8(%rsp),%rbp
1834 jae .L8x_no_tail
1835
1836.byte 0x66
1837 addq 0(%rdi),%r8
1838 adcq 8(%rdi),%r9
1839 adcq 16(%rdi),%r10
1840 adcq 24(%rdi),%r11
1841 adcq 32(%rdi),%r12
1842 adcq 40(%rdi),%r13
1843 adcq 48(%rdi),%r14
1844 adcq 56(%rdi),%r15
1845 sbbq %rsi,%rsi
1846
1847 movq 48+56+8(%rsp),%rbx
1848 movl $8,%ecx
1849 movq 0(%rbp),%rax
1850 jmp .L8x_tail
1851
1852.align 32
1853.L8x_tail:
1854 mulq %rbx
1855 addq %rax,%r8
David Benjamin4969cc92016-04-22 15:02:23 -04001856 movq 8(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001857 movq %r8,(%rdi)
1858 movq %rdx,%r8
1859 adcq $0,%r8
1860
1861 mulq %rbx
1862 addq %rax,%r9
David Benjamin4969cc92016-04-22 15:02:23 -04001863 movq 16(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001864 adcq $0,%rdx
1865 addq %r9,%r8
1866 leaq 8(%rdi),%rdi
1867 movq %rdx,%r9
1868 adcq $0,%r9
1869
1870 mulq %rbx
1871 addq %rax,%r10
David Benjamin4969cc92016-04-22 15:02:23 -04001872 movq 24(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001873 adcq $0,%rdx
1874 addq %r10,%r9
1875 movq %rdx,%r10
1876 adcq $0,%r10
1877
1878 mulq %rbx
1879 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -04001880 movq 32(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001881 adcq $0,%rdx
1882 addq %r11,%r10
1883 movq %rdx,%r11
1884 adcq $0,%r11
1885
1886 mulq %rbx
1887 addq %rax,%r12
David Benjamin4969cc92016-04-22 15:02:23 -04001888 movq 40(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001889 adcq $0,%rdx
1890 addq %r12,%r11
1891 movq %rdx,%r12
1892 adcq $0,%r12
1893
1894 mulq %rbx
1895 addq %rax,%r13
David Benjamin4969cc92016-04-22 15:02:23 -04001896 movq 48(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001897 adcq $0,%rdx
1898 addq %r13,%r12
1899 movq %rdx,%r13
1900 adcq $0,%r13
1901
1902 mulq %rbx
1903 addq %rax,%r14
David Benjamin4969cc92016-04-22 15:02:23 -04001904 movq 56(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001905 adcq $0,%rdx
1906 addq %r14,%r13
1907 movq %rdx,%r14
1908 adcq $0,%r14
1909
1910 mulq %rbx
1911 movq 48-16+8(%rsp,%rcx,8),%rbx
1912 addq %rax,%r15
1913 adcq $0,%rdx
1914 addq %r15,%r14
1915 movq 0(%rbp),%rax
1916 movq %rdx,%r15
1917 adcq $0,%r15
1918
1919 decl %ecx
1920 jnz .L8x_tail
1921
David Benjamin4969cc92016-04-22 15:02:23 -04001922 leaq 64(%rbp),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001923 movq 8+8(%rsp),%rdx
1924 cmpq 0+8(%rsp),%rbp
1925 jae .L8x_tail_done
1926
1927 movq 48+56+8(%rsp),%rbx
1928 negq %rsi
1929 movq 0(%rbp),%rax
1930 adcq 0(%rdi),%r8
1931 adcq 8(%rdi),%r9
1932 adcq 16(%rdi),%r10
1933 adcq 24(%rdi),%r11
1934 adcq 32(%rdi),%r12
1935 adcq 40(%rdi),%r13
1936 adcq 48(%rdi),%r14
1937 adcq 56(%rdi),%r15
1938 sbbq %rsi,%rsi
1939
1940 movl $8,%ecx
1941 jmp .L8x_tail
1942
1943.align 32
1944.L8x_tail_done:
Robert Sloan4d1ac502017-02-06 08:36:14 -08001945 xorq %rax,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001946 addq (%rdx),%r8
Adam Langley4139edb2016-01-13 15:00:54 -08001947 adcq $0,%r9
1948 adcq $0,%r10
1949 adcq $0,%r11
1950 adcq $0,%r12
1951 adcq $0,%r13
1952 adcq $0,%r14
1953 adcq $0,%r15
Robert Sloan4d1ac502017-02-06 08:36:14 -08001954 adcq $0,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001955
1956 negq %rsi
1957.L8x_no_tail:
1958 adcq 0(%rdi),%r8
1959 adcq 8(%rdi),%r9
1960 adcq 16(%rdi),%r10
1961 adcq 24(%rdi),%r11
1962 adcq 32(%rdi),%r12
1963 adcq 40(%rdi),%r13
1964 adcq 48(%rdi),%r14
1965 adcq 56(%rdi),%r15
1966 adcq $0,%rax
David Benjamin4969cc92016-04-22 15:02:23 -04001967 movq -8(%rbp),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001968 xorq %rsi,%rsi
1969
1970.byte 102,72,15,126,213
1971
1972 movq %r8,0(%rdi)
1973 movq %r9,8(%rdi)
1974.byte 102,73,15,126,217
1975 movq %r10,16(%rdi)
1976 movq %r11,24(%rdi)
1977 movq %r12,32(%rdi)
1978 movq %r13,40(%rdi)
1979 movq %r14,48(%rdi)
1980 movq %r15,56(%rdi)
1981 leaq 64(%rdi),%rdi
1982
1983 cmpq %rdx,%rdi
1984 jb .L8x_reduction_loop
David Benjamin4969cc92016-04-22 15:02:23 -04001985 .byte 0xf3,0xc3
1986.size bn_sqr8x_internal,.-bn_sqr8x_internal
1987.type __bn_post4x_internal,@function
Adam Langleyd9e397b2015-01-22 14:27:53 -08001988.align 32
David Benjamin4969cc92016-04-22 15:02:23 -04001989__bn_post4x_internal:
1990 movq 0(%rbp),%r12
1991 leaq (%rdi,%r9,1),%rbx
1992 movq %r9,%rcx
1993.byte 102,72,15,126,207
1994 negq %rax
1995.byte 102,72,15,126,206
1996 sarq $3+2,%rcx
1997 decq %r12
1998 xorq %r10,%r10
1999 movq 8(%rbp),%r13
2000 movq 16(%rbp),%r14
2001 movq 24(%rbp),%r15
2002 jmp .Lsqr4x_sub_entry
2003
2004.align 16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002005.Lsqr4x_sub:
David Benjamin4969cc92016-04-22 15:02:23 -04002006 movq 0(%rbp),%r12
2007 movq 8(%rbp),%r13
2008 movq 16(%rbp),%r14
2009 movq 24(%rbp),%r15
2010.Lsqr4x_sub_entry:
2011 leaq 32(%rbp),%rbp
2012 notq %r12
2013 notq %r13
2014 notq %r14
2015 notq %r15
2016 andq %rax,%r12
2017 andq %rax,%r13
2018 andq %rax,%r14
2019 andq %rax,%r15
2020
2021 negq %r10
2022 adcq 0(%rbx),%r12
2023 adcq 8(%rbx),%r13
2024 adcq 16(%rbx),%r14
2025 adcq 24(%rbx),%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002026 movq %r12,0(%rdi)
David Benjamin4969cc92016-04-22 15:02:23 -04002027 leaq 32(%rbx),%rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08002028 movq %r13,8(%rdi)
David Benjamin4969cc92016-04-22 15:02:23 -04002029 sbbq %r10,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08002030 movq %r14,16(%rdi)
2031 movq %r15,24(%rdi)
2032 leaq 32(%rdi),%rdi
2033
2034 incq %rcx
2035 jnz .Lsqr4x_sub
David Benjamin4969cc92016-04-22 15:02:23 -04002036
Adam Langleyd9e397b2015-01-22 14:27:53 -08002037 movq %r9,%r10
2038 negq %r9
2039 .byte 0xf3,0xc3
David Benjamin4969cc92016-04-22 15:02:23 -04002040.size __bn_post4x_internal,.-__bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08002041.globl bn_from_montgomery
2042.hidden bn_from_montgomery
2043.type bn_from_montgomery,@function
2044.align 32
2045bn_from_montgomery:
2046 testl $7,%r9d
2047 jz bn_from_mont8x
2048 xorl %eax,%eax
2049 .byte 0xf3,0xc3
2050.size bn_from_montgomery,.-bn_from_montgomery
2051
2052.type bn_from_mont8x,@function
2053.align 32
2054bn_from_mont8x:
Robert Sloana94fe052017-02-21 08:49:28 -08002055.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08002056.byte 0x67
2057 movq %rsp,%rax
Robert Sloana94fe052017-02-21 08:49:28 -08002058.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08002059 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08002060.cfi_offset %rbx,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002061 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002062.cfi_offset %rbp,-24
Adam Langleyd9e397b2015-01-22 14:27:53 -08002063 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -08002064.cfi_offset %r12,-32
Adam Langleyd9e397b2015-01-22 14:27:53 -08002065 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -08002066.cfi_offset %r13,-40
Adam Langleyd9e397b2015-01-22 14:27:53 -08002067 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -08002068.cfi_offset %r14,-48
Adam Langleyd9e397b2015-01-22 14:27:53 -08002069 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -08002070.cfi_offset %r15,-56
2071.Lfrom_prologue:
David Benjamin4969cc92016-04-22 15:02:23 -04002072
Adam Langleyd9e397b2015-01-22 14:27:53 -08002073 shll $3,%r9d
David Benjamin4969cc92016-04-22 15:02:23 -04002074 leaq (%r9,%r9,2),%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08002075 negq %r9
2076 movq (%r8),%r8
2077
2078
2079
2080
2081
2082
2083
David Benjamin4969cc92016-04-22 15:02:23 -04002084
2085 leaq -320(%rsp,%r9,2),%r11
Robert Sloana94fe052017-02-21 08:49:28 -08002086 movq %rsp,%rbp
David Benjamin4969cc92016-04-22 15:02:23 -04002087 subq %rdi,%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08002088 andq $4095,%r11
2089 cmpq %r11,%r10
2090 jb .Lfrom_sp_alt
Robert Sloana94fe052017-02-21 08:49:28 -08002091 subq %r11,%rbp
2092 leaq -320(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002093 jmp .Lfrom_sp_done
2094
2095.align 32
2096.Lfrom_sp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -04002097 leaq 4096-320(,%r9,2),%r10
Robert Sloana94fe052017-02-21 08:49:28 -08002098 leaq -320(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002099 subq %r10,%r11
2100 movq $0,%r10
2101 cmovcq %r10,%r11
Robert Sloana94fe052017-02-21 08:49:28 -08002102 subq %r11,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002103.Lfrom_sp_done:
Robert Sloana94fe052017-02-21 08:49:28 -08002104 andq $-64,%rbp
2105 movq %rsp,%r11
2106 subq %rbp,%r11
2107 andq $-4096,%r11
2108 leaq (%r11,%rbp,1),%rsp
2109 movq (%rsp),%r10
2110 cmpq %rbp,%rsp
2111 ja .Lfrom_page_walk
2112 jmp .Lfrom_page_walk_done
2113
2114.Lfrom_page_walk:
2115 leaq -4096(%rsp),%rsp
2116 movq (%rsp),%r10
2117 cmpq %rbp,%rsp
2118 ja .Lfrom_page_walk
2119.Lfrom_page_walk_done:
2120
Adam Langleyd9e397b2015-01-22 14:27:53 -08002121 movq %r9,%r10
2122 negq %r9
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133 movq %r8,32(%rsp)
2134 movq %rax,40(%rsp)
Robert Sloana94fe052017-02-21 08:49:28 -08002135.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
Adam Langleyd9e397b2015-01-22 14:27:53 -08002136.Lfrom_body:
2137 movq %r9,%r11
2138 leaq 48(%rsp),%rax
2139 pxor %xmm0,%xmm0
2140 jmp .Lmul_by_1
2141
2142.align 32
2143.Lmul_by_1:
2144 movdqu (%rsi),%xmm1
2145 movdqu 16(%rsi),%xmm2
2146 movdqu 32(%rsi),%xmm3
2147 movdqa %xmm0,(%rax,%r9,1)
2148 movdqu 48(%rsi),%xmm4
2149 movdqa %xmm0,16(%rax,%r9,1)
2150.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2151 movdqa %xmm1,(%rax)
2152 movdqa %xmm0,32(%rax,%r9,1)
2153 movdqa %xmm2,16(%rax)
2154 movdqa %xmm0,48(%rax,%r9,1)
2155 movdqa %xmm3,32(%rax)
2156 movdqa %xmm4,48(%rax)
2157 leaq 64(%rax),%rax
2158 subq $64,%r11
2159 jnz .Lmul_by_1
2160
2161.byte 102,72,15,110,207
2162.byte 102,72,15,110,209
2163.byte 0x67
2164 movq %rcx,%rbp
2165.byte 102,73,15,110,218
David Benjamin4969cc92016-04-22 15:02:23 -04002166 call __bn_sqr8x_reduction
2167 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08002168
2169 pxor %xmm0,%xmm0
2170 leaq 48(%rsp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08002171 jmp .Lfrom_mont_zero
2172
2173.align 32
2174.Lfrom_mont_zero:
Robert Sloana94fe052017-02-21 08:49:28 -08002175 movq 40(%rsp),%rsi
2176.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -08002177 movdqa %xmm0,0(%rax)
2178 movdqa %xmm0,16(%rax)
2179 movdqa %xmm0,32(%rax)
2180 movdqa %xmm0,48(%rax)
2181 leaq 64(%rax),%rax
2182 subq $32,%r9
2183 jnz .Lfrom_mont_zero
2184
2185 movq $1,%rax
2186 movq -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -08002187.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002188 movq -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -08002189.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08002190 movq -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -08002191.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08002192 movq -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -08002193.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08002194 movq -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002195.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002196 movq -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -08002197.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08002198 leaq (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -08002199.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002200.Lfrom_epilogue:
2201 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -08002202.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08002203.size bn_from_mont8x,.-bn_from_mont8x
2204.globl bn_scatter5
2205.hidden bn_scatter5
2206.type bn_scatter5,@function
2207.align 16
2208bn_scatter5:
2209 cmpl $0,%esi
2210 jz .Lscatter_epilogue
2211 leaq (%rdx,%rcx,8),%rdx
2212.Lscatter:
2213 movq (%rdi),%rax
2214 leaq 8(%rdi),%rdi
2215 movq %rax,(%rdx)
2216 leaq 256(%rdx),%rdx
2217 subl $1,%esi
2218 jnz .Lscatter
2219.Lscatter_epilogue:
2220 .byte 0xf3,0xc3
2221.size bn_scatter5,.-bn_scatter5
2222
2223.globl bn_gather5
2224.hidden bn_gather5
2225.type bn_gather5,@function
David Benjamin4969cc92016-04-22 15:02:23 -04002226.align 32
Adam Langleyd9e397b2015-01-22 14:27:53 -08002227bn_gather5:
David Benjamin4969cc92016-04-22 15:02:23 -04002228.LSEH_begin_bn_gather5:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002229
David Benjamin4969cc92016-04-22 15:02:23 -04002230.byte 0x4c,0x8d,0x14,0x24
2231.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
2232 leaq .Linc(%rip),%rax
2233 andq $-16,%rsp
2234
2235 movd %ecx,%xmm5
2236 movdqa 0(%rax),%xmm0
2237 movdqa 16(%rax),%xmm1
2238 leaq 128(%rdx),%r11
2239 leaq 128(%rsp),%rax
2240
2241 pshufd $0,%xmm5,%xmm5
2242 movdqa %xmm1,%xmm4
2243 movdqa %xmm1,%xmm2
2244 paddd %xmm0,%xmm1
2245 pcmpeqd %xmm5,%xmm0
2246 movdqa %xmm4,%xmm3
2247
2248 paddd %xmm1,%xmm2
2249 pcmpeqd %xmm5,%xmm1
2250 movdqa %xmm0,-128(%rax)
2251 movdqa %xmm4,%xmm0
2252
2253 paddd %xmm2,%xmm3
2254 pcmpeqd %xmm5,%xmm2
2255 movdqa %xmm1,-112(%rax)
2256 movdqa %xmm4,%xmm1
2257
2258 paddd %xmm3,%xmm0
2259 pcmpeqd %xmm5,%xmm3
2260 movdqa %xmm2,-96(%rax)
2261 movdqa %xmm4,%xmm2
2262 paddd %xmm0,%xmm1
2263 pcmpeqd %xmm5,%xmm0
2264 movdqa %xmm3,-80(%rax)
2265 movdqa %xmm4,%xmm3
2266
2267 paddd %xmm1,%xmm2
2268 pcmpeqd %xmm5,%xmm1
2269 movdqa %xmm0,-64(%rax)
2270 movdqa %xmm4,%xmm0
2271
2272 paddd %xmm2,%xmm3
2273 pcmpeqd %xmm5,%xmm2
2274 movdqa %xmm1,-48(%rax)
2275 movdqa %xmm4,%xmm1
2276
2277 paddd %xmm3,%xmm0
2278 pcmpeqd %xmm5,%xmm3
2279 movdqa %xmm2,-32(%rax)
2280 movdqa %xmm4,%xmm2
2281 paddd %xmm0,%xmm1
2282 pcmpeqd %xmm5,%xmm0
2283 movdqa %xmm3,-16(%rax)
2284 movdqa %xmm4,%xmm3
2285
2286 paddd %xmm1,%xmm2
2287 pcmpeqd %xmm5,%xmm1
2288 movdqa %xmm0,0(%rax)
2289 movdqa %xmm4,%xmm0
2290
2291 paddd %xmm2,%xmm3
2292 pcmpeqd %xmm5,%xmm2
2293 movdqa %xmm1,16(%rax)
2294 movdqa %xmm4,%xmm1
2295
2296 paddd %xmm3,%xmm0
2297 pcmpeqd %xmm5,%xmm3
2298 movdqa %xmm2,32(%rax)
2299 movdqa %xmm4,%xmm2
2300 paddd %xmm0,%xmm1
2301 pcmpeqd %xmm5,%xmm0
2302 movdqa %xmm3,48(%rax)
2303 movdqa %xmm4,%xmm3
2304
2305 paddd %xmm1,%xmm2
2306 pcmpeqd %xmm5,%xmm1
2307 movdqa %xmm0,64(%rax)
2308 movdqa %xmm4,%xmm0
2309
2310 paddd %xmm2,%xmm3
2311 pcmpeqd %xmm5,%xmm2
2312 movdqa %xmm1,80(%rax)
2313 movdqa %xmm4,%xmm1
2314
2315 paddd %xmm3,%xmm0
2316 pcmpeqd %xmm5,%xmm3
2317 movdqa %xmm2,96(%rax)
2318 movdqa %xmm4,%xmm2
2319 movdqa %xmm3,112(%rax)
2320 jmp .Lgather
2321
2322.align 32
2323.Lgather:
2324 pxor %xmm4,%xmm4
2325 pxor %xmm5,%xmm5
2326 movdqa -128(%r11),%xmm0
2327 movdqa -112(%r11),%xmm1
2328 movdqa -96(%r11),%xmm2
2329 pand -128(%rax),%xmm0
2330 movdqa -80(%r11),%xmm3
2331 pand -112(%rax),%xmm1
2332 por %xmm0,%xmm4
2333 pand -96(%rax),%xmm2
2334 por %xmm1,%xmm5
2335 pand -80(%rax),%xmm3
2336 por %xmm2,%xmm4
2337 por %xmm3,%xmm5
2338 movdqa -64(%r11),%xmm0
2339 movdqa -48(%r11),%xmm1
2340 movdqa -32(%r11),%xmm2
2341 pand -64(%rax),%xmm0
2342 movdqa -16(%r11),%xmm3
2343 pand -48(%rax),%xmm1
2344 por %xmm0,%xmm4
2345 pand -32(%rax),%xmm2
2346 por %xmm1,%xmm5
2347 pand -16(%rax),%xmm3
2348 por %xmm2,%xmm4
2349 por %xmm3,%xmm5
2350 movdqa 0(%r11),%xmm0
2351 movdqa 16(%r11),%xmm1
2352 movdqa 32(%r11),%xmm2
2353 pand 0(%rax),%xmm0
2354 movdqa 48(%r11),%xmm3
2355 pand 16(%rax),%xmm1
2356 por %xmm0,%xmm4
2357 pand 32(%rax),%xmm2
2358 por %xmm1,%xmm5
2359 pand 48(%rax),%xmm3
2360 por %xmm2,%xmm4
2361 por %xmm3,%xmm5
2362 movdqa 64(%r11),%xmm0
2363 movdqa 80(%r11),%xmm1
2364 movdqa 96(%r11),%xmm2
2365 pand 64(%rax),%xmm0
2366 movdqa 112(%r11),%xmm3
2367 pand 80(%rax),%xmm1
2368 por %xmm0,%xmm4
2369 pand 96(%rax),%xmm2
2370 por %xmm1,%xmm5
2371 pand 112(%rax),%xmm3
2372 por %xmm2,%xmm4
2373 por %xmm3,%xmm5
2374 por %xmm5,%xmm4
2375 leaq 256(%r11),%r11
2376 pshufd $0x4e,%xmm4,%xmm0
2377 por %xmm4,%xmm0
Adam Langleyd9e397b2015-01-22 14:27:53 -08002378 movq %xmm0,(%rdi)
2379 leaq 8(%rdi),%rdi
2380 subl $1,%esi
2381 jnz .Lgather
David Benjamin4969cc92016-04-22 15:02:23 -04002382
2383 leaq (%r10),%rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002384 .byte 0xf3,0xc3
2385.LSEH_end_bn_gather5:
2386.size bn_gather5,.-bn_gather5
2387.align 64
David Benjamin4969cc92016-04-22 15:02:23 -04002388.Linc:
2389.long 0,0, 1,1
2390.long 2,2, 2,2
Adam Langleyd9e397b2015-01-22 14:27:53 -08002391.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2392#endif