blob: 208b1dca3ef664307fc47326a6424278b0cc2d8d [file] [log] [blame]
Robert Sloan6f79a502017-04-03 09:16:40 -07001#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002.text
3
Adam Langleya4fb56a2015-03-06 11:08:23 -08004.extern OPENSSL_ia32cap_P
5.hidden OPENSSL_ia32cap_P
Adam Langleyd9e397b2015-01-22 14:27:53 -08006
7.globl bn_mul_mont_gather5
8.hidden bn_mul_mont_gather5
9.type bn_mul_mont_gather5,@function
10.align 64
11bn_mul_mont_gather5:
Robert Sloana94fe052017-02-21 08:49:28 -080012.cfi_startproc
13 movl %r9d,%r9d
14 movq %rsp,%rax
15.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -080016 testl $7,%r9d
17 jnz .Lmul_enter
18 jmp .Lmul4x_enter
19
20.align 16
21.Lmul_enter:
David Benjamin4969cc92016-04-22 15:02:23 -040022 movd 8(%rsp),%xmm5
Adam Langleyd9e397b2015-01-22 14:27:53 -080023 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -080024.cfi_offset %rbx,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -080025 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -080026.cfi_offset %rbp,-24
Adam Langleyd9e397b2015-01-22 14:27:53 -080027 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -080028.cfi_offset %r12,-32
Adam Langleyd9e397b2015-01-22 14:27:53 -080029 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -080030.cfi_offset %r13,-40
Adam Langleyd9e397b2015-01-22 14:27:53 -080031 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -080032.cfi_offset %r14,-48
Adam Langleyd9e397b2015-01-22 14:27:53 -080033 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -080034.cfi_offset %r15,-56
David Benjamin4969cc92016-04-22 15:02:23 -040035
Robert Sloana94fe052017-02-21 08:49:28 -080036 negq %r9
37 movq %rsp,%r11
38 leaq -280(%rsp,%r9,8),%r10
39 negq %r9
40 andq $-1024,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -080041
Robert Sloana94fe052017-02-21 08:49:28 -080042
43
44
45
46
47
48
49
50 subq %r10,%r11
51 andq $-4096,%r11
52 leaq (%r10,%r11,1),%rsp
53 movq (%rsp),%r11
54 cmpq %r10,%rsp
55 ja .Lmul_page_walk
56 jmp .Lmul_page_walk_done
57
58.Lmul_page_walk:
59 leaq -4096(%rsp),%rsp
60 movq (%rsp),%r11
61 cmpq %r10,%rsp
62 ja .Lmul_page_walk
63.Lmul_page_walk_done:
64
65 leaq .Linc(%rip),%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -080066 movq %rax,8(%rsp,%r9,8)
Robert Sloana94fe052017-02-21 08:49:28 -080067.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
Adam Langleyd9e397b2015-01-22 14:27:53 -080068.Lmul_body:
Robert Sloana94fe052017-02-21 08:49:28 -080069
David Benjamin4969cc92016-04-22 15:02:23 -040070 leaq 128(%rdx),%r12
71 movdqa 0(%r10),%xmm0
72 movdqa 16(%r10),%xmm1
73 leaq 24-112(%rsp,%r9,8),%r10
74 andq $-16,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -080075
David Benjamin4969cc92016-04-22 15:02:23 -040076 pshufd $0,%xmm5,%xmm5
77 movdqa %xmm1,%xmm4
78 movdqa %xmm1,%xmm2
79 paddd %xmm0,%xmm1
80 pcmpeqd %xmm5,%xmm0
81.byte 0x67
82 movdqa %xmm4,%xmm3
83 paddd %xmm1,%xmm2
84 pcmpeqd %xmm5,%xmm1
85 movdqa %xmm0,112(%r10)
86 movdqa %xmm4,%xmm0
87
88 paddd %xmm2,%xmm3
89 pcmpeqd %xmm5,%xmm2
90 movdqa %xmm1,128(%r10)
91 movdqa %xmm4,%xmm1
92
93 paddd %xmm3,%xmm0
94 pcmpeqd %xmm5,%xmm3
95 movdqa %xmm2,144(%r10)
96 movdqa %xmm4,%xmm2
97
98 paddd %xmm0,%xmm1
99 pcmpeqd %xmm5,%xmm0
100 movdqa %xmm3,160(%r10)
101 movdqa %xmm4,%xmm3
102 paddd %xmm1,%xmm2
103 pcmpeqd %xmm5,%xmm1
104 movdqa %xmm0,176(%r10)
105 movdqa %xmm4,%xmm0
106
107 paddd %xmm2,%xmm3
108 pcmpeqd %xmm5,%xmm2
109 movdqa %xmm1,192(%r10)
110 movdqa %xmm4,%xmm1
111
112 paddd %xmm3,%xmm0
113 pcmpeqd %xmm5,%xmm3
114 movdqa %xmm2,208(%r10)
115 movdqa %xmm4,%xmm2
116
117 paddd %xmm0,%xmm1
118 pcmpeqd %xmm5,%xmm0
119 movdqa %xmm3,224(%r10)
120 movdqa %xmm4,%xmm3
121 paddd %xmm1,%xmm2
122 pcmpeqd %xmm5,%xmm1
123 movdqa %xmm0,240(%r10)
124 movdqa %xmm4,%xmm0
125
126 paddd %xmm2,%xmm3
127 pcmpeqd %xmm5,%xmm2
128 movdqa %xmm1,256(%r10)
129 movdqa %xmm4,%xmm1
130
131 paddd %xmm3,%xmm0
132 pcmpeqd %xmm5,%xmm3
133 movdqa %xmm2,272(%r10)
134 movdqa %xmm4,%xmm2
135
136 paddd %xmm0,%xmm1
137 pcmpeqd %xmm5,%xmm0
138 movdqa %xmm3,288(%r10)
139 movdqa %xmm4,%xmm3
140 paddd %xmm1,%xmm2
141 pcmpeqd %xmm5,%xmm1
142 movdqa %xmm0,304(%r10)
143
144 paddd %xmm2,%xmm3
145.byte 0x67
146 pcmpeqd %xmm5,%xmm2
147 movdqa %xmm1,320(%r10)
148
149 pcmpeqd %xmm5,%xmm3
150 movdqa %xmm2,336(%r10)
151 pand 64(%r12),%xmm0
152
153 pand 80(%r12),%xmm1
154 pand 96(%r12),%xmm2
155 movdqa %xmm3,352(%r10)
156 pand 112(%r12),%xmm3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800157 por %xmm2,%xmm0
David Benjamin4969cc92016-04-22 15:02:23 -0400158 por %xmm3,%xmm1
159 movdqa -128(%r12),%xmm4
160 movdqa -112(%r12),%xmm5
161 movdqa -96(%r12),%xmm2
162 pand 112(%r10),%xmm4
163 movdqa -80(%r12),%xmm3
164 pand 128(%r10),%xmm5
165 por %xmm4,%xmm0
166 pand 144(%r10),%xmm2
167 por %xmm5,%xmm1
168 pand 160(%r10),%xmm3
169 por %xmm2,%xmm0
170 por %xmm3,%xmm1
171 movdqa -64(%r12),%xmm4
172 movdqa -48(%r12),%xmm5
173 movdqa -32(%r12),%xmm2
174 pand 176(%r10),%xmm4
175 movdqa -16(%r12),%xmm3
176 pand 192(%r10),%xmm5
177 por %xmm4,%xmm0
178 pand 208(%r10),%xmm2
179 por %xmm5,%xmm1
180 pand 224(%r10),%xmm3
181 por %xmm2,%xmm0
182 por %xmm3,%xmm1
183 movdqa 0(%r12),%xmm4
184 movdqa 16(%r12),%xmm5
185 movdqa 32(%r12),%xmm2
186 pand 240(%r10),%xmm4
187 movdqa 48(%r12),%xmm3
188 pand 256(%r10),%xmm5
189 por %xmm4,%xmm0
190 pand 272(%r10),%xmm2
191 por %xmm5,%xmm1
192 pand 288(%r10),%xmm3
193 por %xmm2,%xmm0
194 por %xmm3,%xmm1
195 por %xmm1,%xmm0
196 pshufd $0x4e,%xmm0,%xmm1
197 por %xmm1,%xmm0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800198 leaq 256(%r12),%r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800199.byte 102,72,15,126,195
200
201 movq (%r8),%r8
202 movq (%rsi),%rax
203
204 xorq %r14,%r14
205 xorq %r15,%r15
206
Adam Langleyd9e397b2015-01-22 14:27:53 -0800207 movq %r8,%rbp
208 mulq %rbx
209 movq %rax,%r10
210 movq (%rcx),%rax
211
Adam Langleyd9e397b2015-01-22 14:27:53 -0800212 imulq %r10,%rbp
213 movq %rdx,%r11
214
Adam Langleyd9e397b2015-01-22 14:27:53 -0800215 mulq %rbp
216 addq %rax,%r10
217 movq 8(%rsi),%rax
218 adcq $0,%rdx
219 movq %rdx,%r13
220
221 leaq 1(%r15),%r15
222 jmp .L1st_enter
223
224.align 16
225.L1st:
226 addq %rax,%r13
227 movq (%rsi,%r15,8),%rax
228 adcq $0,%rdx
229 addq %r11,%r13
230 movq %r10,%r11
231 adcq $0,%rdx
232 movq %r13,-16(%rsp,%r15,8)
233 movq %rdx,%r13
234
235.L1st_enter:
236 mulq %rbx
237 addq %rax,%r11
238 movq (%rcx,%r15,8),%rax
239 adcq $0,%rdx
240 leaq 1(%r15),%r15
241 movq %rdx,%r10
242
243 mulq %rbp
244 cmpq %r9,%r15
245 jne .L1st
246
Adam Langleyd9e397b2015-01-22 14:27:53 -0800247
248 addq %rax,%r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800249 adcq $0,%rdx
250 addq %r11,%r13
251 adcq $0,%rdx
David Benjamin4969cc92016-04-22 15:02:23 -0400252 movq %r13,-16(%rsp,%r9,8)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800253 movq %rdx,%r13
254 movq %r10,%r11
255
256 xorq %rdx,%rdx
257 addq %r11,%r13
258 adcq $0,%rdx
259 movq %r13,-8(%rsp,%r9,8)
260 movq %rdx,(%rsp,%r9,8)
261
262 leaq 1(%r14),%r14
263 jmp .Louter
264.align 16
265.Louter:
David Benjamin4969cc92016-04-22 15:02:23 -0400266 leaq 24+128(%rsp,%r9,8),%rdx
267 andq $-16,%rdx
268 pxor %xmm4,%xmm4
269 pxor %xmm5,%xmm5
270 movdqa -128(%r12),%xmm0
271 movdqa -112(%r12),%xmm1
272 movdqa -96(%r12),%xmm2
273 movdqa -80(%r12),%xmm3
274 pand -128(%rdx),%xmm0
275 pand -112(%rdx),%xmm1
276 por %xmm0,%xmm4
277 pand -96(%rdx),%xmm2
278 por %xmm1,%xmm5
279 pand -80(%rdx),%xmm3
280 por %xmm2,%xmm4
281 por %xmm3,%xmm5
282 movdqa -64(%r12),%xmm0
283 movdqa -48(%r12),%xmm1
284 movdqa -32(%r12),%xmm2
285 movdqa -16(%r12),%xmm3
286 pand -64(%rdx),%xmm0
287 pand -48(%rdx),%xmm1
288 por %xmm0,%xmm4
289 pand -32(%rdx),%xmm2
290 por %xmm1,%xmm5
291 pand -16(%rdx),%xmm3
292 por %xmm2,%xmm4
293 por %xmm3,%xmm5
294 movdqa 0(%r12),%xmm0
295 movdqa 16(%r12),%xmm1
296 movdqa 32(%r12),%xmm2
297 movdqa 48(%r12),%xmm3
298 pand 0(%rdx),%xmm0
299 pand 16(%rdx),%xmm1
300 por %xmm0,%xmm4
301 pand 32(%rdx),%xmm2
302 por %xmm1,%xmm5
303 pand 48(%rdx),%xmm3
304 por %xmm2,%xmm4
305 por %xmm3,%xmm5
306 movdqa 64(%r12),%xmm0
307 movdqa 80(%r12),%xmm1
308 movdqa 96(%r12),%xmm2
309 movdqa 112(%r12),%xmm3
310 pand 64(%rdx),%xmm0
311 pand 80(%rdx),%xmm1
312 por %xmm0,%xmm4
313 pand 96(%rdx),%xmm2
314 por %xmm1,%xmm5
315 pand 112(%rdx),%xmm3
316 por %xmm2,%xmm4
317 por %xmm3,%xmm5
318 por %xmm5,%xmm4
319 pshufd $0x4e,%xmm4,%xmm0
320 por %xmm4,%xmm0
321 leaq 256(%r12),%r12
322
323 movq (%rsi),%rax
324.byte 102,72,15,126,195
325
Adam Langleyd9e397b2015-01-22 14:27:53 -0800326 xorq %r15,%r15
327 movq %r8,%rbp
328 movq (%rsp),%r10
329
Adam Langleyd9e397b2015-01-22 14:27:53 -0800330 mulq %rbx
331 addq %rax,%r10
332 movq (%rcx),%rax
333 adcq $0,%rdx
334
Adam Langleyd9e397b2015-01-22 14:27:53 -0800335 imulq %r10,%rbp
336 movq %rdx,%r11
337
Adam Langleyd9e397b2015-01-22 14:27:53 -0800338 mulq %rbp
339 addq %rax,%r10
340 movq 8(%rsi),%rax
341 adcq $0,%rdx
342 movq 8(%rsp),%r10
343 movq %rdx,%r13
344
345 leaq 1(%r15),%r15
346 jmp .Linner_enter
347
348.align 16
349.Linner:
350 addq %rax,%r13
351 movq (%rsi,%r15,8),%rax
352 adcq $0,%rdx
353 addq %r10,%r13
354 movq (%rsp,%r15,8),%r10
355 adcq $0,%rdx
356 movq %r13,-16(%rsp,%r15,8)
357 movq %rdx,%r13
358
359.Linner_enter:
360 mulq %rbx
361 addq %rax,%r11
362 movq (%rcx,%r15,8),%rax
363 adcq $0,%rdx
364 addq %r11,%r10
365 movq %rdx,%r11
366 adcq $0,%r11
367 leaq 1(%r15),%r15
368
369 mulq %rbp
370 cmpq %r9,%r15
371 jne .Linner
372
Adam Langleyd9e397b2015-01-22 14:27:53 -0800373 addq %rax,%r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800374 adcq $0,%rdx
375 addq %r10,%r13
David Benjamin4969cc92016-04-22 15:02:23 -0400376 movq (%rsp,%r9,8),%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -0800377 adcq $0,%rdx
David Benjamin4969cc92016-04-22 15:02:23 -0400378 movq %r13,-16(%rsp,%r9,8)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800379 movq %rdx,%r13
380
381 xorq %rdx,%rdx
382 addq %r11,%r13
383 adcq $0,%rdx
384 addq %r10,%r13
385 adcq $0,%rdx
386 movq %r13,-8(%rsp,%r9,8)
387 movq %rdx,(%rsp,%r9,8)
388
389 leaq 1(%r14),%r14
390 cmpq %r9,%r14
391 jb .Louter
392
393 xorq %r14,%r14
394 movq (%rsp),%rax
395 leaq (%rsp),%rsi
396 movq %r9,%r15
397 jmp .Lsub
398.align 16
Robert Sloan8ff03552017-06-14 12:40:58 -0700399.Lsub:
400 sbbq (%rcx,%r14,8),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800401 movq %rax,(%rdi,%r14,8)
402 movq 8(%rsi,%r14,8),%rax
403 leaq 1(%r14),%r14
404 decq %r15
405 jnz .Lsub
406
407 sbbq $0,%rax
408 xorq %r14,%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800409 andq %rax,%rsi
410 notq %rax
411 movq %rdi,%rcx
412 andq %rax,%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800413 movq %r9,%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800414 orq %rcx,%rsi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800415.align 16
416.Lcopy:
Robert Sloana94fe052017-02-21 08:49:28 -0800417 movq (%rsi,%r14,8),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800418 movq %r14,(%rsp,%r14,8)
Robert Sloana94fe052017-02-21 08:49:28 -0800419 movq %rax,(%rdi,%r14,8)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800420 leaq 1(%r14),%r14
421 subq $1,%r15
422 jnz .Lcopy
423
424 movq 8(%rsp,%r9,8),%rsi
Robert Sloana94fe052017-02-21 08:49:28 -0800425.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800426 movq $1,%rax
David Benjamin4969cc92016-04-22 15:02:23 -0400427
Adam Langleyd9e397b2015-01-22 14:27:53 -0800428 movq -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800429.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800430 movq -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800431.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800432 movq -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -0800433.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800434 movq -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -0800435.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800436 movq -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800437.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800438 movq -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800439.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800440 leaq (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800441.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800442.Lmul_epilogue:
443 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -0800444.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800445.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
446.type bn_mul4x_mont_gather5,@function
447.align 32
448bn_mul4x_mont_gather5:
Robert Sloana94fe052017-02-21 08:49:28 -0800449.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800450.byte 0x67
451 movq %rsp,%rax
Robert Sloana94fe052017-02-21 08:49:28 -0800452.cfi_def_cfa_register %rax
453.Lmul4x_enter:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800454 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800455.cfi_offset %rbx,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800456 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800457.cfi_offset %rbp,-24
Adam Langleyd9e397b2015-01-22 14:27:53 -0800458 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -0800459.cfi_offset %r12,-32
Adam Langleyd9e397b2015-01-22 14:27:53 -0800460 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -0800461.cfi_offset %r13,-40
Adam Langleyd9e397b2015-01-22 14:27:53 -0800462 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -0800463.cfi_offset %r14,-48
Adam Langleyd9e397b2015-01-22 14:27:53 -0800464 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -0800465.cfi_offset %r15,-56
466.Lmul4x_prologue:
David Benjamin4969cc92016-04-22 15:02:23 -0400467
Adam Langleyd9e397b2015-01-22 14:27:53 -0800468.byte 0x67
Adam Langleyd9e397b2015-01-22 14:27:53 -0800469 shll $3,%r9d
David Benjamin4969cc92016-04-22 15:02:23 -0400470 leaq (%r9,%r9,2),%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -0800471 negq %r9
472
473
474
475
476
477
478
479
David Benjamin4969cc92016-04-22 15:02:23 -0400480
481
482 leaq -320(%rsp,%r9,2),%r11
Robert Sloana94fe052017-02-21 08:49:28 -0800483 movq %rsp,%rbp
David Benjamin4969cc92016-04-22 15:02:23 -0400484 subq %rdi,%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -0800485 andq $4095,%r11
486 cmpq %r11,%r10
487 jb .Lmul4xsp_alt
Robert Sloana94fe052017-02-21 08:49:28 -0800488 subq %r11,%rbp
489 leaq -320(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800490 jmp .Lmul4xsp_done
491
492.align 32
493.Lmul4xsp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -0400494 leaq 4096-320(,%r9,2),%r10
Robert Sloana94fe052017-02-21 08:49:28 -0800495 leaq -320(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800496 subq %r10,%r11
497 movq $0,%r10
498 cmovcq %r10,%r11
Robert Sloana94fe052017-02-21 08:49:28 -0800499 subq %r11,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800500.Lmul4xsp_done:
Robert Sloana94fe052017-02-21 08:49:28 -0800501 andq $-64,%rbp
502 movq %rsp,%r11
503 subq %rbp,%r11
504 andq $-4096,%r11
505 leaq (%r11,%rbp,1),%rsp
506 movq (%rsp),%r10
507 cmpq %rbp,%rsp
508 ja .Lmul4x_page_walk
509 jmp .Lmul4x_page_walk_done
510
511.Lmul4x_page_walk:
512 leaq -4096(%rsp),%rsp
513 movq (%rsp),%r10
514 cmpq %rbp,%rsp
515 ja .Lmul4x_page_walk
516.Lmul4x_page_walk_done:
517
Adam Langleyd9e397b2015-01-22 14:27:53 -0800518 negq %r9
519
520 movq %rax,40(%rsp)
Robert Sloana94fe052017-02-21 08:49:28 -0800521.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
Adam Langleyd9e397b2015-01-22 14:27:53 -0800522.Lmul4x_body:
523
524 call mul4x_internal
525
526 movq 40(%rsp),%rsi
Robert Sloana94fe052017-02-21 08:49:28 -0800527.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800528 movq $1,%rax
David Benjamin4969cc92016-04-22 15:02:23 -0400529
Adam Langleyd9e397b2015-01-22 14:27:53 -0800530 movq -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800531.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800532 movq -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800533.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800534 movq -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -0800535.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800536 movq -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -0800537.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800538 movq -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800539.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800540 movq -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800541.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800542 leaq (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800543.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800544.Lmul4x_epilogue:
545 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -0800546.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800547.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
548
549.type mul4x_internal,@function
550.align 32
551mul4x_internal:
552 shlq $5,%r9
David Benjamin4969cc92016-04-22 15:02:23 -0400553 movd 8(%rax),%xmm5
554 leaq .Linc(%rip),%rax
555 leaq 128(%rdx,%r9,1),%r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800556 shrq $5,%r9
David Benjamin4969cc92016-04-22 15:02:23 -0400557 movdqa 0(%rax),%xmm0
558 movdqa 16(%rax),%xmm1
559 leaq 88-112(%rsp,%r9,1),%r10
560 leaq 128(%rdx),%r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800561
David Benjamin4969cc92016-04-22 15:02:23 -0400562 pshufd $0,%xmm5,%xmm5
563 movdqa %xmm1,%xmm4
564.byte 0x67,0x67
565 movdqa %xmm1,%xmm2
566 paddd %xmm0,%xmm1
567 pcmpeqd %xmm5,%xmm0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800568.byte 0x67
David Benjamin4969cc92016-04-22 15:02:23 -0400569 movdqa %xmm4,%xmm3
570 paddd %xmm1,%xmm2
571 pcmpeqd %xmm5,%xmm1
572 movdqa %xmm0,112(%r10)
573 movdqa %xmm4,%xmm0
574
575 paddd %xmm2,%xmm3
576 pcmpeqd %xmm5,%xmm2
577 movdqa %xmm1,128(%r10)
578 movdqa %xmm4,%xmm1
579
580 paddd %xmm3,%xmm0
581 pcmpeqd %xmm5,%xmm3
582 movdqa %xmm2,144(%r10)
583 movdqa %xmm4,%xmm2
584
585 paddd %xmm0,%xmm1
586 pcmpeqd %xmm5,%xmm0
587 movdqa %xmm3,160(%r10)
588 movdqa %xmm4,%xmm3
589 paddd %xmm1,%xmm2
590 pcmpeqd %xmm5,%xmm1
591 movdqa %xmm0,176(%r10)
592 movdqa %xmm4,%xmm0
593
594 paddd %xmm2,%xmm3
595 pcmpeqd %xmm5,%xmm2
596 movdqa %xmm1,192(%r10)
597 movdqa %xmm4,%xmm1
598
599 paddd %xmm3,%xmm0
600 pcmpeqd %xmm5,%xmm3
601 movdqa %xmm2,208(%r10)
602 movdqa %xmm4,%xmm2
603
604 paddd %xmm0,%xmm1
605 pcmpeqd %xmm5,%xmm0
606 movdqa %xmm3,224(%r10)
607 movdqa %xmm4,%xmm3
608 paddd %xmm1,%xmm2
609 pcmpeqd %xmm5,%xmm1
610 movdqa %xmm0,240(%r10)
611 movdqa %xmm4,%xmm0
612
613 paddd %xmm2,%xmm3
614 pcmpeqd %xmm5,%xmm2
615 movdqa %xmm1,256(%r10)
616 movdqa %xmm4,%xmm1
617
618 paddd %xmm3,%xmm0
619 pcmpeqd %xmm5,%xmm3
620 movdqa %xmm2,272(%r10)
621 movdqa %xmm4,%xmm2
622
623 paddd %xmm0,%xmm1
624 pcmpeqd %xmm5,%xmm0
625 movdqa %xmm3,288(%r10)
626 movdqa %xmm4,%xmm3
627 paddd %xmm1,%xmm2
628 pcmpeqd %xmm5,%xmm1
629 movdqa %xmm0,304(%r10)
630
631 paddd %xmm2,%xmm3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800632.byte 0x67
David Benjamin4969cc92016-04-22 15:02:23 -0400633 pcmpeqd %xmm5,%xmm2
634 movdqa %xmm1,320(%r10)
635
636 pcmpeqd %xmm5,%xmm3
637 movdqa %xmm2,336(%r10)
638 pand 64(%r12),%xmm0
639
640 pand 80(%r12),%xmm1
641 pand 96(%r12),%xmm2
642 movdqa %xmm3,352(%r10)
643 pand 112(%r12),%xmm3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800644 por %xmm2,%xmm0
David Benjamin4969cc92016-04-22 15:02:23 -0400645 por %xmm3,%xmm1
646 movdqa -128(%r12),%xmm4
647 movdqa -112(%r12),%xmm5
648 movdqa -96(%r12),%xmm2
649 pand 112(%r10),%xmm4
650 movdqa -80(%r12),%xmm3
651 pand 128(%r10),%xmm5
652 por %xmm4,%xmm0
653 pand 144(%r10),%xmm2
654 por %xmm5,%xmm1
655 pand 160(%r10),%xmm3
656 por %xmm2,%xmm0
657 por %xmm3,%xmm1
658 movdqa -64(%r12),%xmm4
659 movdqa -48(%r12),%xmm5
660 movdqa -32(%r12),%xmm2
661 pand 176(%r10),%xmm4
662 movdqa -16(%r12),%xmm3
663 pand 192(%r10),%xmm5
664 por %xmm4,%xmm0
665 pand 208(%r10),%xmm2
666 por %xmm5,%xmm1
667 pand 224(%r10),%xmm3
668 por %xmm2,%xmm0
669 por %xmm3,%xmm1
670 movdqa 0(%r12),%xmm4
671 movdqa 16(%r12),%xmm5
672 movdqa 32(%r12),%xmm2
673 pand 240(%r10),%xmm4
674 movdqa 48(%r12),%xmm3
675 pand 256(%r10),%xmm5
676 por %xmm4,%xmm0
677 pand 272(%r10),%xmm2
678 por %xmm5,%xmm1
679 pand 288(%r10),%xmm3
680 por %xmm2,%xmm0
681 por %xmm3,%xmm1
682 por %xmm1,%xmm0
683 pshufd $0x4e,%xmm0,%xmm1
684 por %xmm1,%xmm0
685 leaq 256(%r12),%r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800686.byte 102,72,15,126,195
David Benjamin4969cc92016-04-22 15:02:23 -0400687
Adam Langleyd9e397b2015-01-22 14:27:53 -0800688 movq %r13,16+8(%rsp)
689 movq %rdi,56+8(%rsp)
690
691 movq (%r8),%r8
692 movq (%rsi),%rax
693 leaq (%rsi,%r9,1),%rsi
694 negq %r9
695
696 movq %r8,%rbp
697 mulq %rbx
698 movq %rax,%r10
699 movq (%rcx),%rax
700
Adam Langleyd9e397b2015-01-22 14:27:53 -0800701 imulq %r10,%rbp
David Benjamin4969cc92016-04-22 15:02:23 -0400702 leaq 64+8(%rsp),%r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800703 movq %rdx,%r11
704
Adam Langleyd9e397b2015-01-22 14:27:53 -0800705 mulq %rbp
706 addq %rax,%r10
707 movq 8(%rsi,%r9,1),%rax
708 adcq $0,%rdx
709 movq %rdx,%rdi
710
711 mulq %rbx
712 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -0400713 movq 8(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800714 adcq $0,%rdx
715 movq %rdx,%r10
716
717 mulq %rbp
718 addq %rax,%rdi
719 movq 16(%rsi,%r9,1),%rax
720 adcq $0,%rdx
721 addq %r11,%rdi
722 leaq 32(%r9),%r15
David Benjamin4969cc92016-04-22 15:02:23 -0400723 leaq 32(%rcx),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800724 adcq $0,%rdx
725 movq %rdi,(%r14)
726 movq %rdx,%r13
727 jmp .L1st4x
728
729.align 32
730.L1st4x:
731 mulq %rbx
732 addq %rax,%r10
David Benjamin4969cc92016-04-22 15:02:23 -0400733 movq -16(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800734 leaq 32(%r14),%r14
735 adcq $0,%rdx
736 movq %rdx,%r11
737
738 mulq %rbp
739 addq %rax,%r13
740 movq -8(%rsi,%r15,1),%rax
741 adcq $0,%rdx
742 addq %r10,%r13
743 adcq $0,%rdx
744 movq %r13,-24(%r14)
745 movq %rdx,%rdi
746
747 mulq %rbx
748 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -0400749 movq -8(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800750 adcq $0,%rdx
751 movq %rdx,%r10
752
753 mulq %rbp
754 addq %rax,%rdi
755 movq (%rsi,%r15,1),%rax
756 adcq $0,%rdx
757 addq %r11,%rdi
758 adcq $0,%rdx
759 movq %rdi,-16(%r14)
760 movq %rdx,%r13
761
762 mulq %rbx
763 addq %rax,%r10
764 movq 0(%rcx),%rax
765 adcq $0,%rdx
766 movq %rdx,%r11
767
768 mulq %rbp
769 addq %rax,%r13
770 movq 8(%rsi,%r15,1),%rax
771 adcq $0,%rdx
772 addq %r10,%r13
773 adcq $0,%rdx
774 movq %r13,-8(%r14)
775 movq %rdx,%rdi
776
777 mulq %rbx
778 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -0400779 movq 8(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800780 adcq $0,%rdx
781 movq %rdx,%r10
782
783 mulq %rbp
784 addq %rax,%rdi
785 movq 16(%rsi,%r15,1),%rax
786 adcq $0,%rdx
787 addq %r11,%rdi
David Benjamin4969cc92016-04-22 15:02:23 -0400788 leaq 32(%rcx),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800789 adcq $0,%rdx
790 movq %rdi,(%r14)
791 movq %rdx,%r13
792
793 addq $32,%r15
794 jnz .L1st4x
795
796 mulq %rbx
797 addq %rax,%r10
David Benjamin4969cc92016-04-22 15:02:23 -0400798 movq -16(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800799 leaq 32(%r14),%r14
800 adcq $0,%rdx
801 movq %rdx,%r11
802
803 mulq %rbp
804 addq %rax,%r13
805 movq -8(%rsi),%rax
806 adcq $0,%rdx
807 addq %r10,%r13
808 adcq $0,%rdx
809 movq %r13,-24(%r14)
810 movq %rdx,%rdi
811
812 mulq %rbx
813 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -0400814 movq -8(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800815 adcq $0,%rdx
816 movq %rdx,%r10
817
818 mulq %rbp
819 addq %rax,%rdi
820 movq (%rsi,%r9,1),%rax
821 adcq $0,%rdx
822 addq %r11,%rdi
823 adcq $0,%rdx
824 movq %rdi,-16(%r14)
825 movq %rdx,%r13
826
David Benjamin4969cc92016-04-22 15:02:23 -0400827 leaq (%rcx,%r9,1),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800828
829 xorq %rdi,%rdi
830 addq %r10,%r13
831 adcq $0,%rdi
832 movq %r13,-8(%r14)
833
834 jmp .Louter4x
835
836.align 32
837.Louter4x:
David Benjamin4969cc92016-04-22 15:02:23 -0400838 leaq 16+128(%r14),%rdx
839 pxor %xmm4,%xmm4
840 pxor %xmm5,%xmm5
841 movdqa -128(%r12),%xmm0
842 movdqa -112(%r12),%xmm1
843 movdqa -96(%r12),%xmm2
844 movdqa -80(%r12),%xmm3
845 pand -128(%rdx),%xmm0
846 pand -112(%rdx),%xmm1
847 por %xmm0,%xmm4
848 pand -96(%rdx),%xmm2
849 por %xmm1,%xmm5
850 pand -80(%rdx),%xmm3
851 por %xmm2,%xmm4
852 por %xmm3,%xmm5
853 movdqa -64(%r12),%xmm0
854 movdqa -48(%r12),%xmm1
855 movdqa -32(%r12),%xmm2
856 movdqa -16(%r12),%xmm3
857 pand -64(%rdx),%xmm0
858 pand -48(%rdx),%xmm1
859 por %xmm0,%xmm4
860 pand -32(%rdx),%xmm2
861 por %xmm1,%xmm5
862 pand -16(%rdx),%xmm3
863 por %xmm2,%xmm4
864 por %xmm3,%xmm5
865 movdqa 0(%r12),%xmm0
866 movdqa 16(%r12),%xmm1
867 movdqa 32(%r12),%xmm2
868 movdqa 48(%r12),%xmm3
869 pand 0(%rdx),%xmm0
870 pand 16(%rdx),%xmm1
871 por %xmm0,%xmm4
872 pand 32(%rdx),%xmm2
873 por %xmm1,%xmm5
874 pand 48(%rdx),%xmm3
875 por %xmm2,%xmm4
876 por %xmm3,%xmm5
877 movdqa 64(%r12),%xmm0
878 movdqa 80(%r12),%xmm1
879 movdqa 96(%r12),%xmm2
880 movdqa 112(%r12),%xmm3
881 pand 64(%rdx),%xmm0
882 pand 80(%rdx),%xmm1
883 por %xmm0,%xmm4
884 pand 96(%rdx),%xmm2
885 por %xmm1,%xmm5
886 pand 112(%rdx),%xmm3
887 por %xmm2,%xmm4
888 por %xmm3,%xmm5
889 por %xmm5,%xmm4
890 pshufd $0x4e,%xmm4,%xmm0
891 por %xmm4,%xmm0
892 leaq 256(%r12),%r12
893.byte 102,72,15,126,195
894
Adam Langleyd9e397b2015-01-22 14:27:53 -0800895 movq (%r14,%r9,1),%r10
896 movq %r8,%rbp
897 mulq %rbx
898 addq %rax,%r10
899 movq (%rcx),%rax
900 adcq $0,%rdx
901
Adam Langleyd9e397b2015-01-22 14:27:53 -0800902 imulq %r10,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800903 movq %rdx,%r11
904 movq %rdi,(%r14)
905
Adam Langleyd9e397b2015-01-22 14:27:53 -0800906 leaq (%r14,%r9,1),%r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800907
908 mulq %rbp
909 addq %rax,%r10
910 movq 8(%rsi,%r9,1),%rax
911 adcq $0,%rdx
912 movq %rdx,%rdi
913
914 mulq %rbx
915 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -0400916 movq 8(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800917 adcq $0,%rdx
918 addq 8(%r14),%r11
919 adcq $0,%rdx
920 movq %rdx,%r10
921
922 mulq %rbp
923 addq %rax,%rdi
924 movq 16(%rsi,%r9,1),%rax
925 adcq $0,%rdx
926 addq %r11,%rdi
927 leaq 32(%r9),%r15
David Benjamin4969cc92016-04-22 15:02:23 -0400928 leaq 32(%rcx),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800929 adcq $0,%rdx
930 movq %rdx,%r13
931 jmp .Linner4x
932
933.align 32
934.Linner4x:
935 mulq %rbx
936 addq %rax,%r10
David Benjamin4969cc92016-04-22 15:02:23 -0400937 movq -16(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800938 adcq $0,%rdx
939 addq 16(%r14),%r10
940 leaq 32(%r14),%r14
941 adcq $0,%rdx
942 movq %rdx,%r11
943
944 mulq %rbp
945 addq %rax,%r13
946 movq -8(%rsi,%r15,1),%rax
947 adcq $0,%rdx
948 addq %r10,%r13
949 adcq $0,%rdx
950 movq %rdi,-32(%r14)
951 movq %rdx,%rdi
952
953 mulq %rbx
954 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -0400955 movq -8(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800956 adcq $0,%rdx
957 addq -8(%r14),%r11
958 adcq $0,%rdx
959 movq %rdx,%r10
960
961 mulq %rbp
962 addq %rax,%rdi
963 movq (%rsi,%r15,1),%rax
964 adcq $0,%rdx
965 addq %r11,%rdi
966 adcq $0,%rdx
967 movq %r13,-24(%r14)
968 movq %rdx,%r13
969
970 mulq %rbx
971 addq %rax,%r10
972 movq 0(%rcx),%rax
973 adcq $0,%rdx
974 addq (%r14),%r10
975 adcq $0,%rdx
976 movq %rdx,%r11
977
978 mulq %rbp
979 addq %rax,%r13
980 movq 8(%rsi,%r15,1),%rax
981 adcq $0,%rdx
982 addq %r10,%r13
983 adcq $0,%rdx
984 movq %rdi,-16(%r14)
985 movq %rdx,%rdi
986
987 mulq %rbx
988 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -0400989 movq 8(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800990 adcq $0,%rdx
991 addq 8(%r14),%r11
992 adcq $0,%rdx
993 movq %rdx,%r10
994
995 mulq %rbp
996 addq %rax,%rdi
997 movq 16(%rsi,%r15,1),%rax
998 adcq $0,%rdx
999 addq %r11,%rdi
David Benjamin4969cc92016-04-22 15:02:23 -04001000 leaq 32(%rcx),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001001 adcq $0,%rdx
1002 movq %r13,-8(%r14)
1003 movq %rdx,%r13
1004
1005 addq $32,%r15
1006 jnz .Linner4x
1007
1008 mulq %rbx
1009 addq %rax,%r10
David Benjamin4969cc92016-04-22 15:02:23 -04001010 movq -16(%rcx),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001011 adcq $0,%rdx
1012 addq 16(%r14),%r10
1013 leaq 32(%r14),%r14
1014 adcq $0,%rdx
1015 movq %rdx,%r11
1016
1017 mulq %rbp
1018 addq %rax,%r13
1019 movq -8(%rsi),%rax
1020 adcq $0,%rdx
1021 addq %r10,%r13
1022 adcq $0,%rdx
1023 movq %rdi,-32(%r14)
1024 movq %rdx,%rdi
1025
1026 mulq %rbx
1027 addq %rax,%r11
1028 movq %rbp,%rax
David Benjamin4969cc92016-04-22 15:02:23 -04001029 movq -8(%rcx),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001030 adcq $0,%rdx
1031 addq -8(%r14),%r11
1032 adcq $0,%rdx
1033 movq %rdx,%r10
1034
1035 mulq %rbp
1036 addq %rax,%rdi
1037 movq (%rsi,%r9,1),%rax
1038 adcq $0,%rdx
1039 addq %r11,%rdi
1040 adcq $0,%rdx
1041 movq %r13,-24(%r14)
1042 movq %rdx,%r13
1043
Adam Langleyd9e397b2015-01-22 14:27:53 -08001044 movq %rdi,-16(%r14)
David Benjamin4969cc92016-04-22 15:02:23 -04001045 leaq (%rcx,%r9,1),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001046
1047 xorq %rdi,%rdi
1048 addq %r10,%r13
1049 adcq $0,%rdi
1050 addq (%r14),%r13
1051 adcq $0,%rdi
1052 movq %r13,-8(%r14)
1053
1054 cmpq 16+8(%rsp),%r12
1055 jb .Louter4x
David Benjamin4969cc92016-04-22 15:02:23 -04001056 xorq %rax,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001057 subq %r13,%rbp
1058 adcq %r15,%r15
1059 orq %r15,%rdi
David Benjamin4969cc92016-04-22 15:02:23 -04001060 subq %rdi,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001061 leaq (%r14,%r9,1),%rbx
David Benjamin4969cc92016-04-22 15:02:23 -04001062 movq (%rcx),%r12
1063 leaq (%rcx),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001064 movq %r9,%rcx
1065 sarq $3+2,%rcx
1066 movq 56+8(%rsp),%rdi
David Benjamin4969cc92016-04-22 15:02:23 -04001067 decq %r12
1068 xorq %r10,%r10
1069 movq 8(%rbp),%r13
1070 movq 16(%rbp),%r14
1071 movq 24(%rbp),%r15
1072 jmp .Lsqr4x_sub_entry
Adam Langleyd9e397b2015-01-22 14:27:53 -08001073.size mul4x_internal,.-mul4x_internal
1074.globl bn_power5
1075.hidden bn_power5
1076.type bn_power5,@function
1077.align 32
1078bn_power5:
Robert Sloana94fe052017-02-21 08:49:28 -08001079.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08001080 movq %rsp,%rax
Robert Sloana94fe052017-02-21 08:49:28 -08001081.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001082 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08001083.cfi_offset %rbx,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001084 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08001085.cfi_offset %rbp,-24
Adam Langleyd9e397b2015-01-22 14:27:53 -08001086 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -08001087.cfi_offset %r12,-32
Adam Langleyd9e397b2015-01-22 14:27:53 -08001088 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -08001089.cfi_offset %r13,-40
Adam Langleyd9e397b2015-01-22 14:27:53 -08001090 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -08001091.cfi_offset %r14,-48
Adam Langleyd9e397b2015-01-22 14:27:53 -08001092 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -08001093.cfi_offset %r15,-56
1094.Lpower5_prologue:
David Benjamin4969cc92016-04-22 15:02:23 -04001095
Adam Langleyd9e397b2015-01-22 14:27:53 -08001096 shll $3,%r9d
David Benjamin4969cc92016-04-22 15:02:23 -04001097 leal (%r9,%r9,2),%r10d
Adam Langleyd9e397b2015-01-22 14:27:53 -08001098 negq %r9
1099 movq (%r8),%r8
1100
1101
1102
1103
1104
1105
1106
David Benjamin4969cc92016-04-22 15:02:23 -04001107
1108 leaq -320(%rsp,%r9,2),%r11
Robert Sloana94fe052017-02-21 08:49:28 -08001109 movq %rsp,%rbp
David Benjamin4969cc92016-04-22 15:02:23 -04001110 subq %rdi,%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08001111 andq $4095,%r11
1112 cmpq %r11,%r10
1113 jb .Lpwr_sp_alt
Robert Sloana94fe052017-02-21 08:49:28 -08001114 subq %r11,%rbp
1115 leaq -320(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001116 jmp .Lpwr_sp_done
1117
1118.align 32
1119.Lpwr_sp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -04001120 leaq 4096-320(,%r9,2),%r10
Robert Sloana94fe052017-02-21 08:49:28 -08001121 leaq -320(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001122 subq %r10,%r11
1123 movq $0,%r10
1124 cmovcq %r10,%r11
Robert Sloana94fe052017-02-21 08:49:28 -08001125 subq %r11,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001126.Lpwr_sp_done:
Robert Sloana94fe052017-02-21 08:49:28 -08001127 andq $-64,%rbp
1128 movq %rsp,%r11
1129 subq %rbp,%r11
1130 andq $-4096,%r11
1131 leaq (%r11,%rbp,1),%rsp
1132 movq (%rsp),%r10
1133 cmpq %rbp,%rsp
1134 ja .Lpwr_page_walk
1135 jmp .Lpwr_page_walk_done
1136
1137.Lpwr_page_walk:
1138 leaq -4096(%rsp),%rsp
1139 movq (%rsp),%r10
1140 cmpq %rbp,%rsp
1141 ja .Lpwr_page_walk
1142.Lpwr_page_walk_done:
1143
Adam Langleyd9e397b2015-01-22 14:27:53 -08001144 movq %r9,%r10
1145 negq %r9
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156 movq %r8,32(%rsp)
1157 movq %rax,40(%rsp)
Robert Sloana94fe052017-02-21 08:49:28 -08001158.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
Adam Langleyd9e397b2015-01-22 14:27:53 -08001159.Lpower5_body:
1160.byte 102,72,15,110,207
1161.byte 102,72,15,110,209
1162.byte 102,73,15,110,218
1163.byte 102,72,15,110,226
1164
1165 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001166 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001167 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001168 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001169 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001170 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001171 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001172 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001173 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001174 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001175
1176.byte 102,72,15,126,209
1177.byte 102,72,15,126,226
1178 movq %rsi,%rdi
1179 movq 40(%rsp),%rax
1180 leaq 32(%rsp),%r8
1181
1182 call mul4x_internal
1183
1184 movq 40(%rsp),%rsi
Robert Sloana94fe052017-02-21 08:49:28 -08001185.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001186 movq $1,%rax
1187 movq -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -08001188.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08001189 movq -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -08001190.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08001191 movq -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -08001192.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001193 movq -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -08001194.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08001195 movq -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -08001196.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001197 movq -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -08001198.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001199 leaq (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -08001200.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001201.Lpower5_epilogue:
1202 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -08001203.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08001204.size bn_power5,.-bn_power5
1205
1206.globl bn_sqr8x_internal
1207.hidden bn_sqr8x_internal
1208.hidden bn_sqr8x_internal
1209.type bn_sqr8x_internal,@function
1210.align 32
1211bn_sqr8x_internal:
1212__bn_sqr8x_internal:
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286 leaq 32(%r10),%rbp
1287 leaq (%rsi,%r9,1),%rsi
1288
1289 movq %r9,%rcx
1290
1291
1292 movq -32(%rsi,%rbp,1),%r14
1293 leaq 48+8(%rsp,%r9,2),%rdi
1294 movq -24(%rsi,%rbp,1),%rax
1295 leaq -32(%rdi,%rbp,1),%rdi
1296 movq -16(%rsi,%rbp,1),%rbx
1297 movq %rax,%r15
1298
1299 mulq %r14
1300 movq %rax,%r10
1301 movq %rbx,%rax
1302 movq %rdx,%r11
1303 movq %r10,-24(%rdi,%rbp,1)
1304
1305 mulq %r14
1306 addq %rax,%r11
1307 movq %rbx,%rax
1308 adcq $0,%rdx
1309 movq %r11,-16(%rdi,%rbp,1)
1310 movq %rdx,%r10
1311
1312
1313 movq -8(%rsi,%rbp,1),%rbx
1314 mulq %r15
1315 movq %rax,%r12
1316 movq %rbx,%rax
1317 movq %rdx,%r13
1318
1319 leaq (%rbp),%rcx
1320 mulq %r14
1321 addq %rax,%r10
1322 movq %rbx,%rax
1323 movq %rdx,%r11
1324 adcq $0,%r11
1325 addq %r12,%r10
1326 adcq $0,%r11
1327 movq %r10,-8(%rdi,%rcx,1)
1328 jmp .Lsqr4x_1st
1329
1330.align 32
1331.Lsqr4x_1st:
1332 movq (%rsi,%rcx,1),%rbx
1333 mulq %r15
1334 addq %rax,%r13
1335 movq %rbx,%rax
1336 movq %rdx,%r12
1337 adcq $0,%r12
1338
1339 mulq %r14
1340 addq %rax,%r11
1341 movq %rbx,%rax
1342 movq 8(%rsi,%rcx,1),%rbx
1343 movq %rdx,%r10
1344 adcq $0,%r10
1345 addq %r13,%r11
1346 adcq $0,%r10
1347
1348
1349 mulq %r15
1350 addq %rax,%r12
1351 movq %rbx,%rax
1352 movq %r11,(%rdi,%rcx,1)
1353 movq %rdx,%r13
1354 adcq $0,%r13
1355
1356 mulq %r14
1357 addq %rax,%r10
1358 movq %rbx,%rax
1359 movq 16(%rsi,%rcx,1),%rbx
1360 movq %rdx,%r11
1361 adcq $0,%r11
1362 addq %r12,%r10
1363 adcq $0,%r11
1364
1365 mulq %r15
1366 addq %rax,%r13
1367 movq %rbx,%rax
1368 movq %r10,8(%rdi,%rcx,1)
1369 movq %rdx,%r12
1370 adcq $0,%r12
1371
1372 mulq %r14
1373 addq %rax,%r11
1374 movq %rbx,%rax
1375 movq 24(%rsi,%rcx,1),%rbx
1376 movq %rdx,%r10
1377 adcq $0,%r10
1378 addq %r13,%r11
1379 adcq $0,%r10
1380
1381
1382 mulq %r15
1383 addq %rax,%r12
1384 movq %rbx,%rax
1385 movq %r11,16(%rdi,%rcx,1)
1386 movq %rdx,%r13
1387 adcq $0,%r13
1388 leaq 32(%rcx),%rcx
1389
1390 mulq %r14
1391 addq %rax,%r10
1392 movq %rbx,%rax
1393 movq %rdx,%r11
1394 adcq $0,%r11
1395 addq %r12,%r10
1396 adcq $0,%r11
1397 movq %r10,-8(%rdi,%rcx,1)
1398
1399 cmpq $0,%rcx
1400 jne .Lsqr4x_1st
1401
1402 mulq %r15
1403 addq %rax,%r13
1404 leaq 16(%rbp),%rbp
1405 adcq $0,%rdx
1406 addq %r11,%r13
1407 adcq $0,%rdx
1408
1409 movq %r13,(%rdi)
1410 movq %rdx,%r12
1411 movq %rdx,8(%rdi)
1412 jmp .Lsqr4x_outer
1413
1414.align 32
1415.Lsqr4x_outer:
1416 movq -32(%rsi,%rbp,1),%r14
1417 leaq 48+8(%rsp,%r9,2),%rdi
1418 movq -24(%rsi,%rbp,1),%rax
1419 leaq -32(%rdi,%rbp,1),%rdi
1420 movq -16(%rsi,%rbp,1),%rbx
1421 movq %rax,%r15
1422
1423 mulq %r14
1424 movq -24(%rdi,%rbp,1),%r10
1425 addq %rax,%r10
1426 movq %rbx,%rax
1427 adcq $0,%rdx
1428 movq %r10,-24(%rdi,%rbp,1)
1429 movq %rdx,%r11
1430
1431 mulq %r14
1432 addq %rax,%r11
1433 movq %rbx,%rax
1434 adcq $0,%rdx
1435 addq -16(%rdi,%rbp,1),%r11
1436 movq %rdx,%r10
1437 adcq $0,%r10
1438 movq %r11,-16(%rdi,%rbp,1)
1439
1440 xorq %r12,%r12
1441
1442 movq -8(%rsi,%rbp,1),%rbx
1443 mulq %r15
1444 addq %rax,%r12
1445 movq %rbx,%rax
1446 adcq $0,%rdx
1447 addq -8(%rdi,%rbp,1),%r12
1448 movq %rdx,%r13
1449 adcq $0,%r13
1450
1451 mulq %r14
1452 addq %rax,%r10
1453 movq %rbx,%rax
1454 adcq $0,%rdx
1455 addq %r12,%r10
1456 movq %rdx,%r11
1457 adcq $0,%r11
1458 movq %r10,-8(%rdi,%rbp,1)
1459
1460 leaq (%rbp),%rcx
1461 jmp .Lsqr4x_inner
1462
1463.align 32
1464.Lsqr4x_inner:
1465 movq (%rsi,%rcx,1),%rbx
1466 mulq %r15
1467 addq %rax,%r13
1468 movq %rbx,%rax
1469 movq %rdx,%r12
1470 adcq $0,%r12
1471 addq (%rdi,%rcx,1),%r13
1472 adcq $0,%r12
1473
1474.byte 0x67
1475 mulq %r14
1476 addq %rax,%r11
1477 movq %rbx,%rax
1478 movq 8(%rsi,%rcx,1),%rbx
1479 movq %rdx,%r10
1480 adcq $0,%r10
1481 addq %r13,%r11
1482 adcq $0,%r10
1483
1484 mulq %r15
1485 addq %rax,%r12
1486 movq %r11,(%rdi,%rcx,1)
1487 movq %rbx,%rax
1488 movq %rdx,%r13
1489 adcq $0,%r13
1490 addq 8(%rdi,%rcx,1),%r12
1491 leaq 16(%rcx),%rcx
1492 adcq $0,%r13
1493
1494 mulq %r14
1495 addq %rax,%r10
1496 movq %rbx,%rax
1497 adcq $0,%rdx
1498 addq %r12,%r10
1499 movq %rdx,%r11
1500 adcq $0,%r11
1501 movq %r10,-8(%rdi,%rcx,1)
1502
1503 cmpq $0,%rcx
1504 jne .Lsqr4x_inner
1505
1506.byte 0x67
1507 mulq %r15
1508 addq %rax,%r13
1509 adcq $0,%rdx
1510 addq %r11,%r13
1511 adcq $0,%rdx
1512
1513 movq %r13,(%rdi)
1514 movq %rdx,%r12
1515 movq %rdx,8(%rdi)
1516
1517 addq $16,%rbp
1518 jnz .Lsqr4x_outer
1519
1520
1521 movq -32(%rsi),%r14
1522 leaq 48+8(%rsp,%r9,2),%rdi
1523 movq -24(%rsi),%rax
1524 leaq -32(%rdi,%rbp,1),%rdi
1525 movq -16(%rsi),%rbx
1526 movq %rax,%r15
1527
1528 mulq %r14
1529 addq %rax,%r10
1530 movq %rbx,%rax
1531 movq %rdx,%r11
1532 adcq $0,%r11
1533
1534 mulq %r14
1535 addq %rax,%r11
1536 movq %rbx,%rax
1537 movq %r10,-24(%rdi)
1538 movq %rdx,%r10
1539 adcq $0,%r10
1540 addq %r13,%r11
1541 movq -8(%rsi),%rbx
1542 adcq $0,%r10
1543
1544 mulq %r15
1545 addq %rax,%r12
1546 movq %rbx,%rax
1547 movq %r11,-16(%rdi)
1548 movq %rdx,%r13
1549 adcq $0,%r13
1550
1551 mulq %r14
1552 addq %rax,%r10
1553 movq %rbx,%rax
1554 movq %rdx,%r11
1555 adcq $0,%r11
1556 addq %r12,%r10
1557 adcq $0,%r11
1558 movq %r10,-8(%rdi)
1559
1560 mulq %r15
1561 addq %rax,%r13
1562 movq -16(%rsi),%rax
1563 adcq $0,%rdx
1564 addq %r11,%r13
1565 adcq $0,%rdx
1566
1567 movq %r13,(%rdi)
1568 movq %rdx,%r12
1569 movq %rdx,8(%rdi)
1570
1571 mulq %rbx
1572 addq $16,%rbp
1573 xorq %r14,%r14
1574 subq %r9,%rbp
1575 xorq %r15,%r15
1576
1577 addq %r12,%rax
1578 adcq $0,%rdx
1579 movq %rax,8(%rdi)
1580 movq %rdx,16(%rdi)
1581 movq %r15,24(%rdi)
1582
1583 movq -16(%rsi,%rbp,1),%rax
1584 leaq 48+8(%rsp),%rdi
1585 xorq %r10,%r10
1586 movq 8(%rdi),%r11
1587
1588 leaq (%r14,%r10,2),%r12
1589 shrq $63,%r10
1590 leaq (%rcx,%r11,2),%r13
1591 shrq $63,%r11
1592 orq %r10,%r13
1593 movq 16(%rdi),%r10
1594 movq %r11,%r14
1595 mulq %rax
1596 negq %r15
1597 movq 24(%rdi),%r11
1598 adcq %rax,%r12
1599 movq -8(%rsi,%rbp,1),%rax
1600 movq %r12,(%rdi)
1601 adcq %rdx,%r13
1602
1603 leaq (%r14,%r10,2),%rbx
1604 movq %r13,8(%rdi)
1605 sbbq %r15,%r15
1606 shrq $63,%r10
1607 leaq (%rcx,%r11,2),%r8
1608 shrq $63,%r11
1609 orq %r10,%r8
1610 movq 32(%rdi),%r10
1611 movq %r11,%r14
1612 mulq %rax
1613 negq %r15
1614 movq 40(%rdi),%r11
1615 adcq %rax,%rbx
1616 movq 0(%rsi,%rbp,1),%rax
1617 movq %rbx,16(%rdi)
1618 adcq %rdx,%r8
1619 leaq 16(%rbp),%rbp
1620 movq %r8,24(%rdi)
1621 sbbq %r15,%r15
1622 leaq 64(%rdi),%rdi
1623 jmp .Lsqr4x_shift_n_add
1624
1625.align 32
1626.Lsqr4x_shift_n_add:
1627 leaq (%r14,%r10,2),%r12
1628 shrq $63,%r10
1629 leaq (%rcx,%r11,2),%r13
1630 shrq $63,%r11
1631 orq %r10,%r13
1632 movq -16(%rdi),%r10
1633 movq %r11,%r14
1634 mulq %rax
1635 negq %r15
1636 movq -8(%rdi),%r11
1637 adcq %rax,%r12
1638 movq -8(%rsi,%rbp,1),%rax
1639 movq %r12,-32(%rdi)
1640 adcq %rdx,%r13
1641
1642 leaq (%r14,%r10,2),%rbx
1643 movq %r13,-24(%rdi)
1644 sbbq %r15,%r15
1645 shrq $63,%r10
1646 leaq (%rcx,%r11,2),%r8
1647 shrq $63,%r11
1648 orq %r10,%r8
1649 movq 0(%rdi),%r10
1650 movq %r11,%r14
1651 mulq %rax
1652 negq %r15
1653 movq 8(%rdi),%r11
1654 adcq %rax,%rbx
1655 movq 0(%rsi,%rbp,1),%rax
1656 movq %rbx,-16(%rdi)
1657 adcq %rdx,%r8
1658
1659 leaq (%r14,%r10,2),%r12
1660 movq %r8,-8(%rdi)
1661 sbbq %r15,%r15
1662 shrq $63,%r10
1663 leaq (%rcx,%r11,2),%r13
1664 shrq $63,%r11
1665 orq %r10,%r13
1666 movq 16(%rdi),%r10
1667 movq %r11,%r14
1668 mulq %rax
1669 negq %r15
1670 movq 24(%rdi),%r11
1671 adcq %rax,%r12
1672 movq 8(%rsi,%rbp,1),%rax
1673 movq %r12,0(%rdi)
1674 adcq %rdx,%r13
1675
1676 leaq (%r14,%r10,2),%rbx
1677 movq %r13,8(%rdi)
1678 sbbq %r15,%r15
1679 shrq $63,%r10
1680 leaq (%rcx,%r11,2),%r8
1681 shrq $63,%r11
1682 orq %r10,%r8
1683 movq 32(%rdi),%r10
1684 movq %r11,%r14
1685 mulq %rax
1686 negq %r15
1687 movq 40(%rdi),%r11
1688 adcq %rax,%rbx
1689 movq 16(%rsi,%rbp,1),%rax
1690 movq %rbx,16(%rdi)
1691 adcq %rdx,%r8
1692 movq %r8,24(%rdi)
1693 sbbq %r15,%r15
1694 leaq 64(%rdi),%rdi
1695 addq $32,%rbp
1696 jnz .Lsqr4x_shift_n_add
1697
1698 leaq (%r14,%r10,2),%r12
1699.byte 0x67
1700 shrq $63,%r10
1701 leaq (%rcx,%r11,2),%r13
1702 shrq $63,%r11
1703 orq %r10,%r13
1704 movq -16(%rdi),%r10
1705 movq %r11,%r14
1706 mulq %rax
1707 negq %r15
1708 movq -8(%rdi),%r11
1709 adcq %rax,%r12
1710 movq -8(%rsi),%rax
1711 movq %r12,-32(%rdi)
1712 adcq %rdx,%r13
1713
1714 leaq (%r14,%r10,2),%rbx
1715 movq %r13,-24(%rdi)
1716 sbbq %r15,%r15
1717 shrq $63,%r10
1718 leaq (%rcx,%r11,2),%r8
1719 shrq $63,%r11
1720 orq %r10,%r8
1721 mulq %rax
1722 negq %r15
1723 adcq %rax,%rbx
1724 adcq %rdx,%r8
1725 movq %rbx,-16(%rdi)
1726 movq %r8,-8(%rdi)
1727.byte 102,72,15,126,213
David Benjamin4969cc92016-04-22 15:02:23 -04001728__bn_sqr8x_reduction:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001729 xorq %rax,%rax
David Benjamin4969cc92016-04-22 15:02:23 -04001730 leaq (%r9,%rbp,1),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001731 leaq 48+8(%rsp,%r9,2),%rdx
1732 movq %rcx,0+8(%rsp)
1733 leaq 48+8(%rsp,%r9,1),%rdi
1734 movq %rdx,8+8(%rsp)
1735 negq %r9
1736 jmp .L8x_reduction_loop
1737
1738.align 32
1739.L8x_reduction_loop:
1740 leaq (%rdi,%r9,1),%rdi
1741.byte 0x66
1742 movq 0(%rdi),%rbx
1743 movq 8(%rdi),%r9
1744 movq 16(%rdi),%r10
1745 movq 24(%rdi),%r11
1746 movq 32(%rdi),%r12
1747 movq 40(%rdi),%r13
1748 movq 48(%rdi),%r14
1749 movq 56(%rdi),%r15
1750 movq %rax,(%rdx)
1751 leaq 64(%rdi),%rdi
1752
1753.byte 0x67
1754 movq %rbx,%r8
1755 imulq 32+8(%rsp),%rbx
1756 movq 0(%rbp),%rax
1757 movl $8,%ecx
1758 jmp .L8x_reduce
1759
1760.align 32
1761.L8x_reduce:
1762 mulq %rbx
David Benjamin4969cc92016-04-22 15:02:23 -04001763 movq 8(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001764 negq %r8
1765 movq %rdx,%r8
1766 adcq $0,%r8
1767
1768 mulq %rbx
1769 addq %rax,%r9
David Benjamin4969cc92016-04-22 15:02:23 -04001770 movq 16(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001771 adcq $0,%rdx
1772 addq %r9,%r8
1773 movq %rbx,48-8+8(%rsp,%rcx,8)
1774 movq %rdx,%r9
1775 adcq $0,%r9
1776
1777 mulq %rbx
1778 addq %rax,%r10
David Benjamin4969cc92016-04-22 15:02:23 -04001779 movq 24(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001780 adcq $0,%rdx
1781 addq %r10,%r9
1782 movq 32+8(%rsp),%rsi
1783 movq %rdx,%r10
1784 adcq $0,%r10
1785
1786 mulq %rbx
1787 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -04001788 movq 32(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001789 adcq $0,%rdx
1790 imulq %r8,%rsi
1791 addq %r11,%r10
1792 movq %rdx,%r11
1793 adcq $0,%r11
1794
1795 mulq %rbx
1796 addq %rax,%r12
David Benjamin4969cc92016-04-22 15:02:23 -04001797 movq 40(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001798 adcq $0,%rdx
1799 addq %r12,%r11
1800 movq %rdx,%r12
1801 adcq $0,%r12
1802
1803 mulq %rbx
1804 addq %rax,%r13
David Benjamin4969cc92016-04-22 15:02:23 -04001805 movq 48(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001806 adcq $0,%rdx
1807 addq %r13,%r12
1808 movq %rdx,%r13
1809 adcq $0,%r13
1810
1811 mulq %rbx
1812 addq %rax,%r14
David Benjamin4969cc92016-04-22 15:02:23 -04001813 movq 56(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001814 adcq $0,%rdx
1815 addq %r14,%r13
1816 movq %rdx,%r14
1817 adcq $0,%r14
1818
1819 mulq %rbx
1820 movq %rsi,%rbx
1821 addq %rax,%r15
1822 movq 0(%rbp),%rax
1823 adcq $0,%rdx
1824 addq %r15,%r14
1825 movq %rdx,%r15
1826 adcq $0,%r15
1827
1828 decl %ecx
1829 jnz .L8x_reduce
1830
David Benjamin4969cc92016-04-22 15:02:23 -04001831 leaq 64(%rbp),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001832 xorq %rax,%rax
1833 movq 8+8(%rsp),%rdx
1834 cmpq 0+8(%rsp),%rbp
1835 jae .L8x_no_tail
1836
1837.byte 0x66
1838 addq 0(%rdi),%r8
1839 adcq 8(%rdi),%r9
1840 adcq 16(%rdi),%r10
1841 adcq 24(%rdi),%r11
1842 adcq 32(%rdi),%r12
1843 adcq 40(%rdi),%r13
1844 adcq 48(%rdi),%r14
1845 adcq 56(%rdi),%r15
1846 sbbq %rsi,%rsi
1847
1848 movq 48+56+8(%rsp),%rbx
1849 movl $8,%ecx
1850 movq 0(%rbp),%rax
1851 jmp .L8x_tail
1852
1853.align 32
1854.L8x_tail:
1855 mulq %rbx
1856 addq %rax,%r8
David Benjamin4969cc92016-04-22 15:02:23 -04001857 movq 8(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001858 movq %r8,(%rdi)
1859 movq %rdx,%r8
1860 adcq $0,%r8
1861
1862 mulq %rbx
1863 addq %rax,%r9
David Benjamin4969cc92016-04-22 15:02:23 -04001864 movq 16(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001865 adcq $0,%rdx
1866 addq %r9,%r8
1867 leaq 8(%rdi),%rdi
1868 movq %rdx,%r9
1869 adcq $0,%r9
1870
1871 mulq %rbx
1872 addq %rax,%r10
David Benjamin4969cc92016-04-22 15:02:23 -04001873 movq 24(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001874 adcq $0,%rdx
1875 addq %r10,%r9
1876 movq %rdx,%r10
1877 adcq $0,%r10
1878
1879 mulq %rbx
1880 addq %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -04001881 movq 32(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001882 adcq $0,%rdx
1883 addq %r11,%r10
1884 movq %rdx,%r11
1885 adcq $0,%r11
1886
1887 mulq %rbx
1888 addq %rax,%r12
David Benjamin4969cc92016-04-22 15:02:23 -04001889 movq 40(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001890 adcq $0,%rdx
1891 addq %r12,%r11
1892 movq %rdx,%r12
1893 adcq $0,%r12
1894
1895 mulq %rbx
1896 addq %rax,%r13
David Benjamin4969cc92016-04-22 15:02:23 -04001897 movq 48(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001898 adcq $0,%rdx
1899 addq %r13,%r12
1900 movq %rdx,%r13
1901 adcq $0,%r13
1902
1903 mulq %rbx
1904 addq %rax,%r14
David Benjamin4969cc92016-04-22 15:02:23 -04001905 movq 56(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001906 adcq $0,%rdx
1907 addq %r14,%r13
1908 movq %rdx,%r14
1909 adcq $0,%r14
1910
1911 mulq %rbx
1912 movq 48-16+8(%rsp,%rcx,8),%rbx
1913 addq %rax,%r15
1914 adcq $0,%rdx
1915 addq %r15,%r14
1916 movq 0(%rbp),%rax
1917 movq %rdx,%r15
1918 adcq $0,%r15
1919
1920 decl %ecx
1921 jnz .L8x_tail
1922
David Benjamin4969cc92016-04-22 15:02:23 -04001923 leaq 64(%rbp),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001924 movq 8+8(%rsp),%rdx
1925 cmpq 0+8(%rsp),%rbp
1926 jae .L8x_tail_done
1927
1928 movq 48+56+8(%rsp),%rbx
1929 negq %rsi
1930 movq 0(%rbp),%rax
1931 adcq 0(%rdi),%r8
1932 adcq 8(%rdi),%r9
1933 adcq 16(%rdi),%r10
1934 adcq 24(%rdi),%r11
1935 adcq 32(%rdi),%r12
1936 adcq 40(%rdi),%r13
1937 adcq 48(%rdi),%r14
1938 adcq 56(%rdi),%r15
1939 sbbq %rsi,%rsi
1940
1941 movl $8,%ecx
1942 jmp .L8x_tail
1943
1944.align 32
1945.L8x_tail_done:
Robert Sloan4d1ac502017-02-06 08:36:14 -08001946 xorq %rax,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001947 addq (%rdx),%r8
Adam Langley4139edb2016-01-13 15:00:54 -08001948 adcq $0,%r9
1949 adcq $0,%r10
1950 adcq $0,%r11
1951 adcq $0,%r12
1952 adcq $0,%r13
1953 adcq $0,%r14
1954 adcq $0,%r15
Robert Sloan4d1ac502017-02-06 08:36:14 -08001955 adcq $0,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001956
1957 negq %rsi
1958.L8x_no_tail:
1959 adcq 0(%rdi),%r8
1960 adcq 8(%rdi),%r9
1961 adcq 16(%rdi),%r10
1962 adcq 24(%rdi),%r11
1963 adcq 32(%rdi),%r12
1964 adcq 40(%rdi),%r13
1965 adcq 48(%rdi),%r14
1966 adcq 56(%rdi),%r15
1967 adcq $0,%rax
David Benjamin4969cc92016-04-22 15:02:23 -04001968 movq -8(%rbp),%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001969 xorq %rsi,%rsi
1970
1971.byte 102,72,15,126,213
1972
1973 movq %r8,0(%rdi)
1974 movq %r9,8(%rdi)
1975.byte 102,73,15,126,217
1976 movq %r10,16(%rdi)
1977 movq %r11,24(%rdi)
1978 movq %r12,32(%rdi)
1979 movq %r13,40(%rdi)
1980 movq %r14,48(%rdi)
1981 movq %r15,56(%rdi)
1982 leaq 64(%rdi),%rdi
1983
1984 cmpq %rdx,%rdi
1985 jb .L8x_reduction_loop
David Benjamin4969cc92016-04-22 15:02:23 -04001986 .byte 0xf3,0xc3
1987.size bn_sqr8x_internal,.-bn_sqr8x_internal
1988.type __bn_post4x_internal,@function
Adam Langleyd9e397b2015-01-22 14:27:53 -08001989.align 32
David Benjamin4969cc92016-04-22 15:02:23 -04001990__bn_post4x_internal:
1991 movq 0(%rbp),%r12
1992 leaq (%rdi,%r9,1),%rbx
1993 movq %r9,%rcx
1994.byte 102,72,15,126,207
1995 negq %rax
1996.byte 102,72,15,126,206
1997 sarq $3+2,%rcx
1998 decq %r12
1999 xorq %r10,%r10
2000 movq 8(%rbp),%r13
2001 movq 16(%rbp),%r14
2002 movq 24(%rbp),%r15
2003 jmp .Lsqr4x_sub_entry
2004
2005.align 16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002006.Lsqr4x_sub:
David Benjamin4969cc92016-04-22 15:02:23 -04002007 movq 0(%rbp),%r12
2008 movq 8(%rbp),%r13
2009 movq 16(%rbp),%r14
2010 movq 24(%rbp),%r15
2011.Lsqr4x_sub_entry:
2012 leaq 32(%rbp),%rbp
2013 notq %r12
2014 notq %r13
2015 notq %r14
2016 notq %r15
2017 andq %rax,%r12
2018 andq %rax,%r13
2019 andq %rax,%r14
2020 andq %rax,%r15
2021
2022 negq %r10
2023 adcq 0(%rbx),%r12
2024 adcq 8(%rbx),%r13
2025 adcq 16(%rbx),%r14
2026 adcq 24(%rbx),%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002027 movq %r12,0(%rdi)
David Benjamin4969cc92016-04-22 15:02:23 -04002028 leaq 32(%rbx),%rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08002029 movq %r13,8(%rdi)
David Benjamin4969cc92016-04-22 15:02:23 -04002030 sbbq %r10,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08002031 movq %r14,16(%rdi)
2032 movq %r15,24(%rdi)
2033 leaq 32(%rdi),%rdi
2034
2035 incq %rcx
2036 jnz .Lsqr4x_sub
David Benjamin4969cc92016-04-22 15:02:23 -04002037
Adam Langleyd9e397b2015-01-22 14:27:53 -08002038 movq %r9,%r10
2039 negq %r9
2040 .byte 0xf3,0xc3
David Benjamin4969cc92016-04-22 15:02:23 -04002041.size __bn_post4x_internal,.-__bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08002042.globl bn_from_montgomery
2043.hidden bn_from_montgomery
2044.type bn_from_montgomery,@function
2045.align 32
2046bn_from_montgomery:
2047 testl $7,%r9d
2048 jz bn_from_mont8x
2049 xorl %eax,%eax
2050 .byte 0xf3,0xc3
2051.size bn_from_montgomery,.-bn_from_montgomery
2052
2053.type bn_from_mont8x,@function
2054.align 32
2055bn_from_mont8x:
Robert Sloana94fe052017-02-21 08:49:28 -08002056.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08002057.byte 0x67
2058 movq %rsp,%rax
Robert Sloana94fe052017-02-21 08:49:28 -08002059.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08002060 pushq %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08002061.cfi_offset %rbx,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002062 pushq %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002063.cfi_offset %rbp,-24
Adam Langleyd9e397b2015-01-22 14:27:53 -08002064 pushq %r12
Robert Sloana94fe052017-02-21 08:49:28 -08002065.cfi_offset %r12,-32
Adam Langleyd9e397b2015-01-22 14:27:53 -08002066 pushq %r13
Robert Sloana94fe052017-02-21 08:49:28 -08002067.cfi_offset %r13,-40
Adam Langleyd9e397b2015-01-22 14:27:53 -08002068 pushq %r14
Robert Sloana94fe052017-02-21 08:49:28 -08002069.cfi_offset %r14,-48
Adam Langleyd9e397b2015-01-22 14:27:53 -08002070 pushq %r15
Robert Sloana94fe052017-02-21 08:49:28 -08002071.cfi_offset %r15,-56
2072.Lfrom_prologue:
David Benjamin4969cc92016-04-22 15:02:23 -04002073
Adam Langleyd9e397b2015-01-22 14:27:53 -08002074 shll $3,%r9d
David Benjamin4969cc92016-04-22 15:02:23 -04002075 leaq (%r9,%r9,2),%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08002076 negq %r9
2077 movq (%r8),%r8
2078
2079
2080
2081
2082
2083
2084
David Benjamin4969cc92016-04-22 15:02:23 -04002085
2086 leaq -320(%rsp,%r9,2),%r11
Robert Sloana94fe052017-02-21 08:49:28 -08002087 movq %rsp,%rbp
David Benjamin4969cc92016-04-22 15:02:23 -04002088 subq %rdi,%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08002089 andq $4095,%r11
2090 cmpq %r11,%r10
2091 jb .Lfrom_sp_alt
Robert Sloana94fe052017-02-21 08:49:28 -08002092 subq %r11,%rbp
2093 leaq -320(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002094 jmp .Lfrom_sp_done
2095
2096.align 32
2097.Lfrom_sp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -04002098 leaq 4096-320(,%r9,2),%r10
Robert Sloana94fe052017-02-21 08:49:28 -08002099 leaq -320(%rbp,%r9,2),%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002100 subq %r10,%r11
2101 movq $0,%r10
2102 cmovcq %r10,%r11
Robert Sloana94fe052017-02-21 08:49:28 -08002103 subq %r11,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002104.Lfrom_sp_done:
Robert Sloana94fe052017-02-21 08:49:28 -08002105 andq $-64,%rbp
2106 movq %rsp,%r11
2107 subq %rbp,%r11
2108 andq $-4096,%r11
2109 leaq (%r11,%rbp,1),%rsp
2110 movq (%rsp),%r10
2111 cmpq %rbp,%rsp
2112 ja .Lfrom_page_walk
2113 jmp .Lfrom_page_walk_done
2114
2115.Lfrom_page_walk:
2116 leaq -4096(%rsp),%rsp
2117 movq (%rsp),%r10
2118 cmpq %rbp,%rsp
2119 ja .Lfrom_page_walk
2120.Lfrom_page_walk_done:
2121
Adam Langleyd9e397b2015-01-22 14:27:53 -08002122 movq %r9,%r10
2123 negq %r9
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134 movq %r8,32(%rsp)
2135 movq %rax,40(%rsp)
Robert Sloana94fe052017-02-21 08:49:28 -08002136.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
Adam Langleyd9e397b2015-01-22 14:27:53 -08002137.Lfrom_body:
2138 movq %r9,%r11
2139 leaq 48(%rsp),%rax
2140 pxor %xmm0,%xmm0
2141 jmp .Lmul_by_1
2142
2143.align 32
2144.Lmul_by_1:
2145 movdqu (%rsi),%xmm1
2146 movdqu 16(%rsi),%xmm2
2147 movdqu 32(%rsi),%xmm3
2148 movdqa %xmm0,(%rax,%r9,1)
2149 movdqu 48(%rsi),%xmm4
2150 movdqa %xmm0,16(%rax,%r9,1)
2151.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2152 movdqa %xmm1,(%rax)
2153 movdqa %xmm0,32(%rax,%r9,1)
2154 movdqa %xmm2,16(%rax)
2155 movdqa %xmm0,48(%rax,%r9,1)
2156 movdqa %xmm3,32(%rax)
2157 movdqa %xmm4,48(%rax)
2158 leaq 64(%rax),%rax
2159 subq $64,%r11
2160 jnz .Lmul_by_1
2161
2162.byte 102,72,15,110,207
2163.byte 102,72,15,110,209
2164.byte 0x67
2165 movq %rcx,%rbp
2166.byte 102,73,15,110,218
David Benjamin4969cc92016-04-22 15:02:23 -04002167 call __bn_sqr8x_reduction
2168 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08002169
2170 pxor %xmm0,%xmm0
2171 leaq 48(%rsp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08002172 jmp .Lfrom_mont_zero
2173
2174.align 32
2175.Lfrom_mont_zero:
Robert Sloana94fe052017-02-21 08:49:28 -08002176 movq 40(%rsp),%rsi
2177.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -08002178 movdqa %xmm0,0(%rax)
2179 movdqa %xmm0,16(%rax)
2180 movdqa %xmm0,32(%rax)
2181 movdqa %xmm0,48(%rax)
2182 leaq 64(%rax),%rax
2183 subq $32,%r9
2184 jnz .Lfrom_mont_zero
2185
2186 movq $1,%rax
2187 movq -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -08002188.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002189 movq -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -08002190.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08002191 movq -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -08002192.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08002193 movq -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -08002194.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08002195 movq -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002196.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002197 movq -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -08002198.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08002199 leaq (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -08002200.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002201.Lfrom_epilogue:
2202 .byte 0xf3,0xc3
Robert Sloana94fe052017-02-21 08:49:28 -08002203.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08002204.size bn_from_mont8x,.-bn_from_mont8x
2205.globl bn_scatter5
2206.hidden bn_scatter5
2207.type bn_scatter5,@function
2208.align 16
2209bn_scatter5:
2210 cmpl $0,%esi
2211 jz .Lscatter_epilogue
2212 leaq (%rdx,%rcx,8),%rdx
2213.Lscatter:
2214 movq (%rdi),%rax
2215 leaq 8(%rdi),%rdi
2216 movq %rax,(%rdx)
2217 leaq 256(%rdx),%rdx
2218 subl $1,%esi
2219 jnz .Lscatter
2220.Lscatter_epilogue:
2221 .byte 0xf3,0xc3
2222.size bn_scatter5,.-bn_scatter5
2223
2224.globl bn_gather5
2225.hidden bn_gather5
2226.type bn_gather5,@function
David Benjamin4969cc92016-04-22 15:02:23 -04002227.align 32
Adam Langleyd9e397b2015-01-22 14:27:53 -08002228bn_gather5:
David Benjamin4969cc92016-04-22 15:02:23 -04002229.LSEH_begin_bn_gather5:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002230
David Benjamin4969cc92016-04-22 15:02:23 -04002231.byte 0x4c,0x8d,0x14,0x24
2232.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
2233 leaq .Linc(%rip),%rax
2234 andq $-16,%rsp
2235
2236 movd %ecx,%xmm5
2237 movdqa 0(%rax),%xmm0
2238 movdqa 16(%rax),%xmm1
2239 leaq 128(%rdx),%r11
2240 leaq 128(%rsp),%rax
2241
2242 pshufd $0,%xmm5,%xmm5
2243 movdqa %xmm1,%xmm4
2244 movdqa %xmm1,%xmm2
2245 paddd %xmm0,%xmm1
2246 pcmpeqd %xmm5,%xmm0
2247 movdqa %xmm4,%xmm3
2248
2249 paddd %xmm1,%xmm2
2250 pcmpeqd %xmm5,%xmm1
2251 movdqa %xmm0,-128(%rax)
2252 movdqa %xmm4,%xmm0
2253
2254 paddd %xmm2,%xmm3
2255 pcmpeqd %xmm5,%xmm2
2256 movdqa %xmm1,-112(%rax)
2257 movdqa %xmm4,%xmm1
2258
2259 paddd %xmm3,%xmm0
2260 pcmpeqd %xmm5,%xmm3
2261 movdqa %xmm2,-96(%rax)
2262 movdqa %xmm4,%xmm2
2263 paddd %xmm0,%xmm1
2264 pcmpeqd %xmm5,%xmm0
2265 movdqa %xmm3,-80(%rax)
2266 movdqa %xmm4,%xmm3
2267
2268 paddd %xmm1,%xmm2
2269 pcmpeqd %xmm5,%xmm1
2270 movdqa %xmm0,-64(%rax)
2271 movdqa %xmm4,%xmm0
2272
2273 paddd %xmm2,%xmm3
2274 pcmpeqd %xmm5,%xmm2
2275 movdqa %xmm1,-48(%rax)
2276 movdqa %xmm4,%xmm1
2277
2278 paddd %xmm3,%xmm0
2279 pcmpeqd %xmm5,%xmm3
2280 movdqa %xmm2,-32(%rax)
2281 movdqa %xmm4,%xmm2
2282 paddd %xmm0,%xmm1
2283 pcmpeqd %xmm5,%xmm0
2284 movdqa %xmm3,-16(%rax)
2285 movdqa %xmm4,%xmm3
2286
2287 paddd %xmm1,%xmm2
2288 pcmpeqd %xmm5,%xmm1
2289 movdqa %xmm0,0(%rax)
2290 movdqa %xmm4,%xmm0
2291
2292 paddd %xmm2,%xmm3
2293 pcmpeqd %xmm5,%xmm2
2294 movdqa %xmm1,16(%rax)
2295 movdqa %xmm4,%xmm1
2296
2297 paddd %xmm3,%xmm0
2298 pcmpeqd %xmm5,%xmm3
2299 movdqa %xmm2,32(%rax)
2300 movdqa %xmm4,%xmm2
2301 paddd %xmm0,%xmm1
2302 pcmpeqd %xmm5,%xmm0
2303 movdqa %xmm3,48(%rax)
2304 movdqa %xmm4,%xmm3
2305
2306 paddd %xmm1,%xmm2
2307 pcmpeqd %xmm5,%xmm1
2308 movdqa %xmm0,64(%rax)
2309 movdqa %xmm4,%xmm0
2310
2311 paddd %xmm2,%xmm3
2312 pcmpeqd %xmm5,%xmm2
2313 movdqa %xmm1,80(%rax)
2314 movdqa %xmm4,%xmm1
2315
2316 paddd %xmm3,%xmm0
2317 pcmpeqd %xmm5,%xmm3
2318 movdqa %xmm2,96(%rax)
2319 movdqa %xmm4,%xmm2
2320 movdqa %xmm3,112(%rax)
2321 jmp .Lgather
2322
2323.align 32
2324.Lgather:
2325 pxor %xmm4,%xmm4
2326 pxor %xmm5,%xmm5
2327 movdqa -128(%r11),%xmm0
2328 movdqa -112(%r11),%xmm1
2329 movdqa -96(%r11),%xmm2
2330 pand -128(%rax),%xmm0
2331 movdqa -80(%r11),%xmm3
2332 pand -112(%rax),%xmm1
2333 por %xmm0,%xmm4
2334 pand -96(%rax),%xmm2
2335 por %xmm1,%xmm5
2336 pand -80(%rax),%xmm3
2337 por %xmm2,%xmm4
2338 por %xmm3,%xmm5
2339 movdqa -64(%r11),%xmm0
2340 movdqa -48(%r11),%xmm1
2341 movdqa -32(%r11),%xmm2
2342 pand -64(%rax),%xmm0
2343 movdqa -16(%r11),%xmm3
2344 pand -48(%rax),%xmm1
2345 por %xmm0,%xmm4
2346 pand -32(%rax),%xmm2
2347 por %xmm1,%xmm5
2348 pand -16(%rax),%xmm3
2349 por %xmm2,%xmm4
2350 por %xmm3,%xmm5
2351 movdqa 0(%r11),%xmm0
2352 movdqa 16(%r11),%xmm1
2353 movdqa 32(%r11),%xmm2
2354 pand 0(%rax),%xmm0
2355 movdqa 48(%r11),%xmm3
2356 pand 16(%rax),%xmm1
2357 por %xmm0,%xmm4
2358 pand 32(%rax),%xmm2
2359 por %xmm1,%xmm5
2360 pand 48(%rax),%xmm3
2361 por %xmm2,%xmm4
2362 por %xmm3,%xmm5
2363 movdqa 64(%r11),%xmm0
2364 movdqa 80(%r11),%xmm1
2365 movdqa 96(%r11),%xmm2
2366 pand 64(%rax),%xmm0
2367 movdqa 112(%r11),%xmm3
2368 pand 80(%rax),%xmm1
2369 por %xmm0,%xmm4
2370 pand 96(%rax),%xmm2
2371 por %xmm1,%xmm5
2372 pand 112(%rax),%xmm3
2373 por %xmm2,%xmm4
2374 por %xmm3,%xmm5
2375 por %xmm5,%xmm4
2376 leaq 256(%r11),%r11
2377 pshufd $0x4e,%xmm4,%xmm0
2378 por %xmm4,%xmm0
Adam Langleyd9e397b2015-01-22 14:27:53 -08002379 movq %xmm0,(%rdi)
2380 leaq 8(%rdi),%rdi
2381 subl $1,%esi
2382 jnz .Lgather
David Benjamin4969cc92016-04-22 15:02:23 -04002383
2384 leaq (%r10),%rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002385 .byte 0xf3,0xc3
2386.LSEH_end_bn_gather5:
2387.size bn_gather5,.-bn_gather5
2388.align 64
David Benjamin4969cc92016-04-22 15:02:23 -04002389.Linc:
2390.long 0,0, 1,1
2391.long 2,2, 2,2
Adam Langleyd9e397b2015-01-22 14:27:53 -08002392.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2393#endif