blob: 1bcbc5d097a227a3b2895662e5a0ef24b7a0e834 [file] [log] [blame]
Adam Langleye9ada862015-05-11 17:20:37 -07001default rel
2%define XMMWORD
3%define YMMWORD
4%define ZMMWORD
5section .text code align=64
Adam Langleyd9e397b2015-01-22 14:27:53 -08006
Adam Langleyd9e397b2015-01-22 14:27:53 -08007
Adam Langleye9ada862015-05-11 17:20:37 -07008EXTERN OPENSSL_ia32cap_P
9
10global bn_mul_mont_gather5
Adam Langleyd9e397b2015-01-22 14:27:53 -080011
12ALIGN 64
Adam Langleye9ada862015-05-11 17:20:37 -070013bn_mul_mont_gather5:
14 mov QWORD[8+rsp],rdi ;WIN64 prologue
15 mov QWORD[16+rsp],rsi
Adam Langleyd9e397b2015-01-22 14:27:53 -080016 mov rax,rsp
Adam Langleye9ada862015-05-11 17:20:37 -070017$L$SEH_begin_bn_mul_mont_gather5:
Adam Langleyd9e397b2015-01-22 14:27:53 -080018 mov rdi,rcx
19 mov rsi,rdx
20 mov rdx,r8
21 mov rcx,r9
Adam Langleye9ada862015-05-11 17:20:37 -070022 mov r8,QWORD[40+rsp]
23 mov r9,QWORD[48+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -080024
25
Robert Sloana94fe052017-02-21 08:49:28 -080026
27 mov r9d,r9d
28 mov rax,rsp
29
Adam Langleyd9e397b2015-01-22 14:27:53 -080030 test r9d,7
Adam Langleye9ada862015-05-11 17:20:37 -070031 jnz NEAR $L$mul_enter
32 jmp NEAR $L$mul4x_enter
Adam Langleyd9e397b2015-01-22 14:27:53 -080033
34ALIGN 16
Adam Langleye9ada862015-05-11 17:20:37 -070035$L$mul_enter:
David Benjamin4969cc92016-04-22 15:02:23 -040036 movd xmm5,DWORD[56+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -080037 push rbx
Robert Sloana94fe052017-02-21 08:49:28 -080038
Adam Langleyd9e397b2015-01-22 14:27:53 -080039 push rbp
Robert Sloana94fe052017-02-21 08:49:28 -080040
Adam Langleyd9e397b2015-01-22 14:27:53 -080041 push r12
Robert Sloana94fe052017-02-21 08:49:28 -080042
Adam Langleyd9e397b2015-01-22 14:27:53 -080043 push r13
Robert Sloana94fe052017-02-21 08:49:28 -080044
Adam Langleyd9e397b2015-01-22 14:27:53 -080045 push r14
Robert Sloana94fe052017-02-21 08:49:28 -080046
Adam Langleyd9e397b2015-01-22 14:27:53 -080047 push r15
David Benjamin4969cc92016-04-22 15:02:23 -040048
Adam Langleyd9e397b2015-01-22 14:27:53 -080049
Robert Sloana94fe052017-02-21 08:49:28 -080050 neg r9
51 mov r11,rsp
52 lea r10,[((-280))+r9*8+rsp]
53 neg r9
54 and r10,-1024
55
56
57
58
59
60
61
62
63
64 sub r11,r10
65 and r11,-4096
66 lea rsp,[r11*1+r10]
67 mov r11,QWORD[rsp]
68 cmp rsp,r10
69 ja NEAR $L$mul_page_walk
70 jmp NEAR $L$mul_page_walk_done
71
72$L$mul_page_walk:
73 lea rsp,[((-4096))+rsp]
74 mov r11,QWORD[rsp]
75 cmp rsp,r10
76 ja NEAR $L$mul_page_walk
77$L$mul_page_walk_done:
78
79 lea r10,[$L$inc]
Adam Langleye9ada862015-05-11 17:20:37 -070080 mov QWORD[8+r9*8+rsp],rax
Robert Sloana94fe052017-02-21 08:49:28 -080081
Adam Langleye9ada862015-05-11 17:20:37 -070082$L$mul_body:
Robert Sloana94fe052017-02-21 08:49:28 -080083
David Benjamin4969cc92016-04-22 15:02:23 -040084 lea r12,[128+rdx]
85 movdqa xmm0,XMMWORD[r10]
86 movdqa xmm1,XMMWORD[16+r10]
87 lea r10,[((24-112))+r9*8+rsp]
88 and r10,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -080089
David Benjamin4969cc92016-04-22 15:02:23 -040090 pshufd xmm5,xmm5,0
91 movdqa xmm4,xmm1
92 movdqa xmm2,xmm1
93 paddd xmm1,xmm0
94 pcmpeqd xmm0,xmm5
95DB 0x67
96 movdqa xmm3,xmm4
97 paddd xmm2,xmm1
98 pcmpeqd xmm1,xmm5
99 movdqa XMMWORD[112+r10],xmm0
100 movdqa xmm0,xmm4
101
102 paddd xmm3,xmm2
103 pcmpeqd xmm2,xmm5
104 movdqa XMMWORD[128+r10],xmm1
105 movdqa xmm1,xmm4
106
107 paddd xmm0,xmm3
108 pcmpeqd xmm3,xmm5
109 movdqa XMMWORD[144+r10],xmm2
110 movdqa xmm2,xmm4
111
112 paddd xmm1,xmm0
113 pcmpeqd xmm0,xmm5
114 movdqa XMMWORD[160+r10],xmm3
115 movdqa xmm3,xmm4
116 paddd xmm2,xmm1
117 pcmpeqd xmm1,xmm5
118 movdqa XMMWORD[176+r10],xmm0
119 movdqa xmm0,xmm4
120
121 paddd xmm3,xmm2
122 pcmpeqd xmm2,xmm5
123 movdqa XMMWORD[192+r10],xmm1
124 movdqa xmm1,xmm4
125
126 paddd xmm0,xmm3
127 pcmpeqd xmm3,xmm5
128 movdqa XMMWORD[208+r10],xmm2
129 movdqa xmm2,xmm4
130
131 paddd xmm1,xmm0
132 pcmpeqd xmm0,xmm5
133 movdqa XMMWORD[224+r10],xmm3
134 movdqa xmm3,xmm4
135 paddd xmm2,xmm1
136 pcmpeqd xmm1,xmm5
137 movdqa XMMWORD[240+r10],xmm0
138 movdqa xmm0,xmm4
139
140 paddd xmm3,xmm2
141 pcmpeqd xmm2,xmm5
142 movdqa XMMWORD[256+r10],xmm1
143 movdqa xmm1,xmm4
144
145 paddd xmm0,xmm3
146 pcmpeqd xmm3,xmm5
147 movdqa XMMWORD[272+r10],xmm2
148 movdqa xmm2,xmm4
149
150 paddd xmm1,xmm0
151 pcmpeqd xmm0,xmm5
152 movdqa XMMWORD[288+r10],xmm3
153 movdqa xmm3,xmm4
154 paddd xmm2,xmm1
155 pcmpeqd xmm1,xmm5
156 movdqa XMMWORD[304+r10],xmm0
157
158 paddd xmm3,xmm2
159DB 0x67
160 pcmpeqd xmm2,xmm5
161 movdqa XMMWORD[320+r10],xmm1
162
163 pcmpeqd xmm3,xmm5
164 movdqa XMMWORD[336+r10],xmm2
165 pand xmm0,XMMWORD[64+r12]
166
167 pand xmm1,XMMWORD[80+r12]
168 pand xmm2,XMMWORD[96+r12]
169 movdqa XMMWORD[352+r10],xmm3
170 pand xmm3,XMMWORD[112+r12]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800171 por xmm0,xmm2
David Benjamin4969cc92016-04-22 15:02:23 -0400172 por xmm1,xmm3
173 movdqa xmm4,XMMWORD[((-128))+r12]
174 movdqa xmm5,XMMWORD[((-112))+r12]
175 movdqa xmm2,XMMWORD[((-96))+r12]
176 pand xmm4,XMMWORD[112+r10]
177 movdqa xmm3,XMMWORD[((-80))+r12]
178 pand xmm5,XMMWORD[128+r10]
179 por xmm0,xmm4
180 pand xmm2,XMMWORD[144+r10]
181 por xmm1,xmm5
182 pand xmm3,XMMWORD[160+r10]
183 por xmm0,xmm2
184 por xmm1,xmm3
185 movdqa xmm4,XMMWORD[((-64))+r12]
186 movdqa xmm5,XMMWORD[((-48))+r12]
187 movdqa xmm2,XMMWORD[((-32))+r12]
188 pand xmm4,XMMWORD[176+r10]
189 movdqa xmm3,XMMWORD[((-16))+r12]
190 pand xmm5,XMMWORD[192+r10]
191 por xmm0,xmm4
192 pand xmm2,XMMWORD[208+r10]
193 por xmm1,xmm5
194 pand xmm3,XMMWORD[224+r10]
195 por xmm0,xmm2
196 por xmm1,xmm3
197 movdqa xmm4,XMMWORD[r12]
198 movdqa xmm5,XMMWORD[16+r12]
199 movdqa xmm2,XMMWORD[32+r12]
200 pand xmm4,XMMWORD[240+r10]
201 movdqa xmm3,XMMWORD[48+r12]
202 pand xmm5,XMMWORD[256+r10]
203 por xmm0,xmm4
204 pand xmm2,XMMWORD[272+r10]
205 por xmm1,xmm5
206 pand xmm3,XMMWORD[288+r10]
207 por xmm0,xmm2
208 por xmm1,xmm3
209 por xmm0,xmm1
210 pshufd xmm1,xmm0,0x4e
211 por xmm0,xmm1
Adam Langleye9ada862015-05-11 17:20:37 -0700212 lea r12,[256+r12]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800213DB 102,72,15,126,195
214
Adam Langleye9ada862015-05-11 17:20:37 -0700215 mov r8,QWORD[r8]
216 mov rax,QWORD[rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800217
218 xor r14,r14
219 xor r15,r15
220
Adam Langleyd9e397b2015-01-22 14:27:53 -0800221 mov rbp,r8
222 mul rbx
223 mov r10,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700224 mov rax,QWORD[rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800225
Adam Langleyd9e397b2015-01-22 14:27:53 -0800226 imul rbp,r10
227 mov r11,rdx
228
Adam Langleyd9e397b2015-01-22 14:27:53 -0800229 mul rbp
230 add r10,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700231 mov rax,QWORD[8+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800232 adc rdx,0
233 mov r13,rdx
234
Adam Langleye9ada862015-05-11 17:20:37 -0700235 lea r15,[1+r15]
236 jmp NEAR $L$1st_enter
Adam Langleyd9e397b2015-01-22 14:27:53 -0800237
238ALIGN 16
Adam Langleye9ada862015-05-11 17:20:37 -0700239$L$1st:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800240 add r13,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700241 mov rax,QWORD[r15*8+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800242 adc rdx,0
243 add r13,r11
244 mov r11,r10
245 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700246 mov QWORD[((-16))+r15*8+rsp],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800247 mov r13,rdx
248
Adam Langleye9ada862015-05-11 17:20:37 -0700249$L$1st_enter:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800250 mul rbx
251 add r11,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700252 mov rax,QWORD[r15*8+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800253 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700254 lea r15,[1+r15]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800255 mov r10,rdx
256
257 mul rbp
258 cmp r15,r9
Adam Langleye9ada862015-05-11 17:20:37 -0700259 jne NEAR $L$1st
Adam Langleyd9e397b2015-01-22 14:27:53 -0800260
Adam Langleyd9e397b2015-01-22 14:27:53 -0800261
262 add r13,rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800263 adc rdx,0
264 add r13,r11
265 adc rdx,0
David Benjamin4969cc92016-04-22 15:02:23 -0400266 mov QWORD[((-16))+r9*8+rsp],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800267 mov r13,rdx
268 mov r11,r10
269
270 xor rdx,rdx
271 add r13,r11
272 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700273 mov QWORD[((-8))+r9*8+rsp],r13
274 mov QWORD[r9*8+rsp],rdx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800275
Adam Langleye9ada862015-05-11 17:20:37 -0700276 lea r14,[1+r14]
277 jmp NEAR $L$outer
Adam Langleyd9e397b2015-01-22 14:27:53 -0800278ALIGN 16
Adam Langleye9ada862015-05-11 17:20:37 -0700279$L$outer:
David Benjamin4969cc92016-04-22 15:02:23 -0400280 lea rdx,[((24+128))+r9*8+rsp]
281 and rdx,-16
282 pxor xmm4,xmm4
283 pxor xmm5,xmm5
284 movdqa xmm0,XMMWORD[((-128))+r12]
285 movdqa xmm1,XMMWORD[((-112))+r12]
286 movdqa xmm2,XMMWORD[((-96))+r12]
287 movdqa xmm3,XMMWORD[((-80))+r12]
288 pand xmm0,XMMWORD[((-128))+rdx]
289 pand xmm1,XMMWORD[((-112))+rdx]
290 por xmm4,xmm0
291 pand xmm2,XMMWORD[((-96))+rdx]
292 por xmm5,xmm1
293 pand xmm3,XMMWORD[((-80))+rdx]
294 por xmm4,xmm2
295 por xmm5,xmm3
296 movdqa xmm0,XMMWORD[((-64))+r12]
297 movdqa xmm1,XMMWORD[((-48))+r12]
298 movdqa xmm2,XMMWORD[((-32))+r12]
299 movdqa xmm3,XMMWORD[((-16))+r12]
300 pand xmm0,XMMWORD[((-64))+rdx]
301 pand xmm1,XMMWORD[((-48))+rdx]
302 por xmm4,xmm0
303 pand xmm2,XMMWORD[((-32))+rdx]
304 por xmm5,xmm1
305 pand xmm3,XMMWORD[((-16))+rdx]
306 por xmm4,xmm2
307 por xmm5,xmm3
308 movdqa xmm0,XMMWORD[r12]
309 movdqa xmm1,XMMWORD[16+r12]
310 movdqa xmm2,XMMWORD[32+r12]
311 movdqa xmm3,XMMWORD[48+r12]
312 pand xmm0,XMMWORD[rdx]
313 pand xmm1,XMMWORD[16+rdx]
314 por xmm4,xmm0
315 pand xmm2,XMMWORD[32+rdx]
316 por xmm5,xmm1
317 pand xmm3,XMMWORD[48+rdx]
318 por xmm4,xmm2
319 por xmm5,xmm3
320 movdqa xmm0,XMMWORD[64+r12]
321 movdqa xmm1,XMMWORD[80+r12]
322 movdqa xmm2,XMMWORD[96+r12]
323 movdqa xmm3,XMMWORD[112+r12]
324 pand xmm0,XMMWORD[64+rdx]
325 pand xmm1,XMMWORD[80+rdx]
326 por xmm4,xmm0
327 pand xmm2,XMMWORD[96+rdx]
328 por xmm5,xmm1
329 pand xmm3,XMMWORD[112+rdx]
330 por xmm4,xmm2
331 por xmm5,xmm3
332 por xmm4,xmm5
333 pshufd xmm0,xmm4,0x4e
334 por xmm0,xmm4
335 lea r12,[256+r12]
336
337 mov rax,QWORD[rsi]
338DB 102,72,15,126,195
339
Adam Langleyd9e397b2015-01-22 14:27:53 -0800340 xor r15,r15
341 mov rbp,r8
Adam Langleye9ada862015-05-11 17:20:37 -0700342 mov r10,QWORD[rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800343
Adam Langleyd9e397b2015-01-22 14:27:53 -0800344 mul rbx
345 add r10,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700346 mov rax,QWORD[rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800347 adc rdx,0
348
Adam Langleyd9e397b2015-01-22 14:27:53 -0800349 imul rbp,r10
350 mov r11,rdx
351
Adam Langleyd9e397b2015-01-22 14:27:53 -0800352 mul rbp
353 add r10,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700354 mov rax,QWORD[8+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800355 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700356 mov r10,QWORD[8+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800357 mov r13,rdx
358
Adam Langleye9ada862015-05-11 17:20:37 -0700359 lea r15,[1+r15]
360 jmp NEAR $L$inner_enter
Adam Langleyd9e397b2015-01-22 14:27:53 -0800361
362ALIGN 16
Adam Langleye9ada862015-05-11 17:20:37 -0700363$L$inner:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800364 add r13,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700365 mov rax,QWORD[r15*8+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800366 adc rdx,0
367 add r13,r10
Adam Langleye9ada862015-05-11 17:20:37 -0700368 mov r10,QWORD[r15*8+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800369 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700370 mov QWORD[((-16))+r15*8+rsp],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800371 mov r13,rdx
372
Adam Langleye9ada862015-05-11 17:20:37 -0700373$L$inner_enter:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800374 mul rbx
375 add r11,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700376 mov rax,QWORD[r15*8+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800377 adc rdx,0
378 add r10,r11
379 mov r11,rdx
380 adc r11,0
Adam Langleye9ada862015-05-11 17:20:37 -0700381 lea r15,[1+r15]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800382
383 mul rbp
384 cmp r15,r9
Adam Langleye9ada862015-05-11 17:20:37 -0700385 jne NEAR $L$inner
Adam Langleyd9e397b2015-01-22 14:27:53 -0800386
Adam Langleyd9e397b2015-01-22 14:27:53 -0800387 add r13,rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800388 adc rdx,0
389 add r13,r10
David Benjamin4969cc92016-04-22 15:02:23 -0400390 mov r10,QWORD[r9*8+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800391 adc rdx,0
David Benjamin4969cc92016-04-22 15:02:23 -0400392 mov QWORD[((-16))+r9*8+rsp],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800393 mov r13,rdx
394
395 xor rdx,rdx
396 add r13,r11
397 adc rdx,0
398 add r13,r10
399 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700400 mov QWORD[((-8))+r9*8+rsp],r13
401 mov QWORD[r9*8+rsp],rdx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800402
Adam Langleye9ada862015-05-11 17:20:37 -0700403 lea r14,[1+r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800404 cmp r14,r9
Adam Langleye9ada862015-05-11 17:20:37 -0700405 jb NEAR $L$outer
Adam Langleyd9e397b2015-01-22 14:27:53 -0800406
407 xor r14,r14
Adam Langleye9ada862015-05-11 17:20:37 -0700408 mov rax,QWORD[rsp]
409 lea rsi,[rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800410 mov r15,r9
Adam Langleye9ada862015-05-11 17:20:37 -0700411 jmp NEAR $L$sub
Adam Langleyd9e397b2015-01-22 14:27:53 -0800412ALIGN 16
Robert Sloan8ff03552017-06-14 12:40:58 -0700413$L$sub:
414 sbb rax,QWORD[r14*8+rcx]
Adam Langleye9ada862015-05-11 17:20:37 -0700415 mov QWORD[r14*8+rdi],rax
416 mov rax,QWORD[8+r14*8+rsi]
417 lea r14,[1+r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800418 dec r15
Adam Langleye9ada862015-05-11 17:20:37 -0700419 jnz NEAR $L$sub
Adam Langleyd9e397b2015-01-22 14:27:53 -0800420
421 sbb rax,0
422 xor r14,r14
Robert Sloana94fe052017-02-21 08:49:28 -0800423 and rsi,rax
424 not rax
425 mov rcx,rdi
426 and rcx,rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800427 mov r15,r9
Robert Sloana94fe052017-02-21 08:49:28 -0800428 or rsi,rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800429ALIGN 16
Adam Langleye9ada862015-05-11 17:20:37 -0700430$L$copy:
Robert Sloana94fe052017-02-21 08:49:28 -0800431 mov rax,QWORD[r14*8+rsi]
Adam Langleye9ada862015-05-11 17:20:37 -0700432 mov QWORD[r14*8+rsp],r14
Robert Sloana94fe052017-02-21 08:49:28 -0800433 mov QWORD[r14*8+rdi],rax
Adam Langleye9ada862015-05-11 17:20:37 -0700434 lea r14,[1+r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800435 sub r15,1
Adam Langleye9ada862015-05-11 17:20:37 -0700436 jnz NEAR $L$copy
Adam Langleyd9e397b2015-01-22 14:27:53 -0800437
Adam Langleye9ada862015-05-11 17:20:37 -0700438 mov rsi,QWORD[8+r9*8+rsp]
Robert Sloana94fe052017-02-21 08:49:28 -0800439
Adam Langleyd9e397b2015-01-22 14:27:53 -0800440 mov rax,1
David Benjamin4969cc92016-04-22 15:02:23 -0400441
Adam Langleye9ada862015-05-11 17:20:37 -0700442 mov r15,QWORD[((-48))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -0800443
Adam Langleye9ada862015-05-11 17:20:37 -0700444 mov r14,QWORD[((-40))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -0800445
Adam Langleye9ada862015-05-11 17:20:37 -0700446 mov r13,QWORD[((-32))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -0800447
Adam Langleye9ada862015-05-11 17:20:37 -0700448 mov r12,QWORD[((-24))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -0800449
Adam Langleye9ada862015-05-11 17:20:37 -0700450 mov rbp,QWORD[((-16))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -0800451
Adam Langleye9ada862015-05-11 17:20:37 -0700452 mov rbx,QWORD[((-8))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -0800453
Adam Langleye9ada862015-05-11 17:20:37 -0700454 lea rsp,[rsi]
Robert Sloana94fe052017-02-21 08:49:28 -0800455
Adam Langleye9ada862015-05-11 17:20:37 -0700456$L$mul_epilogue:
457 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
458 mov rsi,QWORD[16+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800459 DB 0F3h,0C3h ;repret
Robert Sloana94fe052017-02-21 08:49:28 -0800460
Adam Langleye9ada862015-05-11 17:20:37 -0700461$L$SEH_end_bn_mul_mont_gather5:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800462
463ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -0700464bn_mul4x_mont_gather5:
465 mov QWORD[8+rsp],rdi ;WIN64 prologue
466 mov QWORD[16+rsp],rsi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800467 mov rax,rsp
Adam Langleye9ada862015-05-11 17:20:37 -0700468$L$SEH_begin_bn_mul4x_mont_gather5:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800469 mov rdi,rcx
470 mov rsi,rdx
471 mov rdx,r8
472 mov rcx,r9
Adam Langleye9ada862015-05-11 17:20:37 -0700473 mov r8,QWORD[40+rsp]
474 mov r9,QWORD[48+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800475
476
Robert Sloana94fe052017-02-21 08:49:28 -0800477
Adam Langleye9ada862015-05-11 17:20:37 -0700478DB 0x67
Adam Langleyd9e397b2015-01-22 14:27:53 -0800479 mov rax,rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800480
481$L$mul4x_enter:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800482 push rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800483
Adam Langleyd9e397b2015-01-22 14:27:53 -0800484 push rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800485
Adam Langleyd9e397b2015-01-22 14:27:53 -0800486 push r12
Robert Sloana94fe052017-02-21 08:49:28 -0800487
Adam Langleyd9e397b2015-01-22 14:27:53 -0800488 push r13
Robert Sloana94fe052017-02-21 08:49:28 -0800489
Adam Langleyd9e397b2015-01-22 14:27:53 -0800490 push r14
Robert Sloana94fe052017-02-21 08:49:28 -0800491
Adam Langleyd9e397b2015-01-22 14:27:53 -0800492 push r15
David Benjamin4969cc92016-04-22 15:02:23 -0400493
Robert Sloana94fe052017-02-21 08:49:28 -0800494$L$mul4x_prologue:
495
Adam Langleye9ada862015-05-11 17:20:37 -0700496DB 0x67
Adam Langleyd9e397b2015-01-22 14:27:53 -0800497 shl r9d,3
David Benjamin4969cc92016-04-22 15:02:23 -0400498 lea r10,[r9*2+r9]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800499 neg r9
500
501
502
503
504
505
506
507
David Benjamin4969cc92016-04-22 15:02:23 -0400508
509
510 lea r11,[((-320))+r9*2+rsp]
Robert Sloana94fe052017-02-21 08:49:28 -0800511 mov rbp,rsp
David Benjamin4969cc92016-04-22 15:02:23 -0400512 sub r11,rdi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800513 and r11,4095
514 cmp r10,r11
Adam Langleye9ada862015-05-11 17:20:37 -0700515 jb NEAR $L$mul4xsp_alt
Robert Sloana94fe052017-02-21 08:49:28 -0800516 sub rbp,r11
517 lea rbp,[((-320))+r9*2+rbp]
Adam Langleye9ada862015-05-11 17:20:37 -0700518 jmp NEAR $L$mul4xsp_done
Adam Langleyd9e397b2015-01-22 14:27:53 -0800519
520ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -0700521$L$mul4xsp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -0400522 lea r10,[((4096-320))+r9*2]
Robert Sloana94fe052017-02-21 08:49:28 -0800523 lea rbp,[((-320))+r9*2+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800524 sub r11,r10
525 mov r10,0
526 cmovc r11,r10
Robert Sloana94fe052017-02-21 08:49:28 -0800527 sub rbp,r11
Adam Langleye9ada862015-05-11 17:20:37 -0700528$L$mul4xsp_done:
Robert Sloana94fe052017-02-21 08:49:28 -0800529 and rbp,-64
530 mov r11,rsp
531 sub r11,rbp
532 and r11,-4096
533 lea rsp,[rbp*1+r11]
534 mov r10,QWORD[rsp]
535 cmp rsp,rbp
536 ja NEAR $L$mul4x_page_walk
537 jmp NEAR $L$mul4x_page_walk_done
538
539$L$mul4x_page_walk:
540 lea rsp,[((-4096))+rsp]
541 mov r10,QWORD[rsp]
542 cmp rsp,rbp
543 ja NEAR $L$mul4x_page_walk
544$L$mul4x_page_walk_done:
545
Adam Langleyd9e397b2015-01-22 14:27:53 -0800546 neg r9
547
Adam Langleye9ada862015-05-11 17:20:37 -0700548 mov QWORD[40+rsp],rax
Robert Sloana94fe052017-02-21 08:49:28 -0800549
Adam Langleye9ada862015-05-11 17:20:37 -0700550$L$mul4x_body:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800551
552 call mul4x_internal
553
Adam Langleye9ada862015-05-11 17:20:37 -0700554 mov rsi,QWORD[40+rsp]
Robert Sloana94fe052017-02-21 08:49:28 -0800555
Adam Langleyd9e397b2015-01-22 14:27:53 -0800556 mov rax,1
David Benjamin4969cc92016-04-22 15:02:23 -0400557
Adam Langleye9ada862015-05-11 17:20:37 -0700558 mov r15,QWORD[((-48))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -0800559
Adam Langleye9ada862015-05-11 17:20:37 -0700560 mov r14,QWORD[((-40))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -0800561
Adam Langleye9ada862015-05-11 17:20:37 -0700562 mov r13,QWORD[((-32))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -0800563
Adam Langleye9ada862015-05-11 17:20:37 -0700564 mov r12,QWORD[((-24))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -0800565
Adam Langleye9ada862015-05-11 17:20:37 -0700566 mov rbp,QWORD[((-16))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -0800567
Adam Langleye9ada862015-05-11 17:20:37 -0700568 mov rbx,QWORD[((-8))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -0800569
Adam Langleye9ada862015-05-11 17:20:37 -0700570 lea rsp,[rsi]
Robert Sloana94fe052017-02-21 08:49:28 -0800571
Adam Langleye9ada862015-05-11 17:20:37 -0700572$L$mul4x_epilogue:
573 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
574 mov rsi,QWORD[16+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800575 DB 0F3h,0C3h ;repret
Robert Sloana94fe052017-02-21 08:49:28 -0800576
Adam Langleye9ada862015-05-11 17:20:37 -0700577$L$SEH_end_bn_mul4x_mont_gather5:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800578
579
580ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -0700581mul4x_internal:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800582 shl r9,5
David Benjamin4969cc92016-04-22 15:02:23 -0400583 movd xmm5,DWORD[56+rax]
584 lea rax,[$L$inc]
585 lea r13,[128+r9*1+rdx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800586 shr r9,5
David Benjamin4969cc92016-04-22 15:02:23 -0400587 movdqa xmm0,XMMWORD[rax]
588 movdqa xmm1,XMMWORD[16+rax]
589 lea r10,[((88-112))+r9*1+rsp]
590 lea r12,[128+rdx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800591
David Benjamin4969cc92016-04-22 15:02:23 -0400592 pshufd xmm5,xmm5,0
593 movdqa xmm4,xmm1
594DB 0x67,0x67
595 movdqa xmm2,xmm1
596 paddd xmm1,xmm0
597 pcmpeqd xmm0,xmm5
Adam Langleye9ada862015-05-11 17:20:37 -0700598DB 0x67
David Benjamin4969cc92016-04-22 15:02:23 -0400599 movdqa xmm3,xmm4
600 paddd xmm2,xmm1
601 pcmpeqd xmm1,xmm5
602 movdqa XMMWORD[112+r10],xmm0
603 movdqa xmm0,xmm4
604
605 paddd xmm3,xmm2
606 pcmpeqd xmm2,xmm5
607 movdqa XMMWORD[128+r10],xmm1
608 movdqa xmm1,xmm4
609
610 paddd xmm0,xmm3
611 pcmpeqd xmm3,xmm5
612 movdqa XMMWORD[144+r10],xmm2
613 movdqa xmm2,xmm4
614
615 paddd xmm1,xmm0
616 pcmpeqd xmm0,xmm5
617 movdqa XMMWORD[160+r10],xmm3
618 movdqa xmm3,xmm4
619 paddd xmm2,xmm1
620 pcmpeqd xmm1,xmm5
621 movdqa XMMWORD[176+r10],xmm0
622 movdqa xmm0,xmm4
623
624 paddd xmm3,xmm2
625 pcmpeqd xmm2,xmm5
626 movdqa XMMWORD[192+r10],xmm1
627 movdqa xmm1,xmm4
628
629 paddd xmm0,xmm3
630 pcmpeqd xmm3,xmm5
631 movdqa XMMWORD[208+r10],xmm2
632 movdqa xmm2,xmm4
633
634 paddd xmm1,xmm0
635 pcmpeqd xmm0,xmm5
636 movdqa XMMWORD[224+r10],xmm3
637 movdqa xmm3,xmm4
638 paddd xmm2,xmm1
639 pcmpeqd xmm1,xmm5
640 movdqa XMMWORD[240+r10],xmm0
641 movdqa xmm0,xmm4
642
643 paddd xmm3,xmm2
644 pcmpeqd xmm2,xmm5
645 movdqa XMMWORD[256+r10],xmm1
646 movdqa xmm1,xmm4
647
648 paddd xmm0,xmm3
649 pcmpeqd xmm3,xmm5
650 movdqa XMMWORD[272+r10],xmm2
651 movdqa xmm2,xmm4
652
653 paddd xmm1,xmm0
654 pcmpeqd xmm0,xmm5
655 movdqa XMMWORD[288+r10],xmm3
656 movdqa xmm3,xmm4
657 paddd xmm2,xmm1
658 pcmpeqd xmm1,xmm5
659 movdqa XMMWORD[304+r10],xmm0
660
661 paddd xmm3,xmm2
Adam Langleye9ada862015-05-11 17:20:37 -0700662DB 0x67
David Benjamin4969cc92016-04-22 15:02:23 -0400663 pcmpeqd xmm2,xmm5
664 movdqa XMMWORD[320+r10],xmm1
665
666 pcmpeqd xmm3,xmm5
667 movdqa XMMWORD[336+r10],xmm2
668 pand xmm0,XMMWORD[64+r12]
669
670 pand xmm1,XMMWORD[80+r12]
671 pand xmm2,XMMWORD[96+r12]
672 movdqa XMMWORD[352+r10],xmm3
673 pand xmm3,XMMWORD[112+r12]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800674 por xmm0,xmm2
David Benjamin4969cc92016-04-22 15:02:23 -0400675 por xmm1,xmm3
676 movdqa xmm4,XMMWORD[((-128))+r12]
677 movdqa xmm5,XMMWORD[((-112))+r12]
678 movdqa xmm2,XMMWORD[((-96))+r12]
679 pand xmm4,XMMWORD[112+r10]
680 movdqa xmm3,XMMWORD[((-80))+r12]
681 pand xmm5,XMMWORD[128+r10]
682 por xmm0,xmm4
683 pand xmm2,XMMWORD[144+r10]
684 por xmm1,xmm5
685 pand xmm3,XMMWORD[160+r10]
686 por xmm0,xmm2
687 por xmm1,xmm3
688 movdqa xmm4,XMMWORD[((-64))+r12]
689 movdqa xmm5,XMMWORD[((-48))+r12]
690 movdqa xmm2,XMMWORD[((-32))+r12]
691 pand xmm4,XMMWORD[176+r10]
692 movdqa xmm3,XMMWORD[((-16))+r12]
693 pand xmm5,XMMWORD[192+r10]
694 por xmm0,xmm4
695 pand xmm2,XMMWORD[208+r10]
696 por xmm1,xmm5
697 pand xmm3,XMMWORD[224+r10]
698 por xmm0,xmm2
699 por xmm1,xmm3
700 movdqa xmm4,XMMWORD[r12]
701 movdqa xmm5,XMMWORD[16+r12]
702 movdqa xmm2,XMMWORD[32+r12]
703 pand xmm4,XMMWORD[240+r10]
704 movdqa xmm3,XMMWORD[48+r12]
705 pand xmm5,XMMWORD[256+r10]
706 por xmm0,xmm4
707 pand xmm2,XMMWORD[272+r10]
708 por xmm1,xmm5
709 pand xmm3,XMMWORD[288+r10]
710 por xmm0,xmm2
711 por xmm1,xmm3
712 por xmm0,xmm1
713 pshufd xmm1,xmm0,0x4e
714 por xmm0,xmm1
715 lea r12,[256+r12]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800716DB 102,72,15,126,195
David Benjamin4969cc92016-04-22 15:02:23 -0400717
Adam Langleye9ada862015-05-11 17:20:37 -0700718 mov QWORD[((16+8))+rsp],r13
719 mov QWORD[((56+8))+rsp],rdi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800720
Adam Langleye9ada862015-05-11 17:20:37 -0700721 mov r8,QWORD[r8]
722 mov rax,QWORD[rsi]
723 lea rsi,[r9*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800724 neg r9
725
726 mov rbp,r8
727 mul rbx
728 mov r10,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700729 mov rax,QWORD[rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800730
Adam Langleyd9e397b2015-01-22 14:27:53 -0800731 imul rbp,r10
David Benjamin4969cc92016-04-22 15:02:23 -0400732 lea r14,[((64+8))+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800733 mov r11,rdx
734
Adam Langleyd9e397b2015-01-22 14:27:53 -0800735 mul rbp
736 add r10,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700737 mov rax,QWORD[8+r9*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800738 adc rdx,0
739 mov rdi,rdx
740
741 mul rbx
742 add r11,rax
David Benjamin4969cc92016-04-22 15:02:23 -0400743 mov rax,QWORD[8+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800744 adc rdx,0
745 mov r10,rdx
746
747 mul rbp
748 add rdi,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700749 mov rax,QWORD[16+r9*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800750 adc rdx,0
751 add rdi,r11
Adam Langleye9ada862015-05-11 17:20:37 -0700752 lea r15,[32+r9]
David Benjamin4969cc92016-04-22 15:02:23 -0400753 lea rcx,[32+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800754 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700755 mov QWORD[r14],rdi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800756 mov r13,rdx
Adam Langleye9ada862015-05-11 17:20:37 -0700757 jmp NEAR $L$1st4x
Adam Langleyd9e397b2015-01-22 14:27:53 -0800758
759ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -0700760$L$1st4x:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800761 mul rbx
762 add r10,rax
David Benjamin4969cc92016-04-22 15:02:23 -0400763 mov rax,QWORD[((-16))+rcx]
Adam Langleye9ada862015-05-11 17:20:37 -0700764 lea r14,[32+r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800765 adc rdx,0
766 mov r11,rdx
767
768 mul rbp
769 add r13,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700770 mov rax,QWORD[((-8))+r15*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800771 adc rdx,0
772 add r13,r10
773 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700774 mov QWORD[((-24))+r14],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800775 mov rdi,rdx
776
777 mul rbx
778 add r11,rax
David Benjamin4969cc92016-04-22 15:02:23 -0400779 mov rax,QWORD[((-8))+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800780 adc rdx,0
781 mov r10,rdx
782
783 mul rbp
784 add rdi,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700785 mov rax,QWORD[r15*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800786 adc rdx,0
787 add rdi,r11
788 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700789 mov QWORD[((-16))+r14],rdi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800790 mov r13,rdx
791
792 mul rbx
793 add r10,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700794 mov rax,QWORD[rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800795 adc rdx,0
796 mov r11,rdx
797
798 mul rbp
799 add r13,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700800 mov rax,QWORD[8+r15*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800801 adc rdx,0
802 add r13,r10
803 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700804 mov QWORD[((-8))+r14],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800805 mov rdi,rdx
806
807 mul rbx
808 add r11,rax
David Benjamin4969cc92016-04-22 15:02:23 -0400809 mov rax,QWORD[8+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800810 adc rdx,0
811 mov r10,rdx
812
813 mul rbp
814 add rdi,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700815 mov rax,QWORD[16+r15*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800816 adc rdx,0
817 add rdi,r11
David Benjamin4969cc92016-04-22 15:02:23 -0400818 lea rcx,[32+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800819 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700820 mov QWORD[r14],rdi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800821 mov r13,rdx
822
823 add r15,32
Adam Langleye9ada862015-05-11 17:20:37 -0700824 jnz NEAR $L$1st4x
Adam Langleyd9e397b2015-01-22 14:27:53 -0800825
826 mul rbx
827 add r10,rax
David Benjamin4969cc92016-04-22 15:02:23 -0400828 mov rax,QWORD[((-16))+rcx]
Adam Langleye9ada862015-05-11 17:20:37 -0700829 lea r14,[32+r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800830 adc rdx,0
831 mov r11,rdx
832
833 mul rbp
834 add r13,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700835 mov rax,QWORD[((-8))+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800836 adc rdx,0
837 add r13,r10
838 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700839 mov QWORD[((-24))+r14],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800840 mov rdi,rdx
841
842 mul rbx
843 add r11,rax
David Benjamin4969cc92016-04-22 15:02:23 -0400844 mov rax,QWORD[((-8))+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800845 adc rdx,0
846 mov r10,rdx
847
848 mul rbp
849 add rdi,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700850 mov rax,QWORD[r9*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800851 adc rdx,0
852 add rdi,r11
853 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700854 mov QWORD[((-16))+r14],rdi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800855 mov r13,rdx
856
David Benjamin4969cc92016-04-22 15:02:23 -0400857 lea rcx,[r9*1+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800858
859 xor rdi,rdi
860 add r13,r10
861 adc rdi,0
Adam Langleye9ada862015-05-11 17:20:37 -0700862 mov QWORD[((-8))+r14],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800863
Adam Langleye9ada862015-05-11 17:20:37 -0700864 jmp NEAR $L$outer4x
Adam Langleyd9e397b2015-01-22 14:27:53 -0800865
866ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -0700867$L$outer4x:
David Benjamin4969cc92016-04-22 15:02:23 -0400868 lea rdx,[((16+128))+r14]
869 pxor xmm4,xmm4
870 pxor xmm5,xmm5
871 movdqa xmm0,XMMWORD[((-128))+r12]
872 movdqa xmm1,XMMWORD[((-112))+r12]
873 movdqa xmm2,XMMWORD[((-96))+r12]
874 movdqa xmm3,XMMWORD[((-80))+r12]
875 pand xmm0,XMMWORD[((-128))+rdx]
876 pand xmm1,XMMWORD[((-112))+rdx]
877 por xmm4,xmm0
878 pand xmm2,XMMWORD[((-96))+rdx]
879 por xmm5,xmm1
880 pand xmm3,XMMWORD[((-80))+rdx]
881 por xmm4,xmm2
882 por xmm5,xmm3
883 movdqa xmm0,XMMWORD[((-64))+r12]
884 movdqa xmm1,XMMWORD[((-48))+r12]
885 movdqa xmm2,XMMWORD[((-32))+r12]
886 movdqa xmm3,XMMWORD[((-16))+r12]
887 pand xmm0,XMMWORD[((-64))+rdx]
888 pand xmm1,XMMWORD[((-48))+rdx]
889 por xmm4,xmm0
890 pand xmm2,XMMWORD[((-32))+rdx]
891 por xmm5,xmm1
892 pand xmm3,XMMWORD[((-16))+rdx]
893 por xmm4,xmm2
894 por xmm5,xmm3
895 movdqa xmm0,XMMWORD[r12]
896 movdqa xmm1,XMMWORD[16+r12]
897 movdqa xmm2,XMMWORD[32+r12]
898 movdqa xmm3,XMMWORD[48+r12]
899 pand xmm0,XMMWORD[rdx]
900 pand xmm1,XMMWORD[16+rdx]
901 por xmm4,xmm0
902 pand xmm2,XMMWORD[32+rdx]
903 por xmm5,xmm1
904 pand xmm3,XMMWORD[48+rdx]
905 por xmm4,xmm2
906 por xmm5,xmm3
907 movdqa xmm0,XMMWORD[64+r12]
908 movdqa xmm1,XMMWORD[80+r12]
909 movdqa xmm2,XMMWORD[96+r12]
910 movdqa xmm3,XMMWORD[112+r12]
911 pand xmm0,XMMWORD[64+rdx]
912 pand xmm1,XMMWORD[80+rdx]
913 por xmm4,xmm0
914 pand xmm2,XMMWORD[96+rdx]
915 por xmm5,xmm1
916 pand xmm3,XMMWORD[112+rdx]
917 por xmm4,xmm2
918 por xmm5,xmm3
919 por xmm4,xmm5
920 pshufd xmm0,xmm4,0x4e
921 por xmm0,xmm4
922 lea r12,[256+r12]
923DB 102,72,15,126,195
924
Adam Langleye9ada862015-05-11 17:20:37 -0700925 mov r10,QWORD[r9*1+r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800926 mov rbp,r8
927 mul rbx
928 add r10,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700929 mov rax,QWORD[rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800930 adc rdx,0
931
Adam Langleyd9e397b2015-01-22 14:27:53 -0800932 imul rbp,r10
Adam Langleyd9e397b2015-01-22 14:27:53 -0800933 mov r11,rdx
Adam Langleye9ada862015-05-11 17:20:37 -0700934 mov QWORD[r14],rdi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800935
Adam Langleye9ada862015-05-11 17:20:37 -0700936 lea r14,[r9*1+r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800937
938 mul rbp
939 add r10,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700940 mov rax,QWORD[8+r9*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800941 adc rdx,0
942 mov rdi,rdx
943
944 mul rbx
945 add r11,rax
David Benjamin4969cc92016-04-22 15:02:23 -0400946 mov rax,QWORD[8+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800947 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700948 add r11,QWORD[8+r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800949 adc rdx,0
950 mov r10,rdx
951
952 mul rbp
953 add rdi,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700954 mov rax,QWORD[16+r9*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800955 adc rdx,0
956 add rdi,r11
Adam Langleye9ada862015-05-11 17:20:37 -0700957 lea r15,[32+r9]
David Benjamin4969cc92016-04-22 15:02:23 -0400958 lea rcx,[32+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800959 adc rdx,0
960 mov r13,rdx
Adam Langleye9ada862015-05-11 17:20:37 -0700961 jmp NEAR $L$inner4x
Adam Langleyd9e397b2015-01-22 14:27:53 -0800962
963ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -0700964$L$inner4x:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800965 mul rbx
966 add r10,rax
David Benjamin4969cc92016-04-22 15:02:23 -0400967 mov rax,QWORD[((-16))+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800968 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700969 add r10,QWORD[16+r14]
970 lea r14,[32+r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800971 adc rdx,0
972 mov r11,rdx
973
974 mul rbp
975 add r13,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700976 mov rax,QWORD[((-8))+r15*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800977 adc rdx,0
978 add r13,r10
979 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700980 mov QWORD[((-32))+r14],rdi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800981 mov rdi,rdx
982
983 mul rbx
984 add r11,rax
David Benjamin4969cc92016-04-22 15:02:23 -0400985 mov rax,QWORD[((-8))+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800986 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700987 add r11,QWORD[((-8))+r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800988 adc rdx,0
989 mov r10,rdx
990
991 mul rbp
992 add rdi,rax
Adam Langleye9ada862015-05-11 17:20:37 -0700993 mov rax,QWORD[r15*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800994 adc rdx,0
995 add rdi,r11
996 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -0700997 mov QWORD[((-24))+r14],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800998 mov r13,rdx
999
1000 mul rbx
1001 add r10,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001002 mov rax,QWORD[rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001003 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -07001004 add r10,QWORD[r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001005 adc rdx,0
1006 mov r11,rdx
1007
1008 mul rbp
1009 add r13,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001010 mov rax,QWORD[8+r15*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001011 adc rdx,0
1012 add r13,r10
1013 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -07001014 mov QWORD[((-16))+r14],rdi
Adam Langleyd9e397b2015-01-22 14:27:53 -08001015 mov rdi,rdx
1016
1017 mul rbx
1018 add r11,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001019 mov rax,QWORD[8+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001020 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -07001021 add r11,QWORD[8+r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001022 adc rdx,0
1023 mov r10,rdx
1024
1025 mul rbp
1026 add rdi,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001027 mov rax,QWORD[16+r15*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001028 adc rdx,0
1029 add rdi,r11
David Benjamin4969cc92016-04-22 15:02:23 -04001030 lea rcx,[32+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001031 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -07001032 mov QWORD[((-8))+r14],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001033 mov r13,rdx
1034
1035 add r15,32
Adam Langleye9ada862015-05-11 17:20:37 -07001036 jnz NEAR $L$inner4x
Adam Langleyd9e397b2015-01-22 14:27:53 -08001037
1038 mul rbx
1039 add r10,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001040 mov rax,QWORD[((-16))+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001041 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -07001042 add r10,QWORD[16+r14]
1043 lea r14,[32+r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001044 adc rdx,0
1045 mov r11,rdx
1046
1047 mul rbp
1048 add r13,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001049 mov rax,QWORD[((-8))+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001050 adc rdx,0
1051 add r13,r10
1052 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -07001053 mov QWORD[((-32))+r14],rdi
Adam Langleyd9e397b2015-01-22 14:27:53 -08001054 mov rdi,rdx
1055
1056 mul rbx
1057 add r11,rax
1058 mov rax,rbp
David Benjamin4969cc92016-04-22 15:02:23 -04001059 mov rbp,QWORD[((-8))+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001060 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -07001061 add r11,QWORD[((-8))+r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001062 adc rdx,0
1063 mov r10,rdx
1064
1065 mul rbp
1066 add rdi,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001067 mov rax,QWORD[r9*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001068 adc rdx,0
1069 add rdi,r11
1070 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -07001071 mov QWORD[((-24))+r14],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001072 mov r13,rdx
1073
Adam Langleye9ada862015-05-11 17:20:37 -07001074 mov QWORD[((-16))+r14],rdi
David Benjamin4969cc92016-04-22 15:02:23 -04001075 lea rcx,[r9*1+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001076
1077 xor rdi,rdi
1078 add r13,r10
1079 adc rdi,0
Adam Langleye9ada862015-05-11 17:20:37 -07001080 add r13,QWORD[r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001081 adc rdi,0
Adam Langleye9ada862015-05-11 17:20:37 -07001082 mov QWORD[((-8))+r14],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001083
Adam Langleye9ada862015-05-11 17:20:37 -07001084 cmp r12,QWORD[((16+8))+rsp]
1085 jb NEAR $L$outer4x
David Benjamin4969cc92016-04-22 15:02:23 -04001086 xor rax,rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001087 sub rbp,r13
1088 adc r15,r15
1089 or rdi,r15
David Benjamin4969cc92016-04-22 15:02:23 -04001090 sub rax,rdi
Adam Langleye9ada862015-05-11 17:20:37 -07001091 lea rbx,[r9*1+r14]
David Benjamin4969cc92016-04-22 15:02:23 -04001092 mov r12,QWORD[rcx]
1093 lea rbp,[rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001094 mov rcx,r9
1095 sar rcx,3+2
Adam Langleye9ada862015-05-11 17:20:37 -07001096 mov rdi,QWORD[((56+8))+rsp]
David Benjamin4969cc92016-04-22 15:02:23 -04001097 dec r12
1098 xor r10,r10
1099 mov r13,QWORD[8+rbp]
1100 mov r14,QWORD[16+rbp]
1101 mov r15,QWORD[24+rbp]
1102 jmp NEAR $L$sqr4x_sub_entry
Adam Langleye9ada862015-05-11 17:20:37 -07001103
1104global bn_power5
Adam Langleyd9e397b2015-01-22 14:27:53 -08001105
1106ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07001107bn_power5:
1108 mov QWORD[8+rsp],rdi ;WIN64 prologue
1109 mov QWORD[16+rsp],rsi
Adam Langleyd9e397b2015-01-22 14:27:53 -08001110 mov rax,rsp
Adam Langleye9ada862015-05-11 17:20:37 -07001111$L$SEH_begin_bn_power5:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001112 mov rdi,rcx
1113 mov rsi,rdx
1114 mov rdx,r8
1115 mov rcx,r9
Adam Langleye9ada862015-05-11 17:20:37 -07001116 mov r8,QWORD[40+rsp]
1117 mov r9,QWORD[48+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001118
1119
Robert Sloana94fe052017-02-21 08:49:28 -08001120
Adam Langleyd9e397b2015-01-22 14:27:53 -08001121 mov rax,rsp
Robert Sloana94fe052017-02-21 08:49:28 -08001122
Adam Langleyd9e397b2015-01-22 14:27:53 -08001123 push rbx
Robert Sloana94fe052017-02-21 08:49:28 -08001124
Adam Langleyd9e397b2015-01-22 14:27:53 -08001125 push rbp
Robert Sloana94fe052017-02-21 08:49:28 -08001126
Adam Langleyd9e397b2015-01-22 14:27:53 -08001127 push r12
Robert Sloana94fe052017-02-21 08:49:28 -08001128
Adam Langleyd9e397b2015-01-22 14:27:53 -08001129 push r13
Robert Sloana94fe052017-02-21 08:49:28 -08001130
Adam Langleyd9e397b2015-01-22 14:27:53 -08001131 push r14
Robert Sloana94fe052017-02-21 08:49:28 -08001132
Adam Langleyd9e397b2015-01-22 14:27:53 -08001133 push r15
David Benjamin4969cc92016-04-22 15:02:23 -04001134
Robert Sloana94fe052017-02-21 08:49:28 -08001135$L$power5_prologue:
1136
Adam Langleyd9e397b2015-01-22 14:27:53 -08001137 shl r9d,3
David Benjamin4969cc92016-04-22 15:02:23 -04001138 lea r10d,[r9*2+r9]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001139 neg r9
Adam Langleye9ada862015-05-11 17:20:37 -07001140 mov r8,QWORD[r8]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001141
1142
1143
1144
1145
1146
1147
David Benjamin4969cc92016-04-22 15:02:23 -04001148
1149 lea r11,[((-320))+r9*2+rsp]
Robert Sloana94fe052017-02-21 08:49:28 -08001150 mov rbp,rsp
David Benjamin4969cc92016-04-22 15:02:23 -04001151 sub r11,rdi
Adam Langleyd9e397b2015-01-22 14:27:53 -08001152 and r11,4095
1153 cmp r10,r11
Adam Langleye9ada862015-05-11 17:20:37 -07001154 jb NEAR $L$pwr_sp_alt
Robert Sloana94fe052017-02-21 08:49:28 -08001155 sub rbp,r11
1156 lea rbp,[((-320))+r9*2+rbp]
Adam Langleye9ada862015-05-11 17:20:37 -07001157 jmp NEAR $L$pwr_sp_done
Adam Langleyd9e397b2015-01-22 14:27:53 -08001158
1159ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07001160$L$pwr_sp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -04001161 lea r10,[((4096-320))+r9*2]
Robert Sloana94fe052017-02-21 08:49:28 -08001162 lea rbp,[((-320))+r9*2+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001163 sub r11,r10
1164 mov r10,0
1165 cmovc r11,r10
Robert Sloana94fe052017-02-21 08:49:28 -08001166 sub rbp,r11
Adam Langleye9ada862015-05-11 17:20:37 -07001167$L$pwr_sp_done:
Robert Sloana94fe052017-02-21 08:49:28 -08001168 and rbp,-64
1169 mov r11,rsp
1170 sub r11,rbp
1171 and r11,-4096
1172 lea rsp,[rbp*1+r11]
1173 mov r10,QWORD[rsp]
1174 cmp rsp,rbp
1175 ja NEAR $L$pwr_page_walk
1176 jmp NEAR $L$pwr_page_walk_done
1177
1178$L$pwr_page_walk:
1179 lea rsp,[((-4096))+rsp]
1180 mov r10,QWORD[rsp]
1181 cmp rsp,rbp
1182 ja NEAR $L$pwr_page_walk
1183$L$pwr_page_walk_done:
1184
Adam Langleyd9e397b2015-01-22 14:27:53 -08001185 mov r10,r9
1186 neg r9
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
Adam Langleye9ada862015-05-11 17:20:37 -07001197 mov QWORD[32+rsp],r8
1198 mov QWORD[40+rsp],rax
Robert Sloana94fe052017-02-21 08:49:28 -08001199
Adam Langleye9ada862015-05-11 17:20:37 -07001200$L$power5_body:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001201DB 102,72,15,110,207
1202DB 102,72,15,110,209
1203DB 102,73,15,110,218
1204DB 102,72,15,110,226
1205
1206 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001207 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001208 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001209 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001210 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001211 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001212 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001213 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001214 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001215 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001216
1217DB 102,72,15,126,209
1218DB 102,72,15,126,226
1219 mov rdi,rsi
Adam Langleye9ada862015-05-11 17:20:37 -07001220 mov rax,QWORD[40+rsp]
1221 lea r8,[32+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001222
1223 call mul4x_internal
1224
Adam Langleye9ada862015-05-11 17:20:37 -07001225 mov rsi,QWORD[40+rsp]
Robert Sloana94fe052017-02-21 08:49:28 -08001226
Adam Langleyd9e397b2015-01-22 14:27:53 -08001227 mov rax,1
Adam Langleye9ada862015-05-11 17:20:37 -07001228 mov r15,QWORD[((-48))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -08001229
Adam Langleye9ada862015-05-11 17:20:37 -07001230 mov r14,QWORD[((-40))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -08001231
Adam Langleye9ada862015-05-11 17:20:37 -07001232 mov r13,QWORD[((-32))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -08001233
Adam Langleye9ada862015-05-11 17:20:37 -07001234 mov r12,QWORD[((-24))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -08001235
Adam Langleye9ada862015-05-11 17:20:37 -07001236 mov rbp,QWORD[((-16))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -08001237
Adam Langleye9ada862015-05-11 17:20:37 -07001238 mov rbx,QWORD[((-8))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -08001239
Adam Langleye9ada862015-05-11 17:20:37 -07001240 lea rsp,[rsi]
Robert Sloana94fe052017-02-21 08:49:28 -08001241
Adam Langleye9ada862015-05-11 17:20:37 -07001242$L$power5_epilogue:
1243 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
1244 mov rsi,QWORD[16+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001245 DB 0F3h,0C3h ;repret
Robert Sloana94fe052017-02-21 08:49:28 -08001246
Adam Langleye9ada862015-05-11 17:20:37 -07001247$L$SEH_end_bn_power5:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001248
Adam Langleye9ada862015-05-11 17:20:37 -07001249global bn_sqr8x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001250
1251
1252ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07001253bn_sqr8x_internal:
1254__bn_sqr8x_internal:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
Adam Langleye9ada862015-05-11 17:20:37 -07001328 lea rbp,[32+r10]
1329 lea rsi,[r9*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001330
1331 mov rcx,r9
1332
1333
Adam Langleye9ada862015-05-11 17:20:37 -07001334 mov r14,QWORD[((-32))+rbp*1+rsi]
1335 lea rdi,[((48+8))+r9*2+rsp]
1336 mov rax,QWORD[((-24))+rbp*1+rsi]
1337 lea rdi,[((-32))+rbp*1+rdi]
1338 mov rbx,QWORD[((-16))+rbp*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001339 mov r15,rax
1340
1341 mul r14
1342 mov r10,rax
1343 mov rax,rbx
1344 mov r11,rdx
Adam Langleye9ada862015-05-11 17:20:37 -07001345 mov QWORD[((-24))+rbp*1+rdi],r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08001346
1347 mul r14
1348 add r11,rax
1349 mov rax,rbx
1350 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -07001351 mov QWORD[((-16))+rbp*1+rdi],r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08001352 mov r10,rdx
1353
1354
Adam Langleye9ada862015-05-11 17:20:37 -07001355 mov rbx,QWORD[((-8))+rbp*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001356 mul r15
1357 mov r12,rax
1358 mov rax,rbx
1359 mov r13,rdx
1360
Adam Langleye9ada862015-05-11 17:20:37 -07001361 lea rcx,[rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001362 mul r14
1363 add r10,rax
1364 mov rax,rbx
1365 mov r11,rdx
1366 adc r11,0
1367 add r10,r12
1368 adc r11,0
Adam Langleye9ada862015-05-11 17:20:37 -07001369 mov QWORD[((-8))+rcx*1+rdi],r10
1370 jmp NEAR $L$sqr4x_1st
Adam Langleyd9e397b2015-01-22 14:27:53 -08001371
1372ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07001373$L$sqr4x_1st:
1374 mov rbx,QWORD[rcx*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001375 mul r15
1376 add r13,rax
1377 mov rax,rbx
1378 mov r12,rdx
1379 adc r12,0
1380
1381 mul r14
1382 add r11,rax
1383 mov rax,rbx
Adam Langleye9ada862015-05-11 17:20:37 -07001384 mov rbx,QWORD[8+rcx*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001385 mov r10,rdx
1386 adc r10,0
1387 add r11,r13
1388 adc r10,0
1389
1390
1391 mul r15
1392 add r12,rax
1393 mov rax,rbx
Adam Langleye9ada862015-05-11 17:20:37 -07001394 mov QWORD[rcx*1+rdi],r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08001395 mov r13,rdx
1396 adc r13,0
1397
1398 mul r14
1399 add r10,rax
1400 mov rax,rbx
Adam Langleye9ada862015-05-11 17:20:37 -07001401 mov rbx,QWORD[16+rcx*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001402 mov r11,rdx
1403 adc r11,0
1404 add r10,r12
1405 adc r11,0
1406
1407 mul r15
1408 add r13,rax
1409 mov rax,rbx
Adam Langleye9ada862015-05-11 17:20:37 -07001410 mov QWORD[8+rcx*1+rdi],r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08001411 mov r12,rdx
1412 adc r12,0
1413
1414 mul r14
1415 add r11,rax
1416 mov rax,rbx
Adam Langleye9ada862015-05-11 17:20:37 -07001417 mov rbx,QWORD[24+rcx*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001418 mov r10,rdx
1419 adc r10,0
1420 add r11,r13
1421 adc r10,0
1422
1423
1424 mul r15
1425 add r12,rax
1426 mov rax,rbx
Adam Langleye9ada862015-05-11 17:20:37 -07001427 mov QWORD[16+rcx*1+rdi],r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08001428 mov r13,rdx
1429 adc r13,0
Adam Langleye9ada862015-05-11 17:20:37 -07001430 lea rcx,[32+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001431
1432 mul r14
1433 add r10,rax
1434 mov rax,rbx
1435 mov r11,rdx
1436 adc r11,0
1437 add r10,r12
1438 adc r11,0
Adam Langleye9ada862015-05-11 17:20:37 -07001439 mov QWORD[((-8))+rcx*1+rdi],r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08001440
1441 cmp rcx,0
Adam Langleye9ada862015-05-11 17:20:37 -07001442 jne NEAR $L$sqr4x_1st
Adam Langleyd9e397b2015-01-22 14:27:53 -08001443
1444 mul r15
1445 add r13,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001446 lea rbp,[16+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001447 adc rdx,0
1448 add r13,r11
1449 adc rdx,0
1450
Adam Langleye9ada862015-05-11 17:20:37 -07001451 mov QWORD[rdi],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001452 mov r12,rdx
Adam Langleye9ada862015-05-11 17:20:37 -07001453 mov QWORD[8+rdi],rdx
1454 jmp NEAR $L$sqr4x_outer
Adam Langleyd9e397b2015-01-22 14:27:53 -08001455
1456ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07001457$L$sqr4x_outer:
1458 mov r14,QWORD[((-32))+rbp*1+rsi]
1459 lea rdi,[((48+8))+r9*2+rsp]
1460 mov rax,QWORD[((-24))+rbp*1+rsi]
1461 lea rdi,[((-32))+rbp*1+rdi]
1462 mov rbx,QWORD[((-16))+rbp*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001463 mov r15,rax
1464
1465 mul r14
Adam Langleye9ada862015-05-11 17:20:37 -07001466 mov r10,QWORD[((-24))+rbp*1+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001467 add r10,rax
1468 mov rax,rbx
1469 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -07001470 mov QWORD[((-24))+rbp*1+rdi],r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08001471 mov r11,rdx
1472
1473 mul r14
1474 add r11,rax
1475 mov rax,rbx
1476 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -07001477 add r11,QWORD[((-16))+rbp*1+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001478 mov r10,rdx
1479 adc r10,0
Adam Langleye9ada862015-05-11 17:20:37 -07001480 mov QWORD[((-16))+rbp*1+rdi],r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08001481
1482 xor r12,r12
1483
Adam Langleye9ada862015-05-11 17:20:37 -07001484 mov rbx,QWORD[((-8))+rbp*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001485 mul r15
1486 add r12,rax
1487 mov rax,rbx
1488 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -07001489 add r12,QWORD[((-8))+rbp*1+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001490 mov r13,rdx
1491 adc r13,0
1492
1493 mul r14
1494 add r10,rax
1495 mov rax,rbx
1496 adc rdx,0
1497 add r10,r12
1498 mov r11,rdx
1499 adc r11,0
Adam Langleye9ada862015-05-11 17:20:37 -07001500 mov QWORD[((-8))+rbp*1+rdi],r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08001501
Adam Langleye9ada862015-05-11 17:20:37 -07001502 lea rcx,[rbp]
1503 jmp NEAR $L$sqr4x_inner
Adam Langleyd9e397b2015-01-22 14:27:53 -08001504
1505ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07001506$L$sqr4x_inner:
1507 mov rbx,QWORD[rcx*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001508 mul r15
1509 add r13,rax
1510 mov rax,rbx
1511 mov r12,rdx
1512 adc r12,0
Adam Langleye9ada862015-05-11 17:20:37 -07001513 add r13,QWORD[rcx*1+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001514 adc r12,0
1515
Adam Langleye9ada862015-05-11 17:20:37 -07001516DB 0x67
Adam Langleyd9e397b2015-01-22 14:27:53 -08001517 mul r14
1518 add r11,rax
1519 mov rax,rbx
Adam Langleye9ada862015-05-11 17:20:37 -07001520 mov rbx,QWORD[8+rcx*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001521 mov r10,rdx
1522 adc r10,0
1523 add r11,r13
1524 adc r10,0
1525
1526 mul r15
1527 add r12,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001528 mov QWORD[rcx*1+rdi],r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08001529 mov rax,rbx
1530 mov r13,rdx
1531 adc r13,0
Adam Langleye9ada862015-05-11 17:20:37 -07001532 add r12,QWORD[8+rcx*1+rdi]
1533 lea rcx,[16+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001534 adc r13,0
1535
1536 mul r14
1537 add r10,rax
1538 mov rax,rbx
1539 adc rdx,0
1540 add r10,r12
1541 mov r11,rdx
1542 adc r11,0
Adam Langleye9ada862015-05-11 17:20:37 -07001543 mov QWORD[((-8))+rcx*1+rdi],r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08001544
1545 cmp rcx,0
Adam Langleye9ada862015-05-11 17:20:37 -07001546 jne NEAR $L$sqr4x_inner
Adam Langleyd9e397b2015-01-22 14:27:53 -08001547
Adam Langleye9ada862015-05-11 17:20:37 -07001548DB 0x67
Adam Langleyd9e397b2015-01-22 14:27:53 -08001549 mul r15
1550 add r13,rax
1551 adc rdx,0
1552 add r13,r11
1553 adc rdx,0
1554
Adam Langleye9ada862015-05-11 17:20:37 -07001555 mov QWORD[rdi],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001556 mov r12,rdx
Adam Langleye9ada862015-05-11 17:20:37 -07001557 mov QWORD[8+rdi],rdx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001558
1559 add rbp,16
Adam Langleye9ada862015-05-11 17:20:37 -07001560 jnz NEAR $L$sqr4x_outer
Adam Langleyd9e397b2015-01-22 14:27:53 -08001561
1562
Adam Langleye9ada862015-05-11 17:20:37 -07001563 mov r14,QWORD[((-32))+rsi]
1564 lea rdi,[((48+8))+r9*2+rsp]
1565 mov rax,QWORD[((-24))+rsi]
1566 lea rdi,[((-32))+rbp*1+rdi]
1567 mov rbx,QWORD[((-16))+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001568 mov r15,rax
1569
1570 mul r14
1571 add r10,rax
1572 mov rax,rbx
1573 mov r11,rdx
1574 adc r11,0
1575
1576 mul r14
1577 add r11,rax
1578 mov rax,rbx
Adam Langleye9ada862015-05-11 17:20:37 -07001579 mov QWORD[((-24))+rdi],r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08001580 mov r10,rdx
1581 adc r10,0
1582 add r11,r13
Adam Langleye9ada862015-05-11 17:20:37 -07001583 mov rbx,QWORD[((-8))+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001584 adc r10,0
1585
1586 mul r15
1587 add r12,rax
1588 mov rax,rbx
Adam Langleye9ada862015-05-11 17:20:37 -07001589 mov QWORD[((-16))+rdi],r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08001590 mov r13,rdx
1591 adc r13,0
1592
1593 mul r14
1594 add r10,rax
1595 mov rax,rbx
1596 mov r11,rdx
1597 adc r11,0
1598 add r10,r12
1599 adc r11,0
Adam Langleye9ada862015-05-11 17:20:37 -07001600 mov QWORD[((-8))+rdi],r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08001601
1602 mul r15
1603 add r13,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001604 mov rax,QWORD[((-16))+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001605 adc rdx,0
1606 add r13,r11
1607 adc rdx,0
1608
Adam Langleye9ada862015-05-11 17:20:37 -07001609 mov QWORD[rdi],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001610 mov r12,rdx
Adam Langleye9ada862015-05-11 17:20:37 -07001611 mov QWORD[8+rdi],rdx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001612
1613 mul rbx
1614 add rbp,16
1615 xor r14,r14
1616 sub rbp,r9
1617 xor r15,r15
1618
1619 add rax,r12
1620 adc rdx,0
Adam Langleye9ada862015-05-11 17:20:37 -07001621 mov QWORD[8+rdi],rax
1622 mov QWORD[16+rdi],rdx
1623 mov QWORD[24+rdi],r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08001624
Adam Langleye9ada862015-05-11 17:20:37 -07001625 mov rax,QWORD[((-16))+rbp*1+rsi]
1626 lea rdi,[((48+8))+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001627 xor r10,r10
Adam Langleye9ada862015-05-11 17:20:37 -07001628 mov r11,QWORD[8+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001629
Adam Langleye9ada862015-05-11 17:20:37 -07001630 lea r12,[r10*2+r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001631 shr r10,63
Adam Langleye9ada862015-05-11 17:20:37 -07001632 lea r13,[r11*2+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001633 shr r11,63
1634 or r13,r10
Adam Langleye9ada862015-05-11 17:20:37 -07001635 mov r10,QWORD[16+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001636 mov r14,r11
1637 mul rax
1638 neg r15
Adam Langleye9ada862015-05-11 17:20:37 -07001639 mov r11,QWORD[24+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001640 adc r12,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001641 mov rax,QWORD[((-8))+rbp*1+rsi]
1642 mov QWORD[rdi],r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08001643 adc r13,rdx
1644
Adam Langleye9ada862015-05-11 17:20:37 -07001645 lea rbx,[r10*2+r14]
1646 mov QWORD[8+rdi],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001647 sbb r15,r15
1648 shr r10,63
Adam Langleye9ada862015-05-11 17:20:37 -07001649 lea r8,[r11*2+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001650 shr r11,63
1651 or r8,r10
Adam Langleye9ada862015-05-11 17:20:37 -07001652 mov r10,QWORD[32+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001653 mov r14,r11
1654 mul rax
1655 neg r15
Adam Langleye9ada862015-05-11 17:20:37 -07001656 mov r11,QWORD[40+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001657 adc rbx,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001658 mov rax,QWORD[rbp*1+rsi]
1659 mov QWORD[16+rdi],rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001660 adc r8,rdx
Adam Langleye9ada862015-05-11 17:20:37 -07001661 lea rbp,[16+rbp]
1662 mov QWORD[24+rdi],r8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001663 sbb r15,r15
Adam Langleye9ada862015-05-11 17:20:37 -07001664 lea rdi,[64+rdi]
1665 jmp NEAR $L$sqr4x_shift_n_add
Adam Langleyd9e397b2015-01-22 14:27:53 -08001666
1667ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07001668$L$sqr4x_shift_n_add:
1669 lea r12,[r10*2+r14]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001670 shr r10,63
Adam Langleye9ada862015-05-11 17:20:37 -07001671 lea r13,[r11*2+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001672 shr r11,63
1673 or r13,r10
Adam Langleye9ada862015-05-11 17:20:37 -07001674 mov r10,QWORD[((-16))+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001675 mov r14,r11
1676 mul rax
1677 neg r15
Adam Langleye9ada862015-05-11 17:20:37 -07001678 mov r11,QWORD[((-8))+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001679 adc r12,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001680 mov rax,QWORD[((-8))+rbp*1+rsi]
1681 mov QWORD[((-32))+rdi],r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08001682 adc r13,rdx
1683
Adam Langleye9ada862015-05-11 17:20:37 -07001684 lea rbx,[r10*2+r14]
1685 mov QWORD[((-24))+rdi],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001686 sbb r15,r15
1687 shr r10,63
Adam Langleye9ada862015-05-11 17:20:37 -07001688 lea r8,[r11*2+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001689 shr r11,63
1690 or r8,r10
Adam Langleye9ada862015-05-11 17:20:37 -07001691 mov r10,QWORD[rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001692 mov r14,r11
1693 mul rax
1694 neg r15
Adam Langleye9ada862015-05-11 17:20:37 -07001695 mov r11,QWORD[8+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001696 adc rbx,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001697 mov rax,QWORD[rbp*1+rsi]
1698 mov QWORD[((-16))+rdi],rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001699 adc r8,rdx
1700
Adam Langleye9ada862015-05-11 17:20:37 -07001701 lea r12,[r10*2+r14]
1702 mov QWORD[((-8))+rdi],r8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001703 sbb r15,r15
1704 shr r10,63
Adam Langleye9ada862015-05-11 17:20:37 -07001705 lea r13,[r11*2+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001706 shr r11,63
1707 or r13,r10
Adam Langleye9ada862015-05-11 17:20:37 -07001708 mov r10,QWORD[16+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001709 mov r14,r11
1710 mul rax
1711 neg r15
Adam Langleye9ada862015-05-11 17:20:37 -07001712 mov r11,QWORD[24+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001713 adc r12,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001714 mov rax,QWORD[8+rbp*1+rsi]
1715 mov QWORD[rdi],r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08001716 adc r13,rdx
1717
Adam Langleye9ada862015-05-11 17:20:37 -07001718 lea rbx,[r10*2+r14]
1719 mov QWORD[8+rdi],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001720 sbb r15,r15
1721 shr r10,63
Adam Langleye9ada862015-05-11 17:20:37 -07001722 lea r8,[r11*2+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001723 shr r11,63
1724 or r8,r10
Adam Langleye9ada862015-05-11 17:20:37 -07001725 mov r10,QWORD[32+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001726 mov r14,r11
1727 mul rax
1728 neg r15
Adam Langleye9ada862015-05-11 17:20:37 -07001729 mov r11,QWORD[40+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001730 adc rbx,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001731 mov rax,QWORD[16+rbp*1+rsi]
1732 mov QWORD[16+rdi],rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001733 adc r8,rdx
Adam Langleye9ada862015-05-11 17:20:37 -07001734 mov QWORD[24+rdi],r8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001735 sbb r15,r15
Adam Langleye9ada862015-05-11 17:20:37 -07001736 lea rdi,[64+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001737 add rbp,32
Adam Langleye9ada862015-05-11 17:20:37 -07001738 jnz NEAR $L$sqr4x_shift_n_add
Adam Langleyd9e397b2015-01-22 14:27:53 -08001739
Adam Langleye9ada862015-05-11 17:20:37 -07001740 lea r12,[r10*2+r14]
1741DB 0x67
Adam Langleyd9e397b2015-01-22 14:27:53 -08001742 shr r10,63
Adam Langleye9ada862015-05-11 17:20:37 -07001743 lea r13,[r11*2+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001744 shr r11,63
1745 or r13,r10
Adam Langleye9ada862015-05-11 17:20:37 -07001746 mov r10,QWORD[((-16))+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001747 mov r14,r11
1748 mul rax
1749 neg r15
Adam Langleye9ada862015-05-11 17:20:37 -07001750 mov r11,QWORD[((-8))+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001751 adc r12,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001752 mov rax,QWORD[((-8))+rsi]
1753 mov QWORD[((-32))+rdi],r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08001754 adc r13,rdx
1755
Adam Langleye9ada862015-05-11 17:20:37 -07001756 lea rbx,[r10*2+r14]
1757 mov QWORD[((-24))+rdi],r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001758 sbb r15,r15
1759 shr r10,63
Adam Langleye9ada862015-05-11 17:20:37 -07001760 lea r8,[r11*2+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001761 shr r11,63
1762 or r8,r10
1763 mul rax
1764 neg r15
1765 adc rbx,rax
1766 adc r8,rdx
Adam Langleye9ada862015-05-11 17:20:37 -07001767 mov QWORD[((-16))+rdi],rbx
1768 mov QWORD[((-8))+rdi],r8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001769DB 102,72,15,126,213
David Benjamin4969cc92016-04-22 15:02:23 -04001770__bn_sqr8x_reduction:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001771 xor rax,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001772 lea rcx,[rbp*1+r9]
Adam Langleye9ada862015-05-11 17:20:37 -07001773 lea rdx,[((48+8))+r9*2+rsp]
1774 mov QWORD[((0+8))+rsp],rcx
1775 lea rdi,[((48+8))+r9*1+rsp]
1776 mov QWORD[((8+8))+rsp],rdx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001777 neg r9
Adam Langleye9ada862015-05-11 17:20:37 -07001778 jmp NEAR $L$8x_reduction_loop
Adam Langleyd9e397b2015-01-22 14:27:53 -08001779
1780ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07001781$L$8x_reduction_loop:
1782 lea rdi,[r9*1+rdi]
1783DB 0x66
1784 mov rbx,QWORD[rdi]
1785 mov r9,QWORD[8+rdi]
1786 mov r10,QWORD[16+rdi]
1787 mov r11,QWORD[24+rdi]
1788 mov r12,QWORD[32+rdi]
1789 mov r13,QWORD[40+rdi]
1790 mov r14,QWORD[48+rdi]
1791 mov r15,QWORD[56+rdi]
1792 mov QWORD[rdx],rax
1793 lea rdi,[64+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001794
Adam Langleye9ada862015-05-11 17:20:37 -07001795DB 0x67
Adam Langleyd9e397b2015-01-22 14:27:53 -08001796 mov r8,rbx
Adam Langleye9ada862015-05-11 17:20:37 -07001797 imul rbx,QWORD[((32+8))+rsp]
1798 mov rax,QWORD[rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001799 mov ecx,8
Adam Langleye9ada862015-05-11 17:20:37 -07001800 jmp NEAR $L$8x_reduce
Adam Langleyd9e397b2015-01-22 14:27:53 -08001801
1802ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07001803$L$8x_reduce:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001804 mul rbx
David Benjamin4969cc92016-04-22 15:02:23 -04001805 mov rax,QWORD[8+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001806 neg r8
1807 mov r8,rdx
1808 adc r8,0
1809
1810 mul rbx
1811 add r9,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001812 mov rax,QWORD[16+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001813 adc rdx,0
1814 add r8,r9
Adam Langleye9ada862015-05-11 17:20:37 -07001815 mov QWORD[((48-8+8))+rcx*8+rsp],rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001816 mov r9,rdx
1817 adc r9,0
1818
1819 mul rbx
1820 add r10,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001821 mov rax,QWORD[24+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001822 adc rdx,0
1823 add r9,r10
Adam Langleye9ada862015-05-11 17:20:37 -07001824 mov rsi,QWORD[((32+8))+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001825 mov r10,rdx
1826 adc r10,0
1827
1828 mul rbx
1829 add r11,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001830 mov rax,QWORD[32+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001831 adc rdx,0
1832 imul rsi,r8
1833 add r10,r11
1834 mov r11,rdx
1835 adc r11,0
1836
1837 mul rbx
1838 add r12,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001839 mov rax,QWORD[40+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001840 adc rdx,0
1841 add r11,r12
1842 mov r12,rdx
1843 adc r12,0
1844
1845 mul rbx
1846 add r13,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001847 mov rax,QWORD[48+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001848 adc rdx,0
1849 add r12,r13
1850 mov r13,rdx
1851 adc r13,0
1852
1853 mul rbx
1854 add r14,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001855 mov rax,QWORD[56+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001856 adc rdx,0
1857 add r13,r14
1858 mov r14,rdx
1859 adc r14,0
1860
1861 mul rbx
1862 mov rbx,rsi
1863 add r15,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001864 mov rax,QWORD[rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001865 adc rdx,0
1866 add r14,r15
1867 mov r15,rdx
1868 adc r15,0
1869
1870 dec ecx
Adam Langleye9ada862015-05-11 17:20:37 -07001871 jnz NEAR $L$8x_reduce
Adam Langleyd9e397b2015-01-22 14:27:53 -08001872
David Benjamin4969cc92016-04-22 15:02:23 -04001873 lea rbp,[64+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001874 xor rax,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001875 mov rdx,QWORD[((8+8))+rsp]
1876 cmp rbp,QWORD[((0+8))+rsp]
1877 jae NEAR $L$8x_no_tail
Adam Langleyd9e397b2015-01-22 14:27:53 -08001878
Adam Langleye9ada862015-05-11 17:20:37 -07001879DB 0x66
1880 add r8,QWORD[rdi]
1881 adc r9,QWORD[8+rdi]
1882 adc r10,QWORD[16+rdi]
1883 adc r11,QWORD[24+rdi]
1884 adc r12,QWORD[32+rdi]
1885 adc r13,QWORD[40+rdi]
1886 adc r14,QWORD[48+rdi]
1887 adc r15,QWORD[56+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001888 sbb rsi,rsi
1889
Adam Langleye9ada862015-05-11 17:20:37 -07001890 mov rbx,QWORD[((48+56+8))+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001891 mov ecx,8
Adam Langleye9ada862015-05-11 17:20:37 -07001892 mov rax,QWORD[rbp]
1893 jmp NEAR $L$8x_tail
Adam Langleyd9e397b2015-01-22 14:27:53 -08001894
1895ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07001896$L$8x_tail:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001897 mul rbx
1898 add r8,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001899 mov rax,QWORD[8+rbp]
Adam Langleye9ada862015-05-11 17:20:37 -07001900 mov QWORD[rdi],r8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001901 mov r8,rdx
1902 adc r8,0
1903
1904 mul rbx
1905 add r9,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001906 mov rax,QWORD[16+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001907 adc rdx,0
1908 add r8,r9
Adam Langleye9ada862015-05-11 17:20:37 -07001909 lea rdi,[8+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001910 mov r9,rdx
1911 adc r9,0
1912
1913 mul rbx
1914 add r10,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001915 mov rax,QWORD[24+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001916 adc rdx,0
1917 add r9,r10
1918 mov r10,rdx
1919 adc r10,0
1920
1921 mul rbx
1922 add r11,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001923 mov rax,QWORD[32+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001924 adc rdx,0
1925 add r10,r11
1926 mov r11,rdx
1927 adc r11,0
1928
1929 mul rbx
1930 add r12,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001931 mov rax,QWORD[40+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001932 adc rdx,0
1933 add r11,r12
1934 mov r12,rdx
1935 adc r12,0
1936
1937 mul rbx
1938 add r13,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001939 mov rax,QWORD[48+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001940 adc rdx,0
1941 add r12,r13
1942 mov r13,rdx
1943 adc r13,0
1944
1945 mul rbx
1946 add r14,rax
David Benjamin4969cc92016-04-22 15:02:23 -04001947 mov rax,QWORD[56+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001948 adc rdx,0
1949 add r13,r14
1950 mov r14,rdx
1951 adc r14,0
1952
1953 mul rbx
Adam Langleye9ada862015-05-11 17:20:37 -07001954 mov rbx,QWORD[((48-16+8))+rcx*8+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001955 add r15,rax
1956 adc rdx,0
1957 add r14,r15
Adam Langleye9ada862015-05-11 17:20:37 -07001958 mov rax,QWORD[rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001959 mov r15,rdx
1960 adc r15,0
1961
1962 dec ecx
Adam Langleye9ada862015-05-11 17:20:37 -07001963 jnz NEAR $L$8x_tail
Adam Langleyd9e397b2015-01-22 14:27:53 -08001964
David Benjamin4969cc92016-04-22 15:02:23 -04001965 lea rbp,[64+rbp]
Adam Langleye9ada862015-05-11 17:20:37 -07001966 mov rdx,QWORD[((8+8))+rsp]
1967 cmp rbp,QWORD[((0+8))+rsp]
1968 jae NEAR $L$8x_tail_done
Adam Langleyd9e397b2015-01-22 14:27:53 -08001969
Adam Langleye9ada862015-05-11 17:20:37 -07001970 mov rbx,QWORD[((48+56+8))+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001971 neg rsi
Adam Langleye9ada862015-05-11 17:20:37 -07001972 mov rax,QWORD[rbp]
1973 adc r8,QWORD[rdi]
1974 adc r9,QWORD[8+rdi]
1975 adc r10,QWORD[16+rdi]
1976 adc r11,QWORD[24+rdi]
1977 adc r12,QWORD[32+rdi]
1978 adc r13,QWORD[40+rdi]
1979 adc r14,QWORD[48+rdi]
1980 adc r15,QWORD[56+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001981 sbb rsi,rsi
1982
1983 mov ecx,8
Adam Langleye9ada862015-05-11 17:20:37 -07001984 jmp NEAR $L$8x_tail
Adam Langleyd9e397b2015-01-22 14:27:53 -08001985
1986ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07001987$L$8x_tail_done:
Robert Sloan4d1ac502017-02-06 08:36:14 -08001988 xor rax,rax
Adam Langleye9ada862015-05-11 17:20:37 -07001989 add r8,QWORD[rdx]
Adam Langley4139edb2016-01-13 15:00:54 -08001990 adc r9,0
1991 adc r10,0
1992 adc r11,0
1993 adc r12,0
1994 adc r13,0
1995 adc r14,0
1996 adc r15,0
Robert Sloan4d1ac502017-02-06 08:36:14 -08001997 adc rax,0
Adam Langleyd9e397b2015-01-22 14:27:53 -08001998
1999 neg rsi
Adam Langleye9ada862015-05-11 17:20:37 -07002000$L$8x_no_tail:
2001 adc r8,QWORD[rdi]
2002 adc r9,QWORD[8+rdi]
2003 adc r10,QWORD[16+rdi]
2004 adc r11,QWORD[24+rdi]
2005 adc r12,QWORD[32+rdi]
2006 adc r13,QWORD[40+rdi]
2007 adc r14,QWORD[48+rdi]
2008 adc r15,QWORD[56+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002009 adc rax,0
David Benjamin4969cc92016-04-22 15:02:23 -04002010 mov rcx,QWORD[((-8))+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002011 xor rsi,rsi
2012
2013DB 102,72,15,126,213
2014
Adam Langleye9ada862015-05-11 17:20:37 -07002015 mov QWORD[rdi],r8
2016 mov QWORD[8+rdi],r9
Adam Langleyd9e397b2015-01-22 14:27:53 -08002017DB 102,73,15,126,217
Adam Langleye9ada862015-05-11 17:20:37 -07002018 mov QWORD[16+rdi],r10
2019 mov QWORD[24+rdi],r11
2020 mov QWORD[32+rdi],r12
2021 mov QWORD[40+rdi],r13
2022 mov QWORD[48+rdi],r14
2023 mov QWORD[56+rdi],r15
2024 lea rdi,[64+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002025
2026 cmp rdi,rdx
Adam Langleye9ada862015-05-11 17:20:37 -07002027 jb NEAR $L$8x_reduction_loop
David Benjamin4969cc92016-04-22 15:02:23 -04002028 DB 0F3h,0C3h ;repret
Adam Langleyd9e397b2015-01-22 14:27:53 -08002029
Adam Langleyd9e397b2015-01-22 14:27:53 -08002030
2031ALIGN 32
David Benjamin4969cc92016-04-22 15:02:23 -04002032__bn_post4x_internal:
2033 mov r12,QWORD[rbp]
2034 lea rbx,[r9*1+rdi]
2035 mov rcx,r9
2036DB 102,72,15,126,207
2037 neg rax
2038DB 102,72,15,126,206
2039 sar rcx,3+2
2040 dec r12
2041 xor r10,r10
2042 mov r13,QWORD[8+rbp]
2043 mov r14,QWORD[16+rbp]
2044 mov r15,QWORD[24+rbp]
2045 jmp NEAR $L$sqr4x_sub_entry
2046
2047ALIGN 16
Adam Langleye9ada862015-05-11 17:20:37 -07002048$L$sqr4x_sub:
David Benjamin4969cc92016-04-22 15:02:23 -04002049 mov r12,QWORD[rbp]
2050 mov r13,QWORD[8+rbp]
2051 mov r14,QWORD[16+rbp]
2052 mov r15,QWORD[24+rbp]
2053$L$sqr4x_sub_entry:
2054 lea rbp,[32+rbp]
2055 not r12
2056 not r13
2057 not r14
2058 not r15
2059 and r12,rax
2060 and r13,rax
2061 and r14,rax
2062 and r15,rax
2063
2064 neg r10
2065 adc r12,QWORD[rbx]
2066 adc r13,QWORD[8+rbx]
2067 adc r14,QWORD[16+rbx]
2068 adc r15,QWORD[24+rbx]
Adam Langleye9ada862015-05-11 17:20:37 -07002069 mov QWORD[rdi],r12
David Benjamin4969cc92016-04-22 15:02:23 -04002070 lea rbx,[32+rbx]
Adam Langleye9ada862015-05-11 17:20:37 -07002071 mov QWORD[8+rdi],r13
David Benjamin4969cc92016-04-22 15:02:23 -04002072 sbb r10,r10
Adam Langleye9ada862015-05-11 17:20:37 -07002073 mov QWORD[16+rdi],r14
2074 mov QWORD[24+rdi],r15
2075 lea rdi,[32+rdi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002076
2077 inc rcx
Adam Langleye9ada862015-05-11 17:20:37 -07002078 jnz NEAR $L$sqr4x_sub
David Benjamin4969cc92016-04-22 15:02:23 -04002079
Adam Langleyd9e397b2015-01-22 14:27:53 -08002080 mov r10,r9
2081 neg r9
2082 DB 0F3h,0C3h ;repret
Adam Langleye9ada862015-05-11 17:20:37 -07002083
2084global bn_from_montgomery
Adam Langleyd9e397b2015-01-22 14:27:53 -08002085
2086ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07002087bn_from_montgomery:
2088 test DWORD[48+rsp],7
2089 jz NEAR bn_from_mont8x
Adam Langleyd9e397b2015-01-22 14:27:53 -08002090 xor eax,eax
2091 DB 0F3h,0C3h ;repret
Adam Langleye9ada862015-05-11 17:20:37 -07002092
Adam Langleyd9e397b2015-01-22 14:27:53 -08002093
2094
2095ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07002096bn_from_mont8x:
2097 mov QWORD[8+rsp],rdi ;WIN64 prologue
2098 mov QWORD[16+rsp],rsi
Adam Langleyd9e397b2015-01-22 14:27:53 -08002099 mov rax,rsp
Adam Langleye9ada862015-05-11 17:20:37 -07002100$L$SEH_begin_bn_from_mont8x:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002101 mov rdi,rcx
2102 mov rsi,rdx
2103 mov rdx,r8
2104 mov rcx,r9
Adam Langleye9ada862015-05-11 17:20:37 -07002105 mov r8,QWORD[40+rsp]
2106 mov r9,QWORD[48+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002107
2108
Robert Sloana94fe052017-02-21 08:49:28 -08002109
Adam Langleye9ada862015-05-11 17:20:37 -07002110DB 0x67
Adam Langleyd9e397b2015-01-22 14:27:53 -08002111 mov rax,rsp
Robert Sloana94fe052017-02-21 08:49:28 -08002112
Adam Langleyd9e397b2015-01-22 14:27:53 -08002113 push rbx
Robert Sloana94fe052017-02-21 08:49:28 -08002114
Adam Langleyd9e397b2015-01-22 14:27:53 -08002115 push rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002116
Adam Langleyd9e397b2015-01-22 14:27:53 -08002117 push r12
Robert Sloana94fe052017-02-21 08:49:28 -08002118
Adam Langleyd9e397b2015-01-22 14:27:53 -08002119 push r13
Robert Sloana94fe052017-02-21 08:49:28 -08002120
Adam Langleyd9e397b2015-01-22 14:27:53 -08002121 push r14
Robert Sloana94fe052017-02-21 08:49:28 -08002122
Adam Langleyd9e397b2015-01-22 14:27:53 -08002123 push r15
David Benjamin4969cc92016-04-22 15:02:23 -04002124
Robert Sloana94fe052017-02-21 08:49:28 -08002125$L$from_prologue:
2126
Adam Langleyd9e397b2015-01-22 14:27:53 -08002127 shl r9d,3
David Benjamin4969cc92016-04-22 15:02:23 -04002128 lea r10,[r9*2+r9]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002129 neg r9
Adam Langleye9ada862015-05-11 17:20:37 -07002130 mov r8,QWORD[r8]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002131
2132
2133
2134
2135
2136
2137
David Benjamin4969cc92016-04-22 15:02:23 -04002138
2139 lea r11,[((-320))+r9*2+rsp]
Robert Sloana94fe052017-02-21 08:49:28 -08002140 mov rbp,rsp
David Benjamin4969cc92016-04-22 15:02:23 -04002141 sub r11,rdi
Adam Langleyd9e397b2015-01-22 14:27:53 -08002142 and r11,4095
2143 cmp r10,r11
Adam Langleye9ada862015-05-11 17:20:37 -07002144 jb NEAR $L$from_sp_alt
Robert Sloana94fe052017-02-21 08:49:28 -08002145 sub rbp,r11
2146 lea rbp,[((-320))+r9*2+rbp]
Adam Langleye9ada862015-05-11 17:20:37 -07002147 jmp NEAR $L$from_sp_done
Adam Langleyd9e397b2015-01-22 14:27:53 -08002148
2149ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07002150$L$from_sp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -04002151 lea r10,[((4096-320))+r9*2]
Robert Sloana94fe052017-02-21 08:49:28 -08002152 lea rbp,[((-320))+r9*2+rbp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002153 sub r11,r10
2154 mov r10,0
2155 cmovc r11,r10
Robert Sloana94fe052017-02-21 08:49:28 -08002156 sub rbp,r11
Adam Langleye9ada862015-05-11 17:20:37 -07002157$L$from_sp_done:
Robert Sloana94fe052017-02-21 08:49:28 -08002158 and rbp,-64
2159 mov r11,rsp
2160 sub r11,rbp
2161 and r11,-4096
2162 lea rsp,[rbp*1+r11]
2163 mov r10,QWORD[rsp]
2164 cmp rsp,rbp
2165 ja NEAR $L$from_page_walk
2166 jmp NEAR $L$from_page_walk_done
2167
2168$L$from_page_walk:
2169 lea rsp,[((-4096))+rsp]
2170 mov r10,QWORD[rsp]
2171 cmp rsp,rbp
2172 ja NEAR $L$from_page_walk
2173$L$from_page_walk_done:
2174
Adam Langleyd9e397b2015-01-22 14:27:53 -08002175 mov r10,r9
2176 neg r9
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
Adam Langleye9ada862015-05-11 17:20:37 -07002187 mov QWORD[32+rsp],r8
2188 mov QWORD[40+rsp],rax
Robert Sloana94fe052017-02-21 08:49:28 -08002189
Adam Langleye9ada862015-05-11 17:20:37 -07002190$L$from_body:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002191 mov r11,r9
Adam Langleye9ada862015-05-11 17:20:37 -07002192 lea rax,[48+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002193 pxor xmm0,xmm0
Adam Langleye9ada862015-05-11 17:20:37 -07002194 jmp NEAR $L$mul_by_1
Adam Langleyd9e397b2015-01-22 14:27:53 -08002195
2196ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07002197$L$mul_by_1:
2198 movdqu xmm1,XMMWORD[rsi]
2199 movdqu xmm2,XMMWORD[16+rsi]
2200 movdqu xmm3,XMMWORD[32+rsi]
2201 movdqa XMMWORD[r9*1+rax],xmm0
2202 movdqu xmm4,XMMWORD[48+rsi]
2203 movdqa XMMWORD[16+r9*1+rax],xmm0
2204DB 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2205 movdqa XMMWORD[rax],xmm1
2206 movdqa XMMWORD[32+r9*1+rax],xmm0
2207 movdqa XMMWORD[16+rax],xmm2
2208 movdqa XMMWORD[48+r9*1+rax],xmm0
2209 movdqa XMMWORD[32+rax],xmm3
2210 movdqa XMMWORD[48+rax],xmm4
2211 lea rax,[64+rax]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002212 sub r11,64
Adam Langleye9ada862015-05-11 17:20:37 -07002213 jnz NEAR $L$mul_by_1
Adam Langleyd9e397b2015-01-22 14:27:53 -08002214
2215DB 102,72,15,110,207
2216DB 102,72,15,110,209
Adam Langleye9ada862015-05-11 17:20:37 -07002217DB 0x67
Adam Langleyd9e397b2015-01-22 14:27:53 -08002218 mov rbp,rcx
2219DB 102,73,15,110,218
David Benjamin4969cc92016-04-22 15:02:23 -04002220 call __bn_sqr8x_reduction
2221 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08002222
2223 pxor xmm0,xmm0
Adam Langleye9ada862015-05-11 17:20:37 -07002224 lea rax,[48+rsp]
Adam Langleye9ada862015-05-11 17:20:37 -07002225 jmp NEAR $L$from_mont_zero
Adam Langleyd9e397b2015-01-22 14:27:53 -08002226
2227ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07002228$L$from_mont_zero:
Robert Sloana94fe052017-02-21 08:49:28 -08002229 mov rsi,QWORD[40+rsp]
2230
Adam Langleye9ada862015-05-11 17:20:37 -07002231 movdqa XMMWORD[rax],xmm0
2232 movdqa XMMWORD[16+rax],xmm0
2233 movdqa XMMWORD[32+rax],xmm0
2234 movdqa XMMWORD[48+rax],xmm0
2235 lea rax,[64+rax]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002236 sub r9,32
Adam Langleye9ada862015-05-11 17:20:37 -07002237 jnz NEAR $L$from_mont_zero
Adam Langleyd9e397b2015-01-22 14:27:53 -08002238
2239 mov rax,1
Adam Langleye9ada862015-05-11 17:20:37 -07002240 mov r15,QWORD[((-48))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -08002241
Adam Langleye9ada862015-05-11 17:20:37 -07002242 mov r14,QWORD[((-40))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -08002243
Adam Langleye9ada862015-05-11 17:20:37 -07002244 mov r13,QWORD[((-32))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -08002245
Adam Langleye9ada862015-05-11 17:20:37 -07002246 mov r12,QWORD[((-24))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -08002247
Adam Langleye9ada862015-05-11 17:20:37 -07002248 mov rbp,QWORD[((-16))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -08002249
Adam Langleye9ada862015-05-11 17:20:37 -07002250 mov rbx,QWORD[((-8))+rsi]
Robert Sloana94fe052017-02-21 08:49:28 -08002251
Adam Langleye9ada862015-05-11 17:20:37 -07002252 lea rsp,[rsi]
Robert Sloana94fe052017-02-21 08:49:28 -08002253
Adam Langleye9ada862015-05-11 17:20:37 -07002254$L$from_epilogue:
2255 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
2256 mov rsi,QWORD[16+rsp]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002257 DB 0F3h,0C3h ;repret
Robert Sloana94fe052017-02-21 08:49:28 -08002258
Adam Langleye9ada862015-05-11 17:20:37 -07002259$L$SEH_end_bn_from_mont8x:
2260global bn_scatter5
Adam Langleyd9e397b2015-01-22 14:27:53 -08002261
2262ALIGN 16
Adam Langleye9ada862015-05-11 17:20:37 -07002263bn_scatter5:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002264 cmp edx,0
Adam Langleye9ada862015-05-11 17:20:37 -07002265 jz NEAR $L$scatter_epilogue
2266 lea r8,[r9*8+r8]
2267$L$scatter:
2268 mov rax,QWORD[rcx]
2269 lea rcx,[8+rcx]
2270 mov QWORD[r8],rax
2271 lea r8,[256+r8]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002272 sub edx,1
Adam Langleye9ada862015-05-11 17:20:37 -07002273 jnz NEAR $L$scatter
2274$L$scatter_epilogue:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002275 DB 0F3h,0C3h ;repret
Adam Langleyd9e397b2015-01-22 14:27:53 -08002276
Adam Langleye9ada862015-05-11 17:20:37 -07002277
2278global bn_gather5
Adam Langleyd9e397b2015-01-22 14:27:53 -08002279
David Benjamin4969cc92016-04-22 15:02:23 -04002280ALIGN 32
Adam Langleye9ada862015-05-11 17:20:37 -07002281bn_gather5:
2282$L$SEH_begin_bn_gather5:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002283
David Benjamin4969cc92016-04-22 15:02:23 -04002284DB 0x4c,0x8d,0x14,0x24
2285DB 0x48,0x81,0xec,0x08,0x01,0x00,0x00
2286 lea rax,[$L$inc]
2287 and rsp,-16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002288
David Benjamin4969cc92016-04-22 15:02:23 -04002289 movd xmm5,r9d
2290 movdqa xmm0,XMMWORD[rax]
2291 movdqa xmm1,XMMWORD[16+rax]
2292 lea r11,[128+r8]
2293 lea rax,[128+rsp]
2294
2295 pshufd xmm5,xmm5,0
2296 movdqa xmm4,xmm1
2297 movdqa xmm2,xmm1
2298 paddd xmm1,xmm0
2299 pcmpeqd xmm0,xmm5
2300 movdqa xmm3,xmm4
2301
2302 paddd xmm2,xmm1
2303 pcmpeqd xmm1,xmm5
2304 movdqa XMMWORD[(-128)+rax],xmm0
2305 movdqa xmm0,xmm4
2306
2307 paddd xmm3,xmm2
2308 pcmpeqd xmm2,xmm5
2309 movdqa XMMWORD[(-112)+rax],xmm1
2310 movdqa xmm1,xmm4
2311
2312 paddd xmm0,xmm3
2313 pcmpeqd xmm3,xmm5
2314 movdqa XMMWORD[(-96)+rax],xmm2
2315 movdqa xmm2,xmm4
2316 paddd xmm1,xmm0
2317 pcmpeqd xmm0,xmm5
2318 movdqa XMMWORD[(-80)+rax],xmm3
2319 movdqa xmm3,xmm4
2320
2321 paddd xmm2,xmm1
2322 pcmpeqd xmm1,xmm5
2323 movdqa XMMWORD[(-64)+rax],xmm0
2324 movdqa xmm0,xmm4
2325
2326 paddd xmm3,xmm2
2327 pcmpeqd xmm2,xmm5
2328 movdqa XMMWORD[(-48)+rax],xmm1
2329 movdqa xmm1,xmm4
2330
2331 paddd xmm0,xmm3
2332 pcmpeqd xmm3,xmm5
2333 movdqa XMMWORD[(-32)+rax],xmm2
2334 movdqa xmm2,xmm4
2335 paddd xmm1,xmm0
2336 pcmpeqd xmm0,xmm5
2337 movdqa XMMWORD[(-16)+rax],xmm3
2338 movdqa xmm3,xmm4
2339
2340 paddd xmm2,xmm1
2341 pcmpeqd xmm1,xmm5
2342 movdqa XMMWORD[rax],xmm0
2343 movdqa xmm0,xmm4
2344
2345 paddd xmm3,xmm2
2346 pcmpeqd xmm2,xmm5
2347 movdqa XMMWORD[16+rax],xmm1
2348 movdqa xmm1,xmm4
2349
2350 paddd xmm0,xmm3
2351 pcmpeqd xmm3,xmm5
2352 movdqa XMMWORD[32+rax],xmm2
2353 movdqa xmm2,xmm4
2354 paddd xmm1,xmm0
2355 pcmpeqd xmm0,xmm5
2356 movdqa XMMWORD[48+rax],xmm3
2357 movdqa xmm3,xmm4
2358
2359 paddd xmm2,xmm1
2360 pcmpeqd xmm1,xmm5
2361 movdqa XMMWORD[64+rax],xmm0
2362 movdqa xmm0,xmm4
2363
2364 paddd xmm3,xmm2
2365 pcmpeqd xmm2,xmm5
2366 movdqa XMMWORD[80+rax],xmm1
2367 movdqa xmm1,xmm4
2368
2369 paddd xmm0,xmm3
2370 pcmpeqd xmm3,xmm5
2371 movdqa XMMWORD[96+rax],xmm2
2372 movdqa xmm2,xmm4
2373 movdqa XMMWORD[112+rax],xmm3
2374 jmp NEAR $L$gather
2375
2376ALIGN 32
2377$L$gather:
2378 pxor xmm4,xmm4
2379 pxor xmm5,xmm5
2380 movdqa xmm0,XMMWORD[((-128))+r11]
2381 movdqa xmm1,XMMWORD[((-112))+r11]
2382 movdqa xmm2,XMMWORD[((-96))+r11]
2383 pand xmm0,XMMWORD[((-128))+rax]
2384 movdqa xmm3,XMMWORD[((-80))+r11]
2385 pand xmm1,XMMWORD[((-112))+rax]
2386 por xmm4,xmm0
2387 pand xmm2,XMMWORD[((-96))+rax]
2388 por xmm5,xmm1
2389 pand xmm3,XMMWORD[((-80))+rax]
2390 por xmm4,xmm2
2391 por xmm5,xmm3
2392 movdqa xmm0,XMMWORD[((-64))+r11]
2393 movdqa xmm1,XMMWORD[((-48))+r11]
2394 movdqa xmm2,XMMWORD[((-32))+r11]
2395 pand xmm0,XMMWORD[((-64))+rax]
2396 movdqa xmm3,XMMWORD[((-16))+r11]
2397 pand xmm1,XMMWORD[((-48))+rax]
2398 por xmm4,xmm0
2399 pand xmm2,XMMWORD[((-32))+rax]
2400 por xmm5,xmm1
2401 pand xmm3,XMMWORD[((-16))+rax]
2402 por xmm4,xmm2
2403 por xmm5,xmm3
2404 movdqa xmm0,XMMWORD[r11]
2405 movdqa xmm1,XMMWORD[16+r11]
2406 movdqa xmm2,XMMWORD[32+r11]
2407 pand xmm0,XMMWORD[rax]
2408 movdqa xmm3,XMMWORD[48+r11]
2409 pand xmm1,XMMWORD[16+rax]
2410 por xmm4,xmm0
2411 pand xmm2,XMMWORD[32+rax]
2412 por xmm5,xmm1
2413 pand xmm3,XMMWORD[48+rax]
2414 por xmm4,xmm2
2415 por xmm5,xmm3
2416 movdqa xmm0,XMMWORD[64+r11]
2417 movdqa xmm1,XMMWORD[80+r11]
2418 movdqa xmm2,XMMWORD[96+r11]
2419 pand xmm0,XMMWORD[64+rax]
2420 movdqa xmm3,XMMWORD[112+r11]
2421 pand xmm1,XMMWORD[80+rax]
2422 por xmm4,xmm0
2423 pand xmm2,XMMWORD[96+rax]
2424 por xmm5,xmm1
2425 pand xmm3,XMMWORD[112+rax]
2426 por xmm4,xmm2
2427 por xmm5,xmm3
2428 por xmm4,xmm5
2429 lea r11,[256+r11]
2430 pshufd xmm0,xmm4,0x4e
2431 por xmm0,xmm4
Adam Langleye9ada862015-05-11 17:20:37 -07002432 movq QWORD[rcx],xmm0
2433 lea rcx,[8+rcx]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002434 sub edx,1
Adam Langleye9ada862015-05-11 17:20:37 -07002435 jnz NEAR $L$gather
David Benjamin4969cc92016-04-22 15:02:23 -04002436
2437 lea rsp,[r10]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002438 DB 0F3h,0C3h ;repret
Adam Langleye9ada862015-05-11 17:20:37 -07002439$L$SEH_end_bn_gather5:
2440
Adam Langleyd9e397b2015-01-22 14:27:53 -08002441ALIGN 64
David Benjamin4969cc92016-04-22 15:02:23 -04002442$L$inc:
2443 DD 0,0,1,1
2444 DD 2,2,2,2
Adam Langleyd9e397b2015-01-22 14:27:53 -08002445DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
2446DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115
2447DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111
2448DB 114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79
2449DB 71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111
2450DB 112,101,110,115,115,108,46,111,114,103,62,0
Adam Langleye9ada862015-05-11 17:20:37 -07002451EXTERN __imp_RtlVirtualUnwind
Adam Langleyd9e397b2015-01-22 14:27:53 -08002452
2453ALIGN 16
Adam Langleye9ada862015-05-11 17:20:37 -07002454mul_handler:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002455 push rsi
2456 push rdi
2457 push rbx
2458 push rbp
2459 push r12
2460 push r13
2461 push r14
2462 push r15
2463 pushfq
2464 sub rsp,64
2465
Adam Langleye9ada862015-05-11 17:20:37 -07002466 mov rax,QWORD[120+r8]
2467 mov rbx,QWORD[248+r8]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002468
Adam Langleye9ada862015-05-11 17:20:37 -07002469 mov rsi,QWORD[8+r9]
2470 mov r11,QWORD[56+r9]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002471
Adam Langleye9ada862015-05-11 17:20:37 -07002472 mov r10d,DWORD[r11]
2473 lea r10,[r10*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002474 cmp rbx,r10
Adam Langleye9ada862015-05-11 17:20:37 -07002475 jb NEAR $L$common_seh_tail
Adam Langleyd9e397b2015-01-22 14:27:53 -08002476
Robert Sloana94fe052017-02-21 08:49:28 -08002477 mov r10d,DWORD[4+r11]
2478 lea r10,[r10*1+rsi]
2479 cmp rbx,r10
2480 jb NEAR $L$common_pop_regs
2481
Adam Langleye9ada862015-05-11 17:20:37 -07002482 mov rax,QWORD[152+r8]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002483
Robert Sloana94fe052017-02-21 08:49:28 -08002484 mov r10d,DWORD[8+r11]
Adam Langleye9ada862015-05-11 17:20:37 -07002485 lea r10,[r10*1+rsi]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002486 cmp rbx,r10
Adam Langleye9ada862015-05-11 17:20:37 -07002487 jae NEAR $L$common_seh_tail
Adam Langleyd9e397b2015-01-22 14:27:53 -08002488
Adam Langleye9ada862015-05-11 17:20:37 -07002489 lea r10,[$L$mul_epilogue]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002490 cmp rbx,r10
David Benjamin4969cc92016-04-22 15:02:23 -04002491 ja NEAR $L$body_40
Adam Langleyd9e397b2015-01-22 14:27:53 -08002492
Adam Langleye9ada862015-05-11 17:20:37 -07002493 mov r10,QWORD[192+r8]
2494 mov rax,QWORD[8+r10*8+rax]
David Benjamin4969cc92016-04-22 15:02:23 -04002495
Robert Sloana94fe052017-02-21 08:49:28 -08002496 jmp NEAR $L$common_pop_regs
Adam Langleyd9e397b2015-01-22 14:27:53 -08002497
Adam Langleye9ada862015-05-11 17:20:37 -07002498$L$body_40:
2499 mov rax,QWORD[40+rax]
Robert Sloana94fe052017-02-21 08:49:28 -08002500$L$common_pop_regs:
Adam Langleye9ada862015-05-11 17:20:37 -07002501 mov rbx,QWORD[((-8))+rax]
2502 mov rbp,QWORD[((-16))+rax]
2503 mov r12,QWORD[((-24))+rax]
2504 mov r13,QWORD[((-32))+rax]
2505 mov r14,QWORD[((-40))+rax]
2506 mov r15,QWORD[((-48))+rax]
2507 mov QWORD[144+r8],rbx
2508 mov QWORD[160+r8],rbp
2509 mov QWORD[216+r8],r12
2510 mov QWORD[224+r8],r13
2511 mov QWORD[232+r8],r14
2512 mov QWORD[240+r8],r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002513
Adam Langleye9ada862015-05-11 17:20:37 -07002514$L$common_seh_tail:
2515 mov rdi,QWORD[8+rax]
2516 mov rsi,QWORD[16+rax]
2517 mov QWORD[152+r8],rax
2518 mov QWORD[168+r8],rsi
2519 mov QWORD[176+r8],rdi
Adam Langleyd9e397b2015-01-22 14:27:53 -08002520
Adam Langleye9ada862015-05-11 17:20:37 -07002521 mov rdi,QWORD[40+r9]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002522 mov rsi,r8
2523 mov ecx,154
Adam Langleye9ada862015-05-11 17:20:37 -07002524 DD 0xa548f3fc
Adam Langleyd9e397b2015-01-22 14:27:53 -08002525
2526 mov rsi,r9
2527 xor rcx,rcx
Adam Langleye9ada862015-05-11 17:20:37 -07002528 mov rdx,QWORD[8+rsi]
2529 mov r8,QWORD[rsi]
2530 mov r9,QWORD[16+rsi]
2531 mov r10,QWORD[40+rsi]
2532 lea r11,[56+rsi]
2533 lea r12,[24+rsi]
2534 mov QWORD[32+rsp],r10
2535 mov QWORD[40+rsp],r11
2536 mov QWORD[48+rsp],r12
2537 mov QWORD[56+rsp],rcx
2538 call QWORD[__imp_RtlVirtualUnwind]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002539
2540 mov eax,1
2541 add rsp,64
2542 popfq
2543 pop r15
2544 pop r14
2545 pop r13
2546 pop r12
2547 pop rbp
2548 pop rbx
2549 pop rdi
2550 pop rsi
2551 DB 0F3h,0C3h ;repret
Adam Langleyd9e397b2015-01-22 14:27:53 -08002552
Adam Langleye9ada862015-05-11 17:20:37 -07002553
2554section .pdata rdata align=4
Adam Langleyd9e397b2015-01-22 14:27:53 -08002555ALIGN 4
Adam Langleye9ada862015-05-11 17:20:37 -07002556 DD $L$SEH_begin_bn_mul_mont_gather5 wrt ..imagebase
2557 DD $L$SEH_end_bn_mul_mont_gather5 wrt ..imagebase
2558 DD $L$SEH_info_bn_mul_mont_gather5 wrt ..imagebase
Adam Langleyd9e397b2015-01-22 14:27:53 -08002559
Adam Langleye9ada862015-05-11 17:20:37 -07002560 DD $L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase
2561 DD $L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase
2562 DD $L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase
Adam Langleyd9e397b2015-01-22 14:27:53 -08002563
Adam Langleye9ada862015-05-11 17:20:37 -07002564 DD $L$SEH_begin_bn_power5 wrt ..imagebase
2565 DD $L$SEH_end_bn_power5 wrt ..imagebase
2566 DD $L$SEH_info_bn_power5 wrt ..imagebase
Adam Langleyd9e397b2015-01-22 14:27:53 -08002567
Adam Langleye9ada862015-05-11 17:20:37 -07002568 DD $L$SEH_begin_bn_from_mont8x wrt ..imagebase
2569 DD $L$SEH_end_bn_from_mont8x wrt ..imagebase
2570 DD $L$SEH_info_bn_from_mont8x wrt ..imagebase
2571 DD $L$SEH_begin_bn_gather5 wrt ..imagebase
2572 DD $L$SEH_end_bn_gather5 wrt ..imagebase
2573 DD $L$SEH_info_bn_gather5 wrt ..imagebase
Adam Langleyd9e397b2015-01-22 14:27:53 -08002574
Adam Langleye9ada862015-05-11 17:20:37 -07002575section .xdata rdata align=8
Adam Langleyd9e397b2015-01-22 14:27:53 -08002576ALIGN 8
Adam Langleye9ada862015-05-11 17:20:37 -07002577$L$SEH_info_bn_mul_mont_gather5:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002578DB 9,0,0,0
Adam Langleye9ada862015-05-11 17:20:37 -07002579 DD mul_handler wrt ..imagebase
Robert Sloana94fe052017-02-21 08:49:28 -08002580 DD $L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
Adam Langleyd9e397b2015-01-22 14:27:53 -08002581ALIGN 8
Adam Langleye9ada862015-05-11 17:20:37 -07002582$L$SEH_info_bn_mul4x_mont_gather5:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002583DB 9,0,0,0
Adam Langleye9ada862015-05-11 17:20:37 -07002584 DD mul_handler wrt ..imagebase
Robert Sloana94fe052017-02-21 08:49:28 -08002585 DD $L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
Adam Langleyd9e397b2015-01-22 14:27:53 -08002586ALIGN 8
Adam Langleye9ada862015-05-11 17:20:37 -07002587$L$SEH_info_bn_power5:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002588DB 9,0,0,0
Adam Langleye9ada862015-05-11 17:20:37 -07002589 DD mul_handler wrt ..imagebase
Robert Sloana94fe052017-02-21 08:49:28 -08002590 DD $L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
Adam Langleyd9e397b2015-01-22 14:27:53 -08002591ALIGN 8
Adam Langleye9ada862015-05-11 17:20:37 -07002592$L$SEH_info_bn_from_mont8x:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002593DB 9,0,0,0
Adam Langleye9ada862015-05-11 17:20:37 -07002594 DD mul_handler wrt ..imagebase
Robert Sloana94fe052017-02-21 08:49:28 -08002595 DD $L$from_prologue wrt ..imagebase,$L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase
Adam Langleyd9e397b2015-01-22 14:27:53 -08002596ALIGN 8
Adam Langleye9ada862015-05-11 17:20:37 -07002597$L$SEH_info_bn_gather5:
David Benjamin4969cc92016-04-22 15:02:23 -04002598DB 0x01,0x0b,0x03,0x0a
2599DB 0x0b,0x01,0x21,0x00
2600DB 0x04,0xa3,0x00,0x00
Adam Langleyd9e397b2015-01-22 14:27:53 -08002601ALIGN 8