blob: b1a4d59429df194200183b10044e821fb8dc4ad1 [file] [log] [blame]
Adam Langleye9ada862015-05-11 17:20:37 -07001%ifidn __OUTPUT_FORMAT__,obj
2section code use32 class=code align=64
3%elifidn __OUTPUT_FORMAT__,win32
4%ifdef __YASM_VERSION_ID__
5%if __YASM_VERSION_ID__ < 01010000h
6%error yasm version 1.1.0 or later needed.
7%endif
8; Yasm automatically includes .00 and complains about redefining it.
9; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html
10%else
11$@feat.00 equ 1
12%endif
13section .text code align=64
14%else
15section .text code
16%endif
17;extern _OPENSSL_ia32cap_P
18global _bn_mul_mont
19align 16
20_bn_mul_mont:
21L$_bn_mul_mont_begin:
22 push ebp
23 push ebx
24 push esi
25 push edi
26 xor eax,eax
27 mov edi,DWORD [40+esp]
28 cmp edi,4
29 jl NEAR L$000just_leave
30 lea esi,[20+esp]
31 lea edx,[24+esp]
Adam Langleye9ada862015-05-11 17:20:37 -070032 add edi,2
33 neg edi
Robert Sloana94fe052017-02-21 08:49:28 -080034 lea ebp,[edi*4+esp-32]
Adam Langleye9ada862015-05-11 17:20:37 -070035 neg edi
Robert Sloana94fe052017-02-21 08:49:28 -080036 mov eax,ebp
Adam Langleye9ada862015-05-11 17:20:37 -070037 sub eax,edx
38 and eax,2047
Robert Sloana94fe052017-02-21 08:49:28 -080039 sub ebp,eax
40 xor edx,ebp
Adam Langleye9ada862015-05-11 17:20:37 -070041 and edx,2048
42 xor edx,2048
Robert Sloana94fe052017-02-21 08:49:28 -080043 sub ebp,edx
44 and ebp,-64
45 mov eax,esp
46 sub eax,ebp
47 and eax,-4096
48 mov edx,esp
49 lea esp,[eax*1+ebp]
50 mov eax,DWORD [esp]
51 cmp esp,ebp
52 ja NEAR L$001page_walk
53 jmp NEAR L$002page_walk_done
54align 16
55L$001page_walk:
56 lea esp,[esp-4096]
57 mov eax,DWORD [esp]
58 cmp esp,ebp
59 ja NEAR L$001page_walk
60L$002page_walk_done:
Adam Langleye9ada862015-05-11 17:20:37 -070061 mov eax,DWORD [esi]
62 mov ebx,DWORD [4+esi]
63 mov ecx,DWORD [8+esi]
Robert Sloana94fe052017-02-21 08:49:28 -080064 mov ebp,DWORD [12+esi]
Adam Langleye9ada862015-05-11 17:20:37 -070065 mov esi,DWORD [16+esi]
66 mov esi,DWORD [esi]
67 mov DWORD [4+esp],eax
68 mov DWORD [8+esp],ebx
69 mov DWORD [12+esp],ecx
Robert Sloana94fe052017-02-21 08:49:28 -080070 mov DWORD [16+esp],ebp
Adam Langleye9ada862015-05-11 17:20:37 -070071 mov DWORD [20+esp],esi
72 lea ebx,[edi-3]
Robert Sloana94fe052017-02-21 08:49:28 -080073 mov DWORD [24+esp],edx
Adam Langleye9ada862015-05-11 17:20:37 -070074 lea eax,[_OPENSSL_ia32cap_P]
75 bt DWORD [eax],26
Robert Sloana94fe052017-02-21 08:49:28 -080076 jnc NEAR L$003non_sse2
Adam Langleye9ada862015-05-11 17:20:37 -070077 mov eax,-1
78 movd mm7,eax
79 mov esi,DWORD [8+esp]
80 mov edi,DWORD [12+esp]
81 mov ebp,DWORD [16+esp]
82 xor edx,edx
83 xor ecx,ecx
84 movd mm4,DWORD [edi]
85 movd mm5,DWORD [esi]
86 movd mm3,DWORD [ebp]
87 pmuludq mm5,mm4
88 movq mm2,mm5
89 movq mm0,mm5
90 pand mm0,mm7
91 pmuludq mm5,[20+esp]
92 pmuludq mm3,mm5
93 paddq mm3,mm0
94 movd mm1,DWORD [4+ebp]
95 movd mm0,DWORD [4+esi]
96 psrlq mm2,32
97 psrlq mm3,32
98 inc ecx
99align 16
Robert Sloana94fe052017-02-21 08:49:28 -0800100L$0041st:
Adam Langleye9ada862015-05-11 17:20:37 -0700101 pmuludq mm0,mm4
102 pmuludq mm1,mm5
103 paddq mm2,mm0
104 paddq mm3,mm1
105 movq mm0,mm2
106 pand mm0,mm7
107 movd mm1,DWORD [4+ecx*4+ebp]
108 paddq mm3,mm0
109 movd mm0,DWORD [4+ecx*4+esi]
110 psrlq mm2,32
111 movd DWORD [28+ecx*4+esp],mm3
112 psrlq mm3,32
113 lea ecx,[1+ecx]
114 cmp ecx,ebx
Robert Sloana94fe052017-02-21 08:49:28 -0800115 jl NEAR L$0041st
Adam Langleye9ada862015-05-11 17:20:37 -0700116 pmuludq mm0,mm4
117 pmuludq mm1,mm5
118 paddq mm2,mm0
119 paddq mm3,mm1
120 movq mm0,mm2
121 pand mm0,mm7
122 paddq mm3,mm0
123 movd DWORD [28+ecx*4+esp],mm3
124 psrlq mm2,32
125 psrlq mm3,32
126 paddq mm3,mm2
127 movq [32+ebx*4+esp],mm3
128 inc edx
Robert Sloana94fe052017-02-21 08:49:28 -0800129L$005outer:
Adam Langleye9ada862015-05-11 17:20:37 -0700130 xor ecx,ecx
131 movd mm4,DWORD [edx*4+edi]
132 movd mm5,DWORD [esi]
133 movd mm6,DWORD [32+esp]
134 movd mm3,DWORD [ebp]
135 pmuludq mm5,mm4
136 paddq mm5,mm6
137 movq mm0,mm5
138 movq mm2,mm5
139 pand mm0,mm7
140 pmuludq mm5,[20+esp]
141 pmuludq mm3,mm5
142 paddq mm3,mm0
143 movd mm6,DWORD [36+esp]
144 movd mm1,DWORD [4+ebp]
145 movd mm0,DWORD [4+esi]
146 psrlq mm2,32
147 psrlq mm3,32
148 paddq mm2,mm6
149 inc ecx
150 dec ebx
Robert Sloana94fe052017-02-21 08:49:28 -0800151L$006inner:
Adam Langleye9ada862015-05-11 17:20:37 -0700152 pmuludq mm0,mm4
153 pmuludq mm1,mm5
154 paddq mm2,mm0
155 paddq mm3,mm1
156 movq mm0,mm2
157 movd mm6,DWORD [36+ecx*4+esp]
158 pand mm0,mm7
159 movd mm1,DWORD [4+ecx*4+ebp]
160 paddq mm3,mm0
161 movd mm0,DWORD [4+ecx*4+esi]
162 psrlq mm2,32
163 movd DWORD [28+ecx*4+esp],mm3
164 psrlq mm3,32
165 paddq mm2,mm6
166 dec ebx
167 lea ecx,[1+ecx]
Robert Sloana94fe052017-02-21 08:49:28 -0800168 jnz NEAR L$006inner
Adam Langleye9ada862015-05-11 17:20:37 -0700169 mov ebx,ecx
170 pmuludq mm0,mm4
171 pmuludq mm1,mm5
172 paddq mm2,mm0
173 paddq mm3,mm1
174 movq mm0,mm2
175 pand mm0,mm7
176 paddq mm3,mm0
177 movd DWORD [28+ecx*4+esp],mm3
178 psrlq mm2,32
179 psrlq mm3,32
180 movd mm6,DWORD [36+ebx*4+esp]
181 paddq mm3,mm2
182 paddq mm3,mm6
183 movq [32+ebx*4+esp],mm3
184 lea edx,[1+edx]
185 cmp edx,ebx
Robert Sloana94fe052017-02-21 08:49:28 -0800186 jle NEAR L$005outer
Adam Langleye9ada862015-05-11 17:20:37 -0700187 emms
Robert Sloana94fe052017-02-21 08:49:28 -0800188 jmp NEAR L$007common_tail
Adam Langleye9ada862015-05-11 17:20:37 -0700189align 16
Robert Sloana94fe052017-02-21 08:49:28 -0800190L$003non_sse2:
Adam Langleye9ada862015-05-11 17:20:37 -0700191 mov esi,DWORD [8+esp]
192 lea ebp,[1+ebx]
193 mov edi,DWORD [12+esp]
194 xor ecx,ecx
195 mov edx,esi
196 and ebp,1
197 sub edx,edi
198 lea eax,[4+ebx*4+edi]
199 or ebp,edx
200 mov edi,DWORD [edi]
Robert Sloana94fe052017-02-21 08:49:28 -0800201 jz NEAR L$008bn_sqr_mont
Adam Langleye9ada862015-05-11 17:20:37 -0700202 mov DWORD [28+esp],eax
203 mov eax,DWORD [esi]
204 xor edx,edx
205align 16
Robert Sloana94fe052017-02-21 08:49:28 -0800206L$009mull:
Adam Langleye9ada862015-05-11 17:20:37 -0700207 mov ebp,edx
208 mul edi
209 add ebp,eax
210 lea ecx,[1+ecx]
211 adc edx,0
212 mov eax,DWORD [ecx*4+esi]
213 cmp ecx,ebx
214 mov DWORD [28+ecx*4+esp],ebp
Robert Sloana94fe052017-02-21 08:49:28 -0800215 jl NEAR L$009mull
Adam Langleye9ada862015-05-11 17:20:37 -0700216 mov ebp,edx
217 mul edi
218 mov edi,DWORD [20+esp]
219 add eax,ebp
220 mov esi,DWORD [16+esp]
221 adc edx,0
222 imul edi,DWORD [32+esp]
223 mov DWORD [32+ebx*4+esp],eax
224 xor ecx,ecx
225 mov DWORD [36+ebx*4+esp],edx
226 mov DWORD [40+ebx*4+esp],ecx
227 mov eax,DWORD [esi]
228 mul edi
229 add eax,DWORD [32+esp]
230 mov eax,DWORD [4+esi]
231 adc edx,0
232 inc ecx
Robert Sloana94fe052017-02-21 08:49:28 -0800233 jmp NEAR L$0102ndmadd
Adam Langleye9ada862015-05-11 17:20:37 -0700234align 16
Robert Sloana94fe052017-02-21 08:49:28 -0800235L$0111stmadd:
Adam Langleye9ada862015-05-11 17:20:37 -0700236 mov ebp,edx
237 mul edi
238 add ebp,DWORD [32+ecx*4+esp]
239 lea ecx,[1+ecx]
240 adc edx,0
241 add ebp,eax
242 mov eax,DWORD [ecx*4+esi]
243 adc edx,0
244 cmp ecx,ebx
245 mov DWORD [28+ecx*4+esp],ebp
Robert Sloana94fe052017-02-21 08:49:28 -0800246 jl NEAR L$0111stmadd
Adam Langleye9ada862015-05-11 17:20:37 -0700247 mov ebp,edx
248 mul edi
249 add eax,DWORD [32+ebx*4+esp]
250 mov edi,DWORD [20+esp]
251 adc edx,0
252 mov esi,DWORD [16+esp]
253 add ebp,eax
254 adc edx,0
255 imul edi,DWORD [32+esp]
256 xor ecx,ecx
257 add edx,DWORD [36+ebx*4+esp]
258 mov DWORD [32+ebx*4+esp],ebp
259 adc ecx,0
260 mov eax,DWORD [esi]
261 mov DWORD [36+ebx*4+esp],edx
262 mov DWORD [40+ebx*4+esp],ecx
263 mul edi
264 add eax,DWORD [32+esp]
265 mov eax,DWORD [4+esi]
266 adc edx,0
267 mov ecx,1
268align 16
Robert Sloana94fe052017-02-21 08:49:28 -0800269L$0102ndmadd:
Adam Langleye9ada862015-05-11 17:20:37 -0700270 mov ebp,edx
271 mul edi
272 add ebp,DWORD [32+ecx*4+esp]
273 lea ecx,[1+ecx]
274 adc edx,0
275 add ebp,eax
276 mov eax,DWORD [ecx*4+esi]
277 adc edx,0
278 cmp ecx,ebx
279 mov DWORD [24+ecx*4+esp],ebp
Robert Sloana94fe052017-02-21 08:49:28 -0800280 jl NEAR L$0102ndmadd
Adam Langleye9ada862015-05-11 17:20:37 -0700281 mov ebp,edx
282 mul edi
283 add ebp,DWORD [32+ebx*4+esp]
284 adc edx,0
285 add ebp,eax
286 adc edx,0
287 mov DWORD [28+ebx*4+esp],ebp
288 xor eax,eax
289 mov ecx,DWORD [12+esp]
290 add edx,DWORD [36+ebx*4+esp]
291 adc eax,DWORD [40+ebx*4+esp]
292 lea ecx,[4+ecx]
293 mov DWORD [32+ebx*4+esp],edx
294 cmp ecx,DWORD [28+esp]
295 mov DWORD [36+ebx*4+esp],eax
Robert Sloana94fe052017-02-21 08:49:28 -0800296 je NEAR L$007common_tail
Adam Langleye9ada862015-05-11 17:20:37 -0700297 mov edi,DWORD [ecx]
298 mov esi,DWORD [8+esp]
299 mov DWORD [12+esp],ecx
300 xor ecx,ecx
301 xor edx,edx
302 mov eax,DWORD [esi]
Robert Sloana94fe052017-02-21 08:49:28 -0800303 jmp NEAR L$0111stmadd
Adam Langleye9ada862015-05-11 17:20:37 -0700304align 16
Robert Sloana94fe052017-02-21 08:49:28 -0800305L$008bn_sqr_mont:
Adam Langleye9ada862015-05-11 17:20:37 -0700306 mov DWORD [esp],ebx
307 mov DWORD [12+esp],ecx
308 mov eax,edi
309 mul edi
310 mov DWORD [32+esp],eax
311 mov ebx,edx
312 shr edx,1
313 and ebx,1
314 inc ecx
315align 16
Robert Sloana94fe052017-02-21 08:49:28 -0800316L$012sqr:
Adam Langleye9ada862015-05-11 17:20:37 -0700317 mov eax,DWORD [ecx*4+esi]
318 mov ebp,edx
319 mul edi
320 add eax,ebp
321 lea ecx,[1+ecx]
322 adc edx,0
323 lea ebp,[eax*2+ebx]
324 shr eax,31
325 cmp ecx,DWORD [esp]
326 mov ebx,eax
327 mov DWORD [28+ecx*4+esp],ebp
Robert Sloana94fe052017-02-21 08:49:28 -0800328 jl NEAR L$012sqr
Adam Langleye9ada862015-05-11 17:20:37 -0700329 mov eax,DWORD [ecx*4+esi]
330 mov ebp,edx
331 mul edi
332 add eax,ebp
333 mov edi,DWORD [20+esp]
334 adc edx,0
335 mov esi,DWORD [16+esp]
336 lea ebp,[eax*2+ebx]
337 imul edi,DWORD [32+esp]
338 shr eax,31
339 mov DWORD [32+ecx*4+esp],ebp
340 lea ebp,[edx*2+eax]
341 mov eax,DWORD [esi]
342 shr edx,31
343 mov DWORD [36+ecx*4+esp],ebp
344 mov DWORD [40+ecx*4+esp],edx
345 mul edi
346 add eax,DWORD [32+esp]
347 mov ebx,ecx
348 adc edx,0
349 mov eax,DWORD [4+esi]
350 mov ecx,1
351align 16
Robert Sloana94fe052017-02-21 08:49:28 -0800352L$0133rdmadd:
Adam Langleye9ada862015-05-11 17:20:37 -0700353 mov ebp,edx
354 mul edi
355 add ebp,DWORD [32+ecx*4+esp]
356 adc edx,0
357 add ebp,eax
358 mov eax,DWORD [4+ecx*4+esi]
359 adc edx,0
360 mov DWORD [28+ecx*4+esp],ebp
361 mov ebp,edx
362 mul edi
363 add ebp,DWORD [36+ecx*4+esp]
364 lea ecx,[2+ecx]
365 adc edx,0
366 add ebp,eax
367 mov eax,DWORD [ecx*4+esi]
368 adc edx,0
369 cmp ecx,ebx
370 mov DWORD [24+ecx*4+esp],ebp
Robert Sloana94fe052017-02-21 08:49:28 -0800371 jl NEAR L$0133rdmadd
Adam Langleye9ada862015-05-11 17:20:37 -0700372 mov ebp,edx
373 mul edi
374 add ebp,DWORD [32+ebx*4+esp]
375 adc edx,0
376 add ebp,eax
377 adc edx,0
378 mov DWORD [28+ebx*4+esp],ebp
379 mov ecx,DWORD [12+esp]
380 xor eax,eax
381 mov esi,DWORD [8+esp]
382 add edx,DWORD [36+ebx*4+esp]
383 adc eax,DWORD [40+ebx*4+esp]
384 mov DWORD [32+ebx*4+esp],edx
385 cmp ecx,ebx
386 mov DWORD [36+ebx*4+esp],eax
Robert Sloana94fe052017-02-21 08:49:28 -0800387 je NEAR L$007common_tail
Adam Langleye9ada862015-05-11 17:20:37 -0700388 mov edi,DWORD [4+ecx*4+esi]
389 lea ecx,[1+ecx]
390 mov eax,edi
391 mov DWORD [12+esp],ecx
392 mul edi
393 add eax,DWORD [32+ecx*4+esp]
394 adc edx,0
395 mov DWORD [32+ecx*4+esp],eax
396 xor ebp,ebp
397 cmp ecx,ebx
398 lea ecx,[1+ecx]
Robert Sloana94fe052017-02-21 08:49:28 -0800399 je NEAR L$014sqrlast
Adam Langleye9ada862015-05-11 17:20:37 -0700400 mov ebx,edx
401 shr edx,1
402 and ebx,1
403align 16
Robert Sloana94fe052017-02-21 08:49:28 -0800404L$015sqradd:
Adam Langleye9ada862015-05-11 17:20:37 -0700405 mov eax,DWORD [ecx*4+esi]
406 mov ebp,edx
407 mul edi
408 add eax,ebp
409 lea ebp,[eax*1+eax]
410 adc edx,0
411 shr eax,31
412 add ebp,DWORD [32+ecx*4+esp]
413 lea ecx,[1+ecx]
414 adc eax,0
415 add ebp,ebx
416 adc eax,0
417 cmp ecx,DWORD [esp]
418 mov DWORD [28+ecx*4+esp],ebp
419 mov ebx,eax
Robert Sloana94fe052017-02-21 08:49:28 -0800420 jle NEAR L$015sqradd
Adam Langleye9ada862015-05-11 17:20:37 -0700421 mov ebp,edx
422 add edx,edx
423 shr ebp,31
424 add edx,ebx
425 adc ebp,0
Robert Sloana94fe052017-02-21 08:49:28 -0800426L$014sqrlast:
Adam Langleye9ada862015-05-11 17:20:37 -0700427 mov edi,DWORD [20+esp]
428 mov esi,DWORD [16+esp]
429 imul edi,DWORD [32+esp]
430 add edx,DWORD [32+ecx*4+esp]
431 mov eax,DWORD [esi]
432 adc ebp,0
433 mov DWORD [32+ecx*4+esp],edx
434 mov DWORD [36+ecx*4+esp],ebp
435 mul edi
436 add eax,DWORD [32+esp]
437 lea ebx,[ecx-1]
438 adc edx,0
439 mov ecx,1
440 mov eax,DWORD [4+esi]
Robert Sloana94fe052017-02-21 08:49:28 -0800441 jmp NEAR L$0133rdmadd
Adam Langleye9ada862015-05-11 17:20:37 -0700442align 16
Robert Sloana94fe052017-02-21 08:49:28 -0800443L$007common_tail:
Adam Langleye9ada862015-05-11 17:20:37 -0700444 mov ebp,DWORD [16+esp]
445 mov edi,DWORD [4+esp]
446 lea esi,[32+esp]
447 mov eax,DWORD [esi]
448 mov ecx,ebx
449 xor edx,edx
450align 16
Robert Sloana94fe052017-02-21 08:49:28 -0800451L$016sub:
Adam Langleye9ada862015-05-11 17:20:37 -0700452 sbb eax,DWORD [edx*4+ebp]
453 mov DWORD [edx*4+edi],eax
454 dec ecx
455 mov eax,DWORD [4+edx*4+esi]
456 lea edx,[1+edx]
Robert Sloana94fe052017-02-21 08:49:28 -0800457 jge NEAR L$016sub
Adam Langleye9ada862015-05-11 17:20:37 -0700458 sbb eax,0
Robert Sloana94fe052017-02-21 08:49:28 -0800459 and esi,eax
460 not eax
461 mov ebp,edi
462 and ebp,eax
463 or esi,ebp
Adam Langleye9ada862015-05-11 17:20:37 -0700464align 16
Robert Sloana94fe052017-02-21 08:49:28 -0800465L$017copy:
466 mov eax,DWORD [ebx*4+esi]
467 mov DWORD [ebx*4+edi],eax
468 mov DWORD [32+ebx*4+esp],ecx
Adam Langleye9ada862015-05-11 17:20:37 -0700469 dec ebx
Robert Sloana94fe052017-02-21 08:49:28 -0800470 jge NEAR L$017copy
Adam Langleye9ada862015-05-11 17:20:37 -0700471 mov esp,DWORD [24+esp]
472 mov eax,1
473L$000just_leave:
474 pop edi
475 pop esi
476 pop ebx
477 pop ebp
478 ret
479db 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
480db 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
481db 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
482db 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
483db 111,114,103,62,0
484segment .bss
485common _OPENSSL_ia32cap_P 16