blob: 5de98a3d61589cd32843673c4f2d0497f7a36863 [file] [log] [blame]
David Benjamin4969cc92016-04-22 15:02:23 -04001#if defined(__i386__)
2.file "chacha-x86.S"
3.text
4.globl _ChaCha20_ctr32
5.private_extern _ChaCha20_ctr32
6.align 4
7_ChaCha20_ctr32:
8L_ChaCha20_ctr32_begin:
9 pushl %ebp
10 pushl %ebx
11 pushl %esi
12 pushl %edi
13 xorl %eax,%eax
14 cmpl 28(%esp),%eax
15 je L000no_data
16 call Lpic_point
17Lpic_point:
18 popl %eax
19 movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lpic_point(%eax),%ebp
20 testl $16777216,(%ebp)
21 jz L001x86
22 testl $512,4(%ebp)
23 jz L001x86
24 jmp Lssse3_shortcut
25L001x86:
26 movl 32(%esp),%esi
27 movl 36(%esp),%edi
28 subl $132,%esp
29 movl (%esi),%eax
30 movl 4(%esi),%ebx
31 movl 8(%esi),%ecx
32 movl 12(%esi),%edx
33 movl %eax,80(%esp)
34 movl %ebx,84(%esp)
35 movl %ecx,88(%esp)
36 movl %edx,92(%esp)
37 movl 16(%esi),%eax
38 movl 20(%esi),%ebx
39 movl 24(%esi),%ecx
40 movl 28(%esi),%edx
41 movl %eax,96(%esp)
42 movl %ebx,100(%esp)
43 movl %ecx,104(%esp)
44 movl %edx,108(%esp)
45 movl (%edi),%eax
46 movl 4(%edi),%ebx
47 movl 8(%edi),%ecx
48 movl 12(%edi),%edx
49 subl $1,%eax
50 movl %eax,112(%esp)
51 movl %ebx,116(%esp)
52 movl %ecx,120(%esp)
53 movl %edx,124(%esp)
54 jmp L002entry
55.align 4,0x90
56L003outer_loop:
57 movl %ebx,156(%esp)
58 movl %eax,152(%esp)
59 movl %ecx,160(%esp)
60L002entry:
61 movl $1634760805,%eax
62 movl $857760878,4(%esp)
63 movl $2036477234,8(%esp)
64 movl $1797285236,12(%esp)
65 movl 84(%esp),%ebx
66 movl 88(%esp),%ebp
67 movl 104(%esp),%ecx
68 movl 108(%esp),%esi
69 movl 116(%esp),%edx
70 movl 120(%esp),%edi
71 movl %ebx,20(%esp)
72 movl %ebp,24(%esp)
73 movl %ecx,40(%esp)
74 movl %esi,44(%esp)
75 movl %edx,52(%esp)
76 movl %edi,56(%esp)
77 movl 92(%esp),%ebx
78 movl 124(%esp),%edi
79 movl 112(%esp),%edx
80 movl 80(%esp),%ebp
81 movl 96(%esp),%ecx
82 movl 100(%esp),%esi
83 addl $1,%edx
84 movl %ebx,28(%esp)
85 movl %edi,60(%esp)
86 movl %edx,112(%esp)
87 movl $10,%ebx
88 jmp L004loop
89.align 4,0x90
90L004loop:
91 addl %ebp,%eax
92 movl %ebx,128(%esp)
93 movl %ebp,%ebx
94 xorl %eax,%edx
95 roll $16,%edx
96 addl %edx,%ecx
97 xorl %ecx,%ebx
98 movl 52(%esp),%edi
99 roll $12,%ebx
100 movl 20(%esp),%ebp
101 addl %ebx,%eax
102 xorl %eax,%edx
103 movl %eax,(%esp)
104 roll $8,%edx
105 movl 4(%esp),%eax
106 addl %edx,%ecx
107 movl %edx,48(%esp)
108 xorl %ecx,%ebx
109 addl %ebp,%eax
110 roll $7,%ebx
111 xorl %eax,%edi
112 movl %ecx,32(%esp)
113 roll $16,%edi
114 movl %ebx,16(%esp)
115 addl %edi,%esi
116 movl 40(%esp),%ecx
117 xorl %esi,%ebp
118 movl 56(%esp),%edx
119 roll $12,%ebp
120 movl 24(%esp),%ebx
121 addl %ebp,%eax
122 xorl %eax,%edi
123 movl %eax,4(%esp)
124 roll $8,%edi
125 movl 8(%esp),%eax
126 addl %edi,%esi
127 movl %edi,52(%esp)
128 xorl %esi,%ebp
129 addl %ebx,%eax
130 roll $7,%ebp
131 xorl %eax,%edx
132 movl %esi,36(%esp)
133 roll $16,%edx
134 movl %ebp,20(%esp)
135 addl %edx,%ecx
136 movl 44(%esp),%esi
137 xorl %ecx,%ebx
138 movl 60(%esp),%edi
139 roll $12,%ebx
140 movl 28(%esp),%ebp
141 addl %ebx,%eax
142 xorl %eax,%edx
143 movl %eax,8(%esp)
144 roll $8,%edx
145 movl 12(%esp),%eax
146 addl %edx,%ecx
147 movl %edx,56(%esp)
148 xorl %ecx,%ebx
149 addl %ebp,%eax
150 roll $7,%ebx
151 xorl %eax,%edi
152 roll $16,%edi
153 movl %ebx,24(%esp)
154 addl %edi,%esi
155 xorl %esi,%ebp
156 roll $12,%ebp
157 movl 20(%esp),%ebx
158 addl %ebp,%eax
159 xorl %eax,%edi
160 movl %eax,12(%esp)
161 roll $8,%edi
162 movl (%esp),%eax
163 addl %edi,%esi
164 movl %edi,%edx
165 xorl %esi,%ebp
166 addl %ebx,%eax
167 roll $7,%ebp
168 xorl %eax,%edx
169 roll $16,%edx
170 movl %ebp,28(%esp)
171 addl %edx,%ecx
172 xorl %ecx,%ebx
173 movl 48(%esp),%edi
174 roll $12,%ebx
175 movl 24(%esp),%ebp
176 addl %ebx,%eax
177 xorl %eax,%edx
178 movl %eax,(%esp)
179 roll $8,%edx
180 movl 4(%esp),%eax
181 addl %edx,%ecx
182 movl %edx,60(%esp)
183 xorl %ecx,%ebx
184 addl %ebp,%eax
185 roll $7,%ebx
186 xorl %eax,%edi
187 movl %ecx,40(%esp)
188 roll $16,%edi
189 movl %ebx,20(%esp)
190 addl %edi,%esi
191 movl 32(%esp),%ecx
192 xorl %esi,%ebp
193 movl 52(%esp),%edx
194 roll $12,%ebp
195 movl 28(%esp),%ebx
196 addl %ebp,%eax
197 xorl %eax,%edi
198 movl %eax,4(%esp)
199 roll $8,%edi
200 movl 8(%esp),%eax
201 addl %edi,%esi
202 movl %edi,48(%esp)
203 xorl %esi,%ebp
204 addl %ebx,%eax
205 roll $7,%ebp
206 xorl %eax,%edx
207 movl %esi,44(%esp)
208 roll $16,%edx
209 movl %ebp,24(%esp)
210 addl %edx,%ecx
211 movl 36(%esp),%esi
212 xorl %ecx,%ebx
213 movl 56(%esp),%edi
214 roll $12,%ebx
215 movl 16(%esp),%ebp
216 addl %ebx,%eax
217 xorl %eax,%edx
218 movl %eax,8(%esp)
219 roll $8,%edx
220 movl 12(%esp),%eax
221 addl %edx,%ecx
222 movl %edx,52(%esp)
223 xorl %ecx,%ebx
224 addl %ebp,%eax
225 roll $7,%ebx
226 xorl %eax,%edi
227 roll $16,%edi
228 movl %ebx,28(%esp)
229 addl %edi,%esi
230 xorl %esi,%ebp
231 movl 48(%esp),%edx
232 roll $12,%ebp
233 movl 128(%esp),%ebx
234 addl %ebp,%eax
235 xorl %eax,%edi
236 movl %eax,12(%esp)
237 roll $8,%edi
238 movl (%esp),%eax
239 addl %edi,%esi
240 movl %edi,56(%esp)
241 xorl %esi,%ebp
242 roll $7,%ebp
243 decl %ebx
244 jnz L004loop
245 movl 160(%esp),%ebx
246 addl $1634760805,%eax
247 addl 80(%esp),%ebp
248 addl 96(%esp),%ecx
249 addl 100(%esp),%esi
250 cmpl $64,%ebx
251 jb L005tail
252 movl 156(%esp),%ebx
253 addl 112(%esp),%edx
254 addl 120(%esp),%edi
255 xorl (%ebx),%eax
256 xorl 16(%ebx),%ebp
257 movl %eax,(%esp)
258 movl 152(%esp),%eax
259 xorl 32(%ebx),%ecx
260 xorl 36(%ebx),%esi
261 xorl 48(%ebx),%edx
262 xorl 56(%ebx),%edi
David Benjamin6e899c72016-06-09 18:02:18 -0400263 movl %ebp,16(%eax)
264 movl %ecx,32(%eax)
265 movl %esi,36(%eax)
266 movl %edx,48(%eax)
267 movl %edi,56(%eax)
David Benjamin4969cc92016-04-22 15:02:23 -0400268 movl 4(%esp),%ebp
269 movl 8(%esp),%ecx
270 movl 12(%esp),%esi
271 movl 20(%esp),%edx
272 movl 24(%esp),%edi
273 addl $857760878,%ebp
274 addl $2036477234,%ecx
275 addl $1797285236,%esi
276 addl 84(%esp),%edx
277 addl 88(%esp),%edi
278 xorl 4(%ebx),%ebp
279 xorl 8(%ebx),%ecx
280 xorl 12(%ebx),%esi
281 xorl 20(%ebx),%edx
282 xorl 24(%ebx),%edi
283 movl %ebp,4(%eax)
David Benjamin4969cc92016-04-22 15:02:23 -0400284 movl %ecx,8(%eax)
285 movl %esi,12(%eax)
David Benjamin4969cc92016-04-22 15:02:23 -0400286 movl %edx,20(%eax)
287 movl %edi,24(%eax)
David Benjamin6e899c72016-06-09 18:02:18 -0400288 movl 28(%esp),%ebp
289 movl 40(%esp),%ecx
David Benjamin4969cc92016-04-22 15:02:23 -0400290 movl 44(%esp),%esi
David Benjamin4969cc92016-04-22 15:02:23 -0400291 movl 52(%esp),%edx
292 movl 60(%esp),%edi
David Benjamin6e899c72016-06-09 18:02:18 -0400293 addl 92(%esp),%ebp
294 addl 104(%esp),%ecx
295 addl 108(%esp),%esi
David Benjamin4969cc92016-04-22 15:02:23 -0400296 addl 116(%esp),%edx
297 addl 124(%esp),%edi
David Benjamin6e899c72016-06-09 18:02:18 -0400298 xorl 28(%ebx),%ebp
299 xorl 40(%ebx),%ecx
300 xorl 44(%ebx),%esi
David Benjamin4969cc92016-04-22 15:02:23 -0400301 xorl 52(%ebx),%edx
302 xorl 60(%ebx),%edi
303 leal 64(%ebx),%ebx
David Benjamin6e899c72016-06-09 18:02:18 -0400304 movl %ebp,28(%eax)
305 movl (%esp),%ebp
306 movl %ecx,40(%eax)
David Benjamin4969cc92016-04-22 15:02:23 -0400307 movl 160(%esp),%ecx
David Benjamin6e899c72016-06-09 18:02:18 -0400308 movl %esi,44(%eax)
David Benjamin4969cc92016-04-22 15:02:23 -0400309 movl %edx,52(%eax)
David Benjamin4969cc92016-04-22 15:02:23 -0400310 movl %edi,60(%eax)
David Benjamin6e899c72016-06-09 18:02:18 -0400311 movl %ebp,(%eax)
David Benjamin4969cc92016-04-22 15:02:23 -0400312 leal 64(%eax),%eax
313 subl $64,%ecx
314 jnz L003outer_loop
315 jmp L006done
316L005tail:
317 addl 112(%esp),%edx
318 addl 120(%esp),%edi
319 movl %eax,(%esp)
320 movl %ebp,16(%esp)
321 movl %ecx,32(%esp)
322 movl %esi,36(%esp)
323 movl %edx,48(%esp)
324 movl %edi,56(%esp)
325 movl 4(%esp),%ebp
326 movl 8(%esp),%ecx
327 movl 12(%esp),%esi
328 movl 20(%esp),%edx
329 movl 24(%esp),%edi
330 addl $857760878,%ebp
331 addl $2036477234,%ecx
332 addl $1797285236,%esi
333 addl 84(%esp),%edx
334 addl 88(%esp),%edi
335 movl %ebp,4(%esp)
336 movl %ecx,8(%esp)
337 movl %esi,12(%esp)
338 movl %edx,20(%esp)
339 movl %edi,24(%esp)
340 movl 28(%esp),%ebp
341 movl 40(%esp),%ecx
342 movl 44(%esp),%esi
343 movl 52(%esp),%edx
344 movl 60(%esp),%edi
345 addl 92(%esp),%ebp
346 addl 104(%esp),%ecx
347 addl 108(%esp),%esi
348 addl 116(%esp),%edx
349 addl 124(%esp),%edi
350 movl %ebp,28(%esp)
351 movl 156(%esp),%ebp
352 movl %ecx,40(%esp)
353 movl 152(%esp),%ecx
354 movl %esi,44(%esp)
355 xorl %esi,%esi
356 movl %edx,52(%esp)
357 movl %edi,60(%esp)
358 xorl %eax,%eax
359 xorl %edx,%edx
360L007tail_loop:
361 movb (%esi,%ebp,1),%al
362 movb (%esp,%esi,1),%dl
363 leal 1(%esi),%esi
364 xorb %dl,%al
365 movb %al,-1(%ecx,%esi,1)
366 decl %ebx
367 jnz L007tail_loop
368L006done:
369 addl $132,%esp
370L000no_data:
371 popl %edi
372 popl %esi
373 popl %ebx
374 popl %ebp
375 ret
376.globl _ChaCha20_ssse3
377.private_extern _ChaCha20_ssse3
378.align 4
379_ChaCha20_ssse3:
380L_ChaCha20_ssse3_begin:
381 pushl %ebp
382 pushl %ebx
383 pushl %esi
384 pushl %edi
385Lssse3_shortcut:
386 movl 20(%esp),%edi
387 movl 24(%esp),%esi
388 movl 28(%esp),%ecx
389 movl 32(%esp),%edx
390 movl 36(%esp),%ebx
391 movl %esp,%ebp
392 subl $524,%esp
393 andl $-64,%esp
394 movl %ebp,512(%esp)
395 leal Lssse3_data-Lpic_point(%eax),%eax
396 movdqu (%ebx),%xmm3
397 cmpl $256,%ecx
398 jb L0081x
399 movl %edx,516(%esp)
400 movl %ebx,520(%esp)
401 subl $256,%ecx
402 leal 384(%esp),%ebp
403 movdqu (%edx),%xmm7
404 pshufd $0,%xmm3,%xmm0
405 pshufd $85,%xmm3,%xmm1
406 pshufd $170,%xmm3,%xmm2
407 pshufd $255,%xmm3,%xmm3
408 paddd 48(%eax),%xmm0
409 pshufd $0,%xmm7,%xmm4
410 pshufd $85,%xmm7,%xmm5
411 psubd 64(%eax),%xmm0
412 pshufd $170,%xmm7,%xmm6
413 pshufd $255,%xmm7,%xmm7
414 movdqa %xmm0,64(%ebp)
415 movdqa %xmm1,80(%ebp)
416 movdqa %xmm2,96(%ebp)
417 movdqa %xmm3,112(%ebp)
418 movdqu 16(%edx),%xmm3
419 movdqa %xmm4,-64(%ebp)
420 movdqa %xmm5,-48(%ebp)
421 movdqa %xmm6,-32(%ebp)
422 movdqa %xmm7,-16(%ebp)
423 movdqa 32(%eax),%xmm7
424 leal 128(%esp),%ebx
425 pshufd $0,%xmm3,%xmm0
426 pshufd $85,%xmm3,%xmm1
427 pshufd $170,%xmm3,%xmm2
428 pshufd $255,%xmm3,%xmm3
429 pshufd $0,%xmm7,%xmm4
430 pshufd $85,%xmm7,%xmm5
431 pshufd $170,%xmm7,%xmm6
432 pshufd $255,%xmm7,%xmm7
433 movdqa %xmm0,(%ebp)
434 movdqa %xmm1,16(%ebp)
435 movdqa %xmm2,32(%ebp)
436 movdqa %xmm3,48(%ebp)
437 movdqa %xmm4,-128(%ebp)
438 movdqa %xmm5,-112(%ebp)
439 movdqa %xmm6,-96(%ebp)
440 movdqa %xmm7,-80(%ebp)
441 leal 128(%esi),%esi
442 leal 128(%edi),%edi
443 jmp L009outer_loop
444.align 4,0x90
445L009outer_loop:
446 movdqa -112(%ebp),%xmm1
447 movdqa -96(%ebp),%xmm2
448 movdqa -80(%ebp),%xmm3
449 movdqa -48(%ebp),%xmm5
450 movdqa -32(%ebp),%xmm6
451 movdqa -16(%ebp),%xmm7
452 movdqa %xmm1,-112(%ebx)
453 movdqa %xmm2,-96(%ebx)
454 movdqa %xmm3,-80(%ebx)
455 movdqa %xmm5,-48(%ebx)
456 movdqa %xmm6,-32(%ebx)
457 movdqa %xmm7,-16(%ebx)
458 movdqa 32(%ebp),%xmm2
459 movdqa 48(%ebp),%xmm3
460 movdqa 64(%ebp),%xmm4
461 movdqa 80(%ebp),%xmm5
462 movdqa 96(%ebp),%xmm6
463 movdqa 112(%ebp),%xmm7
464 paddd 64(%eax),%xmm4
465 movdqa %xmm2,32(%ebx)
466 movdqa %xmm3,48(%ebx)
467 movdqa %xmm4,64(%ebx)
468 movdqa %xmm5,80(%ebx)
469 movdqa %xmm6,96(%ebx)
470 movdqa %xmm7,112(%ebx)
471 movdqa %xmm4,64(%ebp)
472 movdqa -128(%ebp),%xmm0
473 movdqa %xmm4,%xmm6
474 movdqa -64(%ebp),%xmm3
475 movdqa (%ebp),%xmm4
476 movdqa 16(%ebp),%xmm5
477 movl $10,%edx
478 nop
479.align 4,0x90
480L010loop:
481 paddd %xmm3,%xmm0
482 movdqa %xmm3,%xmm2
483 pxor %xmm0,%xmm6
484 pshufb (%eax),%xmm6
485 paddd %xmm6,%xmm4
486 pxor %xmm4,%xmm2
487 movdqa -48(%ebx),%xmm3
488 movdqa %xmm2,%xmm1
489 pslld $12,%xmm2
490 psrld $20,%xmm1
491 por %xmm1,%xmm2
492 movdqa -112(%ebx),%xmm1
493 paddd %xmm2,%xmm0
494 movdqa 80(%ebx),%xmm7
495 pxor %xmm0,%xmm6
496 movdqa %xmm0,-128(%ebx)
497 pshufb 16(%eax),%xmm6
498 paddd %xmm6,%xmm4
499 movdqa %xmm6,64(%ebx)
500 pxor %xmm4,%xmm2
501 paddd %xmm3,%xmm1
502 movdqa %xmm2,%xmm0
503 pslld $7,%xmm2
504 psrld $25,%xmm0
505 pxor %xmm1,%xmm7
506 por %xmm0,%xmm2
507 movdqa %xmm4,(%ebx)
508 pshufb (%eax),%xmm7
509 movdqa %xmm2,-64(%ebx)
510 paddd %xmm7,%xmm5
511 movdqa 32(%ebx),%xmm4
512 pxor %xmm5,%xmm3
513 movdqa -32(%ebx),%xmm2
514 movdqa %xmm3,%xmm0
515 pslld $12,%xmm3
516 psrld $20,%xmm0
517 por %xmm0,%xmm3
518 movdqa -96(%ebx),%xmm0
519 paddd %xmm3,%xmm1
520 movdqa 96(%ebx),%xmm6
521 pxor %xmm1,%xmm7
522 movdqa %xmm1,-112(%ebx)
523 pshufb 16(%eax),%xmm7
524 paddd %xmm7,%xmm5
525 movdqa %xmm7,80(%ebx)
526 pxor %xmm5,%xmm3
527 paddd %xmm2,%xmm0
528 movdqa %xmm3,%xmm1
529 pslld $7,%xmm3
530 psrld $25,%xmm1
531 pxor %xmm0,%xmm6
532 por %xmm1,%xmm3
533 movdqa %xmm5,16(%ebx)
534 pshufb (%eax),%xmm6
535 movdqa %xmm3,-48(%ebx)
536 paddd %xmm6,%xmm4
537 movdqa 48(%ebx),%xmm5
538 pxor %xmm4,%xmm2
539 movdqa -16(%ebx),%xmm3
540 movdqa %xmm2,%xmm1
541 pslld $12,%xmm2
542 psrld $20,%xmm1
543 por %xmm1,%xmm2
544 movdqa -80(%ebx),%xmm1
545 paddd %xmm2,%xmm0
546 movdqa 112(%ebx),%xmm7
547 pxor %xmm0,%xmm6
548 movdqa %xmm0,-96(%ebx)
549 pshufb 16(%eax),%xmm6
550 paddd %xmm6,%xmm4
551 movdqa %xmm6,96(%ebx)
552 pxor %xmm4,%xmm2
553 paddd %xmm3,%xmm1
554 movdqa %xmm2,%xmm0
555 pslld $7,%xmm2
556 psrld $25,%xmm0
557 pxor %xmm1,%xmm7
558 por %xmm0,%xmm2
559 pshufb (%eax),%xmm7
560 movdqa %xmm2,-32(%ebx)
561 paddd %xmm7,%xmm5
562 pxor %xmm5,%xmm3
563 movdqa -48(%ebx),%xmm2
564 movdqa %xmm3,%xmm0
565 pslld $12,%xmm3
566 psrld $20,%xmm0
567 por %xmm0,%xmm3
568 movdqa -128(%ebx),%xmm0
569 paddd %xmm3,%xmm1
570 pxor %xmm1,%xmm7
571 movdqa %xmm1,-80(%ebx)
572 pshufb 16(%eax),%xmm7
573 paddd %xmm7,%xmm5
574 movdqa %xmm7,%xmm6
575 pxor %xmm5,%xmm3
576 paddd %xmm2,%xmm0
577 movdqa %xmm3,%xmm1
578 pslld $7,%xmm3
579 psrld $25,%xmm1
580 pxor %xmm0,%xmm6
581 por %xmm1,%xmm3
582 pshufb (%eax),%xmm6
583 movdqa %xmm3,-16(%ebx)
584 paddd %xmm6,%xmm4
585 pxor %xmm4,%xmm2
586 movdqa -32(%ebx),%xmm3
587 movdqa %xmm2,%xmm1
588 pslld $12,%xmm2
589 psrld $20,%xmm1
590 por %xmm1,%xmm2
591 movdqa -112(%ebx),%xmm1
592 paddd %xmm2,%xmm0
593 movdqa 64(%ebx),%xmm7
594 pxor %xmm0,%xmm6
595 movdqa %xmm0,-128(%ebx)
596 pshufb 16(%eax),%xmm6
597 paddd %xmm6,%xmm4
598 movdqa %xmm6,112(%ebx)
599 pxor %xmm4,%xmm2
600 paddd %xmm3,%xmm1
601 movdqa %xmm2,%xmm0
602 pslld $7,%xmm2
603 psrld $25,%xmm0
604 pxor %xmm1,%xmm7
605 por %xmm0,%xmm2
606 movdqa %xmm4,32(%ebx)
607 pshufb (%eax),%xmm7
608 movdqa %xmm2,-48(%ebx)
609 paddd %xmm7,%xmm5
610 movdqa (%ebx),%xmm4
611 pxor %xmm5,%xmm3
612 movdqa -16(%ebx),%xmm2
613 movdqa %xmm3,%xmm0
614 pslld $12,%xmm3
615 psrld $20,%xmm0
616 por %xmm0,%xmm3
617 movdqa -96(%ebx),%xmm0
618 paddd %xmm3,%xmm1
619 movdqa 80(%ebx),%xmm6
620 pxor %xmm1,%xmm7
621 movdqa %xmm1,-112(%ebx)
622 pshufb 16(%eax),%xmm7
623 paddd %xmm7,%xmm5
624 movdqa %xmm7,64(%ebx)
625 pxor %xmm5,%xmm3
626 paddd %xmm2,%xmm0
627 movdqa %xmm3,%xmm1
628 pslld $7,%xmm3
629 psrld $25,%xmm1
630 pxor %xmm0,%xmm6
631 por %xmm1,%xmm3
632 movdqa %xmm5,48(%ebx)
633 pshufb (%eax),%xmm6
634 movdqa %xmm3,-32(%ebx)
635 paddd %xmm6,%xmm4
636 movdqa 16(%ebx),%xmm5
637 pxor %xmm4,%xmm2
638 movdqa -64(%ebx),%xmm3
639 movdqa %xmm2,%xmm1
640 pslld $12,%xmm2
641 psrld $20,%xmm1
642 por %xmm1,%xmm2
643 movdqa -80(%ebx),%xmm1
644 paddd %xmm2,%xmm0
645 movdqa 96(%ebx),%xmm7
646 pxor %xmm0,%xmm6
647 movdqa %xmm0,-96(%ebx)
648 pshufb 16(%eax),%xmm6
649 paddd %xmm6,%xmm4
650 movdqa %xmm6,80(%ebx)
651 pxor %xmm4,%xmm2
652 paddd %xmm3,%xmm1
653 movdqa %xmm2,%xmm0
654 pslld $7,%xmm2
655 psrld $25,%xmm0
656 pxor %xmm1,%xmm7
657 por %xmm0,%xmm2
658 pshufb (%eax),%xmm7
659 movdqa %xmm2,-16(%ebx)
660 paddd %xmm7,%xmm5
661 pxor %xmm5,%xmm3
662 movdqa %xmm3,%xmm0
663 pslld $12,%xmm3
664 psrld $20,%xmm0
665 por %xmm0,%xmm3
666 movdqa -128(%ebx),%xmm0
667 paddd %xmm3,%xmm1
668 movdqa 64(%ebx),%xmm6
669 pxor %xmm1,%xmm7
670 movdqa %xmm1,-80(%ebx)
671 pshufb 16(%eax),%xmm7
672 paddd %xmm7,%xmm5
673 movdqa %xmm7,96(%ebx)
674 pxor %xmm5,%xmm3
675 movdqa %xmm3,%xmm1
676 pslld $7,%xmm3
677 psrld $25,%xmm1
678 por %xmm1,%xmm3
679 decl %edx
680 jnz L010loop
681 movdqa %xmm3,-64(%ebx)
682 movdqa %xmm4,(%ebx)
683 movdqa %xmm5,16(%ebx)
684 movdqa %xmm6,64(%ebx)
685 movdqa %xmm7,96(%ebx)
686 movdqa -112(%ebx),%xmm1
687 movdqa -96(%ebx),%xmm2
688 movdqa -80(%ebx),%xmm3
689 paddd -128(%ebp),%xmm0
690 paddd -112(%ebp),%xmm1
691 paddd -96(%ebp),%xmm2
692 paddd -80(%ebp),%xmm3
693 movdqa %xmm0,%xmm6
694 punpckldq %xmm1,%xmm0
695 movdqa %xmm2,%xmm7
696 punpckldq %xmm3,%xmm2
697 punpckhdq %xmm1,%xmm6
698 punpckhdq %xmm3,%xmm7
699 movdqa %xmm0,%xmm1
700 punpcklqdq %xmm2,%xmm0
701 movdqa %xmm6,%xmm3
702 punpcklqdq %xmm7,%xmm6
703 punpckhqdq %xmm2,%xmm1
704 punpckhqdq %xmm7,%xmm3
David Benjamin6e899c72016-06-09 18:02:18 -0400705 movdqu -128(%esi),%xmm4
706 movdqu -64(%esi),%xmm5
707 movdqu (%esi),%xmm2
708 movdqu 64(%esi),%xmm7
709 leal 16(%esi),%esi
710 pxor %xmm0,%xmm4
David Benjamin4969cc92016-04-22 15:02:23 -0400711 movdqa -64(%ebx),%xmm0
David Benjamin6e899c72016-06-09 18:02:18 -0400712 pxor %xmm1,%xmm5
David Benjamin4969cc92016-04-22 15:02:23 -0400713 movdqa -48(%ebx),%xmm1
David Benjamin6e899c72016-06-09 18:02:18 -0400714 pxor %xmm2,%xmm6
David Benjamin4969cc92016-04-22 15:02:23 -0400715 movdqa -32(%ebx),%xmm2
David Benjamin6e899c72016-06-09 18:02:18 -0400716 pxor %xmm3,%xmm7
David Benjamin4969cc92016-04-22 15:02:23 -0400717 movdqa -16(%ebx),%xmm3
David Benjamin6e899c72016-06-09 18:02:18 -0400718 movdqu %xmm4,-128(%edi)
719 movdqu %xmm5,-64(%edi)
720 movdqu %xmm6,(%edi)
721 movdqu %xmm7,64(%edi)
722 leal 16(%edi),%edi
David Benjamin4969cc92016-04-22 15:02:23 -0400723 paddd -64(%ebp),%xmm0
724 paddd -48(%ebp),%xmm1
725 paddd -32(%ebp),%xmm2
726 paddd -16(%ebp),%xmm3
727 movdqa %xmm0,%xmm6
728 punpckldq %xmm1,%xmm0
729 movdqa %xmm2,%xmm7
730 punpckldq %xmm3,%xmm2
731 punpckhdq %xmm1,%xmm6
732 punpckhdq %xmm3,%xmm7
733 movdqa %xmm0,%xmm1
734 punpcklqdq %xmm2,%xmm0
735 movdqa %xmm6,%xmm3
736 punpcklqdq %xmm7,%xmm6
737 punpckhqdq %xmm2,%xmm1
738 punpckhqdq %xmm7,%xmm3
David Benjamin6e899c72016-06-09 18:02:18 -0400739 movdqu -128(%esi),%xmm4
740 movdqu -64(%esi),%xmm5
741 movdqu (%esi),%xmm2
742 movdqu 64(%esi),%xmm7
743 leal 16(%esi),%esi
744 pxor %xmm0,%xmm4
David Benjamin4969cc92016-04-22 15:02:23 -0400745 movdqa (%ebx),%xmm0
David Benjamin6e899c72016-06-09 18:02:18 -0400746 pxor %xmm1,%xmm5
David Benjamin4969cc92016-04-22 15:02:23 -0400747 movdqa 16(%ebx),%xmm1
David Benjamin6e899c72016-06-09 18:02:18 -0400748 pxor %xmm2,%xmm6
David Benjamin4969cc92016-04-22 15:02:23 -0400749 movdqa 32(%ebx),%xmm2
David Benjamin6e899c72016-06-09 18:02:18 -0400750 pxor %xmm3,%xmm7
David Benjamin4969cc92016-04-22 15:02:23 -0400751 movdqa 48(%ebx),%xmm3
David Benjamin6e899c72016-06-09 18:02:18 -0400752 movdqu %xmm4,-128(%edi)
753 movdqu %xmm5,-64(%edi)
754 movdqu %xmm6,(%edi)
755 movdqu %xmm7,64(%edi)
756 leal 16(%edi),%edi
David Benjamin4969cc92016-04-22 15:02:23 -0400757 paddd (%ebp),%xmm0
758 paddd 16(%ebp),%xmm1
759 paddd 32(%ebp),%xmm2
760 paddd 48(%ebp),%xmm3
761 movdqa %xmm0,%xmm6
762 punpckldq %xmm1,%xmm0
763 movdqa %xmm2,%xmm7
764 punpckldq %xmm3,%xmm2
765 punpckhdq %xmm1,%xmm6
766 punpckhdq %xmm3,%xmm7
767 movdqa %xmm0,%xmm1
768 punpcklqdq %xmm2,%xmm0
769 movdqa %xmm6,%xmm3
770 punpcklqdq %xmm7,%xmm6
771 punpckhqdq %xmm2,%xmm1
772 punpckhqdq %xmm7,%xmm3
David Benjamin6e899c72016-06-09 18:02:18 -0400773 movdqu -128(%esi),%xmm4
774 movdqu -64(%esi),%xmm5
775 movdqu (%esi),%xmm2
776 movdqu 64(%esi),%xmm7
777 leal 16(%esi),%esi
778 pxor %xmm0,%xmm4
David Benjamin4969cc92016-04-22 15:02:23 -0400779 movdqa 64(%ebx),%xmm0
David Benjamin6e899c72016-06-09 18:02:18 -0400780 pxor %xmm1,%xmm5
David Benjamin4969cc92016-04-22 15:02:23 -0400781 movdqa 80(%ebx),%xmm1
David Benjamin6e899c72016-06-09 18:02:18 -0400782 pxor %xmm2,%xmm6
David Benjamin4969cc92016-04-22 15:02:23 -0400783 movdqa 96(%ebx),%xmm2
David Benjamin6e899c72016-06-09 18:02:18 -0400784 pxor %xmm3,%xmm7
David Benjamin4969cc92016-04-22 15:02:23 -0400785 movdqa 112(%ebx),%xmm3
David Benjamin6e899c72016-06-09 18:02:18 -0400786 movdqu %xmm4,-128(%edi)
787 movdqu %xmm5,-64(%edi)
788 movdqu %xmm6,(%edi)
789 movdqu %xmm7,64(%edi)
790 leal 16(%edi),%edi
David Benjamin4969cc92016-04-22 15:02:23 -0400791 paddd 64(%ebp),%xmm0
792 paddd 80(%ebp),%xmm1
793 paddd 96(%ebp),%xmm2
794 paddd 112(%ebp),%xmm3
795 movdqa %xmm0,%xmm6
796 punpckldq %xmm1,%xmm0
797 movdqa %xmm2,%xmm7
798 punpckldq %xmm3,%xmm2
799 punpckhdq %xmm1,%xmm6
800 punpckhdq %xmm3,%xmm7
801 movdqa %xmm0,%xmm1
802 punpcklqdq %xmm2,%xmm0
803 movdqa %xmm6,%xmm3
804 punpcklqdq %xmm7,%xmm6
805 punpckhqdq %xmm2,%xmm1
806 punpckhqdq %xmm7,%xmm3
David Benjamin6e899c72016-06-09 18:02:18 -0400807 movdqu -128(%esi),%xmm4
808 movdqu -64(%esi),%xmm5
809 movdqu (%esi),%xmm2
810 movdqu 64(%esi),%xmm7
811 leal 208(%esi),%esi
812 pxor %xmm0,%xmm4
813 pxor %xmm1,%xmm5
814 pxor %xmm2,%xmm6
815 pxor %xmm3,%xmm7
816 movdqu %xmm4,-128(%edi)
817 movdqu %xmm5,-64(%edi)
818 movdqu %xmm6,(%edi)
819 movdqu %xmm7,64(%edi)
820 leal 208(%edi),%edi
David Benjamin4969cc92016-04-22 15:02:23 -0400821 subl $256,%ecx
822 jnc L009outer_loop
823 addl $256,%ecx
824 jz L011done
825 movl 520(%esp),%ebx
826 leal -128(%esi),%esi
827 movl 516(%esp),%edx
828 leal -128(%edi),%edi
829 movd 64(%ebp),%xmm2
830 movdqu (%ebx),%xmm3
831 paddd 96(%eax),%xmm2
832 pand 112(%eax),%xmm3
833 por %xmm2,%xmm3
834L0081x:
835 movdqa 32(%eax),%xmm0
836 movdqu (%edx),%xmm1
837 movdqu 16(%edx),%xmm2
838 movdqa (%eax),%xmm6
839 movdqa 16(%eax),%xmm7
840 movl %ebp,48(%esp)
841 movdqa %xmm0,(%esp)
842 movdqa %xmm1,16(%esp)
843 movdqa %xmm2,32(%esp)
844 movdqa %xmm3,48(%esp)
845 movl $10,%edx
846 jmp L012loop1x
847.align 4,0x90
848L013outer1x:
849 movdqa 80(%eax),%xmm3
850 movdqa (%esp),%xmm0
851 movdqa 16(%esp),%xmm1
852 movdqa 32(%esp),%xmm2
853 paddd 48(%esp),%xmm3
854 movl $10,%edx
855 movdqa %xmm3,48(%esp)
856 jmp L012loop1x
857.align 4,0x90
858L012loop1x:
859 paddd %xmm1,%xmm0
860 pxor %xmm0,%xmm3
861.byte 102,15,56,0,222
862 paddd %xmm3,%xmm2
863 pxor %xmm2,%xmm1
864 movdqa %xmm1,%xmm4
865 psrld $20,%xmm1
866 pslld $12,%xmm4
867 por %xmm4,%xmm1
868 paddd %xmm1,%xmm0
869 pxor %xmm0,%xmm3
870.byte 102,15,56,0,223
871 paddd %xmm3,%xmm2
872 pxor %xmm2,%xmm1
873 movdqa %xmm1,%xmm4
874 psrld $25,%xmm1
875 pslld $7,%xmm4
876 por %xmm4,%xmm1
877 pshufd $78,%xmm2,%xmm2
878 pshufd $57,%xmm1,%xmm1
879 pshufd $147,%xmm3,%xmm3
880 nop
881 paddd %xmm1,%xmm0
882 pxor %xmm0,%xmm3
883.byte 102,15,56,0,222
884 paddd %xmm3,%xmm2
885 pxor %xmm2,%xmm1
886 movdqa %xmm1,%xmm4
887 psrld $20,%xmm1
888 pslld $12,%xmm4
889 por %xmm4,%xmm1
890 paddd %xmm1,%xmm0
891 pxor %xmm0,%xmm3
892.byte 102,15,56,0,223
893 paddd %xmm3,%xmm2
894 pxor %xmm2,%xmm1
895 movdqa %xmm1,%xmm4
896 psrld $25,%xmm1
897 pslld $7,%xmm4
898 por %xmm4,%xmm1
899 pshufd $78,%xmm2,%xmm2
900 pshufd $147,%xmm1,%xmm1
901 pshufd $57,%xmm3,%xmm3
902 decl %edx
903 jnz L012loop1x
904 paddd (%esp),%xmm0
905 paddd 16(%esp),%xmm1
906 paddd 32(%esp),%xmm2
907 paddd 48(%esp),%xmm3
908 cmpl $64,%ecx
909 jb L014tail
910 movdqu (%esi),%xmm4
911 movdqu 16(%esi),%xmm5
912 pxor %xmm4,%xmm0
913 movdqu 32(%esi),%xmm4
914 pxor %xmm5,%xmm1
915 movdqu 48(%esi),%xmm5
916 pxor %xmm4,%xmm2
917 pxor %xmm5,%xmm3
918 leal 64(%esi),%esi
919 movdqu %xmm0,(%edi)
920 movdqu %xmm1,16(%edi)
921 movdqu %xmm2,32(%edi)
922 movdqu %xmm3,48(%edi)
923 leal 64(%edi),%edi
924 subl $64,%ecx
925 jnz L013outer1x
926 jmp L011done
927L014tail:
928 movdqa %xmm0,(%esp)
929 movdqa %xmm1,16(%esp)
930 movdqa %xmm2,32(%esp)
931 movdqa %xmm3,48(%esp)
932 xorl %eax,%eax
933 xorl %edx,%edx
934 xorl %ebp,%ebp
935L015tail_loop:
936 movb (%esp,%ebp,1),%al
937 movb (%esi,%ebp,1),%dl
938 leal 1(%ebp),%ebp
939 xorb %dl,%al
940 movb %al,-1(%edi,%ebp,1)
941 decl %ecx
942 jnz L015tail_loop
943L011done:
944 movl 512(%esp),%esp
945 popl %edi
946 popl %esi
947 popl %ebx
948 popl %ebp
949 ret
950.align 6,0x90
951Lssse3_data:
952.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
953.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
954.long 1634760805,857760878,2036477234,1797285236
955.long 0,1,2,3
956.long 4,4,4,4
957.long 1,0,0,0
958.long 4,0,0,0
959.long 0,-1,-1,-1
960.align 6,0x90
961.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
962.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
963.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
964.byte 114,103,62,0
965.section __IMPORT,__pointers,non_lazy_symbol_pointers
966L_OPENSSL_ia32cap_P$non_lazy_ptr:
967.indirect_symbol _OPENSSL_ia32cap_P
968.long 0
969#endif