blob: a384d9a0393d98e686256a1a5e943ea52615cf64 [file] [log] [blame]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001#if defined(__i386__)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002.text
Adam Langleyd9e397b2015-01-22 14:27:53 -08003.globl gcm_gmult_4bit_mmx
4.hidden gcm_gmult_4bit_mmx
5.type gcm_gmult_4bit_mmx,@function
6.align 16
7gcm_gmult_4bit_mmx:
8.L_gcm_gmult_4bit_mmx_begin:
9 pushl %ebp
10 pushl %ebx
11 pushl %esi
12 pushl %edi
13 movl 20(%esp),%edi
14 movl 24(%esp),%esi
Steven Valdezb0b45c62017-01-17 16:23:54 -050015 call .L000pic_point
16.L000pic_point:
Adam Langleyd9e397b2015-01-22 14:27:53 -080017 popl %eax
Steven Valdezb0b45c62017-01-17 16:23:54 -050018 leal .Lrem_4bit-.L000pic_point(%eax),%eax
Adam Langleyd9e397b2015-01-22 14:27:53 -080019 movzbl 15(%edi),%ebx
20 xorl %ecx,%ecx
21 movl %ebx,%edx
22 movb %dl,%cl
23 movl $14,%ebp
24 shlb $4,%cl
25 andl $240,%edx
26 movq 8(%esi,%ecx,1),%mm0
27 movq (%esi,%ecx,1),%mm1
28 movd %mm0,%ebx
Steven Valdezb0b45c62017-01-17 16:23:54 -050029 jmp .L001mmx_loop
Adam Langleyd9e397b2015-01-22 14:27:53 -080030.align 16
Steven Valdezb0b45c62017-01-17 16:23:54 -050031.L001mmx_loop:
Adam Langleyd9e397b2015-01-22 14:27:53 -080032 psrlq $4,%mm0
33 andl $15,%ebx
34 movq %mm1,%mm2
35 psrlq $4,%mm1
36 pxor 8(%esi,%edx,1),%mm0
37 movb (%edi,%ebp,1),%cl
38 psllq $60,%mm2
39 pxor (%eax,%ebx,8),%mm1
40 decl %ebp
41 movd %mm0,%ebx
42 pxor (%esi,%edx,1),%mm1
43 movl %ecx,%edx
44 pxor %mm2,%mm0
Steven Valdezb0b45c62017-01-17 16:23:54 -050045 js .L002mmx_break
Adam Langleyd9e397b2015-01-22 14:27:53 -080046 shlb $4,%cl
47 andl $15,%ebx
48 psrlq $4,%mm0
49 andl $240,%edx
50 movq %mm1,%mm2
51 psrlq $4,%mm1
52 pxor 8(%esi,%ecx,1),%mm0
53 psllq $60,%mm2
54 pxor (%eax,%ebx,8),%mm1
55 movd %mm0,%ebx
56 pxor (%esi,%ecx,1),%mm1
57 pxor %mm2,%mm0
Steven Valdezb0b45c62017-01-17 16:23:54 -050058 jmp .L001mmx_loop
Adam Langleyd9e397b2015-01-22 14:27:53 -080059.align 16
Steven Valdezb0b45c62017-01-17 16:23:54 -050060.L002mmx_break:
Adam Langleyd9e397b2015-01-22 14:27:53 -080061 shlb $4,%cl
62 andl $15,%ebx
63 psrlq $4,%mm0
64 andl $240,%edx
65 movq %mm1,%mm2
66 psrlq $4,%mm1
67 pxor 8(%esi,%ecx,1),%mm0
68 psllq $60,%mm2
69 pxor (%eax,%ebx,8),%mm1
70 movd %mm0,%ebx
71 pxor (%esi,%ecx,1),%mm1
72 pxor %mm2,%mm0
73 psrlq $4,%mm0
74 andl $15,%ebx
75 movq %mm1,%mm2
76 psrlq $4,%mm1
77 pxor 8(%esi,%edx,1),%mm0
78 psllq $60,%mm2
79 pxor (%eax,%ebx,8),%mm1
80 movd %mm0,%ebx
81 pxor (%esi,%edx,1),%mm1
82 pxor %mm2,%mm0
83 psrlq $32,%mm0
84 movd %mm1,%edx
85 psrlq $32,%mm1
86 movd %mm0,%ecx
87 movd %mm1,%ebp
88 bswap %ebx
89 bswap %edx
90 bswap %ecx
91 bswap %ebp
92 emms
93 movl %ebx,12(%edi)
94 movl %edx,4(%edi)
95 movl %ecx,8(%edi)
96 movl %ebp,(%edi)
97 popl %edi
98 popl %esi
99 popl %ebx
100 popl %ebp
101 ret
102.size gcm_gmult_4bit_mmx,.-.L_gcm_gmult_4bit_mmx_begin
103.globl gcm_ghash_4bit_mmx
104.hidden gcm_ghash_4bit_mmx
105.type gcm_ghash_4bit_mmx,@function
106.align 16
107gcm_ghash_4bit_mmx:
108.L_gcm_ghash_4bit_mmx_begin:
109 pushl %ebp
110 pushl %ebx
111 pushl %esi
112 pushl %edi
113 movl 20(%esp),%eax
114 movl 24(%esp),%ebx
115 movl 28(%esp),%ecx
116 movl 32(%esp),%edx
117 movl %esp,%ebp
Steven Valdezb0b45c62017-01-17 16:23:54 -0500118 call .L003pic_point
119.L003pic_point:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800120 popl %esi
Steven Valdezb0b45c62017-01-17 16:23:54 -0500121 leal .Lrem_8bit-.L003pic_point(%esi),%esi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800122 subl $544,%esp
123 andl $-64,%esp
124 subl $16,%esp
125 addl %ecx,%edx
126 movl %eax,544(%esp)
127 movl %edx,552(%esp)
128 movl %ebp,556(%esp)
129 addl $128,%ebx
130 leal 144(%esp),%edi
131 leal 400(%esp),%ebp
132 movl -120(%ebx),%edx
133 movq -120(%ebx),%mm0
134 movq -128(%ebx),%mm3
135 shll $4,%edx
136 movb %dl,(%esp)
137 movl -104(%ebx),%edx
138 movq -104(%ebx),%mm2
139 movq -112(%ebx),%mm5
140 movq %mm0,-128(%edi)
141 psrlq $4,%mm0
142 movq %mm3,(%edi)
143 movq %mm3,%mm7
144 psrlq $4,%mm3
145 shll $4,%edx
146 movb %dl,1(%esp)
147 movl -88(%ebx),%edx
148 movq -88(%ebx),%mm1
149 psllq $60,%mm7
150 movq -96(%ebx),%mm4
151 por %mm7,%mm0
152 movq %mm2,-120(%edi)
153 psrlq $4,%mm2
154 movq %mm5,8(%edi)
155 movq %mm5,%mm6
156 movq %mm0,-128(%ebp)
157 psrlq $4,%mm5
158 movq %mm3,(%ebp)
159 shll $4,%edx
160 movb %dl,2(%esp)
161 movl -72(%ebx),%edx
162 movq -72(%ebx),%mm0
163 psllq $60,%mm6
164 movq -80(%ebx),%mm3
165 por %mm6,%mm2
166 movq %mm1,-112(%edi)
167 psrlq $4,%mm1
168 movq %mm4,16(%edi)
169 movq %mm4,%mm7
170 movq %mm2,-120(%ebp)
171 psrlq $4,%mm4
172 movq %mm5,8(%ebp)
173 shll $4,%edx
174 movb %dl,3(%esp)
175 movl -56(%ebx),%edx
176 movq -56(%ebx),%mm2
177 psllq $60,%mm7
178 movq -64(%ebx),%mm5
179 por %mm7,%mm1
180 movq %mm0,-104(%edi)
181 psrlq $4,%mm0
182 movq %mm3,24(%edi)
183 movq %mm3,%mm6
184 movq %mm1,-112(%ebp)
185 psrlq $4,%mm3
186 movq %mm4,16(%ebp)
187 shll $4,%edx
188 movb %dl,4(%esp)
189 movl -40(%ebx),%edx
190 movq -40(%ebx),%mm1
191 psllq $60,%mm6
192 movq -48(%ebx),%mm4
193 por %mm6,%mm0
194 movq %mm2,-96(%edi)
195 psrlq $4,%mm2
196 movq %mm5,32(%edi)
197 movq %mm5,%mm7
198 movq %mm0,-104(%ebp)
199 psrlq $4,%mm5
200 movq %mm3,24(%ebp)
201 shll $4,%edx
202 movb %dl,5(%esp)
203 movl -24(%ebx),%edx
204 movq -24(%ebx),%mm0
205 psllq $60,%mm7
206 movq -32(%ebx),%mm3
207 por %mm7,%mm2
208 movq %mm1,-88(%edi)
209 psrlq $4,%mm1
210 movq %mm4,40(%edi)
211 movq %mm4,%mm6
212 movq %mm2,-96(%ebp)
213 psrlq $4,%mm4
214 movq %mm5,32(%ebp)
215 shll $4,%edx
216 movb %dl,6(%esp)
217 movl -8(%ebx),%edx
218 movq -8(%ebx),%mm2
219 psllq $60,%mm6
220 movq -16(%ebx),%mm5
221 por %mm6,%mm1
222 movq %mm0,-80(%edi)
223 psrlq $4,%mm0
224 movq %mm3,48(%edi)
225 movq %mm3,%mm7
226 movq %mm1,-88(%ebp)
227 psrlq $4,%mm3
228 movq %mm4,40(%ebp)
229 shll $4,%edx
230 movb %dl,7(%esp)
231 movl 8(%ebx),%edx
232 movq 8(%ebx),%mm1
233 psllq $60,%mm7
234 movq (%ebx),%mm4
235 por %mm7,%mm0
236 movq %mm2,-72(%edi)
237 psrlq $4,%mm2
238 movq %mm5,56(%edi)
239 movq %mm5,%mm6
240 movq %mm0,-80(%ebp)
241 psrlq $4,%mm5
242 movq %mm3,48(%ebp)
243 shll $4,%edx
244 movb %dl,8(%esp)
245 movl 24(%ebx),%edx
246 movq 24(%ebx),%mm0
247 psllq $60,%mm6
248 movq 16(%ebx),%mm3
249 por %mm6,%mm2
250 movq %mm1,-64(%edi)
251 psrlq $4,%mm1
252 movq %mm4,64(%edi)
253 movq %mm4,%mm7
254 movq %mm2,-72(%ebp)
255 psrlq $4,%mm4
256 movq %mm5,56(%ebp)
257 shll $4,%edx
258 movb %dl,9(%esp)
259 movl 40(%ebx),%edx
260 movq 40(%ebx),%mm2
261 psllq $60,%mm7
262 movq 32(%ebx),%mm5
263 por %mm7,%mm1
264 movq %mm0,-56(%edi)
265 psrlq $4,%mm0
266 movq %mm3,72(%edi)
267 movq %mm3,%mm6
268 movq %mm1,-64(%ebp)
269 psrlq $4,%mm3
270 movq %mm4,64(%ebp)
271 shll $4,%edx
272 movb %dl,10(%esp)
273 movl 56(%ebx),%edx
274 movq 56(%ebx),%mm1
275 psllq $60,%mm6
276 movq 48(%ebx),%mm4
277 por %mm6,%mm0
278 movq %mm2,-48(%edi)
279 psrlq $4,%mm2
280 movq %mm5,80(%edi)
281 movq %mm5,%mm7
282 movq %mm0,-56(%ebp)
283 psrlq $4,%mm5
284 movq %mm3,72(%ebp)
285 shll $4,%edx
286 movb %dl,11(%esp)
287 movl 72(%ebx),%edx
288 movq 72(%ebx),%mm0
289 psllq $60,%mm7
290 movq 64(%ebx),%mm3
291 por %mm7,%mm2
292 movq %mm1,-40(%edi)
293 psrlq $4,%mm1
294 movq %mm4,88(%edi)
295 movq %mm4,%mm6
296 movq %mm2,-48(%ebp)
297 psrlq $4,%mm4
298 movq %mm5,80(%ebp)
299 shll $4,%edx
300 movb %dl,12(%esp)
301 movl 88(%ebx),%edx
302 movq 88(%ebx),%mm2
303 psllq $60,%mm6
304 movq 80(%ebx),%mm5
305 por %mm6,%mm1
306 movq %mm0,-32(%edi)
307 psrlq $4,%mm0
308 movq %mm3,96(%edi)
309 movq %mm3,%mm7
310 movq %mm1,-40(%ebp)
311 psrlq $4,%mm3
312 movq %mm4,88(%ebp)
313 shll $4,%edx
314 movb %dl,13(%esp)
315 movl 104(%ebx),%edx
316 movq 104(%ebx),%mm1
317 psllq $60,%mm7
318 movq 96(%ebx),%mm4
319 por %mm7,%mm0
320 movq %mm2,-24(%edi)
321 psrlq $4,%mm2
322 movq %mm5,104(%edi)
323 movq %mm5,%mm6
324 movq %mm0,-32(%ebp)
325 psrlq $4,%mm5
326 movq %mm3,96(%ebp)
327 shll $4,%edx
328 movb %dl,14(%esp)
329 movl 120(%ebx),%edx
330 movq 120(%ebx),%mm0
331 psllq $60,%mm6
332 movq 112(%ebx),%mm3
333 por %mm6,%mm2
334 movq %mm1,-16(%edi)
335 psrlq $4,%mm1
336 movq %mm4,112(%edi)
337 movq %mm4,%mm7
338 movq %mm2,-24(%ebp)
339 psrlq $4,%mm4
340 movq %mm5,104(%ebp)
341 shll $4,%edx
342 movb %dl,15(%esp)
343 psllq $60,%mm7
344 por %mm7,%mm1
345 movq %mm0,-8(%edi)
346 psrlq $4,%mm0
347 movq %mm3,120(%edi)
348 movq %mm3,%mm6
349 movq %mm1,-16(%ebp)
350 psrlq $4,%mm3
351 movq %mm4,112(%ebp)
352 psllq $60,%mm6
353 por %mm6,%mm0
354 movq %mm0,-8(%ebp)
355 movq %mm3,120(%ebp)
356 movq (%eax),%mm6
357 movl 8(%eax),%ebx
358 movl 12(%eax),%edx
359.align 16
Steven Valdezb0b45c62017-01-17 16:23:54 -0500360.L004outer:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800361 xorl 12(%ecx),%edx
362 xorl 8(%ecx),%ebx
363 pxor (%ecx),%mm6
364 leal 16(%ecx),%ecx
365 movl %ebx,536(%esp)
366 movq %mm6,528(%esp)
367 movl %ecx,548(%esp)
368 xorl %eax,%eax
369 roll $8,%edx
370 movb %dl,%al
371 movl %eax,%ebp
372 andb $15,%al
373 shrl $4,%ebp
374 pxor %mm0,%mm0
375 roll $8,%edx
376 pxor %mm1,%mm1
377 pxor %mm2,%mm2
378 movq 16(%esp,%eax,8),%mm7
379 movq 144(%esp,%eax,8),%mm6
380 movb %dl,%al
381 movd %mm7,%ebx
382 psrlq $8,%mm7
383 movq %mm6,%mm3
384 movl %eax,%edi
385 psrlq $8,%mm6
386 pxor 272(%esp,%ebp,8),%mm7
387 andb $15,%al
388 psllq $56,%mm3
389 shrl $4,%edi
390 pxor 16(%esp,%eax,8),%mm7
391 roll $8,%edx
392 pxor 144(%esp,%eax,8),%mm6
393 pxor %mm3,%mm7
394 pxor 400(%esp,%ebp,8),%mm6
395 xorb (%esp,%ebp,1),%bl
396 movb %dl,%al
397 movd %mm7,%ecx
398 movzbl %bl,%ebx
399 psrlq $8,%mm7
400 movq %mm6,%mm3
401 movl %eax,%ebp
402 psrlq $8,%mm6
403 pxor 272(%esp,%edi,8),%mm7
404 andb $15,%al
405 psllq $56,%mm3
406 shrl $4,%ebp
407 pinsrw $2,(%esi,%ebx,2),%mm2
408 pxor 16(%esp,%eax,8),%mm7
409 roll $8,%edx
410 pxor 144(%esp,%eax,8),%mm6
411 pxor %mm3,%mm7
412 pxor 400(%esp,%edi,8),%mm6
413 xorb (%esp,%edi,1),%cl
414 movb %dl,%al
415 movl 536(%esp),%edx
416 movd %mm7,%ebx
417 movzbl %cl,%ecx
418 psrlq $8,%mm7
419 movq %mm6,%mm3
420 movl %eax,%edi
421 psrlq $8,%mm6
422 pxor 272(%esp,%ebp,8),%mm7
423 andb $15,%al
424 psllq $56,%mm3
425 pxor %mm2,%mm6
426 shrl $4,%edi
427 pinsrw $2,(%esi,%ecx,2),%mm1
428 pxor 16(%esp,%eax,8),%mm7
429 roll $8,%edx
430 pxor 144(%esp,%eax,8),%mm6
431 pxor %mm3,%mm7
432 pxor 400(%esp,%ebp,8),%mm6
433 xorb (%esp,%ebp,1),%bl
434 movb %dl,%al
435 movd %mm7,%ecx
436 movzbl %bl,%ebx
437 psrlq $8,%mm7
438 movq %mm6,%mm3
439 movl %eax,%ebp
440 psrlq $8,%mm6
441 pxor 272(%esp,%edi,8),%mm7
442 andb $15,%al
443 psllq $56,%mm3
444 pxor %mm1,%mm6
445 shrl $4,%ebp
446 pinsrw $2,(%esi,%ebx,2),%mm0
447 pxor 16(%esp,%eax,8),%mm7
448 roll $8,%edx
449 pxor 144(%esp,%eax,8),%mm6
450 pxor %mm3,%mm7
451 pxor 400(%esp,%edi,8),%mm6
452 xorb (%esp,%edi,1),%cl
453 movb %dl,%al
454 movd %mm7,%ebx
455 movzbl %cl,%ecx
456 psrlq $8,%mm7
457 movq %mm6,%mm3
458 movl %eax,%edi
459 psrlq $8,%mm6
460 pxor 272(%esp,%ebp,8),%mm7
461 andb $15,%al
462 psllq $56,%mm3
463 pxor %mm0,%mm6
464 shrl $4,%edi
465 pinsrw $2,(%esi,%ecx,2),%mm2
466 pxor 16(%esp,%eax,8),%mm7
467 roll $8,%edx
468 pxor 144(%esp,%eax,8),%mm6
469 pxor %mm3,%mm7
470 pxor 400(%esp,%ebp,8),%mm6
471 xorb (%esp,%ebp,1),%bl
472 movb %dl,%al
473 movd %mm7,%ecx
474 movzbl %bl,%ebx
475 psrlq $8,%mm7
476 movq %mm6,%mm3
477 movl %eax,%ebp
478 psrlq $8,%mm6
479 pxor 272(%esp,%edi,8),%mm7
480 andb $15,%al
481 psllq $56,%mm3
482 pxor %mm2,%mm6
483 shrl $4,%ebp
484 pinsrw $2,(%esi,%ebx,2),%mm1
485 pxor 16(%esp,%eax,8),%mm7
486 roll $8,%edx
487 pxor 144(%esp,%eax,8),%mm6
488 pxor %mm3,%mm7
489 pxor 400(%esp,%edi,8),%mm6
490 xorb (%esp,%edi,1),%cl
491 movb %dl,%al
492 movl 532(%esp),%edx
493 movd %mm7,%ebx
494 movzbl %cl,%ecx
495 psrlq $8,%mm7
496 movq %mm6,%mm3
497 movl %eax,%edi
498 psrlq $8,%mm6
499 pxor 272(%esp,%ebp,8),%mm7
500 andb $15,%al
501 psllq $56,%mm3
502 pxor %mm1,%mm6
503 shrl $4,%edi
504 pinsrw $2,(%esi,%ecx,2),%mm0
505 pxor 16(%esp,%eax,8),%mm7
506 roll $8,%edx
507 pxor 144(%esp,%eax,8),%mm6
508 pxor %mm3,%mm7
509 pxor 400(%esp,%ebp,8),%mm6
510 xorb (%esp,%ebp,1),%bl
511 movb %dl,%al
512 movd %mm7,%ecx
513 movzbl %bl,%ebx
514 psrlq $8,%mm7
515 movq %mm6,%mm3
516 movl %eax,%ebp
517 psrlq $8,%mm6
518 pxor 272(%esp,%edi,8),%mm7
519 andb $15,%al
520 psllq $56,%mm3
521 pxor %mm0,%mm6
522 shrl $4,%ebp
523 pinsrw $2,(%esi,%ebx,2),%mm2
524 pxor 16(%esp,%eax,8),%mm7
525 roll $8,%edx
526 pxor 144(%esp,%eax,8),%mm6
527 pxor %mm3,%mm7
528 pxor 400(%esp,%edi,8),%mm6
529 xorb (%esp,%edi,1),%cl
530 movb %dl,%al
531 movd %mm7,%ebx
532 movzbl %cl,%ecx
533 psrlq $8,%mm7
534 movq %mm6,%mm3
535 movl %eax,%edi
536 psrlq $8,%mm6
537 pxor 272(%esp,%ebp,8),%mm7
538 andb $15,%al
539 psllq $56,%mm3
540 pxor %mm2,%mm6
541 shrl $4,%edi
542 pinsrw $2,(%esi,%ecx,2),%mm1
543 pxor 16(%esp,%eax,8),%mm7
544 roll $8,%edx
545 pxor 144(%esp,%eax,8),%mm6
546 pxor %mm3,%mm7
547 pxor 400(%esp,%ebp,8),%mm6
548 xorb (%esp,%ebp,1),%bl
549 movb %dl,%al
550 movd %mm7,%ecx
551 movzbl %bl,%ebx
552 psrlq $8,%mm7
553 movq %mm6,%mm3
554 movl %eax,%ebp
555 psrlq $8,%mm6
556 pxor 272(%esp,%edi,8),%mm7
557 andb $15,%al
558 psllq $56,%mm3
559 pxor %mm1,%mm6
560 shrl $4,%ebp
561 pinsrw $2,(%esi,%ebx,2),%mm0
562 pxor 16(%esp,%eax,8),%mm7
563 roll $8,%edx
564 pxor 144(%esp,%eax,8),%mm6
565 pxor %mm3,%mm7
566 pxor 400(%esp,%edi,8),%mm6
567 xorb (%esp,%edi,1),%cl
568 movb %dl,%al
569 movl 528(%esp),%edx
570 movd %mm7,%ebx
571 movzbl %cl,%ecx
572 psrlq $8,%mm7
573 movq %mm6,%mm3
574 movl %eax,%edi
575 psrlq $8,%mm6
576 pxor 272(%esp,%ebp,8),%mm7
577 andb $15,%al
578 psllq $56,%mm3
579 pxor %mm0,%mm6
580 shrl $4,%edi
581 pinsrw $2,(%esi,%ecx,2),%mm2
582 pxor 16(%esp,%eax,8),%mm7
583 roll $8,%edx
584 pxor 144(%esp,%eax,8),%mm6
585 pxor %mm3,%mm7
586 pxor 400(%esp,%ebp,8),%mm6
587 xorb (%esp,%ebp,1),%bl
588 movb %dl,%al
589 movd %mm7,%ecx
590 movzbl %bl,%ebx
591 psrlq $8,%mm7
592 movq %mm6,%mm3
593 movl %eax,%ebp
594 psrlq $8,%mm6
595 pxor 272(%esp,%edi,8),%mm7
596 andb $15,%al
597 psllq $56,%mm3
598 pxor %mm2,%mm6
599 shrl $4,%ebp
600 pinsrw $2,(%esi,%ebx,2),%mm1
601 pxor 16(%esp,%eax,8),%mm7
602 roll $8,%edx
603 pxor 144(%esp,%eax,8),%mm6
604 pxor %mm3,%mm7
605 pxor 400(%esp,%edi,8),%mm6
606 xorb (%esp,%edi,1),%cl
607 movb %dl,%al
608 movd %mm7,%ebx
609 movzbl %cl,%ecx
610 psrlq $8,%mm7
611 movq %mm6,%mm3
612 movl %eax,%edi
613 psrlq $8,%mm6
614 pxor 272(%esp,%ebp,8),%mm7
615 andb $15,%al
616 psllq $56,%mm3
617 pxor %mm1,%mm6
618 shrl $4,%edi
619 pinsrw $2,(%esi,%ecx,2),%mm0
620 pxor 16(%esp,%eax,8),%mm7
621 roll $8,%edx
622 pxor 144(%esp,%eax,8),%mm6
623 pxor %mm3,%mm7
624 pxor 400(%esp,%ebp,8),%mm6
625 xorb (%esp,%ebp,1),%bl
626 movb %dl,%al
627 movd %mm7,%ecx
628 movzbl %bl,%ebx
629 psrlq $8,%mm7
630 movq %mm6,%mm3
631 movl %eax,%ebp
632 psrlq $8,%mm6
633 pxor 272(%esp,%edi,8),%mm7
634 andb $15,%al
635 psllq $56,%mm3
636 pxor %mm0,%mm6
637 shrl $4,%ebp
638 pinsrw $2,(%esi,%ebx,2),%mm2
639 pxor 16(%esp,%eax,8),%mm7
640 roll $8,%edx
641 pxor 144(%esp,%eax,8),%mm6
642 pxor %mm3,%mm7
643 pxor 400(%esp,%edi,8),%mm6
644 xorb (%esp,%edi,1),%cl
645 movb %dl,%al
646 movl 524(%esp),%edx
647 movd %mm7,%ebx
648 movzbl %cl,%ecx
649 psrlq $8,%mm7
650 movq %mm6,%mm3
651 movl %eax,%edi
652 psrlq $8,%mm6
653 pxor 272(%esp,%ebp,8),%mm7
654 andb $15,%al
655 psllq $56,%mm3
656 pxor %mm2,%mm6
657 shrl $4,%edi
658 pinsrw $2,(%esi,%ecx,2),%mm1
659 pxor 16(%esp,%eax,8),%mm7
660 pxor 144(%esp,%eax,8),%mm6
661 xorb (%esp,%ebp,1),%bl
662 pxor %mm3,%mm7
663 pxor 400(%esp,%ebp,8),%mm6
664 movzbl %bl,%ebx
665 pxor %mm2,%mm2
666 psllq $4,%mm1
667 movd %mm7,%ecx
668 psrlq $4,%mm7
669 movq %mm6,%mm3
670 psrlq $4,%mm6
671 shll $4,%ecx
672 pxor 16(%esp,%edi,8),%mm7
673 psllq $60,%mm3
674 movzbl %cl,%ecx
675 pxor %mm3,%mm7
676 pxor 144(%esp,%edi,8),%mm6
677 pinsrw $2,(%esi,%ebx,2),%mm0
678 pxor %mm1,%mm6
679 movd %mm7,%edx
680 pinsrw $3,(%esi,%ecx,2),%mm2
681 psllq $12,%mm0
682 pxor %mm0,%mm6
683 psrlq $32,%mm7
684 pxor %mm2,%mm6
685 movl 548(%esp),%ecx
686 movd %mm7,%ebx
687 movq %mm6,%mm3
688 psllw $8,%mm6
689 psrlw $8,%mm3
690 por %mm3,%mm6
691 bswap %edx
692 pshufw $27,%mm6,%mm6
693 bswap %ebx
694 cmpl 552(%esp),%ecx
Steven Valdezb0b45c62017-01-17 16:23:54 -0500695 jne .L004outer
Adam Langleyd9e397b2015-01-22 14:27:53 -0800696 movl 544(%esp),%eax
697 movl %edx,12(%eax)
698 movl %ebx,8(%eax)
699 movq %mm6,(%eax)
700 movl 556(%esp),%esp
701 emms
702 popl %edi
703 popl %esi
704 popl %ebx
705 popl %ebp
706 ret
707.size gcm_ghash_4bit_mmx,.-.L_gcm_ghash_4bit_mmx_begin
708.globl gcm_init_clmul
709.hidden gcm_init_clmul
710.type gcm_init_clmul,@function
711.align 16
712gcm_init_clmul:
713.L_gcm_init_clmul_begin:
714 movl 4(%esp),%edx
715 movl 8(%esp),%eax
Steven Valdezb0b45c62017-01-17 16:23:54 -0500716 call .L005pic
717.L005pic:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800718 popl %ecx
Steven Valdezb0b45c62017-01-17 16:23:54 -0500719 leal .Lbswap-.L005pic(%ecx),%ecx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800720 movdqu (%eax),%xmm2
721 pshufd $78,%xmm2,%xmm2
722 pshufd $255,%xmm2,%xmm4
723 movdqa %xmm2,%xmm3
724 psllq $1,%xmm2
725 pxor %xmm5,%xmm5
726 psrlq $63,%xmm3
727 pcmpgtd %xmm4,%xmm5
728 pslldq $8,%xmm3
729 por %xmm3,%xmm2
730 pand 16(%ecx),%xmm5
731 pxor %xmm5,%xmm2
732 movdqa %xmm2,%xmm0
733 movdqa %xmm0,%xmm1
734 pshufd $78,%xmm0,%xmm3
735 pshufd $78,%xmm2,%xmm4
736 pxor %xmm0,%xmm3
737 pxor %xmm2,%xmm4
738.byte 102,15,58,68,194,0
739.byte 102,15,58,68,202,17
740.byte 102,15,58,68,220,0
741 xorps %xmm0,%xmm3
742 xorps %xmm1,%xmm3
743 movdqa %xmm3,%xmm4
744 psrldq $8,%xmm3
745 pslldq $8,%xmm4
746 pxor %xmm3,%xmm1
747 pxor %xmm4,%xmm0
748 movdqa %xmm0,%xmm4
749 movdqa %xmm0,%xmm3
750 psllq $5,%xmm0
751 pxor %xmm0,%xmm3
752 psllq $1,%xmm0
753 pxor %xmm3,%xmm0
754 psllq $57,%xmm0
755 movdqa %xmm0,%xmm3
756 pslldq $8,%xmm0
757 psrldq $8,%xmm3
758 pxor %xmm4,%xmm0
759 pxor %xmm3,%xmm1
760 movdqa %xmm0,%xmm4
761 psrlq $1,%xmm0
762 pxor %xmm4,%xmm1
763 pxor %xmm0,%xmm4
764 psrlq $5,%xmm0
765 pxor %xmm4,%xmm0
766 psrlq $1,%xmm0
767 pxor %xmm1,%xmm0
768 pshufd $78,%xmm2,%xmm3
769 pshufd $78,%xmm0,%xmm4
770 pxor %xmm2,%xmm3
771 movdqu %xmm2,(%edx)
772 pxor %xmm0,%xmm4
773 movdqu %xmm0,16(%edx)
774.byte 102,15,58,15,227,8
775 movdqu %xmm4,32(%edx)
776 ret
777.size gcm_init_clmul,.-.L_gcm_init_clmul_begin
778.globl gcm_gmult_clmul
779.hidden gcm_gmult_clmul
780.type gcm_gmult_clmul,@function
781.align 16
782gcm_gmult_clmul:
783.L_gcm_gmult_clmul_begin:
784 movl 4(%esp),%eax
785 movl 8(%esp),%edx
Steven Valdezb0b45c62017-01-17 16:23:54 -0500786 call .L006pic
787.L006pic:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800788 popl %ecx
Steven Valdezb0b45c62017-01-17 16:23:54 -0500789 leal .Lbswap-.L006pic(%ecx),%ecx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800790 movdqu (%eax),%xmm0
791 movdqa (%ecx),%xmm5
792 movups (%edx),%xmm2
793.byte 102,15,56,0,197
794 movups 32(%edx),%xmm4
795 movdqa %xmm0,%xmm1
796 pshufd $78,%xmm0,%xmm3
797 pxor %xmm0,%xmm3
798.byte 102,15,58,68,194,0
799.byte 102,15,58,68,202,17
800.byte 102,15,58,68,220,0
801 xorps %xmm0,%xmm3
802 xorps %xmm1,%xmm3
803 movdqa %xmm3,%xmm4
804 psrldq $8,%xmm3
805 pslldq $8,%xmm4
806 pxor %xmm3,%xmm1
807 pxor %xmm4,%xmm0
808 movdqa %xmm0,%xmm4
809 movdqa %xmm0,%xmm3
810 psllq $5,%xmm0
811 pxor %xmm0,%xmm3
812 psllq $1,%xmm0
813 pxor %xmm3,%xmm0
814 psllq $57,%xmm0
815 movdqa %xmm0,%xmm3
816 pslldq $8,%xmm0
817 psrldq $8,%xmm3
818 pxor %xmm4,%xmm0
819 pxor %xmm3,%xmm1
820 movdqa %xmm0,%xmm4
821 psrlq $1,%xmm0
822 pxor %xmm4,%xmm1
823 pxor %xmm0,%xmm4
824 psrlq $5,%xmm0
825 pxor %xmm4,%xmm0
826 psrlq $1,%xmm0
827 pxor %xmm1,%xmm0
828.byte 102,15,56,0,197
829 movdqu %xmm0,(%eax)
830 ret
831.size gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin
832.globl gcm_ghash_clmul
833.hidden gcm_ghash_clmul
834.type gcm_ghash_clmul,@function
835.align 16
836gcm_ghash_clmul:
837.L_gcm_ghash_clmul_begin:
838 pushl %ebp
839 pushl %ebx
840 pushl %esi
841 pushl %edi
842 movl 20(%esp),%eax
843 movl 24(%esp),%edx
844 movl 28(%esp),%esi
845 movl 32(%esp),%ebx
Steven Valdezb0b45c62017-01-17 16:23:54 -0500846 call .L007pic
847.L007pic:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800848 popl %ecx
Steven Valdezb0b45c62017-01-17 16:23:54 -0500849 leal .Lbswap-.L007pic(%ecx),%ecx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800850 movdqu (%eax),%xmm0
851 movdqa (%ecx),%xmm5
852 movdqu (%edx),%xmm2
853.byte 102,15,56,0,197
854 subl $16,%ebx
Steven Valdezb0b45c62017-01-17 16:23:54 -0500855 jz .L008odd_tail
Adam Langleyd9e397b2015-01-22 14:27:53 -0800856 movdqu (%esi),%xmm3
857 movdqu 16(%esi),%xmm6
858.byte 102,15,56,0,221
859.byte 102,15,56,0,245
860 movdqu 32(%edx),%xmm5
861 pxor %xmm3,%xmm0
862 pshufd $78,%xmm6,%xmm3
863 movdqa %xmm6,%xmm7
864 pxor %xmm6,%xmm3
865 leal 32(%esi),%esi
866.byte 102,15,58,68,242,0
867.byte 102,15,58,68,250,17
868.byte 102,15,58,68,221,0
869 movups 16(%edx),%xmm2
870 nop
871 subl $32,%ebx
Steven Valdezb0b45c62017-01-17 16:23:54 -0500872 jbe .L009even_tail
873 jmp .L010mod_loop
Adam Langleyd9e397b2015-01-22 14:27:53 -0800874.align 32
Steven Valdezb0b45c62017-01-17 16:23:54 -0500875.L010mod_loop:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800876 pshufd $78,%xmm0,%xmm4
877 movdqa %xmm0,%xmm1
878 pxor %xmm0,%xmm4
879 nop
880.byte 102,15,58,68,194,0
881.byte 102,15,58,68,202,17
882.byte 102,15,58,68,229,16
883 movups (%edx),%xmm2
884 xorps %xmm6,%xmm0
885 movdqa (%ecx),%xmm5
886 xorps %xmm7,%xmm1
887 movdqu (%esi),%xmm7
888 pxor %xmm0,%xmm3
889 movdqu 16(%esi),%xmm6
890 pxor %xmm1,%xmm3
891.byte 102,15,56,0,253
892 pxor %xmm3,%xmm4
893 movdqa %xmm4,%xmm3
894 psrldq $8,%xmm4
895 pslldq $8,%xmm3
896 pxor %xmm4,%xmm1
897 pxor %xmm3,%xmm0
898.byte 102,15,56,0,245
899 pxor %xmm7,%xmm1
900 movdqa %xmm6,%xmm7
901 movdqa %xmm0,%xmm4
902 movdqa %xmm0,%xmm3
903 psllq $5,%xmm0
904 pxor %xmm0,%xmm3
905 psllq $1,%xmm0
906 pxor %xmm3,%xmm0
907.byte 102,15,58,68,242,0
908 movups 32(%edx),%xmm5
909 psllq $57,%xmm0
910 movdqa %xmm0,%xmm3
911 pslldq $8,%xmm0
912 psrldq $8,%xmm3
913 pxor %xmm4,%xmm0
914 pxor %xmm3,%xmm1
915 pshufd $78,%xmm7,%xmm3
916 movdqa %xmm0,%xmm4
917 psrlq $1,%xmm0
918 pxor %xmm7,%xmm3
919 pxor %xmm4,%xmm1
920.byte 102,15,58,68,250,17
921 movups 16(%edx),%xmm2
922 pxor %xmm0,%xmm4
923 psrlq $5,%xmm0
924 pxor %xmm4,%xmm0
925 psrlq $1,%xmm0
926 pxor %xmm1,%xmm0
927.byte 102,15,58,68,221,0
928 leal 32(%esi),%esi
929 subl $32,%ebx
Steven Valdezb0b45c62017-01-17 16:23:54 -0500930 ja .L010mod_loop
931.L009even_tail:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800932 pshufd $78,%xmm0,%xmm4
933 movdqa %xmm0,%xmm1
934 pxor %xmm0,%xmm4
935.byte 102,15,58,68,194,0
936.byte 102,15,58,68,202,17
937.byte 102,15,58,68,229,16
938 movdqa (%ecx),%xmm5
939 xorps %xmm6,%xmm0
940 xorps %xmm7,%xmm1
941 pxor %xmm0,%xmm3
942 pxor %xmm1,%xmm3
943 pxor %xmm3,%xmm4
944 movdqa %xmm4,%xmm3
945 psrldq $8,%xmm4
946 pslldq $8,%xmm3
947 pxor %xmm4,%xmm1
948 pxor %xmm3,%xmm0
949 movdqa %xmm0,%xmm4
950 movdqa %xmm0,%xmm3
951 psllq $5,%xmm0
952 pxor %xmm0,%xmm3
953 psllq $1,%xmm0
954 pxor %xmm3,%xmm0
955 psllq $57,%xmm0
956 movdqa %xmm0,%xmm3
957 pslldq $8,%xmm0
958 psrldq $8,%xmm3
959 pxor %xmm4,%xmm0
960 pxor %xmm3,%xmm1
961 movdqa %xmm0,%xmm4
962 psrlq $1,%xmm0
963 pxor %xmm4,%xmm1
964 pxor %xmm0,%xmm4
965 psrlq $5,%xmm0
966 pxor %xmm4,%xmm0
967 psrlq $1,%xmm0
968 pxor %xmm1,%xmm0
969 testl %ebx,%ebx
Steven Valdezb0b45c62017-01-17 16:23:54 -0500970 jnz .L011done
Adam Langleyd9e397b2015-01-22 14:27:53 -0800971 movups (%edx),%xmm2
Steven Valdezb0b45c62017-01-17 16:23:54 -0500972.L008odd_tail:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800973 movdqu (%esi),%xmm3
974.byte 102,15,56,0,221
975 pxor %xmm3,%xmm0
976 movdqa %xmm0,%xmm1
977 pshufd $78,%xmm0,%xmm3
978 pshufd $78,%xmm2,%xmm4
979 pxor %xmm0,%xmm3
980 pxor %xmm2,%xmm4
981.byte 102,15,58,68,194,0
982.byte 102,15,58,68,202,17
983.byte 102,15,58,68,220,0
984 xorps %xmm0,%xmm3
985 xorps %xmm1,%xmm3
986 movdqa %xmm3,%xmm4
987 psrldq $8,%xmm3
988 pslldq $8,%xmm4
989 pxor %xmm3,%xmm1
990 pxor %xmm4,%xmm0
991 movdqa %xmm0,%xmm4
992 movdqa %xmm0,%xmm3
993 psllq $5,%xmm0
994 pxor %xmm0,%xmm3
995 psllq $1,%xmm0
996 pxor %xmm3,%xmm0
997 psllq $57,%xmm0
998 movdqa %xmm0,%xmm3
999 pslldq $8,%xmm0
1000 psrldq $8,%xmm3
1001 pxor %xmm4,%xmm0
1002 pxor %xmm3,%xmm1
1003 movdqa %xmm0,%xmm4
1004 psrlq $1,%xmm0
1005 pxor %xmm4,%xmm1
1006 pxor %xmm0,%xmm4
1007 psrlq $5,%xmm0
1008 pxor %xmm4,%xmm0
1009 psrlq $1,%xmm0
1010 pxor %xmm1,%xmm0
Steven Valdezb0b45c62017-01-17 16:23:54 -05001011.L011done:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001012.byte 102,15,56,0,197
1013 movdqu %xmm0,(%eax)
1014 popl %edi
1015 popl %esi
1016 popl %ebx
1017 popl %ebp
1018 ret
1019.size gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin
1020.align 64
1021.Lbswap:
1022.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1023.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
1024.align 64
1025.Lrem_8bit:
1026.value 0,450,900,582,1800,1738,1164,1358
1027.value 3600,4050,3476,3158,2328,2266,2716,2910
1028.value 7200,7650,8100,7782,6952,6890,6316,6510
1029.value 4656,5106,4532,4214,5432,5370,5820,6014
1030.value 14400,14722,15300,14854,16200,16010,15564,15630
1031.value 13904,14226,13780,13334,12632,12442,13020,13086
1032.value 9312,9634,10212,9766,9064,8874,8428,8494
1033.value 10864,11186,10740,10294,11640,11450,12028,12094
1034.value 28800,28994,29444,29382,30600,30282,29708,30158
1035.value 32400,32594,32020,31958,31128,30810,31260,31710
1036.value 27808,28002,28452,28390,27560,27242,26668,27118
1037.value 25264,25458,24884,24822,26040,25722,26172,26622
1038.value 18624,18690,19268,19078,20424,19978,19532,19854
1039.value 18128,18194,17748,17558,16856,16410,16988,17310
1040.value 21728,21794,22372,22182,21480,21034,20588,20910
1041.value 23280,23346,22900,22710,24056,23610,24188,24510
1042.value 57600,57538,57988,58182,58888,59338,58764,58446
1043.value 61200,61138,60564,60758,59416,59866,60316,59998
1044.value 64800,64738,65188,65382,64040,64490,63916,63598
1045.value 62256,62194,61620,61814,62520,62970,63420,63102
1046.value 55616,55426,56004,56070,56904,57226,56780,56334
1047.value 55120,54930,54484,54550,53336,53658,54236,53790
1048.value 50528,50338,50916,50982,49768,50090,49644,49198
1049.value 52080,51890,51444,51510,52344,52666,53244,52798
1050.value 37248,36930,37380,37830,38536,38730,38156,38094
1051.value 40848,40530,39956,40406,39064,39258,39708,39646
1052.value 36256,35938,36388,36838,35496,35690,35116,35054
1053.value 33712,33394,32820,33270,33976,34170,34620,34558
1054.value 43456,43010,43588,43910,44744,44810,44364,44174
1055.value 42960,42514,42068,42390,41176,41242,41820,41630
1056.value 46560,46114,46692,47014,45800,45866,45420,45230
1057.value 48112,47666,47220,47542,48376,48442,49020,48830
1058.align 64
1059.Lrem_4bit:
1060.long 0,0,0,471859200,0,943718400,0,610271232
1061.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208
1062.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008
1063.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160
1064.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
1065.byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
1066.byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
1067.byte 0
1068#endif