blob: 329452b8f79466eea6066ab6bcefa7981e4fd34e [file] [log] [blame]
Tan Swee Heng974e4b72007-12-10 15:52:56 +08001# salsa20_pm.s version 20051229
2# D. J. Bernstein
3# Public domain.
4
Jussi Kivilinna04443802013-01-19 13:39:31 +02005#include <linux/linkage.h>
6
Tan Swee Heng974e4b72007-12-10 15:52:56 +08007.text
Jussi Kivilinna04443802013-01-19 13:39:31 +02008
9# enter salsa20_encrypt_bytes
10ENTRY(salsa20_encrypt_bytes)
Tan Swee Heng974e4b72007-12-10 15:52:56 +080011 mov %esp,%eax
12 and $31,%eax
13 add $256,%eax
14 sub %eax,%esp
15 # eax_stack = eax
16 movl %eax,80(%esp)
17 # ebx_stack = ebx
18 movl %ebx,84(%esp)
19 # esi_stack = esi
20 movl %esi,88(%esp)
21 # edi_stack = edi
22 movl %edi,92(%esp)
23 # ebp_stack = ebp
24 movl %ebp,96(%esp)
25 # x = arg1
26 movl 4(%esp,%eax),%edx
27 # m = arg2
28 movl 8(%esp,%eax),%esi
29 # out = arg3
30 movl 12(%esp,%eax),%edi
31 # bytes = arg4
32 movl 16(%esp,%eax),%ebx
33 # bytes -= 0
34 sub $0,%ebx
35 # goto done if unsigned<=
36 jbe ._done
37._start:
38 # in0 = *(uint32 *) (x + 0)
39 movl 0(%edx),%eax
40 # in1 = *(uint32 *) (x + 4)
41 movl 4(%edx),%ecx
42 # in2 = *(uint32 *) (x + 8)
43 movl 8(%edx),%ebp
44 # j0 = in0
45 movl %eax,164(%esp)
46 # in3 = *(uint32 *) (x + 12)
47 movl 12(%edx),%eax
48 # j1 = in1
49 movl %ecx,168(%esp)
50 # in4 = *(uint32 *) (x + 16)
51 movl 16(%edx),%ecx
52 # j2 = in2
53 movl %ebp,172(%esp)
54 # in5 = *(uint32 *) (x + 20)
55 movl 20(%edx),%ebp
56 # j3 = in3
57 movl %eax,176(%esp)
58 # in6 = *(uint32 *) (x + 24)
59 movl 24(%edx),%eax
60 # j4 = in4
61 movl %ecx,180(%esp)
62 # in7 = *(uint32 *) (x + 28)
63 movl 28(%edx),%ecx
64 # j5 = in5
65 movl %ebp,184(%esp)
66 # in8 = *(uint32 *) (x + 32)
67 movl 32(%edx),%ebp
68 # j6 = in6
69 movl %eax,188(%esp)
70 # in9 = *(uint32 *) (x + 36)
71 movl 36(%edx),%eax
72 # j7 = in7
73 movl %ecx,192(%esp)
74 # in10 = *(uint32 *) (x + 40)
75 movl 40(%edx),%ecx
76 # j8 = in8
77 movl %ebp,196(%esp)
78 # in11 = *(uint32 *) (x + 44)
79 movl 44(%edx),%ebp
80 # j9 = in9
81 movl %eax,200(%esp)
82 # in12 = *(uint32 *) (x + 48)
83 movl 48(%edx),%eax
84 # j10 = in10
85 movl %ecx,204(%esp)
86 # in13 = *(uint32 *) (x + 52)
87 movl 52(%edx),%ecx
88 # j11 = in11
89 movl %ebp,208(%esp)
90 # in14 = *(uint32 *) (x + 56)
91 movl 56(%edx),%ebp
92 # j12 = in12
93 movl %eax,212(%esp)
94 # in15 = *(uint32 *) (x + 60)
95 movl 60(%edx),%eax
96 # j13 = in13
97 movl %ecx,216(%esp)
98 # j14 = in14
99 movl %ebp,220(%esp)
100 # j15 = in15
101 movl %eax,224(%esp)
102 # x_backup = x
103 movl %edx,64(%esp)
104._bytesatleast1:
105 # bytes - 64
106 cmp $64,%ebx
107 # goto nocopy if unsigned>=
108 jae ._nocopy
109 # ctarget = out
110 movl %edi,228(%esp)
111 # out = &tmp
112 leal 0(%esp),%edi
113 # i = bytes
114 mov %ebx,%ecx
115 # while (i) { *out++ = *m++; --i }
116 rep movsb
117 # out = &tmp
118 leal 0(%esp),%edi
119 # m = &tmp
120 leal 0(%esp),%esi
121._nocopy:
122 # out_backup = out
123 movl %edi,72(%esp)
124 # m_backup = m
125 movl %esi,68(%esp)
126 # bytes_backup = bytes
127 movl %ebx,76(%esp)
128 # in0 = j0
129 movl 164(%esp),%eax
130 # in1 = j1
131 movl 168(%esp),%ecx
132 # in2 = j2
133 movl 172(%esp),%edx
134 # in3 = j3
135 movl 176(%esp),%ebx
136 # x0 = in0
137 movl %eax,100(%esp)
138 # x1 = in1
139 movl %ecx,104(%esp)
140 # x2 = in2
141 movl %edx,108(%esp)
142 # x3 = in3
143 movl %ebx,112(%esp)
144 # in4 = j4
145 movl 180(%esp),%eax
146 # in5 = j5
147 movl 184(%esp),%ecx
148 # in6 = j6
149 movl 188(%esp),%edx
150 # in7 = j7
151 movl 192(%esp),%ebx
152 # x4 = in4
153 movl %eax,116(%esp)
154 # x5 = in5
155 movl %ecx,120(%esp)
156 # x6 = in6
157 movl %edx,124(%esp)
158 # x7 = in7
159 movl %ebx,128(%esp)
160 # in8 = j8
161 movl 196(%esp),%eax
162 # in9 = j9
163 movl 200(%esp),%ecx
164 # in10 = j10
165 movl 204(%esp),%edx
166 # in11 = j11
167 movl 208(%esp),%ebx
168 # x8 = in8
169 movl %eax,132(%esp)
170 # x9 = in9
171 movl %ecx,136(%esp)
172 # x10 = in10
173 movl %edx,140(%esp)
174 # x11 = in11
175 movl %ebx,144(%esp)
176 # in12 = j12
177 movl 212(%esp),%eax
178 # in13 = j13
179 movl 216(%esp),%ecx
180 # in14 = j14
181 movl 220(%esp),%edx
182 # in15 = j15
183 movl 224(%esp),%ebx
184 # x12 = in12
185 movl %eax,148(%esp)
186 # x13 = in13
187 movl %ecx,152(%esp)
188 # x14 = in14
189 movl %edx,156(%esp)
190 # x15 = in15
191 movl %ebx,160(%esp)
192 # i = 20
193 mov $20,%ebp
194 # p = x0
195 movl 100(%esp),%eax
196 # s = x5
197 movl 120(%esp),%ecx
198 # t = x10
199 movl 140(%esp),%edx
200 # w = x15
201 movl 160(%esp),%ebx
202._mainloop:
203 # x0 = p
204 movl %eax,100(%esp)
205 # x10 = t
206 movl %edx,140(%esp)
207 # p += x12
208 addl 148(%esp),%eax
209 # x5 = s
210 movl %ecx,120(%esp)
211 # t += x6
212 addl 124(%esp),%edx
213 # x15 = w
214 movl %ebx,160(%esp)
215 # r = x1
216 movl 104(%esp),%esi
217 # r += s
218 add %ecx,%esi
219 # v = x11
220 movl 144(%esp),%edi
221 # v += w
222 add %ebx,%edi
223 # p <<<= 7
224 rol $7,%eax
225 # p ^= x4
226 xorl 116(%esp),%eax
227 # t <<<= 7
228 rol $7,%edx
229 # t ^= x14
230 xorl 156(%esp),%edx
231 # r <<<= 7
232 rol $7,%esi
233 # r ^= x9
234 xorl 136(%esp),%esi
235 # v <<<= 7
236 rol $7,%edi
237 # v ^= x3
238 xorl 112(%esp),%edi
239 # x4 = p
240 movl %eax,116(%esp)
241 # x14 = t
242 movl %edx,156(%esp)
243 # p += x0
244 addl 100(%esp),%eax
245 # x9 = r
246 movl %esi,136(%esp)
247 # t += x10
248 addl 140(%esp),%edx
249 # x3 = v
250 movl %edi,112(%esp)
251 # p <<<= 9
252 rol $9,%eax
253 # p ^= x8
254 xorl 132(%esp),%eax
255 # t <<<= 9
256 rol $9,%edx
257 # t ^= x2
258 xorl 108(%esp),%edx
259 # s += r
260 add %esi,%ecx
261 # s <<<= 9
262 rol $9,%ecx
263 # s ^= x13
264 xorl 152(%esp),%ecx
265 # w += v
266 add %edi,%ebx
267 # w <<<= 9
268 rol $9,%ebx
269 # w ^= x7
270 xorl 128(%esp),%ebx
271 # x8 = p
272 movl %eax,132(%esp)
273 # x2 = t
274 movl %edx,108(%esp)
275 # p += x4
276 addl 116(%esp),%eax
277 # x13 = s
278 movl %ecx,152(%esp)
279 # t += x14
280 addl 156(%esp),%edx
281 # x7 = w
282 movl %ebx,128(%esp)
283 # p <<<= 13
284 rol $13,%eax
285 # p ^= x12
286 xorl 148(%esp),%eax
287 # t <<<= 13
288 rol $13,%edx
289 # t ^= x6
290 xorl 124(%esp),%edx
291 # r += s
292 add %ecx,%esi
293 # r <<<= 13
294 rol $13,%esi
295 # r ^= x1
296 xorl 104(%esp),%esi
297 # v += w
298 add %ebx,%edi
299 # v <<<= 13
300 rol $13,%edi
301 # v ^= x11
302 xorl 144(%esp),%edi
303 # x12 = p
304 movl %eax,148(%esp)
305 # x6 = t
306 movl %edx,124(%esp)
307 # p += x8
308 addl 132(%esp),%eax
309 # x1 = r
310 movl %esi,104(%esp)
311 # t += x2
312 addl 108(%esp),%edx
313 # x11 = v
314 movl %edi,144(%esp)
315 # p <<<= 18
316 rol $18,%eax
317 # p ^= x0
318 xorl 100(%esp),%eax
319 # t <<<= 18
320 rol $18,%edx
321 # t ^= x10
322 xorl 140(%esp),%edx
323 # s += r
324 add %esi,%ecx
325 # s <<<= 18
326 rol $18,%ecx
327 # s ^= x5
328 xorl 120(%esp),%ecx
329 # w += v
330 add %edi,%ebx
331 # w <<<= 18
332 rol $18,%ebx
333 # w ^= x15
334 xorl 160(%esp),%ebx
335 # x0 = p
336 movl %eax,100(%esp)
337 # x10 = t
338 movl %edx,140(%esp)
339 # p += x3
340 addl 112(%esp),%eax
341 # p <<<= 7
342 rol $7,%eax
343 # x5 = s
344 movl %ecx,120(%esp)
345 # t += x9
346 addl 136(%esp),%edx
347 # x15 = w
348 movl %ebx,160(%esp)
349 # r = x4
350 movl 116(%esp),%esi
351 # r += s
352 add %ecx,%esi
353 # v = x14
354 movl 156(%esp),%edi
355 # v += w
356 add %ebx,%edi
357 # p ^= x1
358 xorl 104(%esp),%eax
359 # t <<<= 7
360 rol $7,%edx
361 # t ^= x11
362 xorl 144(%esp),%edx
363 # r <<<= 7
364 rol $7,%esi
365 # r ^= x6
366 xorl 124(%esp),%esi
367 # v <<<= 7
368 rol $7,%edi
369 # v ^= x12
370 xorl 148(%esp),%edi
371 # x1 = p
372 movl %eax,104(%esp)
373 # x11 = t
374 movl %edx,144(%esp)
375 # p += x0
376 addl 100(%esp),%eax
377 # x6 = r
378 movl %esi,124(%esp)
379 # t += x10
380 addl 140(%esp),%edx
381 # x12 = v
382 movl %edi,148(%esp)
383 # p <<<= 9
384 rol $9,%eax
385 # p ^= x2
386 xorl 108(%esp),%eax
387 # t <<<= 9
388 rol $9,%edx
389 # t ^= x8
390 xorl 132(%esp),%edx
391 # s += r
392 add %esi,%ecx
393 # s <<<= 9
394 rol $9,%ecx
395 # s ^= x7
396 xorl 128(%esp),%ecx
397 # w += v
398 add %edi,%ebx
399 # w <<<= 9
400 rol $9,%ebx
401 # w ^= x13
402 xorl 152(%esp),%ebx
403 # x2 = p
404 movl %eax,108(%esp)
405 # x8 = t
406 movl %edx,132(%esp)
407 # p += x1
408 addl 104(%esp),%eax
409 # x7 = s
410 movl %ecx,128(%esp)
411 # t += x11
412 addl 144(%esp),%edx
413 # x13 = w
414 movl %ebx,152(%esp)
415 # p <<<= 13
416 rol $13,%eax
417 # p ^= x3
418 xorl 112(%esp),%eax
419 # t <<<= 13
420 rol $13,%edx
421 # t ^= x9
422 xorl 136(%esp),%edx
423 # r += s
424 add %ecx,%esi
425 # r <<<= 13
426 rol $13,%esi
427 # r ^= x4
428 xorl 116(%esp),%esi
429 # v += w
430 add %ebx,%edi
431 # v <<<= 13
432 rol $13,%edi
433 # v ^= x14
434 xorl 156(%esp),%edi
435 # x3 = p
436 movl %eax,112(%esp)
437 # x9 = t
438 movl %edx,136(%esp)
439 # p += x2
440 addl 108(%esp),%eax
441 # x4 = r
442 movl %esi,116(%esp)
443 # t += x8
444 addl 132(%esp),%edx
445 # x14 = v
446 movl %edi,156(%esp)
447 # p <<<= 18
448 rol $18,%eax
449 # p ^= x0
450 xorl 100(%esp),%eax
451 # t <<<= 18
452 rol $18,%edx
453 # t ^= x10
454 xorl 140(%esp),%edx
455 # s += r
456 add %esi,%ecx
457 # s <<<= 18
458 rol $18,%ecx
459 # s ^= x5
460 xorl 120(%esp),%ecx
461 # w += v
462 add %edi,%ebx
463 # w <<<= 18
464 rol $18,%ebx
465 # w ^= x15
466 xorl 160(%esp),%ebx
467 # x0 = p
468 movl %eax,100(%esp)
469 # x10 = t
470 movl %edx,140(%esp)
471 # p += x12
472 addl 148(%esp),%eax
473 # x5 = s
474 movl %ecx,120(%esp)
475 # t += x6
476 addl 124(%esp),%edx
477 # x15 = w
478 movl %ebx,160(%esp)
479 # r = x1
480 movl 104(%esp),%esi
481 # r += s
482 add %ecx,%esi
483 # v = x11
484 movl 144(%esp),%edi
485 # v += w
486 add %ebx,%edi
487 # p <<<= 7
488 rol $7,%eax
489 # p ^= x4
490 xorl 116(%esp),%eax
491 # t <<<= 7
492 rol $7,%edx
493 # t ^= x14
494 xorl 156(%esp),%edx
495 # r <<<= 7
496 rol $7,%esi
497 # r ^= x9
498 xorl 136(%esp),%esi
499 # v <<<= 7
500 rol $7,%edi
501 # v ^= x3
502 xorl 112(%esp),%edi
503 # x4 = p
504 movl %eax,116(%esp)
505 # x14 = t
506 movl %edx,156(%esp)
507 # p += x0
508 addl 100(%esp),%eax
509 # x9 = r
510 movl %esi,136(%esp)
511 # t += x10
512 addl 140(%esp),%edx
513 # x3 = v
514 movl %edi,112(%esp)
515 # p <<<= 9
516 rol $9,%eax
517 # p ^= x8
518 xorl 132(%esp),%eax
519 # t <<<= 9
520 rol $9,%edx
521 # t ^= x2
522 xorl 108(%esp),%edx
523 # s += r
524 add %esi,%ecx
525 # s <<<= 9
526 rol $9,%ecx
527 # s ^= x13
528 xorl 152(%esp),%ecx
529 # w += v
530 add %edi,%ebx
531 # w <<<= 9
532 rol $9,%ebx
533 # w ^= x7
534 xorl 128(%esp),%ebx
535 # x8 = p
536 movl %eax,132(%esp)
537 # x2 = t
538 movl %edx,108(%esp)
539 # p += x4
540 addl 116(%esp),%eax
541 # x13 = s
542 movl %ecx,152(%esp)
543 # t += x14
544 addl 156(%esp),%edx
545 # x7 = w
546 movl %ebx,128(%esp)
547 # p <<<= 13
548 rol $13,%eax
549 # p ^= x12
550 xorl 148(%esp),%eax
551 # t <<<= 13
552 rol $13,%edx
553 # t ^= x6
554 xorl 124(%esp),%edx
555 # r += s
556 add %ecx,%esi
557 # r <<<= 13
558 rol $13,%esi
559 # r ^= x1
560 xorl 104(%esp),%esi
561 # v += w
562 add %ebx,%edi
563 # v <<<= 13
564 rol $13,%edi
565 # v ^= x11
566 xorl 144(%esp),%edi
567 # x12 = p
568 movl %eax,148(%esp)
569 # x6 = t
570 movl %edx,124(%esp)
571 # p += x8
572 addl 132(%esp),%eax
573 # x1 = r
574 movl %esi,104(%esp)
575 # t += x2
576 addl 108(%esp),%edx
577 # x11 = v
578 movl %edi,144(%esp)
579 # p <<<= 18
580 rol $18,%eax
581 # p ^= x0
582 xorl 100(%esp),%eax
583 # t <<<= 18
584 rol $18,%edx
585 # t ^= x10
586 xorl 140(%esp),%edx
587 # s += r
588 add %esi,%ecx
589 # s <<<= 18
590 rol $18,%ecx
591 # s ^= x5
592 xorl 120(%esp),%ecx
593 # w += v
594 add %edi,%ebx
595 # w <<<= 18
596 rol $18,%ebx
597 # w ^= x15
598 xorl 160(%esp),%ebx
599 # x0 = p
600 movl %eax,100(%esp)
601 # x10 = t
602 movl %edx,140(%esp)
603 # p += x3
604 addl 112(%esp),%eax
605 # p <<<= 7
606 rol $7,%eax
607 # x5 = s
608 movl %ecx,120(%esp)
609 # t += x9
610 addl 136(%esp),%edx
611 # x15 = w
612 movl %ebx,160(%esp)
613 # r = x4
614 movl 116(%esp),%esi
615 # r += s
616 add %ecx,%esi
617 # v = x14
618 movl 156(%esp),%edi
619 # v += w
620 add %ebx,%edi
621 # p ^= x1
622 xorl 104(%esp),%eax
623 # t <<<= 7
624 rol $7,%edx
625 # t ^= x11
626 xorl 144(%esp),%edx
627 # r <<<= 7
628 rol $7,%esi
629 # r ^= x6
630 xorl 124(%esp),%esi
631 # v <<<= 7
632 rol $7,%edi
633 # v ^= x12
634 xorl 148(%esp),%edi
635 # x1 = p
636 movl %eax,104(%esp)
637 # x11 = t
638 movl %edx,144(%esp)
639 # p += x0
640 addl 100(%esp),%eax
641 # x6 = r
642 movl %esi,124(%esp)
643 # t += x10
644 addl 140(%esp),%edx
645 # x12 = v
646 movl %edi,148(%esp)
647 # p <<<= 9
648 rol $9,%eax
649 # p ^= x2
650 xorl 108(%esp),%eax
651 # t <<<= 9
652 rol $9,%edx
653 # t ^= x8
654 xorl 132(%esp),%edx
655 # s += r
656 add %esi,%ecx
657 # s <<<= 9
658 rol $9,%ecx
659 # s ^= x7
660 xorl 128(%esp),%ecx
661 # w += v
662 add %edi,%ebx
663 # w <<<= 9
664 rol $9,%ebx
665 # w ^= x13
666 xorl 152(%esp),%ebx
667 # x2 = p
668 movl %eax,108(%esp)
669 # x8 = t
670 movl %edx,132(%esp)
671 # p += x1
672 addl 104(%esp),%eax
673 # x7 = s
674 movl %ecx,128(%esp)
675 # t += x11
676 addl 144(%esp),%edx
677 # x13 = w
678 movl %ebx,152(%esp)
679 # p <<<= 13
680 rol $13,%eax
681 # p ^= x3
682 xorl 112(%esp),%eax
683 # t <<<= 13
684 rol $13,%edx
685 # t ^= x9
686 xorl 136(%esp),%edx
687 # r += s
688 add %ecx,%esi
689 # r <<<= 13
690 rol $13,%esi
691 # r ^= x4
692 xorl 116(%esp),%esi
693 # v += w
694 add %ebx,%edi
695 # v <<<= 13
696 rol $13,%edi
697 # v ^= x14
698 xorl 156(%esp),%edi
699 # x3 = p
700 movl %eax,112(%esp)
701 # x9 = t
702 movl %edx,136(%esp)
703 # p += x2
704 addl 108(%esp),%eax
705 # x4 = r
706 movl %esi,116(%esp)
707 # t += x8
708 addl 132(%esp),%edx
709 # x14 = v
710 movl %edi,156(%esp)
711 # p <<<= 18
712 rol $18,%eax
713 # p ^= x0
714 xorl 100(%esp),%eax
715 # t <<<= 18
716 rol $18,%edx
717 # t ^= x10
718 xorl 140(%esp),%edx
719 # s += r
720 add %esi,%ecx
721 # s <<<= 18
722 rol $18,%ecx
723 # s ^= x5
724 xorl 120(%esp),%ecx
725 # w += v
726 add %edi,%ebx
727 # w <<<= 18
728 rol $18,%ebx
729 # w ^= x15
730 xorl 160(%esp),%ebx
731 # i -= 4
732 sub $4,%ebp
733 # goto mainloop if unsigned >
734 ja ._mainloop
735 # x0 = p
736 movl %eax,100(%esp)
737 # x5 = s
738 movl %ecx,120(%esp)
739 # x10 = t
740 movl %edx,140(%esp)
741 # x15 = w
742 movl %ebx,160(%esp)
743 # out = out_backup
744 movl 72(%esp),%edi
745 # m = m_backup
746 movl 68(%esp),%esi
747 # in0 = x0
748 movl 100(%esp),%eax
749 # in1 = x1
750 movl 104(%esp),%ecx
751 # in0 += j0
752 addl 164(%esp),%eax
753 # in1 += j1
754 addl 168(%esp),%ecx
755 # in0 ^= *(uint32 *) (m + 0)
756 xorl 0(%esi),%eax
757 # in1 ^= *(uint32 *) (m + 4)
758 xorl 4(%esi),%ecx
759 # *(uint32 *) (out + 0) = in0
760 movl %eax,0(%edi)
761 # *(uint32 *) (out + 4) = in1
762 movl %ecx,4(%edi)
763 # in2 = x2
764 movl 108(%esp),%eax
765 # in3 = x3
766 movl 112(%esp),%ecx
767 # in2 += j2
768 addl 172(%esp),%eax
769 # in3 += j3
770 addl 176(%esp),%ecx
771 # in2 ^= *(uint32 *) (m + 8)
772 xorl 8(%esi),%eax
773 # in3 ^= *(uint32 *) (m + 12)
774 xorl 12(%esi),%ecx
775 # *(uint32 *) (out + 8) = in2
776 movl %eax,8(%edi)
777 # *(uint32 *) (out + 12) = in3
778 movl %ecx,12(%edi)
779 # in4 = x4
780 movl 116(%esp),%eax
781 # in5 = x5
782 movl 120(%esp),%ecx
783 # in4 += j4
784 addl 180(%esp),%eax
785 # in5 += j5
786 addl 184(%esp),%ecx
787 # in4 ^= *(uint32 *) (m + 16)
788 xorl 16(%esi),%eax
789 # in5 ^= *(uint32 *) (m + 20)
790 xorl 20(%esi),%ecx
791 # *(uint32 *) (out + 16) = in4
792 movl %eax,16(%edi)
793 # *(uint32 *) (out + 20) = in5
794 movl %ecx,20(%edi)
795 # in6 = x6
796 movl 124(%esp),%eax
797 # in7 = x7
798 movl 128(%esp),%ecx
799 # in6 += j6
800 addl 188(%esp),%eax
801 # in7 += j7
802 addl 192(%esp),%ecx
803 # in6 ^= *(uint32 *) (m + 24)
804 xorl 24(%esi),%eax
805 # in7 ^= *(uint32 *) (m + 28)
806 xorl 28(%esi),%ecx
807 # *(uint32 *) (out + 24) = in6
808 movl %eax,24(%edi)
809 # *(uint32 *) (out + 28) = in7
810 movl %ecx,28(%edi)
811 # in8 = x8
812 movl 132(%esp),%eax
813 # in9 = x9
814 movl 136(%esp),%ecx
815 # in8 += j8
816 addl 196(%esp),%eax
817 # in9 += j9
818 addl 200(%esp),%ecx
819 # in8 ^= *(uint32 *) (m + 32)
820 xorl 32(%esi),%eax
821 # in9 ^= *(uint32 *) (m + 36)
822 xorl 36(%esi),%ecx
823 # *(uint32 *) (out + 32) = in8
824 movl %eax,32(%edi)
825 # *(uint32 *) (out + 36) = in9
826 movl %ecx,36(%edi)
827 # in10 = x10
828 movl 140(%esp),%eax
829 # in11 = x11
830 movl 144(%esp),%ecx
831 # in10 += j10
832 addl 204(%esp),%eax
833 # in11 += j11
834 addl 208(%esp),%ecx
835 # in10 ^= *(uint32 *) (m + 40)
836 xorl 40(%esi),%eax
837 # in11 ^= *(uint32 *) (m + 44)
838 xorl 44(%esi),%ecx
839 # *(uint32 *) (out + 40) = in10
840 movl %eax,40(%edi)
841 # *(uint32 *) (out + 44) = in11
842 movl %ecx,44(%edi)
843 # in12 = x12
844 movl 148(%esp),%eax
845 # in13 = x13
846 movl 152(%esp),%ecx
847 # in12 += j12
848 addl 212(%esp),%eax
849 # in13 += j13
850 addl 216(%esp),%ecx
851 # in12 ^= *(uint32 *) (m + 48)
852 xorl 48(%esi),%eax
853 # in13 ^= *(uint32 *) (m + 52)
854 xorl 52(%esi),%ecx
855 # *(uint32 *) (out + 48) = in12
856 movl %eax,48(%edi)
857 # *(uint32 *) (out + 52) = in13
858 movl %ecx,52(%edi)
859 # in14 = x14
860 movl 156(%esp),%eax
861 # in15 = x15
862 movl 160(%esp),%ecx
863 # in14 += j14
864 addl 220(%esp),%eax
865 # in15 += j15
866 addl 224(%esp),%ecx
867 # in14 ^= *(uint32 *) (m + 56)
868 xorl 56(%esi),%eax
869 # in15 ^= *(uint32 *) (m + 60)
870 xorl 60(%esi),%ecx
871 # *(uint32 *) (out + 56) = in14
872 movl %eax,56(%edi)
873 # *(uint32 *) (out + 60) = in15
874 movl %ecx,60(%edi)
875 # bytes = bytes_backup
876 movl 76(%esp),%ebx
877 # in8 = j8
878 movl 196(%esp),%eax
879 # in9 = j9
880 movl 200(%esp),%ecx
881 # in8 += 1
882 add $1,%eax
883 # in9 += 0 + carry
884 adc $0,%ecx
885 # j8 = in8
886 movl %eax,196(%esp)
887 # j9 = in9
888 movl %ecx,200(%esp)
889 # bytes - 64
890 cmp $64,%ebx
891 # goto bytesatleast65 if unsigned>
892 ja ._bytesatleast65
893 # goto bytesatleast64 if unsigned>=
894 jae ._bytesatleast64
895 # m = out
896 mov %edi,%esi
897 # out = ctarget
898 movl 228(%esp),%edi
899 # i = bytes
900 mov %ebx,%ecx
901 # while (i) { *out++ = *m++; --i }
902 rep movsb
903._bytesatleast64:
904 # x = x_backup
905 movl 64(%esp),%eax
906 # in8 = j8
907 movl 196(%esp),%ecx
908 # in9 = j9
909 movl 200(%esp),%edx
910 # *(uint32 *) (x + 32) = in8
911 movl %ecx,32(%eax)
912 # *(uint32 *) (x + 36) = in9
913 movl %edx,36(%eax)
914._done:
915 # eax = eax_stack
916 movl 80(%esp),%eax
917 # ebx = ebx_stack
918 movl 84(%esp),%ebx
919 # esi = esi_stack
920 movl 88(%esp),%esi
921 # edi = edi_stack
922 movl 92(%esp),%edi
923 # ebp = ebp_stack
924 movl 96(%esp),%ebp
925 # leave
926 add %eax,%esp
927 ret
928._bytesatleast65:
929 # bytes -= 64
930 sub $64,%ebx
931 # out += 64
932 add $64,%edi
933 # m += 64
934 add $64,%esi
935 # goto bytesatleast1
936 jmp ._bytesatleast1
Jussi Kivilinna04443802013-01-19 13:39:31 +0200937ENDPROC(salsa20_encrypt_bytes)
938
939# enter salsa20_keysetup
940ENTRY(salsa20_keysetup)
Tan Swee Heng974e4b72007-12-10 15:52:56 +0800941 mov %esp,%eax
942 and $31,%eax
943 add $256,%eax
944 sub %eax,%esp
945 # eax_stack = eax
946 movl %eax,64(%esp)
947 # ebx_stack = ebx
948 movl %ebx,68(%esp)
949 # esi_stack = esi
950 movl %esi,72(%esp)
951 # edi_stack = edi
952 movl %edi,76(%esp)
953 # ebp_stack = ebp
954 movl %ebp,80(%esp)
955 # k = arg2
956 movl 8(%esp,%eax),%ecx
957 # kbits = arg3
958 movl 12(%esp,%eax),%edx
959 # x = arg1
960 movl 4(%esp,%eax),%eax
961 # in1 = *(uint32 *) (k + 0)
962 movl 0(%ecx),%ebx
963 # in2 = *(uint32 *) (k + 4)
964 movl 4(%ecx),%esi
965 # in3 = *(uint32 *) (k + 8)
966 movl 8(%ecx),%edi
967 # in4 = *(uint32 *) (k + 12)
968 movl 12(%ecx),%ebp
969 # *(uint32 *) (x + 4) = in1
970 movl %ebx,4(%eax)
971 # *(uint32 *) (x + 8) = in2
972 movl %esi,8(%eax)
973 # *(uint32 *) (x + 12) = in3
974 movl %edi,12(%eax)
975 # *(uint32 *) (x + 16) = in4
976 movl %ebp,16(%eax)
977 # kbits - 256
978 cmp $256,%edx
979 # goto kbits128 if unsigned<
980 jb ._kbits128
981._kbits256:
982 # in11 = *(uint32 *) (k + 16)
983 movl 16(%ecx),%edx
984 # in12 = *(uint32 *) (k + 20)
985 movl 20(%ecx),%ebx
986 # in13 = *(uint32 *) (k + 24)
987 movl 24(%ecx),%esi
988 # in14 = *(uint32 *) (k + 28)
989 movl 28(%ecx),%ecx
990 # *(uint32 *) (x + 44) = in11
991 movl %edx,44(%eax)
992 # *(uint32 *) (x + 48) = in12
993 movl %ebx,48(%eax)
994 # *(uint32 *) (x + 52) = in13
995 movl %esi,52(%eax)
996 # *(uint32 *) (x + 56) = in14
997 movl %ecx,56(%eax)
998 # in0 = 1634760805
999 mov $1634760805,%ecx
1000 # in5 = 857760878
1001 mov $857760878,%edx
1002 # in10 = 2036477234
1003 mov $2036477234,%ebx
1004 # in15 = 1797285236
1005 mov $1797285236,%esi
1006 # *(uint32 *) (x + 0) = in0
1007 movl %ecx,0(%eax)
1008 # *(uint32 *) (x + 20) = in5
1009 movl %edx,20(%eax)
1010 # *(uint32 *) (x + 40) = in10
1011 movl %ebx,40(%eax)
1012 # *(uint32 *) (x + 60) = in15
1013 movl %esi,60(%eax)
1014 # goto keysetupdone
1015 jmp ._keysetupdone
1016._kbits128:
1017 # in11 = *(uint32 *) (k + 0)
1018 movl 0(%ecx),%edx
1019 # in12 = *(uint32 *) (k + 4)
1020 movl 4(%ecx),%ebx
1021 # in13 = *(uint32 *) (k + 8)
1022 movl 8(%ecx),%esi
1023 # in14 = *(uint32 *) (k + 12)
1024 movl 12(%ecx),%ecx
1025 # *(uint32 *) (x + 44) = in11
1026 movl %edx,44(%eax)
1027 # *(uint32 *) (x + 48) = in12
1028 movl %ebx,48(%eax)
1029 # *(uint32 *) (x + 52) = in13
1030 movl %esi,52(%eax)
1031 # *(uint32 *) (x + 56) = in14
1032 movl %ecx,56(%eax)
1033 # in0 = 1634760805
1034 mov $1634760805,%ecx
1035 # in5 = 824206446
1036 mov $824206446,%edx
1037 # in10 = 2036477238
1038 mov $2036477238,%ebx
1039 # in15 = 1797285236
1040 mov $1797285236,%esi
1041 # *(uint32 *) (x + 0) = in0
1042 movl %ecx,0(%eax)
1043 # *(uint32 *) (x + 20) = in5
1044 movl %edx,20(%eax)
1045 # *(uint32 *) (x + 40) = in10
1046 movl %ebx,40(%eax)
1047 # *(uint32 *) (x + 60) = in15
1048 movl %esi,60(%eax)
1049._keysetupdone:
1050 # eax = eax_stack
1051 movl 64(%esp),%eax
1052 # ebx = ebx_stack
1053 movl 68(%esp),%ebx
1054 # esi = esi_stack
1055 movl 72(%esp),%esi
1056 # edi = edi_stack
1057 movl 76(%esp),%edi
1058 # ebp = ebp_stack
1059 movl 80(%esp),%ebp
1060 # leave
1061 add %eax,%esp
1062 ret
Jussi Kivilinna04443802013-01-19 13:39:31 +02001063ENDPROC(salsa20_keysetup)
1064
1065# enter salsa20_ivsetup
1066ENTRY(salsa20_ivsetup)
Tan Swee Heng974e4b72007-12-10 15:52:56 +08001067 mov %esp,%eax
1068 and $31,%eax
1069 add $256,%eax
1070 sub %eax,%esp
1071 # eax_stack = eax
1072 movl %eax,64(%esp)
1073 # ebx_stack = ebx
1074 movl %ebx,68(%esp)
1075 # esi_stack = esi
1076 movl %esi,72(%esp)
1077 # edi_stack = edi
1078 movl %edi,76(%esp)
1079 # ebp_stack = ebp
1080 movl %ebp,80(%esp)
1081 # iv = arg2
1082 movl 8(%esp,%eax),%ecx
1083 # x = arg1
1084 movl 4(%esp,%eax),%eax
1085 # in6 = *(uint32 *) (iv + 0)
1086 movl 0(%ecx),%edx
1087 # in7 = *(uint32 *) (iv + 4)
1088 movl 4(%ecx),%ecx
1089 # in8 = 0
1090 mov $0,%ebx
1091 # in9 = 0
1092 mov $0,%esi
1093 # *(uint32 *) (x + 24) = in6
1094 movl %edx,24(%eax)
1095 # *(uint32 *) (x + 28) = in7
1096 movl %ecx,28(%eax)
1097 # *(uint32 *) (x + 32) = in8
1098 movl %ebx,32(%eax)
1099 # *(uint32 *) (x + 36) = in9
1100 movl %esi,36(%eax)
1101 # eax = eax_stack
1102 movl 64(%esp),%eax
1103 # ebx = ebx_stack
1104 movl 68(%esp),%ebx
1105 # esi = esi_stack
1106 movl 72(%esp),%esi
1107 # edi = edi_stack
1108 movl 76(%esp),%edi
1109 # ebp = ebp_stack
1110 movl 80(%esp),%ebp
1111 # leave
1112 add %eax,%esp
1113 ret
Jussi Kivilinna04443802013-01-19 13:39:31 +02001114ENDPROC(salsa20_ivsetup)