blob: 4eda2b8db9e1b08d4af85e37c6cf19039bc9220b [file] [log] [blame]
Ondrej Mosnacek1d373d42018-05-11 14:12:51 +02001/*
2 * AES-NI + SSE2 implementation of AEGIS-128L
3 *
4 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
5 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 */
11
12#include <linux/linkage.h>
13#include <asm/frame.h>
14
15#define STATE0 %xmm0
16#define STATE1 %xmm1
17#define STATE2 %xmm2
18#define STATE3 %xmm3
19#define STATE4 %xmm4
20#define STATE5 %xmm5
21#define STATE6 %xmm6
22#define STATE7 %xmm7
23#define MSG0 %xmm8
24#define MSG1 %xmm9
25#define T0 %xmm10
26#define T1 %xmm11
27#define T2 %xmm12
28#define T3 %xmm13
29
30#define STATEP %rdi
31#define LEN %rsi
32#define SRC %rdx
33#define DST %rcx
34
35.section .rodata.cst16.aegis128l_const, "aM", @progbits, 32
36.align 16
37.Laegis128l_const_0:
38 .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
39 .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
40.Laegis128l_const_1:
41 .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
42 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
43
44.section .rodata.cst16.aegis128l_counter, "aM", @progbits, 16
45.align 16
46.Laegis128l_counter0:
47 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
48 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
49.Laegis128l_counter1:
50 .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
51 .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
52
53.text
54
55/*
56 * __load_partial: internal ABI
57 * input:
58 * LEN - bytes
59 * SRC - src
60 * output:
61 * MSG0 - first message block
62 * MSG1 - second message block
63 * changed:
64 * T0
65 * %r8
66 * %r9
67 */
68__load_partial:
69 xor %r9, %r9
70 pxor MSG0, MSG0
71 pxor MSG1, MSG1
72
73 mov LEN, %r8
74 and $0x1, %r8
75 jz .Lld_partial_1
76
77 mov LEN, %r8
78 and $0x1E, %r8
79 add SRC, %r8
80 mov (%r8), %r9b
81
82.Lld_partial_1:
83 mov LEN, %r8
84 and $0x2, %r8
85 jz .Lld_partial_2
86
87 mov LEN, %r8
88 and $0x1C, %r8
89 add SRC, %r8
90 shl $0x10, %r9
91 mov (%r8), %r9w
92
93.Lld_partial_2:
94 mov LEN, %r8
95 and $0x4, %r8
96 jz .Lld_partial_4
97
98 mov LEN, %r8
99 and $0x18, %r8
100 add SRC, %r8
101 shl $32, %r9
102 mov (%r8), %r8d
103 xor %r8, %r9
104
105.Lld_partial_4:
106 movq %r9, MSG0
107
108 mov LEN, %r8
109 and $0x8, %r8
110 jz .Lld_partial_8
111
112 mov LEN, %r8
113 and $0x10, %r8
114 add SRC, %r8
115 pslldq $8, MSG0
116 movq (%r8), T0
117 pxor T0, MSG0
118
119.Lld_partial_8:
120 mov LEN, %r8
121 and $0x10, %r8
122 jz .Lld_partial_16
123
124 movdqa MSG0, MSG1
125 movdqu (SRC), MSG0
126
127.Lld_partial_16:
128 ret
129ENDPROC(__load_partial)
130
131/*
132 * __store_partial: internal ABI
133 * input:
134 * LEN - bytes
135 * DST - dst
136 * output:
137 * T0 - first message block
138 * T1 - second message block
139 * changed:
140 * %r8
141 * %r9
142 * %r10
143 */
144__store_partial:
145 mov LEN, %r8
146 mov DST, %r9
147
148 cmp $16, %r8
149 jl .Lst_partial_16
150
151 movdqu T0, (%r9)
152 movdqa T1, T0
153
154 sub $16, %r8
155 add $16, %r9
156
157.Lst_partial_16:
158 movq T0, %r10
159
160 cmp $8, %r8
161 jl .Lst_partial_8
162
163 mov %r10, (%r9)
164 psrldq $8, T0
165 movq T0, %r10
166
167 sub $8, %r8
168 add $8, %r9
169
170.Lst_partial_8:
171 cmp $4, %r8
172 jl .Lst_partial_4
173
174 mov %r10d, (%r9)
175 shr $32, %r10
176
177 sub $4, %r8
178 add $4, %r9
179
180.Lst_partial_4:
181 cmp $2, %r8
182 jl .Lst_partial_2
183
184 mov %r10w, (%r9)
185 shr $0x10, %r10
186
187 sub $2, %r8
188 add $2, %r9
189
190.Lst_partial_2:
191 cmp $1, %r8
192 jl .Lst_partial_1
193
194 mov %r10b, (%r9)
195
196.Lst_partial_1:
197 ret
198ENDPROC(__store_partial)
199
200.macro update
201 movdqa STATE7, T0
202 aesenc STATE0, STATE7
203 aesenc STATE1, STATE0
204 aesenc STATE2, STATE1
205 aesenc STATE3, STATE2
206 aesenc STATE4, STATE3
207 aesenc STATE5, STATE4
208 aesenc STATE6, STATE5
209 aesenc T0, STATE6
210.endm
211
212.macro update0
213 update
214 pxor MSG0, STATE7
215 pxor MSG1, STATE3
216.endm
217
218.macro update1
219 update
220 pxor MSG0, STATE6
221 pxor MSG1, STATE2
222.endm
223
224.macro update2
225 update
226 pxor MSG0, STATE5
227 pxor MSG1, STATE1
228.endm
229
230.macro update3
231 update
232 pxor MSG0, STATE4
233 pxor MSG1, STATE0
234.endm
235
236.macro update4
237 update
238 pxor MSG0, STATE3
239 pxor MSG1, STATE7
240.endm
241
242.macro update5
243 update
244 pxor MSG0, STATE2
245 pxor MSG1, STATE6
246.endm
247
248.macro update6
249 update
250 pxor MSG0, STATE1
251 pxor MSG1, STATE5
252.endm
253
254.macro update7
255 update
256 pxor MSG0, STATE0
257 pxor MSG1, STATE4
258.endm
259
260.macro state_load
261 movdqu 0x00(STATEP), STATE0
262 movdqu 0x10(STATEP), STATE1
263 movdqu 0x20(STATEP), STATE2
264 movdqu 0x30(STATEP), STATE3
265 movdqu 0x40(STATEP), STATE4
266 movdqu 0x50(STATEP), STATE5
267 movdqu 0x60(STATEP), STATE6
268 movdqu 0x70(STATEP), STATE7
269.endm
270
271.macro state_store s0 s1 s2 s3 s4 s5 s6 s7
272 movdqu \s7, 0x00(STATEP)
273 movdqu \s0, 0x10(STATEP)
274 movdqu \s1, 0x20(STATEP)
275 movdqu \s2, 0x30(STATEP)
276 movdqu \s3, 0x40(STATEP)
277 movdqu \s4, 0x50(STATEP)
278 movdqu \s5, 0x60(STATEP)
279 movdqu \s6, 0x70(STATEP)
280.endm
281
282.macro state_store0
283 state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7
284.endm
285
286.macro state_store1
287 state_store STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6
288.endm
289
290.macro state_store2
291 state_store STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
292.endm
293
294.macro state_store3
295 state_store STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4
296.endm
297
298.macro state_store4
299 state_store STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3
300.endm
301
302.macro state_store5
303 state_store STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2
304.endm
305
306.macro state_store6
307 state_store STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1
308.endm
309
310.macro state_store7
311 state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0
312.endm
313
314/*
315 * void crypto_aegis128l_aesni_init(void *state, const void *key, const void *iv);
316 */
317ENTRY(crypto_aegis128l_aesni_init)
318 FRAME_BEGIN
319
320 /* load key: */
321 movdqa (%rsi), MSG1
322 movdqa MSG1, STATE0
323 movdqa MSG1, STATE4
324 movdqa MSG1, STATE5
325 movdqa MSG1, STATE6
326 movdqa MSG1, STATE7
327
328 /* load IV: */
329 movdqu (%rdx), MSG0
330 pxor MSG0, STATE0
331 pxor MSG0, STATE4
332
333 /* load the constants: */
334 movdqa .Laegis128l_const_0, STATE2
335 movdqa .Laegis128l_const_1, STATE1
336 movdqa STATE1, STATE3
337 pxor STATE2, STATE5
338 pxor STATE1, STATE6
339 pxor STATE2, STATE7
340
341 /* update 10 times with IV and KEY: */
342 update0
343 update1
344 update2
345 update3
346 update4
347 update5
348 update6
349 update7
350 update0
351 update1
352
353 state_store1
354
355 FRAME_END
356 ret
357ENDPROC(crypto_aegis128l_aesni_init)
358
359.macro ad_block a i
360 movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
361 movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
362 update\i
363 sub $0x20, LEN
364 cmp $0x20, LEN
365 jl .Lad_out_\i
366.endm
367
368/*
369 * void crypto_aegis128l_aesni_ad(void *state, unsigned int length,
370 * const void *data);
371 */
372ENTRY(crypto_aegis128l_aesni_ad)
373 FRAME_BEGIN
374
375 cmp $0x20, LEN
376 jb .Lad_out
377
378 state_load
379
380 mov SRC, %r8
381 and $0xf, %r8
382 jnz .Lad_u_loop
383
384.align 8
385.Lad_a_loop:
386 ad_block a 0
387 ad_block a 1
388 ad_block a 2
389 ad_block a 3
390 ad_block a 4
391 ad_block a 5
392 ad_block a 6
393 ad_block a 7
394
395 add $0x100, SRC
396 jmp .Lad_a_loop
397
398.align 8
399.Lad_u_loop:
400 ad_block u 0
401 ad_block u 1
402 ad_block u 2
403 ad_block u 3
404 ad_block u 4
405 ad_block u 5
406 ad_block u 6
407 ad_block u 7
408
409 add $0x100, SRC
410 jmp .Lad_u_loop
411
412.Lad_out_0:
413 state_store0
414 FRAME_END
415 ret
416
417.Lad_out_1:
418 state_store1
419 FRAME_END
420 ret
421
422.Lad_out_2:
423 state_store2
424 FRAME_END
425 ret
426
427.Lad_out_3:
428 state_store3
429 FRAME_END
430 ret
431
432.Lad_out_4:
433 state_store4
434 FRAME_END
435 ret
436
437.Lad_out_5:
438 state_store5
439 FRAME_END
440 ret
441
442.Lad_out_6:
443 state_store6
444 FRAME_END
445 ret
446
447.Lad_out_7:
448 state_store7
449 FRAME_END
450 ret
451
452.Lad_out:
453 FRAME_END
454 ret
455ENDPROC(crypto_aegis128l_aesni_ad)
456
457.macro crypt m0 m1 s0 s1 s2 s3 s4 s5 s6 s7
458 pxor \s1, \m0
459 pxor \s6, \m0
460 movdqa \s2, T3
461 pand \s3, T3
462 pxor T3, \m0
463
464 pxor \s2, \m1
465 pxor \s5, \m1
466 movdqa \s6, T3
467 pand \s7, T3
468 pxor T3, \m1
469.endm
470
471.macro crypt0 m0 m1
472 crypt \m0 \m1 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7
473.endm
474
475.macro crypt1 m0 m1
476 crypt \m0 \m1 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6
477.endm
478
479.macro crypt2 m0 m1
480 crypt \m0 \m1 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
481.endm
482
483.macro crypt3 m0 m1
484 crypt \m0 \m1 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4
485.endm
486
487.macro crypt4 m0 m1
488 crypt \m0 \m1 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3
489.endm
490
491.macro crypt5 m0 m1
492 crypt \m0 \m1 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2
493.endm
494
495.macro crypt6 m0 m1
496 crypt \m0 \m1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1
497.endm
498
499.macro crypt7 m0 m1
500 crypt \m0 \m1 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0
501.endm
502
503.macro encrypt_block a i
504 movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
505 movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
506 movdqa MSG0, T0
507 movdqa MSG1, T1
508 crypt\i T0, T1
509 movdq\a T0, (\i * 0x20 + 0x00)(DST)
510 movdq\a T1, (\i * 0x20 + 0x10)(DST)
511
512 update\i
513
514 sub $0x20, LEN
515 cmp $0x20, LEN
516 jl .Lenc_out_\i
517.endm
518
519.macro decrypt_block a i
520 movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
521 movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
522 crypt\i MSG0, MSG1
523 movdq\a MSG0, (\i * 0x20 + 0x00)(DST)
524 movdq\a MSG1, (\i * 0x20 + 0x10)(DST)
525
526 update\i
527
528 sub $0x20, LEN
529 cmp $0x20, LEN
530 jl .Ldec_out_\i
531.endm
532
533/*
534 * void crypto_aegis128l_aesni_enc(void *state, unsigned int length,
535 * const void *src, void *dst);
536 */
537ENTRY(crypto_aegis128l_aesni_enc)
538 FRAME_BEGIN
539
540 cmp $0x20, LEN
541 jb .Lenc_out
542
543 state_load
544
545 mov SRC, %r8
546 or DST, %r8
547 and $0xf, %r8
548 jnz .Lenc_u_loop
549
550.align 8
551.Lenc_a_loop:
552 encrypt_block a 0
553 encrypt_block a 1
554 encrypt_block a 2
555 encrypt_block a 3
556 encrypt_block a 4
557 encrypt_block a 5
558 encrypt_block a 6
559 encrypt_block a 7
560
561 add $0x100, SRC
562 add $0x100, DST
563 jmp .Lenc_a_loop
564
565.align 8
566.Lenc_u_loop:
567 encrypt_block u 0
568 encrypt_block u 1
569 encrypt_block u 2
570 encrypt_block u 3
571 encrypt_block u 4
572 encrypt_block u 5
573 encrypt_block u 6
574 encrypt_block u 7
575
576 add $0x100, SRC
577 add $0x100, DST
578 jmp .Lenc_u_loop
579
580.Lenc_out_0:
581 state_store0
582 FRAME_END
583 ret
584
585.Lenc_out_1:
586 state_store1
587 FRAME_END
588 ret
589
590.Lenc_out_2:
591 state_store2
592 FRAME_END
593 ret
594
595.Lenc_out_3:
596 state_store3
597 FRAME_END
598 ret
599
600.Lenc_out_4:
601 state_store4
602 FRAME_END
603 ret
604
605.Lenc_out_5:
606 state_store5
607 FRAME_END
608 ret
609
610.Lenc_out_6:
611 state_store6
612 FRAME_END
613 ret
614
615.Lenc_out_7:
616 state_store7
617 FRAME_END
618 ret
619
620.Lenc_out:
621 FRAME_END
622 ret
623ENDPROC(crypto_aegis128l_aesni_enc)
624
625/*
626 * void crypto_aegis128l_aesni_enc_tail(void *state, unsigned int length,
627 * const void *src, void *dst);
628 */
629ENTRY(crypto_aegis128l_aesni_enc_tail)
630 FRAME_BEGIN
631
632 state_load
633
634 /* encrypt message: */
635 call __load_partial
636
637 movdqa MSG0, T0
638 movdqa MSG1, T1
639 crypt0 T0, T1
640
641 call __store_partial
642
643 update0
644
645 state_store0
646
647 FRAME_END
Borislav Petkov221e00d2018-06-23 12:36:22 +0200648 ret
Ondrej Mosnacek1d373d42018-05-11 14:12:51 +0200649ENDPROC(crypto_aegis128l_aesni_enc_tail)
650
651/*
652 * void crypto_aegis128l_aesni_dec(void *state, unsigned int length,
653 * const void *src, void *dst);
654 */
655ENTRY(crypto_aegis128l_aesni_dec)
656 FRAME_BEGIN
657
658 cmp $0x20, LEN
659 jb .Ldec_out
660
661 state_load
662
663 mov SRC, %r8
664 or DST, %r8
665 and $0xF, %r8
666 jnz .Ldec_u_loop
667
668.align 8
669.Ldec_a_loop:
670 decrypt_block a 0
671 decrypt_block a 1
672 decrypt_block a 2
673 decrypt_block a 3
674 decrypt_block a 4
675 decrypt_block a 5
676 decrypt_block a 6
677 decrypt_block a 7
678
679 add $0x100, SRC
680 add $0x100, DST
681 jmp .Ldec_a_loop
682
683.align 8
684.Ldec_u_loop:
685 decrypt_block u 0
686 decrypt_block u 1
687 decrypt_block u 2
688 decrypt_block u 3
689 decrypt_block u 4
690 decrypt_block u 5
691 decrypt_block u 6
692 decrypt_block u 7
693
694 add $0x100, SRC
695 add $0x100, DST
696 jmp .Ldec_u_loop
697
698.Ldec_out_0:
699 state_store0
700 FRAME_END
701 ret
702
703.Ldec_out_1:
704 state_store1
705 FRAME_END
706 ret
707
708.Ldec_out_2:
709 state_store2
710 FRAME_END
711 ret
712
713.Ldec_out_3:
714 state_store3
715 FRAME_END
716 ret
717
718.Ldec_out_4:
719 state_store4
720 FRAME_END
721 ret
722
723.Ldec_out_5:
724 state_store5
725 FRAME_END
726 ret
727
728.Ldec_out_6:
729 state_store6
730 FRAME_END
731 ret
732
733.Ldec_out_7:
734 state_store7
735 FRAME_END
736 ret
737
738.Ldec_out:
739 FRAME_END
740 ret
741ENDPROC(crypto_aegis128l_aesni_dec)
742
743/*
744 * void crypto_aegis128l_aesni_dec_tail(void *state, unsigned int length,
745 * const void *src, void *dst);
746 */
747ENTRY(crypto_aegis128l_aesni_dec_tail)
748 FRAME_BEGIN
749
750 state_load
751
752 /* decrypt message: */
753 call __load_partial
754
755 crypt0 MSG0, MSG1
756
757 movdqa MSG0, T0
758 movdqa MSG1, T1
759 call __store_partial
760
761 /* mask with byte count: */
762 movq LEN, T0
763 punpcklbw T0, T0
764 punpcklbw T0, T0
765 punpcklbw T0, T0
766 punpcklbw T0, T0
767 movdqa T0, T1
768 movdqa .Laegis128l_counter0, T2
769 movdqa .Laegis128l_counter1, T3
770 pcmpgtb T2, T0
771 pcmpgtb T3, T1
772 pand T0, MSG0
773 pand T1, MSG1
774
775 update0
776
777 state_store0
778
779 FRAME_END
780 ret
781ENDPROC(crypto_aegis128l_aesni_dec_tail)
782
783/*
784 * void crypto_aegis128l_aesni_final(void *state, void *tag_xor,
785 * u64 assoclen, u64 cryptlen);
786 */
787ENTRY(crypto_aegis128l_aesni_final)
788 FRAME_BEGIN
789
790 state_load
791
792 /* prepare length block: */
793 movq %rdx, MSG0
794 movq %rcx, T0
795 pslldq $8, T0
796 pxor T0, MSG0
797 psllq $3, MSG0 /* multiply by 8 (to get bit count) */
798
799 pxor STATE2, MSG0
800 movdqa MSG0, MSG1
801
802 /* update state: */
803 update0
804 update1
805 update2
806 update3
807 update4
808 update5
809 update6
810
811 /* xor tag: */
812 movdqu (%rsi), T0
813
814 pxor STATE1, T0
815 pxor STATE2, T0
816 pxor STATE3, T0
817 pxor STATE4, T0
818 pxor STATE5, T0
819 pxor STATE6, T0
820 pxor STATE7, T0
821
822 movdqu T0, (%rsi)
823
824 FRAME_END
825 ret
826ENDPROC(crypto_aegis128l_aesni_final)