blob: 1a3d064e03c3b1e4772d2b6e82200f61547c63cf [file] [log] [blame]
Robert Sloan8ff03552017-06-14 12:40:58 -07001#!/usr/bin/env perl
2
3# Copyright (c) 2017, Shay Gueron.
4# Copyright (c) 2017, Google Inc.
5#
6# Permission to use, copy, modify, and/or distribute this software for any
7# purpose with or without fee is hereby granted, provided that the above
8# copyright notice and this permission notice appear in all copies.
9#
10# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
13# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
15# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
16# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
17
18use warnings FATAL => 'all';
19
20$flavour = shift;
21$output = shift;
22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
23
24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
25
26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29die "can't locate x86_64-xlate.pl";
30
31open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
32*STDOUT=*OUT;
33
34$code.=<<___;
35.data
36
37.align 16
38one:
39.quad 1,0
40two:
41.quad 2,0
42three:
43.quad 3,0
44four:
45.quad 4,0
46five:
47.quad 5,0
48six:
49.quad 6,0
50seven:
51.quad 7,0
52eight:
53.quad 8,0
54
55OR_MASK:
56.long 0x00000000,0x00000000,0x00000000,0x80000000
57poly:
58.quad 0x1, 0xc200000000000000
59mask:
60.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
61con1:
62.long 1,1,1,1
63con2:
64.long 0x1b,0x1b,0x1b,0x1b
65con3:
66.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
67and_mask:
68.long 0,0xffffffff, 0xffffffff, 0xffffffff
69___
70
71$code.=<<___;
72.text
73___
74
75sub gfmul {
76 #########################
77 # a = T
78 # b = TMP0 - remains unchanged
79 # res = T
80 # uses also TMP1,TMP2,TMP3,TMP4
81 # __m128i GFMUL(__m128i A, __m128i B);
82
83 my $T = "%xmm0";
84 my $TMP0 = "%xmm1";
85 my $TMP1 = "%xmm2";
86 my $TMP2 = "%xmm3";
87 my $TMP3 = "%xmm4";
88 my $TMP4 = "%xmm5";
89
90 $code.=<<___;
91.type GFMUL,\@abi-omnipotent
92.align 16
93GFMUL:
94.cfi_startproc
95 vpclmulqdq \$0x00, $TMP0, $T, $TMP1
96 vpclmulqdq \$0x11, $TMP0, $T, $TMP4
97 vpclmulqdq \$0x10, $TMP0, $T, $TMP2
98 vpclmulqdq \$0x01, $TMP0, $T, $TMP3
99 vpxor $TMP3, $TMP2, $TMP2
100 vpslldq \$8, $TMP2, $TMP3
101 vpsrldq \$8, $TMP2, $TMP2
102 vpxor $TMP3, $TMP1, $TMP1
103 vpxor $TMP2, $TMP4, $TMP4
104
105 vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2
106 vpshufd \$78, $TMP1, $TMP3
107 vpxor $TMP3, $TMP2, $TMP1
108
109 vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2
110 vpshufd \$78, $TMP1, $TMP3
111 vpxor $TMP3, $TMP2, $TMP1
112
113 vpxor $TMP4, $TMP1, $T
114 ret
115.cfi_endproc
116.size GFMUL, .-GFMUL
117___
118}
119gfmul();
120
121sub aesgcmsiv_htable_init {
122 # aesgcmsiv_htable_init writes an eight-entry table of powers of |H| to
123 # |out_htable|.
124 # void aesgcmsiv_htable_init(uint8_t out_htable[16*8], uint8_t *H);
125
126 my $Htbl = "%rdi";
127 my $H = "%rsi";
128 my $T = "%xmm0";
129 my $TMP0 = "%xmm1";
130
131$code.=<<___;
132.globl aesgcmsiv_htable_init
133.type aesgcmsiv_htable_init,\@function,2
134.align 16
135aesgcmsiv_htable_init:
136.cfi_startproc
137 vmovdqa ($H), $T
138 vmovdqa $T, $TMP0
139 vmovdqa $T, ($Htbl) # H
140 call GFMUL
141 vmovdqa $T, 16($Htbl) # H^2
142 call GFMUL
143 vmovdqa $T, 32($Htbl) # H^3
144 call GFMUL
145 vmovdqa $T, 48($Htbl) # H^4
146 call GFMUL
147 vmovdqa $T, 64($Htbl) # H^5
148 call GFMUL
149 vmovdqa $T, 80($Htbl) # H^6
150 call GFMUL
151 vmovdqa $T, 96($Htbl) # H^7
152 call GFMUL
153 vmovdqa $T, 112($Htbl) # H^8
154 ret
155.cfi_endproc
156.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init
157___
158}
159aesgcmsiv_htable_init();
160
161sub aesgcmsiv_htable6_init {
162 # aesgcmsiv_htable6_init writes a six-entry table of powers of |H| to
163 # |out_htable|.
164 # void aesgcmsiv_htable6_init(uint8_t out_htable[16*6], uint8_t *H);
165 #
166 my $Htbl = "%rdi";
167 my $H = "%rsi";
168 my $T = "%xmm0";
169 my $TMP0 = "%xmm1";
170
171 $code.=<<___;
172.globl aesgcmsiv_htable6_init
173.type aesgcmsiv_htable6_init,\@function,2
174.align 16
175aesgcmsiv_htable6_init:
176.cfi_startproc
177 vmovdqa ($H), $T
178 vmovdqa $T, $TMP0
179 vmovdqa $T, ($Htbl) # H
180 call GFMUL
181 vmovdqa $T, 16($Htbl) # H^2
182 call GFMUL
183 vmovdqa $T, 32($Htbl) # H^3
184 call GFMUL
185 vmovdqa $T, 48($Htbl) # H^4
186 call GFMUL
187 vmovdqa $T, 64($Htbl) # H^5
188 call GFMUL
189 vmovdqa $T, 80($Htbl) # H^6
190 ret
191.cfi_endproc
192.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init
193___
194}
195aesgcmsiv_htable6_init();
196
197sub aesgcmsiv_htable_polyval {
198 # void aesgcmsiv_htable_polyval(uint8_t Htbl[16*8], uint8_t *MSG, uint64_t LEN, uint8_t *T);
199 # parameter 1: %rdi Htable - pointer to Htable
200 # parameter 2: %rsi INp - pointer to input
201 # parameter 3: %rdx LEN - length of BUFFER in bytes
202 # parameter 4: %rcx T - pointer to POLYVAL output
203
204 my $DATA = "%xmm0";
205 my $hlp0 = "%r11";
206 my $Htbl = "%rdi";
207 my $inp = "%rsi";
208 my $len = "%rdx";
209 my $TMP0 = "%xmm3";
210 my $TMP1 = "%xmm4";
211 my $TMP2 = "%xmm5";
212 my $TMP3 = "%xmm6";
213 my $TMP4 = "%xmm7";
214 my $Tp = "%rcx";
215 my $T = "%xmm1";
216 my $Xhi = "%xmm9";
217
218 my $SCHOOLBOOK_AAD = sub {
219 my ($i)=@_;
220 return <<___;
221 vpclmulqdq \$0x01, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
222 vpxor $TMP3, $TMP2, $TMP2
223 vpclmulqdq \$0x00, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
224 vpxor $TMP3, $TMP0, $TMP0
225 vpclmulqdq \$0x11, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
226 vpxor $TMP3, $TMP1, $TMP1
227 vpclmulqdq \$0x10, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
228 vpxor $TMP3, $TMP2, $TMP2
229___
230 };
231
232 $code.=<<___;
233.globl aesgcmsiv_htable_polyval
234.type aesgcmsiv_htable_polyval,\@function,4
235.align 16
236aesgcmsiv_htable_polyval:
237.cfi_startproc
238 test $len, $len
239 jnz .Lhtable_polyval_start
240 ret
241
242.Lhtable_polyval_start:
243 vzeroall
244
245 # We hash 8 blocks each iteration. If the total number of blocks is not a
246 # multiple of 8, we first hash the leading n%8 blocks.
247 movq $len, $hlp0
248 andq \$127, $hlp0
249
250 jz .Lhtable_polyval_no_prefix
251
252 vpxor $Xhi, $Xhi, $Xhi
253 vmovdqa ($Tp), $T
254 sub $hlp0, $len
255
256 sub \$16, $hlp0
257
258 # hash first prefix block
259 vmovdqu ($inp), $DATA
260 vpxor $T, $DATA, $DATA
261
262 vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP2
263 vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP0
264 vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP1
265 vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
266 vpxor $TMP3, $TMP2, $TMP2
267
268 lea 16($inp), $inp
269 test $hlp0, $hlp0
270 jnz .Lhtable_polyval_prefix_loop
271 jmp .Lhtable_polyval_prefix_complete
272
273 # hash remaining prefix bocks (up to 7 total prefix blocks)
274.align 64
275.Lhtable_polyval_prefix_loop:
276 sub \$16, $hlp0
277
278 vmovdqu ($inp), $DATA # next data block
279
280 vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP3
281 vpxor $TMP3, $TMP0, $TMP0
282 vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP3
283 vpxor $TMP3, $TMP1, $TMP1
284 vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP3
285 vpxor $TMP3, $TMP2, $TMP2
286 vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
287 vpxor $TMP3, $TMP2, $TMP2
288
289 test $hlp0, $hlp0
290
291 lea 16($inp), $inp
292
293 jnz .Lhtable_polyval_prefix_loop
294
295.Lhtable_polyval_prefix_complete:
296 vpsrldq \$8, $TMP2, $TMP3
297 vpslldq \$8, $TMP2, $TMP2
298
299 vpxor $TMP3, $TMP1, $Xhi
300 vpxor $TMP2, $TMP0, $T
301
302 jmp .Lhtable_polyval_main_loop
303
304.Lhtable_polyval_no_prefix:
305 # At this point we know the number of blocks is a multiple of 8. However,
306 # the reduction in the main loop includes a multiplication by x^(-128). In
307 # order to counter this, the existing tag needs to be multipled by x^128.
308 # In practice, this just means that it is loaded into $Xhi, not $T.
309 vpxor $T, $T, $T
310 vmovdqa ($Tp), $Xhi
311
312.align 64
313.Lhtable_polyval_main_loop:
314 sub \$0x80, $len
315 jb .Lhtable_polyval_out
316
317 vmovdqu 16*7($inp), $DATA # Ii
318
319 vpclmulqdq \$0x01, ($Htbl), $DATA, $TMP2
320 vpclmulqdq \$0x00, ($Htbl), $DATA, $TMP0
321 vpclmulqdq \$0x11, ($Htbl), $DATA, $TMP1
322 vpclmulqdq \$0x10, ($Htbl), $DATA, $TMP3
323 vpxor $TMP3, $TMP2, $TMP2
324
325 #########################################################
326 vmovdqu 16*6($inp), $DATA
327 ${\$SCHOOLBOOK_AAD->(1)}
328
329 #########################################################
330 vmovdqu 16*5($inp), $DATA
331
332 vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 1a
333 vpalignr \$8, $T, $T, $T
334
335 ${\$SCHOOLBOOK_AAD->(2)}
336
337 vpxor $TMP4, $T, $T # reduction stage 1b
338 #########################################################
339 vmovdqu 16*4($inp), $DATA
340
341 ${\$SCHOOLBOOK_AAD->(3)}
342 #########################################################
343 vmovdqu 16*3($inp), $DATA
344
345 vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 2a
346 vpalignr \$8, $T, $T, $T
347
348 ${\$SCHOOLBOOK_AAD->(4)}
349
350 vpxor $TMP4, $T, $T # reduction stage 2b
351 #########################################################
352 vmovdqu 16*2($inp), $DATA
353
354 ${\$SCHOOLBOOK_AAD->(5)}
355
356 vpxor $Xhi, $T, $T # reduction finalize
357 #########################################################
358 vmovdqu 16*1($inp), $DATA
359
360 ${\$SCHOOLBOOK_AAD->(6)}
361 #########################################################
362 vmovdqu 16*0($inp), $DATA
363 vpxor $T, $DATA, $DATA
364
365 ${\$SCHOOLBOOK_AAD->(7)}
366 #########################################################
367 vpsrldq \$8, $TMP2, $TMP3
368 vpslldq \$8, $TMP2, $TMP2
369
370 vpxor $TMP3, $TMP1, $Xhi
371 vpxor $TMP2, $TMP0, $T
372
373 lea 16*8($inp), $inp
374 jmp .Lhtable_polyval_main_loop
375
376 #########################################################
377
378.Lhtable_polyval_out:
379 vpclmulqdq \$0x10, poly(%rip), $T, $TMP3
380 vpalignr \$8, $T, $T, $T
381 vpxor $TMP3, $T, $T
382
383 vpclmulqdq \$0x10, poly(%rip), $T, $TMP3
384 vpalignr \$8, $T, $T, $T
385 vpxor $TMP3, $T, $T
386 vpxor $Xhi, $T, $T
387
388 vmovdqu $T, ($Tp)
389 vzeroupper
390 ret
391.cfi_endproc
392.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval
393___
394}
395aesgcmsiv_htable_polyval();
396
397sub aesgcmsiv_polyval_horner {
398 #void aesgcmsiv_polyval_horner(unsigned char T[16], // output
399 # const unsigned char* H, // H
400 # unsigned char* BUF, // Buffer
401 # unsigned int blocks); // Len2
402 #
403 # parameter 1: %rdi T - pointers to POLYVAL output
404 # parameter 2: %rsi Hp - pointer to H (user key)
405 # parameter 3: %rdx INp - pointer to input
406 # parameter 4: %rcx L - total number of blocks in input BUFFER
407 #
408 my $T = "%rdi";
409 my $Hp = "%rsi";
410 my $INp = "%rdx";
411 my $L = "%rcx";
412 my $LOC = "%r10";
413 my $LEN = "%eax";
414 my $H = "%xmm1";
415 my $RES = "%xmm0";
416
417 $code.=<<___;
418.globl aesgcmsiv_polyval_horner
419.type aesgcmsiv_polyval_horner,\@function,4
420.align 16
421aesgcmsiv_polyval_horner:
422.cfi_startproc
423 test $L, $L
424 jnz .Lpolyval_horner_start
425 ret
426
427.Lpolyval_horner_start:
428 # We will start with L GFMULS for POLYVAL(BIG_BUFFER)
429 # RES = GFMUL(RES, H)
430
431 xorq $LOC, $LOC
432 shlq \$4, $L # L contains number of bytes to process
433
434 vmovdqa ($Hp), $H
435 vmovdqa ($T), $RES
436
437.Lpolyval_horner_loop:
438 vpxor ($INp,$LOC), $RES, $RES # RES = RES + Xi
439 call GFMUL # RES = RES * H
440
441 add \$16, $LOC
442 cmp $LOC, $L
443 jne .Lpolyval_horner_loop
444
445 # calculation of T is complete. RES=T
446 vmovdqa $RES, ($T)
447 ret
448.cfi_endproc
449.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner
450___
451}
452aesgcmsiv_polyval_horner();
453
454# void aes128gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
455# parameter 1: %rdi
456# parameter 2: %rsi
457$code.=<<___;
458.globl aes128gcmsiv_aes_ks
459.type aes128gcmsiv_aes_ks,\@function,2
460.align 16
461aes128gcmsiv_aes_ks:
462.cfi_startproc
463 vmovdqu (%rdi), %xmm1 # xmm1 = user key
464 vmovdqa %xmm1, (%rsi) # rsi points to output
465
466 vmovdqa con1(%rip), %xmm0
467 vmovdqa mask(%rip), %xmm15
468
469 movq \$8, %rax
470
471.Lks128_loop:
472 addq \$16, %rsi # rsi points for next key
473 subq \$1, %rax
474 vpshufb %xmm15, %xmm1, %xmm2 # xmm2 = shuffled user key
475 vaesenclast %xmm0, %xmm2, %xmm2
476 vpslld \$1, %xmm0, %xmm0
477 vpslldq \$4, %xmm1, %xmm3
478 vpxor %xmm3, %xmm1, %xmm1
479 vpslldq \$4, %xmm3, %xmm3
480 vpxor %xmm3, %xmm1, %xmm1
481 vpslldq \$4, %xmm3, %xmm3
482 vpxor %xmm3, %xmm1, %xmm1
483 vpxor %xmm2, %xmm1, %xmm1
484 vmovdqa %xmm1, (%rsi)
485 jne .Lks128_loop
486
487 vmovdqa con2(%rip), %xmm0
488 vpshufb %xmm15, %xmm1, %xmm2
489 vaesenclast %xmm0, %xmm2, %xmm2
490 vpslld \$1, %xmm0, %xmm0
491 vpslldq \$4, %xmm1, %xmm3
492 vpxor %xmm3, %xmm1, %xmm1
493 vpslldq \$4, %xmm3, %xmm3
494 vpxor %xmm3, %xmm1, %xmm1
495 vpslldq \$4, %xmm3, %xmm3
496 vpxor %xmm3, %xmm1, %xmm1
497 vpxor %xmm2, %xmm1, %xmm1
498 vmovdqa %xmm1, 16(%rsi)
499
500 vpshufb %xmm15, %xmm1, %xmm2
501 vaesenclast %xmm0, %xmm2, %xmm2
502 vpslldq \$4, %xmm1, %xmm3
503 vpxor %xmm3, %xmm1, %xmm1
504 vpslldq \$4, %xmm3, %xmm3
505 vpxor %xmm3, %xmm1, %xmm1
506 vpslldq \$4, %xmm3, %xmm3
507 vpxor %xmm3, %xmm1, %xmm1
508 vpxor %xmm2, %xmm1, %xmm1
509 vmovdqa %xmm1, 32(%rsi)
510 ret
511.cfi_endproc
512.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks
513___
514
515# void aes256gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
516# parameter 1: %rdi
517# parameter 2: %rsi
518$code.=<<___;
519.globl aes256gcmsiv_aes_ks
520.type aes256gcmsiv_aes_ks,\@function,2
521.align 16
522aes256gcmsiv_aes_ks:
523.cfi_startproc
524 vmovdqu (%rdi), %xmm1
525 vmovdqu 16(%rdi), %xmm3
526 vmovdqa %xmm1, (%rsi)
527 vmovdqa %xmm3, 16(%rsi)
528 vmovdqa con1(%rip), %xmm0
529 vmovdqa mask(%rip), %xmm15
530 vpxor %xmm14, %xmm14, %xmm14
531 mov \$6, %rax
532
533.Lks256_loop:
534 add \$32, %rsi
535 subq \$1, %rax
536 vpshufb %xmm15, %xmm3, %xmm2
537 vaesenclast %xmm0, %xmm2, %xmm2
538 vpslld \$1, %xmm0, %xmm0
539 vpsllq \$32, %xmm1, %xmm4
540 vpxor %xmm4, %xmm1, %xmm1
541 vpshufb con3(%rip), %xmm1, %xmm4
542 vpxor %xmm4, %xmm1, %xmm1
543 vpxor %xmm2, %xmm1, %xmm1
544 vmovdqa %xmm1, (%rsi)
545 vpshufd \$0xff, %xmm1, %xmm2
546 vaesenclast %xmm14, %xmm2, %xmm2
547 vpsllq \$32, %xmm3, %xmm4
548 vpxor %xmm4, %xmm3, %xmm3
549 vpshufb con3(%rip), %xmm3, %xmm4
550 vpxor %xmm4, %xmm3, %xmm3
551 vpxor %xmm2, %xmm3, %xmm3
552 vmovdqa %xmm3, 16(%rsi)
553 jne .Lks256_loop
554
555 vpshufb %xmm15, %xmm3, %xmm2
556 vaesenclast %xmm0, %xmm2, %xmm2
557 vpsllq \$32, %xmm1, %xmm4
558 vpxor %xmm4, %xmm1, %xmm1
559 vpshufb con3(%rip), %xmm1, %xmm4
560 vpxor %xmm4, %xmm1, %xmm1
561 vpxor %xmm2, %xmm1, %xmm1
562 vmovdqa %xmm1, 32(%rsi)
563 ret
564.cfi_endproc
565___
566
567sub aes128gcmsiv_aes_ks_enc_x1 {
568 my $KS1_REGA = "%xmm1";
569 my $KS1_REGB = "%xmm2";
570 my $BLOCK1 = "%xmm4";
571 my $AUXREG = "%xmm3";
572
573 my $KS_BLOCK = sub {
574 my ($reg, $reg2, $auxReg) = @_;
575 return <<___;
576 vpsllq \$32, $reg, $auxReg #!!saving mov instruction to xmm3
577 vpxor $auxReg, $reg, $reg
578 vpshufb con3(%rip), $reg, $auxReg
579 vpxor $auxReg, $reg, $reg
580 vpxor $reg2, $reg, $reg
581___
582 };
583
584 my $round = sub {
585 my ($i, $j) = @_;
586 return <<___;
587 vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2
588 vaesenclast %xmm0, %xmm2, %xmm2
589 vpslld \$1, %xmm0, %xmm0
590 ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
591 vaesenc %xmm1, $BLOCK1, $BLOCK1
592 vmovdqa %xmm1, ${\eval(16*$i)}($j)
593___
594 };
595
596 my $roundlast = sub {
597 my ($i, $j) = @_;
598 return <<___;
599 vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2
600 vaesenclast %xmm0, %xmm2, %xmm2
601 ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
602 vaesenclast %xmm1, $BLOCK1, $BLOCK1
603 vmovdqa %xmm1, ${\eval(16*$i)}($j)
604___
605 };
606
607# parameter 1: %rdi Pointer to PT
608# parameter 2: %rsi Pointer to CT
609# parameter 4: %rdx Pointer to keys
610# parameter 5: %rcx Pointer to initial key
611 $code.=<<___;
612.globl aes128gcmsiv_aes_ks_enc_x1
613.type aes128gcmsiv_aes_ks_enc_x1,\@function,4
614.align 16
615aes128gcmsiv_aes_ks_enc_x1:
616.cfi_startproc
617 vmovdqa (%rcx), %xmm1 # xmm1 = first 16 bytes of random key
618 vmovdqa 0*16(%rdi), $BLOCK1
619
620 vmovdqa %xmm1, (%rdx) # KEY[0] = first 16 bytes of random key
621 vpxor %xmm1, $BLOCK1, $BLOCK1
622
623 vmovdqa con1(%rip), %xmm0 # xmm0 = 1,1,1,1
624 vmovdqa mask(%rip), %xmm15 # xmm15 = mask
625
626 ${\$round->(1, "%rdx")}
627 ${\$round->(2, "%rdx")}
628 ${\$round->(3, "%rdx")}
629 ${\$round->(4, "%rdx")}
630 ${\$round->(5, "%rdx")}
631 ${\$round->(6, "%rdx")}
632 ${\$round->(7, "%rdx")}
633 ${\$round->(8, "%rdx")}
634
635 vmovdqa con2(%rip), %xmm0
636
637 ${\$round->(9, "%rdx")}
638 ${\$roundlast->(10, "%rdx")}
639
640 vmovdqa $BLOCK1, 0*16(%rsi)
641 ret
642.cfi_endproc
643.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1
644___
645}
646aes128gcmsiv_aes_ks_enc_x1();
647
648sub aes128gcmsiv_kdf {
649 my $BLOCK1 = "%xmm9";
650 my $BLOCK2 = "%xmm10";
651 my $BLOCK3 = "%xmm11";
652 my $BLOCK4 = "%xmm12";
653 my $BLOCK5 = "%xmm13";
654 my $BLOCK6 = "%xmm14";
655 my $ONE = "%xmm13";
656 my $KSp = "%rdx";
657 my $STATE_1 = "%xmm1";
658
659 my $enc_roundx4 = sub {
660 my ($i, $j) = @_;
661 return <<___;
662 vmovdqa ${\eval($i*16)}(%rdx), $j
663 vaesenc $j, $BLOCK1, $BLOCK1
664 vaesenc $j, $BLOCK2, $BLOCK2
665 vaesenc $j, $BLOCK3, $BLOCK3
666 vaesenc $j, $BLOCK4, $BLOCK4
667___
668 };
669
670 my $enc_roundlastx4 = sub {
671 my ($i, $j) = @_;
672 return <<___;
673 vmovdqa ${\eval($i*16)}(%rdx), $j
674 vaesenclast $j, $BLOCK1, $BLOCK1
675 vaesenclast $j, $BLOCK2, $BLOCK2
676 vaesenclast $j, $BLOCK3, $BLOCK3
677 vaesenclast $j, $BLOCK4, $BLOCK4
678___
679 };
680
681# void aes128gcmsiv_kdf(const uint8_t nonce[16],
682# uint8_t *out_key_material,
683# const uint8_t *key_schedule);
684 $code.=<<___;
685.globl aes128gcmsiv_kdf
686.type aes128gcmsiv_kdf,\@function,3
687.align 16
688aes128gcmsiv_kdf:
689.cfi_startproc
690# parameter 1: %rdi Pointer to NONCE
691# parameter 2: %rsi Pointer to CT
692# parameter 4: %rdx Pointer to keys
693
694 vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key
695 vmovdqa 0*16(%rdi), $BLOCK1
696 vmovdqa and_mask(%rip), $BLOCK4
697 vmovdqa one(%rip), $ONE
698 vpshufd \$0x90, $BLOCK1, $BLOCK1
699 vpand $BLOCK4, $BLOCK1, $BLOCK1
700 vpaddd $ONE, $BLOCK1, $BLOCK2
701 vpaddd $ONE, $BLOCK2, $BLOCK3
702 vpaddd $ONE, $BLOCK3, $BLOCK4
703
704 vpxor %xmm1, $BLOCK1, $BLOCK1
705 vpxor %xmm1, $BLOCK2, $BLOCK2
706 vpxor %xmm1, $BLOCK3, $BLOCK3
707 vpxor %xmm1, $BLOCK4, $BLOCK4
708
709 ${\$enc_roundx4->(1, "%xmm1")}
710 ${\$enc_roundx4->(2, "%xmm2")}
711 ${\$enc_roundx4->(3, "%xmm1")}
712 ${\$enc_roundx4->(4, "%xmm2")}
713 ${\$enc_roundx4->(5, "%xmm1")}
714 ${\$enc_roundx4->(6, "%xmm2")}
715 ${\$enc_roundx4->(7, "%xmm1")}
716 ${\$enc_roundx4->(8, "%xmm2")}
717 ${\$enc_roundx4->(9, "%xmm1")}
718 ${\$enc_roundlastx4->(10, "%xmm2")}
719
720 vmovdqa $BLOCK1, 0*16(%rsi)
721 vmovdqa $BLOCK2, 1*16(%rsi)
722 vmovdqa $BLOCK3, 2*16(%rsi)
723 vmovdqa $BLOCK4, 3*16(%rsi)
724 ret
725.cfi_endproc
726.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf
727___
728}
729aes128gcmsiv_kdf();
730
731sub aes128gcmsiv_enc_msg_x4 {
732 my $CTR1 = "%xmm0";
733 my $CTR2 = "%xmm1";
734 my $CTR3 = "%xmm2";
735 my $CTR4 = "%xmm3";
736 my $ADDER = "%xmm4";
737
738 my $STATE1 = "%xmm5";
739 my $STATE2 = "%xmm6";
740 my $STATE3 = "%xmm7";
741 my $STATE4 = "%xmm8";
742
743 my $TMP = "%xmm12";
744 my $TMP2 = "%xmm13";
745 my $TMP3 = "%xmm14";
746 my $IV = "%xmm15";
747
748 my $PT = "%rdi";
749 my $CT = "%rsi";
750 my $TAG = "%rdx";
751 my $KS = "%rcx";
752 my $LEN = "%r8";
753
754 my $aes_round = sub {
755 my ($i) = @_;
756 return <<___;
757 vmovdqu ${\eval($i*16)}($KS), $TMP
758 vaesenc $TMP, $STATE1, $STATE1
759 vaesenc $TMP, $STATE2, $STATE2
760 vaesenc $TMP, $STATE3, $STATE3
761 vaesenc $TMP, $STATE4, $STATE4
762___
763 };
764
765 my $aes_lastround = sub {
766 my ($i) = @_;
767 return <<___;
768 vmovdqu ${\eval($i*16)}($KS), $TMP
769 vaesenclast $TMP, $STATE1, $STATE1
770 vaesenclast $TMP, $STATE2, $STATE2
771 vaesenclast $TMP, $STATE3, $STATE3
772 vaesenclast $TMP, $STATE4, $STATE4
773___
774 };
775
776# void aes128gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
777# unsigned char* TAG, unsigned char* KS,
778# size_t byte_len);
779# parameter 1: %rdi #PT
780# parameter 2: %rsi #CT
781# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
782# parameter 4: %rcx #KS
783# parameter 5: %r8 #LEN MSG_length in bytes
784 $code.=<<___;
785.globl aes128gcmsiv_enc_msg_x4
786.type aes128gcmsiv_enc_msg_x4,\@function,5
787.align 16
788aes128gcmsiv_enc_msg_x4:
789.cfi_startproc
790 test $LEN, $LEN
791 jnz .L128_enc_msg_x4_start
792 ret
793
794.L128_enc_msg_x4_start:
795 pushq %r12
796.cfi_push %r12
797 pushq %r13
798.cfi_push %r13
799
800 shrq \$4, $LEN # LEN = num of blocks
801 movq $LEN, %r10
802 shlq \$62, %r10
803 shrq \$62, %r10
804
805 # make IV from TAG
806 vmovdqa ($TAG), $IV
807 vpor OR_MASK(%rip), $IV, $IV #IV = [1]TAG[126...32][00..00]
808
809 vmovdqu four(%rip), $ADDER # Register to increment counters
810 vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00]
811 vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01]
812 vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02]
813 vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
814
815 shrq \$2, $LEN
816 je .L128_enc_msg_x4_check_remainder
817
818 subq \$64, $CT
819 subq \$64, $PT
820
821.L128_enc_msg_x4_loop1:
822 addq \$64, $CT
823 addq \$64, $PT
824
825 vmovdqa $CTR1, $STATE1
826 vmovdqa $CTR2, $STATE2
827 vmovdqa $CTR3, $STATE3
828 vmovdqa $CTR4, $STATE4
829
830 vpxor ($KS), $STATE1, $STATE1
831 vpxor ($KS), $STATE2, $STATE2
832 vpxor ($KS), $STATE3, $STATE3
833 vpxor ($KS), $STATE4, $STATE4
834
835 ${\$aes_round->(1)}
836 vpaddd $ADDER, $CTR1, $CTR1
837 ${\$aes_round->(2)}
838 vpaddd $ADDER, $CTR2, $CTR2
839 ${\$aes_round->(3)}
840 vpaddd $ADDER, $CTR3, $CTR3
841 ${\$aes_round->(4)}
842 vpaddd $ADDER, $CTR4, $CTR4
843
844 ${\$aes_round->(5)}
845 ${\$aes_round->(6)}
846 ${\$aes_round->(7)}
847 ${\$aes_round->(8)}
848 ${\$aes_round->(9)}
849 ${\$aes_lastround->(10)}
850
851 # XOR with Plaintext
852 vpxor 0*16($PT), $STATE1, $STATE1
853 vpxor 1*16($PT), $STATE2, $STATE2
854 vpxor 2*16($PT), $STATE3, $STATE3
855 vpxor 3*16($PT), $STATE4, $STATE4
856
857 subq \$1, $LEN
858
859 vmovdqu $STATE1, 0*16($CT)
860 vmovdqu $STATE2, 1*16($CT)
861 vmovdqu $STATE3, 2*16($CT)
862 vmovdqu $STATE4, 3*16($CT)
863
864 jne .L128_enc_msg_x4_loop1
865
866 addq \$64,$CT
867 addq \$64,$PT
868
869.L128_enc_msg_x4_check_remainder:
870 cmpq \$0, %r10
871 je .L128_enc_msg_x4_out
872
873.L128_enc_msg_x4_loop2:
874 # enc each block separately
875 # CTR1 is the highest counter (even if no LOOP done)
876 vmovdqa $CTR1, $STATE1
877 vpaddd one(%rip), $CTR1, $CTR1 # inc counter
878
879 vpxor ($KS), $STATE1, $STATE1
880 vaesenc 16($KS), $STATE1, $STATE1
881 vaesenc 32($KS), $STATE1, $STATE1
882 vaesenc 48($KS), $STATE1, $STATE1
883 vaesenc 64($KS), $STATE1, $STATE1
884 vaesenc 80($KS), $STATE1, $STATE1
885 vaesenc 96($KS), $STATE1, $STATE1
886 vaesenc 112($KS), $STATE1, $STATE1
887 vaesenc 128($KS), $STATE1, $STATE1
888 vaesenc 144($KS), $STATE1, $STATE1
889 vaesenclast 160($KS), $STATE1, $STATE1
890
891 # XOR with plaintext
892 vpxor ($PT), $STATE1, $STATE1
893 vmovdqu $STATE1, ($CT)
894
895 addq \$16, $PT
896 addq \$16, $CT
897
898 subq \$1, %r10
899 jne .L128_enc_msg_x4_loop2
900
901.L128_enc_msg_x4_out:
902 popq %r13
903.cfi_pop %r13
904 popq %r12
905.cfi_pop %r12
906 ret
907.cfi_endproc
908.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4
909___
910}
911aes128gcmsiv_enc_msg_x4();
912
913sub aes128gcmsiv_enc_msg_x8 {
914 my $STATE1 = "%xmm1";
915 my $STATE2 = "%xmm2";
916 my $STATE3 = "%xmm3";
917 my $STATE4 = "%xmm4";
918 my $STATE5 = "%xmm5";
919 my $STATE6 = "%xmm6";
920 my $STATE7 = "%xmm7";
921 my $STATE8 = "%xmm8";
922
923 my $CTR1 = "%xmm0";
924 my $CTR2 = "%xmm9";
925 my $CTR3 = "%xmm10";
926 my $CTR4 = "%xmm11";
927 my $CTR5 = "%xmm12";
928 my $CTR6 = "%xmm13";
929 my $CTR7 = "%xmm14";
930 my $SCHED = "%xmm15";
931
932 my $TMP1 = "%xmm1";
933 my $TMP2 = "%xmm2";
934
935 my $PT = "%rdi";
936 my $CT = "%rsi";
937 my $TAG = "%rdx";
938 my $KS = "%rcx";
939 my $LEN = "%r8";
940
941 my $aes_round8 = sub {
942 my ($i) = @_;
943 return <<___;
944 vmovdqu ${\eval($i*16)}($KS), $SCHED
945 vaesenc $SCHED, $STATE1, $STATE1
946 vaesenc $SCHED, $STATE2, $STATE2
947 vaesenc $SCHED, $STATE3, $STATE3
948 vaesenc $SCHED, $STATE4, $STATE4
949 vaesenc $SCHED, $STATE5, $STATE5
950 vaesenc $SCHED, $STATE6, $STATE6
951 vaesenc $SCHED, $STATE7, $STATE7
952 vaesenc $SCHED, $STATE8, $STATE8
953___
954 };
955
956 my $aes_lastround8 = sub {
957 my ($i) = @_;
958 return <<___;
959 vmovdqu ${\eval($i*16)}($KS), $SCHED
960 vaesenclast $SCHED, $STATE1, $STATE1
961 vaesenclast $SCHED, $STATE2, $STATE2
962 vaesenclast $SCHED, $STATE3, $STATE3
963 vaesenclast $SCHED, $STATE4, $STATE4
964 vaesenclast $SCHED, $STATE5, $STATE5
965 vaesenclast $SCHED, $STATE6, $STATE6
966 vaesenclast $SCHED, $STATE7, $STATE7
967 vaesenclast $SCHED, $STATE8, $STATE8
968___
969 };
970
971# void ENC_MSG_x8(unsigned char* PT,
972# unsigned char* CT,
973# unsigned char* TAG,
974# unsigned char* KS,
975# size_t byte_len);
976# parameter 1: %rdi #PT
977# parameter 2: %rsi #CT
978# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
979# parameter 4: %rcx #KS
980# parameter 5: %r8 #LEN MSG_length in bytes
981 $code.=<<___;
982.globl aes128gcmsiv_enc_msg_x8
983.type aes128gcmsiv_enc_msg_x8,\@function,5
984.align 16
985aes128gcmsiv_enc_msg_x8:
986.cfi_startproc
987 test $LEN, $LEN
988 jnz .L128_enc_msg_x8_start
989 ret
990
991.L128_enc_msg_x8_start:
992 pushq %r12
993.cfi_push %r12
994 pushq %r13
995.cfi_push %r13
996 pushq %rbp
997.cfi_push %rbp
998 movq %rsp, %rbp
999.cfi_def_cfa_register rbp
1000
1001 # Place in stack
1002 subq \$128, %rsp
1003 andq \$-64, %rsp
1004
1005 shrq \$4, $LEN # LEN = num of blocks
1006 movq $LEN, %r10
1007 shlq \$61, %r10
1008 shrq \$61, %r10
1009
1010 # make IV from TAG
1011 vmovdqu ($TAG), $TMP1
1012 vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00]
1013
1014 # store counter8 in the stack
1015 vpaddd seven(%rip), $TMP1, $CTR1
1016 vmovdqu $CTR1, (%rsp) # CTR8 = TAG[127...32][00..07]
1017 vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01]
1018 vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02]
1019 vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03]
1020 vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04]
1021 vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05]
1022 vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06]
1023 vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00]
1024
1025 shrq \$3, $LEN
1026 je .L128_enc_msg_x8_check_remainder
1027
1028 subq \$128, $CT
1029 subq \$128, $PT
1030
1031.L128_enc_msg_x8_loop1:
1032 addq \$128, $CT
1033 addq \$128, $PT
1034
1035 vmovdqa $CTR1, $STATE1
1036 vmovdqa $CTR2, $STATE2
1037 vmovdqa $CTR3, $STATE3
1038 vmovdqa $CTR4, $STATE4
1039 vmovdqa $CTR5, $STATE5
1040 vmovdqa $CTR6, $STATE6
1041 vmovdqa $CTR7, $STATE7
1042 # move from stack
1043 vmovdqu (%rsp), $STATE8
1044
1045 vpxor ($KS), $STATE1, $STATE1
1046 vpxor ($KS), $STATE2, $STATE2
1047 vpxor ($KS), $STATE3, $STATE3
1048 vpxor ($KS), $STATE4, $STATE4
1049 vpxor ($KS), $STATE5, $STATE5
1050 vpxor ($KS), $STATE6, $STATE6
1051 vpxor ($KS), $STATE7, $STATE7
1052 vpxor ($KS), $STATE8, $STATE8
1053
1054 ${\$aes_round8->(1)}
1055 vmovdqu (%rsp), $CTR7 # deal with CTR8
1056 vpaddd eight(%rip), $CTR7, $CTR7
1057 vmovdqu $CTR7, (%rsp)
1058 ${\$aes_round8->(2)}
1059 vpsubd one(%rip), $CTR7, $CTR7
1060 ${\$aes_round8->(3)}
1061 vpaddd eight(%rip), $CTR1, $CTR1
1062 ${\$aes_round8->(4)}
1063 vpaddd eight(%rip), $CTR2, $CTR2
1064 ${\$aes_round8->(5)}
1065 vpaddd eight(%rip), $CTR3, $CTR3
1066 ${\$aes_round8->(6)}
1067 vpaddd eight(%rip), $CTR4, $CTR4
1068 ${\$aes_round8->(7)}
1069 vpaddd eight(%rip), $CTR5, $CTR5
1070 ${\$aes_round8->(8)}
1071 vpaddd eight(%rip), $CTR6, $CTR6
1072 ${\$aes_round8->(9)}
1073 ${\$aes_lastround8->(10)}
1074
1075 # XOR with Plaintext
1076 vpxor 0*16($PT), $STATE1, $STATE1
1077 vpxor 1*16($PT), $STATE2, $STATE2
1078 vpxor 2*16($PT), $STATE3, $STATE3
1079 vpxor 3*16($PT), $STATE4, $STATE4
1080 vpxor 4*16($PT), $STATE5, $STATE5
1081 vpxor 5*16($PT), $STATE6, $STATE6
1082 vpxor 6*16($PT), $STATE7, $STATE7
1083 vpxor 7*16($PT), $STATE8, $STATE8
1084
1085 dec $LEN
1086
1087 vmovdqu $STATE1, 0*16($CT)
1088 vmovdqu $STATE2, 1*16($CT)
1089 vmovdqu $STATE3, 2*16($CT)
1090 vmovdqu $STATE4, 3*16($CT)
1091 vmovdqu $STATE5, 4*16($CT)
1092 vmovdqu $STATE6, 5*16($CT)
1093 vmovdqu $STATE7, 6*16($CT)
1094 vmovdqu $STATE8, 7*16($CT)
1095
1096 jne .L128_enc_msg_x8_loop1
1097
1098 addq \$128, $CT
1099 addq \$128, $PT
1100
1101.L128_enc_msg_x8_check_remainder:
1102 cmpq \$0, %r10
1103 je .L128_enc_msg_x8_out
1104
1105.L128_enc_msg_x8_loop2:
1106 # enc each block separately
1107 # CTR1 is the highest counter (even if no LOOP done)
1108 vmovdqa $CTR1, $STATE1
1109 vpaddd one(%rip), $CTR1, $CTR1 # inc counter
1110
1111 vpxor ($KS), $STATE1, $STATE1
1112 vaesenc 16($KS), $STATE1, $STATE1
1113 vaesenc 32($KS), $STATE1, $STATE1
1114 vaesenc 48($KS), $STATE1, $STATE1
1115 vaesenc 64($KS), $STATE1, $STATE1
1116 vaesenc 80($KS), $STATE1, $STATE1
1117 vaesenc 96($KS), $STATE1, $STATE1
1118 vaesenc 112($KS), $STATE1, $STATE1
1119 vaesenc 128($KS), $STATE1, $STATE1
1120 vaesenc 144($KS), $STATE1, $STATE1
1121 vaesenclast 160($KS), $STATE1, $STATE1
1122
1123 # XOR with Plaintext
1124 vpxor ($PT), $STATE1, $STATE1
1125
1126 vmovdqu $STATE1, ($CT)
1127
1128 addq \$16, $PT
1129 addq \$16, $CT
1130
1131 decq %r10
1132 jne .L128_enc_msg_x8_loop2
1133
1134.L128_enc_msg_x8_out:
1135 movq %rbp, %rsp
1136.cfi_def_cfa_register %rsp
1137 popq %rbp
1138.cfi_pop %rbp
1139 popq %r13
1140.cfi_pop %r13
1141 popq %r12
1142.cfi_pop %r12
1143 ret
1144.cfi_endproc
1145.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8
1146___
1147}
1148aes128gcmsiv_enc_msg_x8();
1149
1150sub aesgcmsiv_dec {
1151 my ($aes256) = @_;
1152
1153 my $T = "%xmm0";
1154 my $TMP0 = "%xmm1";
1155 my $TMP1 = "%xmm2";
1156 my $TMP2 = "%xmm3";
1157 my $TMP3 = "%xmm4";
1158 my $TMP4 = "%xmm5";
1159 my $TMP5 = "%xmm6";
1160 my $CTR1 = "%xmm7";
1161 my $CTR2 = "%xmm8";
1162 my $CTR3 = "%xmm9";
1163 my $CTR4 = "%xmm10";
1164 my $CTR5 = "%xmm11";
1165 my $CTR6 = "%xmm12";
1166 my $CTR = "%xmm15";
1167 my $CT = "%rdi";
1168 my $PT = "%rsi";
1169 my $POL = "%rdx";
1170 my $Htbl = "%rcx";
1171 my $KS = "%r8";
1172 my $LEN = "%r9";
1173 my $secureBuffer = "%rax";
1174 my $HTABLE_ROUNDS = "%xmm13";
1175
1176 my $labelPrefix = "128";
1177 if ($aes256) {
1178 $labelPrefix = "256";
1179 }
1180
1181 my $aes_round_dec = sub {
1182 my ($i) = @_;
1183 return <<___;
1184 vmovdqu ${\eval($i*16)}($KS), $TMP3
1185 vaesenc $TMP3, $CTR1, $CTR1
1186 vaesenc $TMP3, $CTR2, $CTR2
1187 vaesenc $TMP3, $CTR3, $CTR3
1188 vaesenc $TMP3, $CTR4, $CTR4
1189 vaesenc $TMP3, $CTR5, $CTR5
1190 vaesenc $TMP3, $CTR6, $CTR6
1191___
1192 };
1193
1194 my $aes_lastround_dec = sub {
1195 my ($i) = @_;
1196 return <<___;
1197 vmovdqu ${\eval($i*16)}($KS), $TMP3
1198 vaesenclast $TMP3, $CTR1, $CTR1
1199 vaesenclast $TMP3, $CTR2, $CTR2
1200 vaesenclast $TMP3, $CTR3, $CTR3
1201 vaesenclast $TMP3, $CTR4, $CTR4
1202 vaesenclast $TMP3, $CTR5, $CTR5
1203 vaesenclast $TMP3, $CTR6, $CTR6
1204___
1205 };
1206
1207 my $schoolbook = sub {
1208 my ($i) = @_;
1209 return <<___;
1210 vmovdqu ${\eval($i*16-32)}($secureBuffer), $TMP5
1211 vmovdqu ${\eval($i*16-32)}($Htbl), $HTABLE_ROUNDS
1212
1213 vpclmulqdq \$0x10, $HTABLE_ROUNDS, $TMP5, $TMP3
1214 vpxor $TMP3, $TMP0, $TMP0
1215 vpclmulqdq \$0x11, $HTABLE_ROUNDS, $TMP5, $TMP3
1216 vpxor $TMP3, $TMP1, $TMP1
1217 vpclmulqdq \$0x00, $HTABLE_ROUNDS, $TMP5, $TMP3
1218 vpxor $TMP3, $TMP2, $TMP2
1219 vpclmulqdq \$0x01, $HTABLE_ROUNDS, $TMP5, $TMP3
1220 vpxor $TMP3, $TMP0, $TMP0
1221___
1222 };
1223
1224 if ($aes256) {
1225 $code.=<<___;
1226.globl aes256gcmsiv_dec
1227.type aes256gcmsiv_dec,\@function,6
1228.align 16
1229aes256gcmsiv_dec:
1230___
1231 } else {
1232 $code.=<<___;
1233.globl aes128gcmsiv_dec
1234.type aes128gcmsiv_dec,\@function,6
1235.align 16
1236aes128gcmsiv_dec:
1237___
1238 }
1239
1240 $code.=<<___;
1241.cfi_startproc
1242 test \$~15, $LEN
1243 jnz .L${labelPrefix}_dec_start
1244 ret
1245
1246.L${labelPrefix}_dec_start:
1247 vzeroupper
1248 vmovdqa ($POL), $T
1249 movq $POL, $secureBuffer
1250
1251 leaq 32($secureBuffer), $secureBuffer
1252 leaq 32($Htbl), $Htbl
1253
1254 # make CTRBLKs from given tag.
1255 vmovdqu ($CT,$LEN), $CTR
1256 vpor OR_MASK(%rip), $CTR, $CTR # CTR = [1]TAG[126...32][00..00]
1257 andq \$~15, $LEN
1258
1259 # If less then 6 blocks, make singles
1260 cmp \$96, $LEN
1261 jb .L${labelPrefix}_dec_loop2
1262
1263 # Decrypt the first six blocks
1264 sub \$96, $LEN
1265 vmovdqa $CTR, $CTR1
1266 vpaddd one(%rip), $CTR1, $CTR2
1267 vpaddd two(%rip), $CTR1, $CTR3
1268 vpaddd one(%rip), $CTR3, $CTR4
1269 vpaddd two(%rip), $CTR3, $CTR5
1270 vpaddd one(%rip), $CTR5, $CTR6
1271 vpaddd two(%rip), $CTR5, $CTR
1272
1273 vpxor ($KS), $CTR1, $CTR1
1274 vpxor ($KS), $CTR2, $CTR2
1275 vpxor ($KS), $CTR3, $CTR3
1276 vpxor ($KS), $CTR4, $CTR4
1277 vpxor ($KS), $CTR5, $CTR5
1278 vpxor ($KS), $CTR6, $CTR6
1279
1280 ${\$aes_round_dec->(1)}
1281 ${\$aes_round_dec->(2)}
1282 ${\$aes_round_dec->(3)}
1283 ${\$aes_round_dec->(4)}
1284 ${\$aes_round_dec->(5)}
1285 ${\$aes_round_dec->(6)}
1286 ${\$aes_round_dec->(7)}
1287 ${\$aes_round_dec->(8)}
1288 ${\$aes_round_dec->(9)}
1289___
1290
1291if ($aes256) {
1292$code.=<<___;
1293 ${\$aes_round_dec->(10)}
1294 ${\$aes_round_dec->(11)}
1295 ${\$aes_round_dec->(12)}
1296 ${\$aes_round_dec->(13)}
1297 ${\$aes_lastround_dec->(14)}
1298___
1299} else {
1300$code.=<<___;
1301 ${\$aes_lastround_dec->(10)}
1302___
1303}
1304
1305$code.=<<___;
1306 # XOR with CT
1307 vpxor 0*16($CT), $CTR1, $CTR1
1308 vpxor 1*16($CT), $CTR2, $CTR2
1309 vpxor 2*16($CT), $CTR3, $CTR3
1310 vpxor 3*16($CT), $CTR4, $CTR4
1311 vpxor 4*16($CT), $CTR5, $CTR5
1312 vpxor 5*16($CT), $CTR6, $CTR6
1313
1314 vmovdqu $CTR1, 0*16($PT)
1315 vmovdqu $CTR2, 1*16($PT)
1316 vmovdqu $CTR3, 2*16($PT)
1317 vmovdqu $CTR4, 3*16($PT)
1318 vmovdqu $CTR5, 4*16($PT)
1319 vmovdqu $CTR6, 5*16($PT)
1320
1321 addq \$96, $CT
1322 addq \$96, $PT
1323 jmp .L${labelPrefix}_dec_loop1
1324
1325# Decrypt 6 blocks each time while hashing previous 6 blocks
1326.align 64
1327.L${labelPrefix}_dec_loop1:
1328 cmp \$96, $LEN
1329 jb .L${labelPrefix}_dec_finish_96
1330 sub \$96, $LEN
1331
1332 vmovdqa $CTR6, $TMP5
1333 vmovdqa $CTR5, 1*16-32($secureBuffer)
1334 vmovdqa $CTR4, 2*16-32($secureBuffer)
1335 vmovdqa $CTR3, 3*16-32($secureBuffer)
1336 vmovdqa $CTR2, 4*16-32($secureBuffer)
1337 vmovdqa $CTR1, 5*16-32($secureBuffer)
1338
1339 vmovdqa $CTR, $CTR1
1340 vpaddd one(%rip), $CTR1, $CTR2
1341 vpaddd two(%rip), $CTR1, $CTR3
1342 vpaddd one(%rip), $CTR3, $CTR4
1343 vpaddd two(%rip), $CTR3, $CTR5
1344 vpaddd one(%rip), $CTR5, $CTR6
1345 vpaddd two(%rip), $CTR5, $CTR
1346
1347 vmovdqa ($KS), $TMP3
1348 vpxor $TMP3, $CTR1, $CTR1
1349 vpxor $TMP3, $CTR2, $CTR2
1350 vpxor $TMP3, $CTR3, $CTR3
1351 vpxor $TMP3, $CTR4, $CTR4
1352 vpxor $TMP3, $CTR5, $CTR5
1353 vpxor $TMP3, $CTR6, $CTR6
1354
1355 vmovdqu 0*16-32($Htbl), $TMP3
1356 vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
1357 vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
1358 vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP0
1359 vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP3
1360 vpxor $TMP3, $TMP0, $TMP0
1361
1362 ${\$aes_round_dec->(1)}
1363 ${\$schoolbook->(1)}
1364
1365 ${\$aes_round_dec->(2)}
1366 ${\$schoolbook->(2)}
1367
1368 ${\$aes_round_dec->(3)}
1369 ${\$schoolbook->(3)}
1370
1371 ${\$aes_round_dec->(4)}
1372 ${\$schoolbook->(4)}
1373
1374 ${\$aes_round_dec->(5)}
1375 ${\$aes_round_dec->(6)}
1376 ${\$aes_round_dec->(7)}
1377
1378 vmovdqa 5*16-32($secureBuffer), $TMP5
1379 vpxor $T, $TMP5, $TMP5
1380 vmovdqu 5*16-32($Htbl), $TMP4
1381
1382 vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
1383 vpxor $TMP3, $TMP0, $TMP0
1384 vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
1385 vpxor $TMP3, $TMP1, $TMP1
1386 vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
1387 vpxor $TMP3, $TMP2, $TMP2
1388 vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
1389 vpxor $TMP3, $TMP0, $TMP0
1390
1391 ${\$aes_round_dec->(8)}
1392
1393 vpsrldq \$8, $TMP0, $TMP3
1394 vpxor $TMP3, $TMP1, $TMP4
1395 vpslldq \$8, $TMP0, $TMP3
1396 vpxor $TMP3, $TMP2, $T
1397
1398 vmovdqa poly(%rip), $TMP2
1399
1400 ${\$aes_round_dec->(9)}
1401___
1402
1403if ($aes256) {
1404$code.=<<___;
1405 ${\$aes_round_dec->(10)}
1406 ${\$aes_round_dec->(11)}
1407 ${\$aes_round_dec->(12)}
1408 ${\$aes_round_dec->(13)}
1409 vmovdqu 14*16($KS), $TMP5
1410___
1411} else {
1412$code.=<<___;
1413 vmovdqu 10*16($KS), $TMP5
1414___
1415}
1416
1417$code.=<<___;
1418 vpalignr \$8, $T, $T, $TMP1
1419 vpclmulqdq \$0x10, $TMP2, $T, $T
1420 vpxor $T, $TMP1, $T
1421
1422 vpxor 0*16($CT), $TMP5, $TMP3
1423 vaesenclast $TMP3, $CTR1, $CTR1
1424 vpxor 1*16($CT), $TMP5, $TMP3
1425 vaesenclast $TMP3, $CTR2, $CTR2
1426 vpxor 2*16($CT), $TMP5, $TMP3
1427 vaesenclast $TMP3, $CTR3, $CTR3
1428 vpxor 3*16($CT), $TMP5, $TMP3
1429 vaesenclast $TMP3, $CTR4, $CTR4
1430 vpxor 4*16($CT), $TMP5, $TMP3
1431 vaesenclast $TMP3, $CTR5, $CTR5
1432 vpxor 5*16($CT), $TMP5, $TMP3
1433 vaesenclast $TMP3, $CTR6, $CTR6
1434
1435 vpalignr \$8, $T, $T, $TMP1
1436 vpclmulqdq \$0x10, $TMP2, $T, $T
1437 vpxor $T, $TMP1, $T
1438
1439 vmovdqu $CTR1, 0*16($PT)
1440 vmovdqu $CTR2, 1*16($PT)
1441 vmovdqu $CTR3, 2*16($PT)
1442 vmovdqu $CTR4, 3*16($PT)
1443 vmovdqu $CTR5, 4*16($PT)
1444 vmovdqu $CTR6, 5*16($PT)
1445
1446 vpxor $TMP4, $T, $T
1447
1448 lea 96($CT), $CT
1449 lea 96($PT), $PT
1450 jmp .L${labelPrefix}_dec_loop1
1451
1452.L${labelPrefix}_dec_finish_96:
1453 vmovdqa $CTR6, $TMP5
1454 vmovdqa $CTR5, 1*16-32($secureBuffer)
1455 vmovdqa $CTR4, 2*16-32($secureBuffer)
1456 vmovdqa $CTR3, 3*16-32($secureBuffer)
1457 vmovdqa $CTR2, 4*16-32($secureBuffer)
1458 vmovdqa $CTR1, 5*16-32($secureBuffer)
1459
1460 vmovdqu 0*16-32($Htbl), $TMP3
1461 vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP0
1462 vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
1463 vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
1464 vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP3
1465 vpxor $TMP3, $TMP0, $TMP0
1466
1467 ${\$schoolbook->(1)}
1468 ${\$schoolbook->(2)}
1469 ${\$schoolbook->(3)}
1470 ${\$schoolbook->(4)}
1471
1472 vmovdqu 5*16-32($secureBuffer), $TMP5
1473 vpxor $T, $TMP5, $TMP5
1474 vmovdqu 5*16-32($Htbl), $TMP4
1475 vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
1476 vpxor $TMP3, $TMP1, $TMP1
1477 vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
1478 vpxor $TMP3, $TMP2, $TMP2
1479 vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
1480 vpxor $TMP3, $TMP0, $TMP0
1481 vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
1482 vpxor $TMP3, $TMP0, $TMP0
1483
1484 vpsrldq \$8, $TMP0, $TMP3
1485 vpxor $TMP3, $TMP1, $TMP4
1486 vpslldq \$8, $TMP0, $TMP3
1487 vpxor $TMP3, $TMP2, $T
1488
1489 vmovdqa poly(%rip), $TMP2
1490
1491 vpalignr \$8, $T, $T, $TMP1
1492 vpclmulqdq \$0x10, $TMP2, $T, $T
1493 vpxor $T, $TMP1, $T
1494
1495 vpalignr \$8, $T, $T, $TMP1
1496 vpclmulqdq \$0x10, $TMP2, $T, $T
1497 vpxor $T, $TMP1, $T
1498
1499 vpxor $TMP4, $T, $T
1500
1501.L${labelPrefix}_dec_loop2:
1502 # Here we encrypt any remaining whole block
1503
1504 # if there are no whole blocks
1505 cmp \$16, $LEN
1506 jb .L${labelPrefix}_dec_out
1507 sub \$16, $LEN
1508
1509 vmovdqa $CTR, $TMP1
1510 vpaddd one(%rip), $CTR, $CTR
1511
1512 vpxor 0*16($KS), $TMP1, $TMP1
1513 vaesenc 1*16($KS), $TMP1, $TMP1
1514 vaesenc 2*16($KS), $TMP1, $TMP1
1515 vaesenc 3*16($KS), $TMP1, $TMP1
1516 vaesenc 4*16($KS), $TMP1, $TMP1
1517 vaesenc 5*16($KS), $TMP1, $TMP1
1518 vaesenc 6*16($KS), $TMP1, $TMP1
1519 vaesenc 7*16($KS), $TMP1, $TMP1
1520 vaesenc 8*16($KS), $TMP1, $TMP1
1521 vaesenc 9*16($KS), $TMP1, $TMP1
1522___
1523if ($aes256) {
1524$code.=<<___;
1525 vaesenc 10*16($KS), $TMP1, $TMP1
1526 vaesenc 11*16($KS), $TMP1, $TMP1
1527 vaesenc 12*16($KS), $TMP1, $TMP1
1528 vaesenc 13*16($KS), $TMP1, $TMP1
1529 vaesenclast 14*16($KS), $TMP1, $TMP1
1530___
1531} else {
1532$code.=<<___;
1533 vaesenclast 10*16($KS), $TMP1, $TMP1
1534___
1535}
1536
1537$code.=<<___;
1538 vpxor ($CT), $TMP1, $TMP1
1539 vmovdqu $TMP1, ($PT)
1540 addq \$16, $CT
1541 addq \$16, $PT
1542
1543 vpxor $TMP1, $T, $T
1544 vmovdqa -32($Htbl), $TMP0
1545 call GFMUL
1546
1547 jmp .L${labelPrefix}_dec_loop2
1548
1549.L${labelPrefix}_dec_out:
1550 vmovdqu $T, ($POL)
1551 ret
1552.cfi_endproc
1553___
1554
1555 if ($aes256) {
1556 $code.=<<___;
1557.size aes256gcmsiv_dec, .-aes256gcmsiv_dec
1558___
1559 } else {
1560 $code.=<<___;
1561.size aes128gcmsiv_dec, .-aes128gcmsiv_dec
1562___
1563 }
1564}
1565
1566aesgcmsiv_dec(0); # emit 128-bit version
1567
1568sub aes128gcmsiv_ecb_enc_block {
1569 my $STATE_1 = "%xmm1";
1570 my $KSp = "%rdx";
1571
1572 # parameter 1: PT %rdi (pointer to 128 bit)
1573 # parameter 2: CT %rsi (pointer to 128 bit)
1574 # parameter 3: ks %rdx (pointer to ks)
1575 $code.=<<___;
1576.globl aes128gcmsiv_ecb_enc_block
1577.type aes128gcmsiv_ecb_enc_block,\@function,3
1578.align 16
1579aes128gcmsiv_ecb_enc_block:
1580.cfi_startproc
1581 vmovdqa (%rdi), $STATE_1
1582
1583 vpxor ($KSp), $STATE_1, $STATE_1
1584 vaesenc 1*16($KSp), $STATE_1, $STATE_1
1585 vaesenc 2*16($KSp), $STATE_1, $STATE_1
1586 vaesenc 3*16($KSp), $STATE_1, $STATE_1
1587 vaesenc 4*16($KSp), $STATE_1, $STATE_1
1588 vaesenc 5*16($KSp), $STATE_1, $STATE_1
1589 vaesenc 6*16($KSp), $STATE_1, $STATE_1
1590 vaesenc 7*16($KSp), $STATE_1, $STATE_1
1591 vaesenc 8*16($KSp), $STATE_1, $STATE_1
1592 vaesenc 9*16($KSp), $STATE_1, $STATE_1
1593 vaesenclast 10*16($KSp), $STATE_1, $STATE_1 # STATE_1 == IV
1594
1595 vmovdqa $STATE_1, (%rsi)
1596
1597 ret
1598.cfi_endproc
1599.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block
1600___
1601}
1602aes128gcmsiv_ecb_enc_block();
1603
1604sub aes256gcmsiv_aes_ks_enc_x1 {
1605 my $KS = "%rdx";
1606 my $KEYp = "%rcx";
1607 my $CON_MASK = "%xmm0";
1608 my $MASK_256 = "%xmm15";
1609 my $KEY_1 = "%xmm1";
1610 my $KEY_2 = "%xmm3";
1611 my $BLOCK1 = "%xmm8";
1612 my $AUX_REG = "%xmm14";
1613 my $PT = "%rdi";
1614 my $CT = "%rsi";
1615
1616 my $round_double = sub {
1617 my ($i, $j) = @_;
1618 return <<___;
1619 vpshufb %xmm15, %xmm3, %xmm2
1620 vaesenclast %xmm0, %xmm2, %xmm2
1621 vpslld \$1, %xmm0, %xmm0
1622 vpslldq \$4, %xmm1, %xmm4
1623 vpxor %xmm4, %xmm1, %xmm1
1624 vpslldq \$4, %xmm4, %xmm4
1625 vpxor %xmm4, %xmm1, %xmm1
1626 vpslldq \$4, %xmm4, %xmm4
1627 vpxor %xmm4, %xmm1, %xmm1
1628 vpxor %xmm2, %xmm1, %xmm1
1629 vaesenc %xmm1, $BLOCK1, $BLOCK1
1630 vmovdqu %xmm1, ${\eval(16*$i)}($KS)
1631
1632 vpshufd \$0xff, %xmm1, %xmm2
1633 vaesenclast %xmm14, %xmm2, %xmm2
1634 vpslldq \$4, %xmm3, %xmm4
1635 vpxor %xmm4, %xmm3, %xmm3
1636 vpslldq \$4, %xmm4, %xmm4
1637 vpxor %xmm4, %xmm3, %xmm3
1638 vpslldq \$4, %xmm4, %xmm4
1639 vpxor %xmm4, %xmm3, %xmm3
1640 vpxor %xmm2, %xmm3, %xmm3
1641 vaesenc %xmm3, $BLOCK1, $BLOCK1
1642 vmovdqu %xmm3, ${\eval(16*$j)}($KS)
1643___
1644 };
1645
1646 my $round_last = sub {
1647 my ($i) = @_;
1648 return <<___;
1649 vpshufb %xmm15, %xmm3, %xmm2
1650 vaesenclast %xmm0, %xmm2, %xmm2
1651 vpslldq \$4, %xmm1, %xmm4
1652 vpxor %xmm4, %xmm1, %xmm1
1653 vpslldq \$4, %xmm4, %xmm4
1654 vpxor %xmm4, %xmm1, %xmm1
1655 vpslldq \$4, %xmm4, %xmm4
1656 vpxor %xmm4, %xmm1, %xmm1
1657 vpxor %xmm2, %xmm1, %xmm1
1658 vaesenclast %xmm1, $BLOCK1, $BLOCK1
1659 vmovdqu %xmm1, ${\eval(16*$i)}($KS)
1660___
1661 };
1662
1663 # parameter 1: %rdi Pointer to PT1
1664 # parameter 2: %rsi Pointer to CT1
1665 # parameter 3: %rdx Pointer to KS
1666 # parameter 4: %rcx Pointer to initial key
1667 $code.=<<___;
1668.globl aes256gcmsiv_aes_ks_enc_x1
1669.type aes256gcmsiv_aes_ks_enc_x1,\@function,4
1670.align 16
1671aes256gcmsiv_aes_ks_enc_x1:
1672.cfi_startproc
1673 vmovdqa con1(%rip), $CON_MASK # CON_MASK = 1,1,1,1
1674 vmovdqa mask(%rip), $MASK_256 # MASK_256
1675 vmovdqa ($PT), $BLOCK1
1676 vmovdqa ($KEYp), $KEY_1 # KEY_1 || KEY_2 [0..7] = user key
1677 vmovdqa 16($KEYp), $KEY_2
1678 vpxor $KEY_1, $BLOCK1, $BLOCK1
1679 vaesenc $KEY_2, $BLOCK1, $BLOCK1
1680 vmovdqu $KEY_1, ($KS) # First round key
1681 vmovdqu $KEY_2, 16($KS)
1682 vpxor $AUX_REG, $AUX_REG, $AUX_REG
1683
1684 ${\$round_double->(2, 3)}
1685 ${\$round_double->(4, 5)}
1686 ${\$round_double->(6, 7)}
1687 ${\$round_double->(8, 9)}
1688 ${\$round_double->(10, 11)}
1689 ${\$round_double->(12, 13)}
1690 ${\$round_last->(14)}
1691 vmovdqa $BLOCK1, ($CT)
1692 ret
1693.cfi_endproc
1694.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1
1695___
1696}
1697aes256gcmsiv_aes_ks_enc_x1();
1698
1699sub aes256gcmsiv_ecb_enc_block {
1700 my $STATE_1 = "%xmm1";
1701 my $PT = "%rdi";
1702 my $CT = "%rsi";
1703 my $KSp = "%rdx";
1704
1705 # parameter 1: PT %rdi (pointer to 128 bit)
1706 # parameter 2: CT %rsi (pointer to 128 bit)
1707 # parameter 3: ks %rdx (pointer to ks)
1708 $code.=<<___;
1709.globl aes256gcmsiv_ecb_enc_block
1710.type aes256gcmsiv_ecb_enc_block,\@function,3
1711.align 16
1712aes256gcmsiv_ecb_enc_block:
1713.cfi_startproc
1714 vmovdqa (%rdi), $STATE_1
1715 vpxor ($KSp), $STATE_1, $STATE_1
1716 vaesenc 1*16($KSp), $STATE_1, $STATE_1
1717 vaesenc 2*16($KSp), $STATE_1, $STATE_1
1718 vaesenc 3*16($KSp), $STATE_1, $STATE_1
1719 vaesenc 4*16($KSp), $STATE_1, $STATE_1
1720 vaesenc 5*16($KSp), $STATE_1, $STATE_1
1721 vaesenc 6*16($KSp), $STATE_1, $STATE_1
1722 vaesenc 7*16($KSp), $STATE_1, $STATE_1
1723 vaesenc 8*16($KSp), $STATE_1, $STATE_1
1724 vaesenc 9*16($KSp), $STATE_1, $STATE_1
1725 vaesenc 10*16($KSp), $STATE_1, $STATE_1
1726 vaesenc 11*16($KSp), $STATE_1, $STATE_1
1727 vaesenc 12*16($KSp), $STATE_1, $STATE_1
1728 vaesenc 13*16($KSp), $STATE_1, $STATE_1
1729 vaesenclast 14*16($KSp), $STATE_1, $STATE_1 # $STATE_1 == IV
1730 vmovdqa $STATE_1, (%rsi)
1731 ret
1732.cfi_endproc
1733.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block
1734___
1735}
1736aes256gcmsiv_ecb_enc_block();
1737
1738sub aes256gcmsiv_enc_msg_x4 {
1739 my $CTR1 = "%xmm0";
1740 my $CTR2 = "%xmm1";
1741 my $CTR3 = "%xmm2";
1742 my $CTR4 = "%xmm3";
1743 my $ADDER = "%xmm4";
1744
1745 my $STATE1 = "%xmm5";
1746 my $STATE2 = "%xmm6";
1747 my $STATE3 = "%xmm7";
1748 my $STATE4 = "%xmm8";
1749
1750 my $TMP = "%xmm12";
1751 my $TMP2 = "%xmm13";
1752 my $TMP3 = "%xmm14";
1753 my $IV = "%xmm15";
1754
1755 my $PT = "%rdi";
1756 my $CT = "%rsi";
1757 my $TAG = "%rdx";
1758 my $KS = "%rcx";
1759 my $LEN = "%r8";
1760
1761 my $aes_round = sub {
1762 my ($i) = @_;
1763 return <<___;
1764 vmovdqu ${\eval($i*16)}($KS), $TMP
1765 vaesenc $TMP, $STATE1, $STATE1
1766 vaesenc $TMP, $STATE2, $STATE2
1767 vaesenc $TMP, $STATE3, $STATE3
1768 vaesenc $TMP, $STATE4, $STATE4
1769___
1770 };
1771
1772 my $aes_lastround = sub {
1773 my ($i) = @_;
1774 return <<___;
1775 vmovdqu ${\eval($i*16)}($KS), $TMP
1776 vaesenclast $TMP, $STATE1, $STATE1
1777 vaesenclast $TMP, $STATE2, $STATE2
1778 vaesenclast $TMP, $STATE3, $STATE3
1779 vaesenclast $TMP, $STATE4, $STATE4
1780___
1781 };
1782
1783 # void aes256gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
1784 # unsigned char* TAG, unsigned char* KS,
1785 # size_t byte_len);
1786 # parameter 1: %rdi #PT
1787 # parameter 2: %rsi #CT
1788 # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
1789 # parameter 4: %rcx #KS
1790 # parameter 5: %r8 #LEN MSG_length in bytes
1791 $code.=<<___;
1792.globl aes256gcmsiv_enc_msg_x4
1793.type aes256gcmsiv_enc_msg_x4,\@function,5
1794.align 16
1795aes256gcmsiv_enc_msg_x4:
1796.cfi_startproc
1797 test $LEN, $LEN
1798 jnz .L256_enc_msg_x4_start
1799 ret
1800
1801.L256_enc_msg_x4_start:
1802 movq $LEN, %r10
1803 shrq \$4, $LEN # LEN = num of blocks
1804 shlq \$60, %r10
1805 jz .L256_enc_msg_x4_start2
1806 addq \$1, $LEN
1807
1808.L256_enc_msg_x4_start2:
1809 movq $LEN, %r10
1810 shlq \$62, %r10
1811 shrq \$62, %r10
1812
1813 # make IV from TAG
1814 vmovdqa ($TAG), $IV
1815 vpor OR_MASK(%rip), $IV, $IV # IV = [1]TAG[126...32][00..00]
1816
1817 vmovdqa four(%rip), $ADDER # Register to increment counters
1818 vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00]
1819 vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01]
1820 vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02]
1821 vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
1822
1823 shrq \$2, $LEN
1824 je .L256_enc_msg_x4_check_remainder
1825
1826 subq \$64, $CT
1827 subq \$64, $PT
1828
1829.L256_enc_msg_x4_loop1:
1830 addq \$64, $CT
1831 addq \$64, $PT
1832
1833 vmovdqa $CTR1, $STATE1
1834 vmovdqa $CTR2, $STATE2
1835 vmovdqa $CTR3, $STATE3
1836 vmovdqa $CTR4, $STATE4
1837
1838 vpxor ($KS), $STATE1, $STATE1
1839 vpxor ($KS), $STATE2, $STATE2
1840 vpxor ($KS), $STATE3, $STATE3
1841 vpxor ($KS), $STATE4, $STATE4
1842
1843 ${\$aes_round->(1)}
1844 vpaddd $ADDER, $CTR1, $CTR1
1845 ${\$aes_round->(2)}
1846 vpaddd $ADDER, $CTR2, $CTR2
1847 ${\$aes_round->(3)}
1848 vpaddd $ADDER, $CTR3, $CTR3
1849 ${\$aes_round->(4)}
1850 vpaddd $ADDER, $CTR4, $CTR4
1851
1852 ${\$aes_round->(5)}
1853 ${\$aes_round->(6)}
1854 ${\$aes_round->(7)}
1855 ${\$aes_round->(8)}
1856 ${\$aes_round->(9)}
1857 ${\$aes_round->(10)}
1858 ${\$aes_round->(11)}
1859 ${\$aes_round->(12)}
1860 ${\$aes_round->(13)}
1861 ${\$aes_lastround->(14)}
1862
1863 # XOR with Plaintext
1864 vpxor 0*16($PT), $STATE1, $STATE1
1865 vpxor 1*16($PT), $STATE2, $STATE2
1866 vpxor 2*16($PT), $STATE3, $STATE3
1867 vpxor 3*16($PT), $STATE4, $STATE4
1868
1869 subq \$1, $LEN
1870
1871 vmovdqu $STATE1, 0*16($CT)
1872 vmovdqu $STATE2, 1*16($CT)
1873 vmovdqu $STATE3, 2*16($CT)
1874 vmovdqu $STATE4, 3*16($CT)
1875
1876 jne .L256_enc_msg_x4_loop1
1877
1878 addq \$64, $CT
1879 addq \$64, $PT
1880
1881.L256_enc_msg_x4_check_remainder:
1882 cmpq \$0, %r10
1883 je .L256_enc_msg_x4_out
1884
1885.L256_enc_msg_x4_loop2:
1886 # encrypt each block separately
1887 # CTR1 is the highest counter (even if no LOOP done)
1888
1889 vmovdqa $CTR1, $STATE1
1890 vpaddd one(%rip), $CTR1, $CTR1 # inc counter
1891 vpxor ($KS), $STATE1, $STATE1
1892 vaesenc 16($KS), $STATE1, $STATE1
1893 vaesenc 32($KS), $STATE1, $STATE1
1894 vaesenc 48($KS), $STATE1, $STATE1
1895 vaesenc 64($KS), $STATE1, $STATE1
1896 vaesenc 80($KS), $STATE1, $STATE1
1897 vaesenc 96($KS), $STATE1, $STATE1
1898 vaesenc 112($KS), $STATE1, $STATE1
1899 vaesenc 128($KS), $STATE1, $STATE1
1900 vaesenc 144($KS), $STATE1, $STATE1
1901 vaesenc 160($KS), $STATE1, $STATE1
1902 vaesenc 176($KS), $STATE1, $STATE1
1903 vaesenc 192($KS), $STATE1, $STATE1
1904 vaesenc 208($KS), $STATE1, $STATE1
1905 vaesenclast 224($KS), $STATE1, $STATE1
1906
1907 # XOR with Plaintext
1908 vpxor ($PT), $STATE1, $STATE1
1909
1910 vmovdqu $STATE1, ($CT)
1911
1912 addq \$16, $PT
1913 addq \$16, $CT
1914
1915 subq \$1, %r10
1916 jne .L256_enc_msg_x4_loop2
1917
1918.L256_enc_msg_x4_out:
1919 ret
1920.cfi_endproc
1921.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4
1922___
1923}
1924aes256gcmsiv_enc_msg_x4();
1925
1926sub aes256gcmsiv_enc_msg_x8() {
1927 my $STATE1 = "%xmm1";
1928 my $STATE2 = "%xmm2";
1929 my $STATE3 = "%xmm3";
1930 my $STATE4 = "%xmm4";
1931 my $STATE5 = "%xmm5";
1932 my $STATE6 = "%xmm6";
1933 my $STATE7 = "%xmm7";
1934 my $STATE8 = "%xmm8";
1935 my $CTR1 = "%xmm0";
1936 my $CTR2 = "%xmm9";
1937 my $CTR3 = "%xmm10";
1938 my $CTR4 = "%xmm11";
1939 my $CTR5 = "%xmm12";
1940 my $CTR6 = "%xmm13";
1941 my $CTR7 = "%xmm14";
1942 my $TMP1 = "%xmm1";
1943 my $TMP2 = "%xmm2";
1944 my $KS = "%rcx";
1945 my $LEN = "%r8";
1946 my $PT = "%rdi";
1947 my $CT = "%rsi";
1948 my $TAG = "%rdx";
1949 my $SCHED = "%xmm15";
1950
1951 my $aes_round8 = sub {
1952 my ($i) = @_;
1953 return <<___;
1954 vmovdqu ${\eval($i*16)}($KS), $SCHED
1955 vaesenc $SCHED, $STATE1, $STATE1
1956 vaesenc $SCHED, $STATE2, $STATE2
1957 vaesenc $SCHED, $STATE3, $STATE3
1958 vaesenc $SCHED, $STATE4, $STATE4
1959 vaesenc $SCHED, $STATE5, $STATE5
1960 vaesenc $SCHED, $STATE6, $STATE6
1961 vaesenc $SCHED, $STATE7, $STATE7
1962 vaesenc $SCHED, $STATE8, $STATE8
1963___
1964 };
1965
1966 my $aes_lastround8 = sub {
1967 my ($i) = @_;
1968 return <<___;
1969 vmovdqu ${\eval($i*16)}($KS), $SCHED
1970 vaesenclast $SCHED, $STATE1, $STATE1
1971 vaesenclast $SCHED, $STATE2, $STATE2
1972 vaesenclast $SCHED, $STATE3, $STATE3
1973 vaesenclast $SCHED, $STATE4, $STATE4
1974 vaesenclast $SCHED, $STATE5, $STATE5
1975 vaesenclast $SCHED, $STATE6, $STATE6
1976 vaesenclast $SCHED, $STATE7, $STATE7
1977 vaesenclast $SCHED, $STATE8, $STATE8
1978___
1979 };
1980
1981 # void ENC_MSG_x8(unsigned char* PT,
1982 # unsigned char* CT,
1983 # unsigned char* TAG,
1984 # unsigned char* KS,
1985 # size_t byte_len);
1986 # parameter 1: %rdi #PT
1987 # parameter 2: %rsi #CT
1988 # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
1989 # parameter 4: %rcx #KS
1990 # parameter 5: %r8 #LEN MSG_length in bytes
1991 $code.=<<___;
1992.globl aes256gcmsiv_enc_msg_x8
1993.type aes256gcmsiv_enc_msg_x8,\@function,5
1994.align 16
1995aes256gcmsiv_enc_msg_x8:
1996.cfi_startproc
1997 test $LEN, $LEN
1998 jnz .L256_enc_msg_x8_start
1999 ret
2000
2001.L256_enc_msg_x8_start:
2002 # Place in stack
2003 movq %rsp, %r11
2004 subq \$16, %r11
2005 andq \$-64, %r11
2006
2007 movq $LEN, %r10
2008 shrq \$4, $LEN # LEN = num of blocks
2009 shlq \$60, %r10
2010 jz .L256_enc_msg_x8_start2
2011 addq \$1, $LEN
2012
2013.L256_enc_msg_x8_start2:
2014 movq $LEN, %r10
2015 shlq \$61, %r10
2016 shrq \$61, %r10
2017
2018 # Make IV from TAG
2019 vmovdqa ($TAG), $TMP1
2020 vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00]
2021
2022 # store counter8 on the stack
2023 vpaddd seven(%rip), $TMP1, $CTR1
2024 vmovdqa $CTR1, (%r11) # CTR8 = TAG[127...32][00..07]
2025 vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01]
2026 vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02]
2027 vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03]
2028 vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04]
2029 vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05]
2030 vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06]
2031 vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00]
2032
2033 shrq \$3, $LEN
2034 jz .L256_enc_msg_x8_check_remainder
2035
2036 subq \$128, $CT
2037 subq \$128, $PT
2038
2039.L256_enc_msg_x8_loop1:
2040 addq \$128, $CT
2041 addq \$128, $PT
2042
2043 vmovdqa $CTR1, $STATE1
2044 vmovdqa $CTR2, $STATE2
2045 vmovdqa $CTR3, $STATE3
2046 vmovdqa $CTR4, $STATE4
2047 vmovdqa $CTR5, $STATE5
2048 vmovdqa $CTR6, $STATE6
2049 vmovdqa $CTR7, $STATE7
2050 # move from stack
2051 vmovdqa (%r11), $STATE8
2052
2053 vpxor ($KS), $STATE1, $STATE1
2054 vpxor ($KS), $STATE2, $STATE2
2055 vpxor ($KS), $STATE3, $STATE3
2056 vpxor ($KS), $STATE4, $STATE4
2057 vpxor ($KS), $STATE5, $STATE5
2058 vpxor ($KS), $STATE6, $STATE6
2059 vpxor ($KS), $STATE7, $STATE7
2060 vpxor ($KS), $STATE8, $STATE8
2061
2062 ${\$aes_round8->(1)}
2063 vmovdqa (%r11), $CTR7 # deal with CTR8
2064 vpaddd eight(%rip), $CTR7, $CTR7
2065 vmovdqa $CTR7, (%r11)
2066 ${\$aes_round8->(2)}
2067 vpsubd one(%rip), $CTR7, $CTR7
2068 ${\$aes_round8->(3)}
2069 vpaddd eight(%rip), $CTR1, $CTR1
2070 ${\$aes_round8->(4)}
2071 vpaddd eight(%rip), $CTR2, $CTR2
2072 ${\$aes_round8->(5)}
2073 vpaddd eight(%rip), $CTR3, $CTR3
2074 ${\$aes_round8->(6)}
2075 vpaddd eight(%rip), $CTR4, $CTR4
2076 ${\$aes_round8->(7)}
2077 vpaddd eight(%rip), $CTR5, $CTR5
2078 ${\$aes_round8->(8)}
2079 vpaddd eight(%rip), $CTR6, $CTR6
2080 ${\$aes_round8->(9)}
2081 ${\$aes_round8->(10)}
2082 ${\$aes_round8->(11)}
2083 ${\$aes_round8->(12)}
2084 ${\$aes_round8->(13)}
2085 ${\$aes_lastround8->(14)}
2086
2087 # XOR with Plaintext
2088 vpxor 0*16($PT), $STATE1, $STATE1
2089 vpxor 1*16($PT), $STATE2, $STATE2
2090 vpxor 2*16($PT), $STATE3, $STATE3
2091 vpxor 3*16($PT), $STATE4, $STATE4
2092 vpxor 4*16($PT), $STATE5, $STATE5
2093 vpxor 5*16($PT), $STATE6, $STATE6
2094 vpxor 6*16($PT), $STATE7, $STATE7
2095 vpxor 7*16($PT), $STATE8, $STATE8
2096
2097 subq \$1, $LEN
2098
2099 vmovdqu $STATE1, 0*16($CT)
2100 vmovdqu $STATE2, 1*16($CT)
2101 vmovdqu $STATE3, 2*16($CT)
2102 vmovdqu $STATE4, 3*16($CT)
2103 vmovdqu $STATE5, 4*16($CT)
2104 vmovdqu $STATE6, 5*16($CT)
2105 vmovdqu $STATE7, 6*16($CT)
2106 vmovdqu $STATE8, 7*16($CT)
2107
2108 jne .L256_enc_msg_x8_loop1
2109
2110 addq \$128, $CT
2111 addq \$128, $PT
2112
2113.L256_enc_msg_x8_check_remainder:
2114 cmpq \$0, %r10
2115 je .L256_enc_msg_x8_out
2116
2117.L256_enc_msg_x8_loop2:
2118 # encrypt each block separately
2119 # CTR1 is the highest counter (even if no LOOP done)
2120 vmovdqa $CTR1, $STATE1
2121 vpaddd one(%rip), $CTR1, $CTR1
2122
2123 vpxor ($KS), $STATE1, $STATE1
2124 vaesenc 16($KS), $STATE1, $STATE1
2125 vaesenc 32($KS), $STATE1, $STATE1
2126 vaesenc 48($KS), $STATE1, $STATE1
2127 vaesenc 64($KS), $STATE1, $STATE1
2128 vaesenc 80($KS), $STATE1, $STATE1
2129 vaesenc 96($KS), $STATE1, $STATE1
2130 vaesenc 112($KS), $STATE1, $STATE1
2131 vaesenc 128($KS), $STATE1, $STATE1
2132 vaesenc 144($KS), $STATE1, $STATE1
2133 vaesenc 160($KS), $STATE1, $STATE1
2134 vaesenc 176($KS), $STATE1, $STATE1
2135 vaesenc 192($KS), $STATE1, $STATE1
2136 vaesenc 208($KS), $STATE1, $STATE1
2137 vaesenclast 224($KS), $STATE1, $STATE1
2138
2139 # XOR with Plaintext
2140 vpxor ($PT), $STATE1, $STATE1
2141
2142 vmovdqu $STATE1, ($CT)
2143
2144 addq \$16, $PT
2145 addq \$16, $CT
2146 subq \$1, %r10
2147 jnz .L256_enc_msg_x8_loop2
2148
2149.L256_enc_msg_x8_out:
2150 ret
2151
2152.cfi_endproc
2153.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8
2154___
2155}
2156aes256gcmsiv_enc_msg_x8();
2157aesgcmsiv_dec(1);
2158
2159sub aes256gcmsiv_kdf {
2160 my $ONE = "%xmm8";
2161 my $BLOCK1 = "%xmm4";
2162 my $BLOCK2 = "%xmm6";
2163 my $BLOCK3 = "%xmm7";
2164 my $BLOCK4 = "%xmm11";
2165 my $BLOCK5 = "%xmm12";
2166 my $BLOCK6 = "%xmm13";
2167
2168 my $enc_roundx6 = sub {
2169 my ($i, $j) = @_;
2170 return <<___;
2171 vmovdqa ${\eval($i*16)}(%rdx), $j
2172 vaesenc $j, $BLOCK1, $BLOCK1
2173 vaesenc $j, $BLOCK2, $BLOCK2
2174 vaesenc $j, $BLOCK3, $BLOCK3
2175 vaesenc $j, $BLOCK4, $BLOCK4
2176 vaesenc $j, $BLOCK5, $BLOCK5
2177 vaesenc $j, $BLOCK6, $BLOCK6
2178___
2179 };
2180
2181 my $enc_roundlastx6 = sub {
2182 my ($i, $j) = @_;
2183 return <<___;
2184 vmovdqa ${\eval($i*16)}(%rdx), $j
2185 vaesenclast $j, $BLOCK1, $BLOCK1
2186 vaesenclast $j, $BLOCK2, $BLOCK2
2187 vaesenclast $j, $BLOCK3, $BLOCK3
2188 vaesenclast $j, $BLOCK4, $BLOCK4
2189 vaesenclast $j, $BLOCK5, $BLOCK5
2190 vaesenclast $j, $BLOCK6, $BLOCK6
2191___
2192 };
2193
2194 # void aes256gcmsiv_kdf(const uint8_t nonce[16],
2195 # uint8_t *out_key_material,
2196 # const uint8_t *key_schedule);
2197 $code.=<<___;
2198.globl aes256gcmsiv_kdf
2199.type aes256gcmsiv_kdf,\@function,3
2200.align 16
2201aes256gcmsiv_kdf:
2202.cfi_startproc
2203# parameter 1: %rdi Pointer to NONCE
2204# parameter 2: %rsi Pointer to CT
2205# parameter 4: %rdx Pointer to keys
2206
2207 vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key
2208 vmovdqa 0*16(%rdi), $BLOCK1
2209 vmovdqa and_mask(%rip), $BLOCK4
2210 vmovdqa one(%rip), $ONE
2211 vpshufd \$0x90, $BLOCK1, $BLOCK1
2212 vpand $BLOCK4, $BLOCK1, $BLOCK1
2213 vpaddd $ONE, $BLOCK1, $BLOCK2
2214 vpaddd $ONE, $BLOCK2, $BLOCK3
2215 vpaddd $ONE, $BLOCK3, $BLOCK4
2216 vpaddd $ONE, $BLOCK4, $BLOCK5
2217 vpaddd $ONE, $BLOCK5, $BLOCK6
2218
2219 vpxor %xmm1, $BLOCK1, $BLOCK1
2220 vpxor %xmm1, $BLOCK2, $BLOCK2
2221 vpxor %xmm1, $BLOCK3, $BLOCK3
2222 vpxor %xmm1, $BLOCK4, $BLOCK4
2223 vpxor %xmm1, $BLOCK5, $BLOCK5
2224 vpxor %xmm1, $BLOCK6, $BLOCK6
2225
2226 ${\$enc_roundx6->(1, "%xmm1")}
2227 ${\$enc_roundx6->(2, "%xmm2")}
2228 ${\$enc_roundx6->(3, "%xmm1")}
2229 ${\$enc_roundx6->(4, "%xmm2")}
2230 ${\$enc_roundx6->(5, "%xmm1")}
2231 ${\$enc_roundx6->(6, "%xmm2")}
2232 ${\$enc_roundx6->(7, "%xmm1")}
2233 ${\$enc_roundx6->(8, "%xmm2")}
2234 ${\$enc_roundx6->(9, "%xmm1")}
2235 ${\$enc_roundx6->(10, "%xmm2")}
2236 ${\$enc_roundx6->(11, "%xmm1")}
2237 ${\$enc_roundx6->(12, "%xmm2")}
2238 ${\$enc_roundx6->(13, "%xmm1")}
2239 ${\$enc_roundlastx6->(14, "%xmm2")}
2240
2241 vmovdqa $BLOCK1, 0*16(%rsi)
2242 vmovdqa $BLOCK2, 1*16(%rsi)
2243 vmovdqa $BLOCK3, 2*16(%rsi)
2244 vmovdqa $BLOCK4, 3*16(%rsi)
2245 vmovdqa $BLOCK5, 4*16(%rsi)
2246 vmovdqa $BLOCK6, 5*16(%rsi)
2247 ret
2248.cfi_endproc
2249.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
2250___
2251}
2252aes256gcmsiv_kdf();
2253
2254print $code;
2255
2256close STDOUT;