blob: 8d9ee6ba991ecf41a14370b1a8bcf21f7fa3f5b4 [file] [log] [blame]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode] ###
5### bitsliced implementation for Intel Core 2 processors ###
6### requires support of SSE extensions up to SSSE3 ###
7### Author: Emilia Käsper and Peter Schwabe ###
8### Date: 2009-03-19 ###
9### Public domain ###
10### ###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12### further information. ###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22# from 12.5KB to 2.2KB;
23# - above was possibile thanks to mixcolumns() modification that
24# allowed to feed its output back to aesenc[last], this was
25# achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28# relies on conversion of "conventional" key schedule as returned
29# by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31# to skip one shiftrows(), reduce bit-sliced key schedule and
32# speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38# Emilia's this(*) difference
39#
40# Core 2 9.30 8.69 +7%
41# Nehalem(**) 7.63 6.88 +11%
42# Atom 17.1 16.4 +4%
43# Silvermont - 12.9
Robert Sloana94fe052017-02-21 08:49:28 -080044# Goldmont - 8.85
Adam Langleyd9e397b2015-01-22 14:27:53 -080045#
46# (*) Comparison is not completely fair, because "this" is ECB,
47# i.e. no extra processing such as counter values calculation
48# and xor-ing input as in Emilia's CTR implementation is
49# performed. However, the CTR calculations stand for not more
50# than 1% of total time, so comparison is *rather* fair.
51#
52# (**) Results were collected on Westmere, which is considered to
53# be equivalent to Nehalem for this code.
54#
55# As for key schedule conversion subroutine. Interface to OpenSSL
56# relies on per-invocation on-the-fly conversion. This naturally
57# has impact on performance, especially for short inputs. Conversion
58# time in CPU cycles and its ratio to CPU cycles spent in 8x block
59# function is:
60#
61# conversion conversion/8x block
62# Core 2 240 0.22
63# Nehalem 180 0.20
64# Atom 430 0.20
65#
66# The ratio values mean that 128-byte blocks will be processed
67# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
68# etc. Then keep in mind that input sizes not divisible by 128 are
69# *effectively* slower, especially shortest ones, e.g. consecutive
70# 144-byte blocks are processed 44% slower than one would expect,
71# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
72# it's still faster than ["hyper-threading-safe" code path in]
73# aes-x86_64.pl on all lengths above 64 bytes...
74#
75# October 2011.
76#
77# Add decryption procedure. Performance in CPU cycles spent to decrypt
78# one byte out of 4096-byte buffer with 128-bit key is:
79#
80# Core 2 9.98
81# Nehalem 7.80
82# Atom 17.9
83# Silvermont 14.0
Robert Sloana94fe052017-02-21 08:49:28 -080084# Goldmont 10.2
Adam Langleyd9e397b2015-01-22 14:27:53 -080085#
86# November 2011.
87#
88# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
89# suboptimal, but XTS is meant to be used with larger blocks...
90#
91# <appro@openssl.org>
92
93$flavour = shift;
94$output = shift;
95if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
96
97$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
98
99$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
100( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
Robert Sloan572a4e22017-04-17 10:52:19 -0700101( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
Adam Langleyd9e397b2015-01-22 14:27:53 -0800102die "can't locate x86_64-xlate.pl";
103
David Benjaminc895d6b2016-08-11 13:26:41 -0400104open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
Adam Langleyd9e397b2015-01-22 14:27:53 -0800105*STDOUT=*OUT;
106
107my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
108my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
109my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
110
111{
112my ($key,$rounds,$const)=("%rax","%r10d","%r11");
113
114sub Sbox {
115# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
116# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
117my @b=@_[0..7];
118my @t=@_[8..11];
119my @s=@_[12..15];
120 &InBasisChange (@b);
121 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
122 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
123}
124
125sub InBasisChange {
126# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
Robert Sloana94fe052017-02-21 08:49:28 -0800127# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
Adam Langleyd9e397b2015-01-22 14:27:53 -0800128my @b=@_[0..7];
129$code.=<<___;
130 pxor @b[6], @b[5]
131 pxor @b[1], @b[2]
132 pxor @b[0], @b[3]
133 pxor @b[2], @b[6]
134 pxor @b[0], @b[5]
135
136 pxor @b[3], @b[6]
137 pxor @b[7], @b[3]
138 pxor @b[5], @b[7]
139 pxor @b[4], @b[3]
140 pxor @b[5], @b[4]
141 pxor @b[1], @b[3]
142
143 pxor @b[7], @b[2]
144 pxor @b[5], @b[1]
145___
146}
147
148sub OutBasisChange {
149# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
150# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
151my @b=@_[0..7];
152$code.=<<___;
153 pxor @b[6], @b[0]
154 pxor @b[4], @b[1]
155 pxor @b[0], @b[2]
156 pxor @b[6], @b[4]
157 pxor @b[1], @b[6]
158
159 pxor @b[5], @b[1]
160 pxor @b[3], @b[5]
161 pxor @b[7], @b[3]
162 pxor @b[5], @b[7]
163 pxor @b[5], @b[2]
164
165 pxor @b[7], @b[4]
166___
167}
168
169sub InvSbox {
170# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
171# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
172my @b=@_[0..7];
173my @t=@_[8..11];
174my @s=@_[12..15];
175 &InvInBasisChange (@b);
176 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
177 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
178}
179
180sub InvInBasisChange { # OutBasisChange in reverse
181my @b=@_[5,1,2,6,3,7,0,4];
182$code.=<<___
183 pxor @b[7], @b[4]
184
185 pxor @b[5], @b[7]
186 pxor @b[5], @b[2]
187 pxor @b[7], @b[3]
188 pxor @b[3], @b[5]
189 pxor @b[5], @b[1]
190
191 pxor @b[1], @b[6]
192 pxor @b[0], @b[2]
193 pxor @b[6], @b[4]
194 pxor @b[6], @b[0]
195 pxor @b[4], @b[1]
196___
197}
198
199sub InvOutBasisChange { # InBasisChange in reverse
200my @b=@_[2,5,7,3,6,1,0,4];
201$code.=<<___;
202 pxor @b[5], @b[1]
203 pxor @b[7], @b[2]
204
205 pxor @b[1], @b[3]
206 pxor @b[5], @b[4]
207 pxor @b[5], @b[7]
208 pxor @b[4], @b[3]
209 pxor @b[0], @b[5]
210 pxor @b[7], @b[3]
211 pxor @b[2], @b[6]
212 pxor @b[1], @b[2]
213 pxor @b[3], @b[6]
214
215 pxor @b[0], @b[3]
216 pxor @b[6], @b[5]
217___
218}
219
220sub Mul_GF4 {
221#;*************************************************************
222#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
223#;*************************************************************
224my ($x0,$x1,$y0,$y1,$t0)=@_;
225$code.=<<___;
226 movdqa $y0, $t0
227 pxor $y1, $t0
228 pand $x0, $t0
229 pxor $x1, $x0
230 pand $y0, $x1
231 pand $y1, $x0
232 pxor $x1, $x0
233 pxor $t0, $x1
234___
235}
236
237sub Mul_GF4_N { # not used, see next subroutine
238# multiply and scale by N
239my ($x0,$x1,$y0,$y1,$t0)=@_;
240$code.=<<___;
241 movdqa $y0, $t0
242 pxor $y1, $t0
243 pand $x0, $t0
244 pxor $x1, $x0
245 pand $y0, $x1
246 pand $y1, $x0
247 pxor $x0, $x1
248 pxor $t0, $x0
249___
250}
251
252sub Mul_GF4_N_GF4 {
253# interleaved Mul_GF4_N and Mul_GF4
254my ($x0,$x1,$y0,$y1,$t0,
255 $x2,$x3,$y2,$y3,$t1)=@_;
256$code.=<<___;
257 movdqa $y0, $t0
258 movdqa $y2, $t1
259 pxor $y1, $t0
260 pxor $y3, $t1
261 pand $x0, $t0
262 pand $x2, $t1
263 pxor $x1, $x0
264 pxor $x3, $x2
265 pand $y0, $x1
266 pand $y2, $x3
267 pand $y1, $x0
268 pand $y3, $x2
269 pxor $x0, $x1
270 pxor $x3, $x2
271 pxor $t0, $x0
272 pxor $t1, $x3
273___
274}
275sub Mul_GF16_2 {
276my @x=@_[0..7];
277my @y=@_[8..11];
278my @t=@_[12..15];
279$code.=<<___;
280 movdqa @x[0], @t[0]
281 movdqa @x[1], @t[1]
282___
283 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
284$code.=<<___;
285 pxor @x[2], @t[0]
286 pxor @x[3], @t[1]
287 pxor @y[2], @y[0]
288 pxor @y[3], @y[1]
289___
290 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
291 @x[2], @x[3], @y[2], @y[3], @t[2]);
292$code.=<<___;
293 pxor @t[0], @x[0]
294 pxor @t[0], @x[2]
295 pxor @t[1], @x[1]
296 pxor @t[1], @x[3]
297
298 movdqa @x[4], @t[0]
299 movdqa @x[5], @t[1]
300 pxor @x[6], @t[0]
301 pxor @x[7], @t[1]
302___
303 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
304 @x[6], @x[7], @y[2], @y[3], @t[2]);
305$code.=<<___;
306 pxor @y[2], @y[0]
307 pxor @y[3], @y[1]
308___
309 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
310$code.=<<___;
311 pxor @t[0], @x[4]
312 pxor @t[0], @x[6]
313 pxor @t[1], @x[5]
314 pxor @t[1], @x[7]
315___
316}
317sub Inv_GF256 {
318#;********************************************************************
319#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
320#;********************************************************************
321my @x=@_[0..7];
322my @t=@_[8..11];
323my @s=@_[12..15];
324# direct optimizations from hardware
325$code.=<<___;
326 movdqa @x[4], @t[3]
327 movdqa @x[5], @t[2]
328 movdqa @x[1], @t[1]
329 movdqa @x[7], @s[1]
330 movdqa @x[0], @s[0]
331
332 pxor @x[6], @t[3]
333 pxor @x[7], @t[2]
334 pxor @x[3], @t[1]
335 movdqa @t[3], @s[2]
336 pxor @x[6], @s[1]
337 movdqa @t[2], @t[0]
338 pxor @x[2], @s[0]
339 movdqa @t[3], @s[3]
340
341 por @t[1], @t[2]
342 por @s[0], @t[3]
343 pxor @t[0], @s[3]
344 pand @s[0], @s[2]
345 pxor @t[1], @s[0]
346 pand @t[1], @t[0]
347 pand @s[0], @s[3]
348 movdqa @x[3], @s[0]
349 pxor @x[2], @s[0]
350 pand @s[0], @s[1]
351 pxor @s[1], @t[3]
352 pxor @s[1], @t[2]
353 movdqa @x[4], @s[1]
354 movdqa @x[1], @s[0]
355 pxor @x[5], @s[1]
356 pxor @x[0], @s[0]
357 movdqa @s[1], @t[1]
358 pand @s[0], @s[1]
359 por @s[0], @t[1]
360 pxor @s[1], @t[0]
361 pxor @s[3], @t[3]
362 pxor @s[2], @t[2]
363 pxor @s[3], @t[1]
364 movdqa @x[7], @s[0]
365 pxor @s[2], @t[0]
366 movdqa @x[6], @s[1]
367 pxor @s[2], @t[1]
368 movdqa @x[5], @s[2]
369 pand @x[3], @s[0]
370 movdqa @x[4], @s[3]
371 pand @x[2], @s[1]
372 pand @x[1], @s[2]
373 por @x[0], @s[3]
374 pxor @s[0], @t[3]
375 pxor @s[1], @t[2]
376 pxor @s[2], @t[1]
Robert Sloana94fe052017-02-21 08:49:28 -0800377 pxor @s[3], @t[0]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800378
379 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
380
381 # new smaller inversion
382
383 movdqa @t[3], @s[0]
384 pand @t[1], @t[3]
385 pxor @t[2], @s[0]
386
387 movdqa @t[0], @s[2]
388 movdqa @s[0], @s[3]
389 pxor @t[3], @s[2]
390 pand @s[2], @s[3]
391
392 movdqa @t[1], @s[1]
393 pxor @t[2], @s[3]
394 pxor @t[0], @s[1]
395
396 pxor @t[2], @t[3]
397
398 pand @t[3], @s[1]
399
400 movdqa @s[2], @t[2]
401 pxor @t[0], @s[1]
402
403 pxor @s[1], @t[2]
404 pxor @s[1], @t[1]
405
406 pand @t[0], @t[2]
407
408 pxor @t[2], @s[2]
409 pxor @t[2], @t[1]
410
411 pand @s[3], @s[2]
412
413 pxor @s[0], @s[2]
414___
415# output in s3, s2, s1, t1
416
417# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
418
419# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
420 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
421
422### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
423}
424
425# AES linear components
426
427sub ShiftRows {
428my @x=@_[0..7];
429my $mask=pop;
430$code.=<<___;
431 pxor 0x00($key),@x[0]
432 pxor 0x10($key),@x[1]
433 pxor 0x20($key),@x[2]
434 pxor 0x30($key),@x[3]
435 pshufb $mask,@x[0]
436 pshufb $mask,@x[1]
437 pxor 0x40($key),@x[4]
438 pxor 0x50($key),@x[5]
439 pshufb $mask,@x[2]
440 pshufb $mask,@x[3]
441 pxor 0x60($key),@x[6]
442 pxor 0x70($key),@x[7]
443 pshufb $mask,@x[4]
444 pshufb $mask,@x[5]
445 pshufb $mask,@x[6]
446 pshufb $mask,@x[7]
447 lea 0x80($key),$key
448___
449}
450
451sub MixColumns {
452# modified to emit output in order suitable for feeding back to aesenc[last]
453my @x=@_[0..7];
454my @t=@_[8..15];
455my $inv=@_[16]; # optional
456$code.=<<___;
457 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
458 pshufd \$0x93, @x[1], @t[1]
459 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
460 pshufd \$0x93, @x[2], @t[2]
461 pxor @t[1], @x[1]
462 pshufd \$0x93, @x[3], @t[3]
463 pxor @t[2], @x[2]
464 pshufd \$0x93, @x[4], @t[4]
465 pxor @t[3], @x[3]
466 pshufd \$0x93, @x[5], @t[5]
467 pxor @t[4], @x[4]
468 pshufd \$0x93, @x[6], @t[6]
469 pxor @t[5], @x[5]
470 pshufd \$0x93, @x[7], @t[7]
471 pxor @t[6], @x[6]
472 pxor @t[7], @x[7]
473
474 pxor @x[0], @t[1]
475 pxor @x[7], @t[0]
476 pxor @x[7], @t[1]
477 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
478 pxor @x[1], @t[2]
479 pshufd \$0x4E, @x[1], @x[1]
480 pxor @x[4], @t[5]
481 pxor @t[0], @x[0]
482 pxor @x[5], @t[6]
483 pxor @t[1], @x[1]
484 pxor @x[3], @t[4]
485 pshufd \$0x4E, @x[4], @t[0]
486 pxor @x[6], @t[7]
487 pshufd \$0x4E, @x[5], @t[1]
488 pxor @x[2], @t[3]
489 pshufd \$0x4E, @x[3], @x[4]
490 pxor @x[7], @t[3]
491 pshufd \$0x4E, @x[7], @x[5]
492 pxor @x[7], @t[4]
493 pshufd \$0x4E, @x[6], @x[3]
494 pxor @t[4], @t[0]
495 pshufd \$0x4E, @x[2], @x[6]
496 pxor @t[5], @t[1]
497___
498$code.=<<___ if (!$inv);
499 pxor @t[3], @x[4]
500 pxor @t[7], @x[5]
501 pxor @t[6], @x[3]
502 movdqa @t[0], @x[2]
503 pxor @t[2], @x[6]
504 movdqa @t[1], @x[7]
505___
506$code.=<<___ if ($inv);
507 pxor @x[4], @t[3]
508 pxor @t[7], @x[5]
509 pxor @x[3], @t[6]
510 movdqa @t[0], @x[3]
511 pxor @t[2], @x[6]
512 movdqa @t[6], @x[2]
513 movdqa @t[1], @x[7]
514 movdqa @x[6], @x[4]
515 movdqa @t[3], @x[6]
516___
517}
518
519sub InvMixColumns_orig {
520my @x=@_[0..7];
521my @t=@_[8..15];
522
523$code.=<<___;
524 # multiplication by 0x0e
525 pshufd \$0x93, @x[7], @t[7]
526 movdqa @x[2], @t[2]
527 pxor @x[5], @x[7] # 7 5
528 pxor @x[5], @x[2] # 2 5
529 pshufd \$0x93, @x[0], @t[0]
530 movdqa @x[5], @t[5]
531 pxor @x[0], @x[5] # 5 0 [1]
532 pxor @x[1], @x[0] # 0 1
533 pshufd \$0x93, @x[1], @t[1]
534 pxor @x[2], @x[1] # 1 25
535 pxor @x[6], @x[0] # 01 6 [2]
536 pxor @x[3], @x[1] # 125 3 [4]
537 pshufd \$0x93, @x[3], @t[3]
538 pxor @x[0], @x[2] # 25 016 [3]
539 pxor @x[7], @x[3] # 3 75
540 pxor @x[6], @x[7] # 75 6 [0]
541 pshufd \$0x93, @x[6], @t[6]
542 movdqa @x[4], @t[4]
543 pxor @x[4], @x[6] # 6 4
544 pxor @x[3], @x[4] # 4 375 [6]
545 pxor @x[7], @x[3] # 375 756=36
546 pxor @t[5], @x[6] # 64 5 [7]
547 pxor @t[2], @x[3] # 36 2
548 pxor @t[4], @x[3] # 362 4 [5]
549 pshufd \$0x93, @t[5], @t[5]
550___
551 my @y = @x[7,5,0,2,1,3,4,6];
552$code.=<<___;
553 # multiplication by 0x0b
554 pxor @y[0], @y[1]
555 pxor @t[0], @y[0]
556 pxor @t[1], @y[1]
557 pshufd \$0x93, @t[2], @t[2]
558 pxor @t[5], @y[0]
559 pxor @t[6], @y[1]
560 pxor @t[7], @y[0]
561 pshufd \$0x93, @t[4], @t[4]
562 pxor @t[6], @t[7] # clobber t[7]
563 pxor @y[0], @y[1]
564
565 pxor @t[0], @y[3]
566 pshufd \$0x93, @t[0], @t[0]
567 pxor @t[1], @y[2]
568 pxor @t[1], @y[4]
569 pxor @t[2], @y[2]
570 pshufd \$0x93, @t[1], @t[1]
571 pxor @t[2], @y[3]
572 pxor @t[2], @y[5]
573 pxor @t[7], @y[2]
574 pshufd \$0x93, @t[2], @t[2]
575 pxor @t[3], @y[3]
576 pxor @t[3], @y[6]
577 pxor @t[3], @y[4]
578 pshufd \$0x93, @t[3], @t[3]
579 pxor @t[4], @y[7]
580 pxor @t[4], @y[5]
581 pxor @t[7], @y[7]
582 pxor @t[5], @y[3]
583 pxor @t[4], @y[4]
584 pxor @t[5], @t[7] # clobber t[7] even more
585
586 pxor @t[7], @y[5]
587 pshufd \$0x93, @t[4], @t[4]
588 pxor @t[7], @y[6]
589 pxor @t[7], @y[4]
590
591 pxor @t[5], @t[7]
592 pshufd \$0x93, @t[5], @t[5]
593 pxor @t[6], @t[7] # restore t[7]
594
595 # multiplication by 0x0d
596 pxor @y[7], @y[4]
597 pxor @t[4], @y[7]
598 pshufd \$0x93, @t[6], @t[6]
599 pxor @t[0], @y[2]
600 pxor @t[5], @y[7]
601 pxor @t[2], @y[2]
602 pshufd \$0x93, @t[7], @t[7]
603
604 pxor @y[1], @y[3]
605 pxor @t[1], @y[1]
606 pxor @t[0], @y[0]
607 pxor @t[0], @y[3]
608 pxor @t[5], @y[1]
609 pxor @t[5], @y[0]
610 pxor @t[7], @y[1]
611 pshufd \$0x93, @t[0], @t[0]
612 pxor @t[6], @y[0]
613 pxor @y[1], @y[3]
614 pxor @t[1], @y[4]
615 pshufd \$0x93, @t[1], @t[1]
616
617 pxor @t[7], @y[7]
618 pxor @t[2], @y[4]
619 pxor @t[2], @y[5]
620 pshufd \$0x93, @t[2], @t[2]
621 pxor @t[6], @y[2]
622 pxor @t[3], @t[6] # clobber t[6]
623 pxor @y[7], @y[4]
624 pxor @t[6], @y[3]
625
626 pxor @t[6], @y[6]
627 pxor @t[5], @y[5]
628 pxor @t[4], @y[6]
629 pshufd \$0x93, @t[4], @t[4]
630 pxor @t[6], @y[5]
631 pxor @t[7], @y[6]
632 pxor @t[3], @t[6] # restore t[6]
633
634 pshufd \$0x93, @t[5], @t[5]
635 pshufd \$0x93, @t[6], @t[6]
636 pshufd \$0x93, @t[7], @t[7]
637 pshufd \$0x93, @t[3], @t[3]
638
639 # multiplication by 0x09
640 pxor @y[1], @y[4]
641 pxor @y[1], @t[1] # t[1]=y[1]
642 pxor @t[5], @t[0] # clobber t[0]
643 pxor @t[5], @t[1]
644 pxor @t[0], @y[3]
645 pxor @y[0], @t[0] # t[0]=y[0]
646 pxor @t[6], @t[1]
647 pxor @t[7], @t[6] # clobber t[6]
648 pxor @t[1], @y[4]
649 pxor @t[4], @y[7]
650 pxor @y[4], @t[4] # t[4]=y[4]
651 pxor @t[3], @y[6]
652 pxor @y[3], @t[3] # t[3]=y[3]
653 pxor @t[2], @y[5]
654 pxor @y[2], @t[2] # t[2]=y[2]
655 pxor @t[7], @t[3]
656 pxor @y[5], @t[5] # t[5]=y[5]
657 pxor @t[6], @t[2]
658 pxor @t[6], @t[5]
659 pxor @y[6], @t[6] # t[6]=y[6]
660 pxor @y[7], @t[7] # t[7]=y[7]
661
662 movdqa @t[0],@XMM[0]
663 movdqa @t[1],@XMM[1]
664 movdqa @t[2],@XMM[2]
665 movdqa @t[3],@XMM[3]
666 movdqa @t[4],@XMM[4]
667 movdqa @t[5],@XMM[5]
668 movdqa @t[6],@XMM[6]
669 movdqa @t[7],@XMM[7]
670___
671}
672
673sub InvMixColumns {
674my @x=@_[0..7];
675my @t=@_[8..15];
676
677# Thanks to Jussi Kivilinna for providing pointer to
678#
679# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
680# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
681# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
682# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
683
684$code.=<<___;
685 # multiplication by 0x05-0x00-0x04-0x00
686 pshufd \$0x4E, @x[0], @t[0]
687 pshufd \$0x4E, @x[6], @t[6]
688 pxor @x[0], @t[0]
689 pshufd \$0x4E, @x[7], @t[7]
690 pxor @x[6], @t[6]
691 pshufd \$0x4E, @x[1], @t[1]
692 pxor @x[7], @t[7]
693 pshufd \$0x4E, @x[2], @t[2]
694 pxor @x[1], @t[1]
695 pshufd \$0x4E, @x[3], @t[3]
696 pxor @x[2], @t[2]
697 pxor @t[6], @x[0]
698 pxor @t[6], @x[1]
699 pshufd \$0x4E, @x[4], @t[4]
700 pxor @x[3], @t[3]
701 pxor @t[0], @x[2]
702 pxor @t[1], @x[3]
703 pshufd \$0x4E, @x[5], @t[5]
704 pxor @x[4], @t[4]
705 pxor @t[7], @x[1]
706 pxor @t[2], @x[4]
707 pxor @x[5], @t[5]
708
709 pxor @t[7], @x[2]
710 pxor @t[6], @x[3]
711 pxor @t[6], @x[4]
712 pxor @t[3], @x[5]
713 pxor @t[4], @x[6]
714 pxor @t[7], @x[4]
715 pxor @t[7], @x[5]
716 pxor @t[5], @x[7]
717___
718 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
719}
720
721sub aesenc { # not used
722my @b=@_[0..7];
723my @t=@_[8..15];
724$code.=<<___;
725 movdqa 0x30($const),@t[0] # .LSR
726___
727 &ShiftRows (@b,@t[0]);
728 &Sbox (@b,@t);
729 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
730}
731
732sub aesenclast { # not used
733my @b=@_[0..7];
734my @t=@_[8..15];
735$code.=<<___;
736 movdqa 0x40($const),@t[0] # .LSRM0
737___
738 &ShiftRows (@b,@t[0]);
739 &Sbox (@b,@t);
740$code.=<<___
741 pxor 0x00($key),@b[0]
742 pxor 0x10($key),@b[1]
743 pxor 0x20($key),@b[4]
744 pxor 0x30($key),@b[6]
745 pxor 0x40($key),@b[3]
746 pxor 0x50($key),@b[7]
747 pxor 0x60($key),@b[2]
748 pxor 0x70($key),@b[5]
749___
750}
751
752sub swapmove {
753my ($a,$b,$n,$mask,$t)=@_;
754$code.=<<___;
755 movdqa $b,$t
756 psrlq \$$n,$b
757 pxor $a,$b
758 pand $mask,$b
759 pxor $b,$a
760 psllq \$$n,$b
761 pxor $t,$b
762___
763}
764sub swapmove2x {
765my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
766$code.=<<___;
767 movdqa $b0,$t0
768 psrlq \$$n,$b0
769 movdqa $b1,$t1
770 psrlq \$$n,$b1
771 pxor $a0,$b0
772 pxor $a1,$b1
773 pand $mask,$b0
774 pand $mask,$b1
775 pxor $b0,$a0
776 psllq \$$n,$b0
777 pxor $b1,$a1
778 psllq \$$n,$b1
779 pxor $t0,$b0
780 pxor $t1,$b1
781___
782}
783
784sub bitslice {
785my @x=reverse(@_[0..7]);
786my ($t0,$t1,$t2,$t3)=@_[8..11];
787$code.=<<___;
788 movdqa 0x00($const),$t0 # .LBS0
789 movdqa 0x10($const),$t1 # .LBS1
790___
791 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
792 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
793$code.=<<___;
794 movdqa 0x20($const),$t0 # .LBS2
795___
796 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
797 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
798
799 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
800 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
801}
802
803$code.=<<___;
804.text
805
806.extern asm_AES_encrypt
807.extern asm_AES_decrypt
808
809.type _bsaes_encrypt8,\@abi-omnipotent
810.align 64
811_bsaes_encrypt8:
812 lea .LBS0(%rip), $const # constants table
813
814 movdqa ($key), @XMM[9] # round 0 key
815 lea 0x10($key), $key
816 movdqa 0x50($const), @XMM[8] # .LM0SR
817 pxor @XMM[9], @XMM[0] # xor with round0 key
818 pxor @XMM[9], @XMM[1]
819 pxor @XMM[9], @XMM[2]
820 pxor @XMM[9], @XMM[3]
821 pshufb @XMM[8], @XMM[0]
822 pshufb @XMM[8], @XMM[1]
823 pxor @XMM[9], @XMM[4]
824 pxor @XMM[9], @XMM[5]
825 pshufb @XMM[8], @XMM[2]
826 pshufb @XMM[8], @XMM[3]
827 pxor @XMM[9], @XMM[6]
828 pxor @XMM[9], @XMM[7]
829 pshufb @XMM[8], @XMM[4]
830 pshufb @XMM[8], @XMM[5]
831 pshufb @XMM[8], @XMM[6]
832 pshufb @XMM[8], @XMM[7]
833_bsaes_encrypt8_bitslice:
834___
835 &bitslice (@XMM[0..7, 8..11]);
836$code.=<<___;
837 dec $rounds
838 jmp .Lenc_sbox
839.align 16
840.Lenc_loop:
841___
842 &ShiftRows (@XMM[0..7, 8]);
843$code.=".Lenc_sbox:\n";
844 &Sbox (@XMM[0..7, 8..15]);
845$code.=<<___;
846 dec $rounds
847 jl .Lenc_done
848___
849 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
850$code.=<<___;
851 movdqa 0x30($const), @XMM[8] # .LSR
852 jnz .Lenc_loop
853 movdqa 0x40($const), @XMM[8] # .LSRM0
854 jmp .Lenc_loop
855.align 16
856.Lenc_done:
857___
858 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
859 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
860$code.=<<___;
861 movdqa ($key), @XMM[8] # last round key
862 pxor @XMM[8], @XMM[4]
863 pxor @XMM[8], @XMM[6]
864 pxor @XMM[8], @XMM[3]
865 pxor @XMM[8], @XMM[7]
866 pxor @XMM[8], @XMM[2]
867 pxor @XMM[8], @XMM[5]
868 pxor @XMM[8], @XMM[0]
869 pxor @XMM[8], @XMM[1]
870 ret
871.size _bsaes_encrypt8,.-_bsaes_encrypt8
872
873.type _bsaes_decrypt8,\@abi-omnipotent
874.align 64
875_bsaes_decrypt8:
876 lea .LBS0(%rip), $const # constants table
877
878 movdqa ($key), @XMM[9] # round 0 key
879 lea 0x10($key), $key
880 movdqa -0x30($const), @XMM[8] # .LM0ISR
881 pxor @XMM[9], @XMM[0] # xor with round0 key
882 pxor @XMM[9], @XMM[1]
883 pxor @XMM[9], @XMM[2]
884 pxor @XMM[9], @XMM[3]
885 pshufb @XMM[8], @XMM[0]
886 pshufb @XMM[8], @XMM[1]
887 pxor @XMM[9], @XMM[4]
888 pxor @XMM[9], @XMM[5]
889 pshufb @XMM[8], @XMM[2]
890 pshufb @XMM[8], @XMM[3]
891 pxor @XMM[9], @XMM[6]
892 pxor @XMM[9], @XMM[7]
893 pshufb @XMM[8], @XMM[4]
894 pshufb @XMM[8], @XMM[5]
895 pshufb @XMM[8], @XMM[6]
896 pshufb @XMM[8], @XMM[7]
897___
898 &bitslice (@XMM[0..7, 8..11]);
899$code.=<<___;
900 dec $rounds
901 jmp .Ldec_sbox
902.align 16
903.Ldec_loop:
904___
905 &ShiftRows (@XMM[0..7, 8]);
906$code.=".Ldec_sbox:\n";
907 &InvSbox (@XMM[0..7, 8..15]);
908$code.=<<___;
909 dec $rounds
910 jl .Ldec_done
911___
912 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
913$code.=<<___;
914 movdqa -0x10($const), @XMM[8] # .LISR
915 jnz .Ldec_loop
916 movdqa -0x20($const), @XMM[8] # .LISRM0
917 jmp .Ldec_loop
918.align 16
919.Ldec_done:
920___
921 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
922$code.=<<___;
923 movdqa ($key), @XMM[8] # last round key
924 pxor @XMM[8], @XMM[6]
925 pxor @XMM[8], @XMM[4]
926 pxor @XMM[8], @XMM[2]
927 pxor @XMM[8], @XMM[7]
928 pxor @XMM[8], @XMM[3]
929 pxor @XMM[8], @XMM[5]
930 pxor @XMM[8], @XMM[0]
931 pxor @XMM[8], @XMM[1]
932 ret
933.size _bsaes_decrypt8,.-_bsaes_decrypt8
934___
935}
936{
937my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
938
939sub bitslice_key {
940my @x=reverse(@_[0..7]);
941my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
942
943 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
944$code.=<<___;
945 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
946 movdqa @x[0], @x[2]
947 movdqa @x[1], @x[3]
948___
949 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
950
951 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
952$code.=<<___;
953 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
954 movdqa @x[0], @x[4]
955 movdqa @x[2], @x[6]
956 movdqa @x[1], @x[5]
957 movdqa @x[3], @x[7]
958___
959 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
960 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
961}
962
963$code.=<<___;
964.type _bsaes_key_convert,\@abi-omnipotent
965.align 16
966_bsaes_key_convert:
967 lea .Lmasks(%rip), $const
968 movdqu ($inp), %xmm7 # load round 0 key
969 lea 0x10($inp), $inp
970 movdqa 0x00($const), %xmm0 # 0x01...
971 movdqa 0x10($const), %xmm1 # 0x02...
972 movdqa 0x20($const), %xmm2 # 0x04...
973 movdqa 0x30($const), %xmm3 # 0x08...
974 movdqa 0x40($const), %xmm4 # .LM0
975 pcmpeqd %xmm5, %xmm5 # .LNOT
976
977 movdqu ($inp), %xmm6 # load round 1 key
978 movdqa %xmm7, ($out) # save round 0 key
979 lea 0x10($out), $out
980 dec $rounds
981 jmp .Lkey_loop
982.align 16
983.Lkey_loop:
984 pshufb %xmm4, %xmm6 # .LM0
985
986 movdqa %xmm0, %xmm8
987 movdqa %xmm1, %xmm9
988
989 pand %xmm6, %xmm8
990 pand %xmm6, %xmm9
991 movdqa %xmm2, %xmm10
992 pcmpeqb %xmm0, %xmm8
993 psllq \$4, %xmm0 # 0x10...
994 movdqa %xmm3, %xmm11
995 pcmpeqb %xmm1, %xmm9
996 psllq \$4, %xmm1 # 0x20...
997
998 pand %xmm6, %xmm10
999 pand %xmm6, %xmm11
1000 movdqa %xmm0, %xmm12
1001 pcmpeqb %xmm2, %xmm10
1002 psllq \$4, %xmm2 # 0x40...
1003 movdqa %xmm1, %xmm13
1004 pcmpeqb %xmm3, %xmm11
1005 psllq \$4, %xmm3 # 0x80...
1006
1007 movdqa %xmm2, %xmm14
1008 movdqa %xmm3, %xmm15
1009 pxor %xmm5, %xmm8 # "pnot"
1010 pxor %xmm5, %xmm9
1011
1012 pand %xmm6, %xmm12
1013 pand %xmm6, %xmm13
1014 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1015 pcmpeqb %xmm0, %xmm12
1016 psrlq \$4, %xmm0 # 0x01...
1017 movdqa %xmm9, 0x10($out)
1018 pcmpeqb %xmm1, %xmm13
1019 psrlq \$4, %xmm1 # 0x02...
1020 lea 0x10($inp), $inp
1021
1022 pand %xmm6, %xmm14
1023 pand %xmm6, %xmm15
1024 movdqa %xmm10, 0x20($out)
1025 pcmpeqb %xmm2, %xmm14
1026 psrlq \$4, %xmm2 # 0x04...
1027 movdqa %xmm11, 0x30($out)
1028 pcmpeqb %xmm3, %xmm15
1029 psrlq \$4, %xmm3 # 0x08...
1030 movdqu ($inp), %xmm6 # load next round key
1031
1032 pxor %xmm5, %xmm13 # "pnot"
1033 pxor %xmm5, %xmm14
1034 movdqa %xmm12, 0x40($out)
1035 movdqa %xmm13, 0x50($out)
1036 movdqa %xmm14, 0x60($out)
1037 movdqa %xmm15, 0x70($out)
1038 lea 0x80($out),$out
1039 dec $rounds
1040 jnz .Lkey_loop
1041
1042 movdqa 0x50($const), %xmm7 # .L63
1043 #movdqa %xmm6, ($out) # don't save last round key
1044 ret
1045.size _bsaes_key_convert,.-_bsaes_key_convert
1046___
1047}
1048
1049if (0 && !$win64) { # following four functions are unsupported interface
1050 # used for benchmarking...
1051$code.=<<___;
1052.globl bsaes_enc_key_convert
1053.type bsaes_enc_key_convert,\@function,2
1054.align 16
1055bsaes_enc_key_convert:
1056 mov 240($inp),%r10d # pass rounds
1057 mov $inp,%rcx # pass key
1058 mov $out,%rax # pass key schedule
1059 call _bsaes_key_convert
1060 pxor %xmm6,%xmm7 # fix up last round key
1061 movdqa %xmm7,(%rax) # save last round key
1062 ret
1063.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1064
1065.globl bsaes_encrypt_128
1066.type bsaes_encrypt_128,\@function,4
1067.align 16
1068bsaes_encrypt_128:
1069.Lenc128_loop:
1070 movdqu 0x00($inp), @XMM[0] # load input
1071 movdqu 0x10($inp), @XMM[1]
1072 movdqu 0x20($inp), @XMM[2]
1073 movdqu 0x30($inp), @XMM[3]
1074 movdqu 0x40($inp), @XMM[4]
1075 movdqu 0x50($inp), @XMM[5]
1076 movdqu 0x60($inp), @XMM[6]
1077 movdqu 0x70($inp), @XMM[7]
1078 mov $key, %rax # pass the $key
1079 lea 0x80($inp), $inp
1080 mov \$10,%r10d
1081
1082 call _bsaes_encrypt8
1083
1084 movdqu @XMM[0], 0x00($out) # write output
1085 movdqu @XMM[1], 0x10($out)
1086 movdqu @XMM[4], 0x20($out)
1087 movdqu @XMM[6], 0x30($out)
1088 movdqu @XMM[3], 0x40($out)
1089 movdqu @XMM[7], 0x50($out)
1090 movdqu @XMM[2], 0x60($out)
1091 movdqu @XMM[5], 0x70($out)
1092 lea 0x80($out), $out
1093 sub \$0x80,$len
1094 ja .Lenc128_loop
1095 ret
1096.size bsaes_encrypt_128,.-bsaes_encrypt_128
1097
1098.globl bsaes_dec_key_convert
1099.type bsaes_dec_key_convert,\@function,2
1100.align 16
1101bsaes_dec_key_convert:
1102 mov 240($inp),%r10d # pass rounds
1103 mov $inp,%rcx # pass key
1104 mov $out,%rax # pass key schedule
1105 call _bsaes_key_convert
1106 pxor ($out),%xmm7 # fix up round 0 key
1107 movdqa %xmm6,(%rax) # save last round key
1108 movdqa %xmm7,($out)
1109 ret
1110.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1111
1112.globl bsaes_decrypt_128
1113.type bsaes_decrypt_128,\@function,4
1114.align 16
1115bsaes_decrypt_128:
1116.Ldec128_loop:
1117 movdqu 0x00($inp), @XMM[0] # load input
1118 movdqu 0x10($inp), @XMM[1]
1119 movdqu 0x20($inp), @XMM[2]
1120 movdqu 0x30($inp), @XMM[3]
1121 movdqu 0x40($inp), @XMM[4]
1122 movdqu 0x50($inp), @XMM[5]
1123 movdqu 0x60($inp), @XMM[6]
1124 movdqu 0x70($inp), @XMM[7]
1125 mov $key, %rax # pass the $key
1126 lea 0x80($inp), $inp
1127 mov \$10,%r10d
1128
1129 call _bsaes_decrypt8
1130
1131 movdqu @XMM[0], 0x00($out) # write output
1132 movdqu @XMM[1], 0x10($out)
1133 movdqu @XMM[6], 0x20($out)
1134 movdqu @XMM[4], 0x30($out)
1135 movdqu @XMM[2], 0x40($out)
1136 movdqu @XMM[7], 0x50($out)
1137 movdqu @XMM[3], 0x60($out)
1138 movdqu @XMM[5], 0x70($out)
1139 lea 0x80($out), $out
1140 sub \$0x80,$len
1141 ja .Ldec128_loop
1142 ret
1143.size bsaes_decrypt_128,.-bsaes_decrypt_128
1144___
1145}
1146{
1147######################################################################
1148#
1149# OpenSSL interface
1150#
1151my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1152 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1153my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1154
1155if ($ecb) {
1156$code.=<<___;
1157.globl bsaes_ecb_encrypt_blocks
1158.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1159.align 16
1160bsaes_ecb_encrypt_blocks:
1161 mov %rsp, %rax
1162.Lecb_enc_prologue:
1163 push %rbp
1164 push %rbx
1165 push %r12
1166 push %r13
1167 push %r14
1168 push %r15
1169 lea -0x48(%rsp),%rsp
1170___
1171$code.=<<___ if ($win64);
1172 lea -0xa0(%rsp), %rsp
1173 movaps %xmm6, 0x40(%rsp)
1174 movaps %xmm7, 0x50(%rsp)
1175 movaps %xmm8, 0x60(%rsp)
1176 movaps %xmm9, 0x70(%rsp)
1177 movaps %xmm10, 0x80(%rsp)
1178 movaps %xmm11, 0x90(%rsp)
1179 movaps %xmm12, 0xa0(%rsp)
1180 movaps %xmm13, 0xb0(%rsp)
1181 movaps %xmm14, 0xc0(%rsp)
1182 movaps %xmm15, 0xd0(%rsp)
1183.Lecb_enc_body:
1184___
1185$code.=<<___;
1186 mov %rsp,%rbp # backup %rsp
1187 mov 240($arg4),%eax # rounds
1188 mov $arg1,$inp # backup arguments
1189 mov $arg2,$out
1190 mov $arg3,$len
1191 mov $arg4,$key
1192 cmp \$8,$arg3
1193 jb .Lecb_enc_short
1194
1195 mov %eax,%ebx # backup rounds
1196 shl \$7,%rax # 128 bytes per inner round key
1197 sub \$`128-32`,%rax # size of bit-sliced key schedule
1198 sub %rax,%rsp
1199 mov %rsp,%rax # pass key schedule
1200 mov $key,%rcx # pass key
1201 mov %ebx,%r10d # pass rounds
1202 call _bsaes_key_convert
1203 pxor %xmm6,%xmm7 # fix up last round key
1204 movdqa %xmm7,(%rax) # save last round key
1205
1206 sub \$8,$len
1207.Lecb_enc_loop:
1208 movdqu 0x00($inp), @XMM[0] # load input
1209 movdqu 0x10($inp), @XMM[1]
1210 movdqu 0x20($inp), @XMM[2]
1211 movdqu 0x30($inp), @XMM[3]
1212 movdqu 0x40($inp), @XMM[4]
1213 movdqu 0x50($inp), @XMM[5]
1214 mov %rsp, %rax # pass key schedule
1215 movdqu 0x60($inp), @XMM[6]
1216 mov %ebx,%r10d # pass rounds
1217 movdqu 0x70($inp), @XMM[7]
1218 lea 0x80($inp), $inp
1219
1220 call _bsaes_encrypt8
1221
1222 movdqu @XMM[0], 0x00($out) # write output
1223 movdqu @XMM[1], 0x10($out)
1224 movdqu @XMM[4], 0x20($out)
1225 movdqu @XMM[6], 0x30($out)
1226 movdqu @XMM[3], 0x40($out)
1227 movdqu @XMM[7], 0x50($out)
1228 movdqu @XMM[2], 0x60($out)
1229 movdqu @XMM[5], 0x70($out)
1230 lea 0x80($out), $out
1231 sub \$8,$len
1232 jnc .Lecb_enc_loop
1233
1234 add \$8,$len
1235 jz .Lecb_enc_done
1236
1237 movdqu 0x00($inp), @XMM[0] # load input
1238 mov %rsp, %rax # pass key schedule
1239 mov %ebx,%r10d # pass rounds
1240 cmp \$2,$len
1241 jb .Lecb_enc_one
1242 movdqu 0x10($inp), @XMM[1]
1243 je .Lecb_enc_two
1244 movdqu 0x20($inp), @XMM[2]
1245 cmp \$4,$len
1246 jb .Lecb_enc_three
1247 movdqu 0x30($inp), @XMM[3]
1248 je .Lecb_enc_four
1249 movdqu 0x40($inp), @XMM[4]
1250 cmp \$6,$len
1251 jb .Lecb_enc_five
1252 movdqu 0x50($inp), @XMM[5]
1253 je .Lecb_enc_six
1254 movdqu 0x60($inp), @XMM[6]
1255 call _bsaes_encrypt8
1256 movdqu @XMM[0], 0x00($out) # write output
1257 movdqu @XMM[1], 0x10($out)
1258 movdqu @XMM[4], 0x20($out)
1259 movdqu @XMM[6], 0x30($out)
1260 movdqu @XMM[3], 0x40($out)
1261 movdqu @XMM[7], 0x50($out)
1262 movdqu @XMM[2], 0x60($out)
1263 jmp .Lecb_enc_done
1264.align 16
1265.Lecb_enc_six:
1266 call _bsaes_encrypt8
1267 movdqu @XMM[0], 0x00($out) # write output
1268 movdqu @XMM[1], 0x10($out)
1269 movdqu @XMM[4], 0x20($out)
1270 movdqu @XMM[6], 0x30($out)
1271 movdqu @XMM[3], 0x40($out)
1272 movdqu @XMM[7], 0x50($out)
1273 jmp .Lecb_enc_done
1274.align 16
1275.Lecb_enc_five:
1276 call _bsaes_encrypt8
1277 movdqu @XMM[0], 0x00($out) # write output
1278 movdqu @XMM[1], 0x10($out)
1279 movdqu @XMM[4], 0x20($out)
1280 movdqu @XMM[6], 0x30($out)
1281 movdqu @XMM[3], 0x40($out)
1282 jmp .Lecb_enc_done
1283.align 16
1284.Lecb_enc_four:
1285 call _bsaes_encrypt8
1286 movdqu @XMM[0], 0x00($out) # write output
1287 movdqu @XMM[1], 0x10($out)
1288 movdqu @XMM[4], 0x20($out)
1289 movdqu @XMM[6], 0x30($out)
1290 jmp .Lecb_enc_done
1291.align 16
1292.Lecb_enc_three:
1293 call _bsaes_encrypt8
1294 movdqu @XMM[0], 0x00($out) # write output
1295 movdqu @XMM[1], 0x10($out)
1296 movdqu @XMM[4], 0x20($out)
1297 jmp .Lecb_enc_done
1298.align 16
1299.Lecb_enc_two:
1300 call _bsaes_encrypt8
1301 movdqu @XMM[0], 0x00($out) # write output
1302 movdqu @XMM[1], 0x10($out)
1303 jmp .Lecb_enc_done
1304.align 16
1305.Lecb_enc_one:
1306 call _bsaes_encrypt8
1307 movdqu @XMM[0], 0x00($out) # write output
1308 jmp .Lecb_enc_done
1309.align 16
1310.Lecb_enc_short:
1311 lea ($inp), $arg1
1312 lea ($out), $arg2
1313 lea ($key), $arg3
1314 call asm_AES_encrypt
1315 lea 16($inp), $inp
1316 lea 16($out), $out
1317 dec $len
1318 jnz .Lecb_enc_short
1319
1320.Lecb_enc_done:
1321 lea (%rsp),%rax
1322 pxor %xmm0, %xmm0
1323.Lecb_enc_bzero: # wipe key schedule [if any]
1324 movdqa %xmm0, 0x00(%rax)
1325 movdqa %xmm0, 0x10(%rax)
1326 lea 0x20(%rax), %rax
1327 cmp %rax, %rbp
1328 jb .Lecb_enc_bzero
1329
Robert Sloana94fe052017-02-21 08:49:28 -08001330 lea 0x78(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001331___
1332$code.=<<___ if ($win64);
1333 movaps 0x40(%rbp), %xmm6
1334 movaps 0x50(%rbp), %xmm7
1335 movaps 0x60(%rbp), %xmm8
1336 movaps 0x70(%rbp), %xmm9
1337 movaps 0x80(%rbp), %xmm10
1338 movaps 0x90(%rbp), %xmm11
1339 movaps 0xa0(%rbp), %xmm12
1340 movaps 0xb0(%rbp), %xmm13
1341 movaps 0xc0(%rbp), %xmm14
1342 movaps 0xd0(%rbp), %xmm15
Robert Sloana94fe052017-02-21 08:49:28 -08001343 lea 0xa0(%rax), %rax
1344.Lecb_enc_tail:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001345___
1346$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08001347 mov -48(%rax), %r15
1348 mov -40(%rax), %r14
1349 mov -32(%rax), %r13
1350 mov -24(%rax), %r12
1351 mov -16(%rax), %rbx
1352 mov -8(%rax), %rbp
1353 lea (%rax), %rsp # restore %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001354.Lecb_enc_epilogue:
1355 ret
1356.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1357
1358.globl bsaes_ecb_decrypt_blocks
1359.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1360.align 16
1361bsaes_ecb_decrypt_blocks:
1362 mov %rsp, %rax
1363.Lecb_dec_prologue:
1364 push %rbp
1365 push %rbx
1366 push %r12
1367 push %r13
1368 push %r14
1369 push %r15
1370 lea -0x48(%rsp),%rsp
1371___
1372$code.=<<___ if ($win64);
1373 lea -0xa0(%rsp), %rsp
1374 movaps %xmm6, 0x40(%rsp)
1375 movaps %xmm7, 0x50(%rsp)
1376 movaps %xmm8, 0x60(%rsp)
1377 movaps %xmm9, 0x70(%rsp)
1378 movaps %xmm10, 0x80(%rsp)
1379 movaps %xmm11, 0x90(%rsp)
1380 movaps %xmm12, 0xa0(%rsp)
1381 movaps %xmm13, 0xb0(%rsp)
1382 movaps %xmm14, 0xc0(%rsp)
1383 movaps %xmm15, 0xd0(%rsp)
1384.Lecb_dec_body:
1385___
1386$code.=<<___;
1387 mov %rsp,%rbp # backup %rsp
1388 mov 240($arg4),%eax # rounds
1389 mov $arg1,$inp # backup arguments
1390 mov $arg2,$out
1391 mov $arg3,$len
1392 mov $arg4,$key
1393 cmp \$8,$arg3
1394 jb .Lecb_dec_short
1395
1396 mov %eax,%ebx # backup rounds
1397 shl \$7,%rax # 128 bytes per inner round key
1398 sub \$`128-32`,%rax # size of bit-sliced key schedule
1399 sub %rax,%rsp
1400 mov %rsp,%rax # pass key schedule
1401 mov $key,%rcx # pass key
1402 mov %ebx,%r10d # pass rounds
1403 call _bsaes_key_convert
1404 pxor (%rsp),%xmm7 # fix up 0 round key
1405 movdqa %xmm6,(%rax) # save last round key
1406 movdqa %xmm7,(%rsp)
1407
1408 sub \$8,$len
1409.Lecb_dec_loop:
1410 movdqu 0x00($inp), @XMM[0] # load input
1411 movdqu 0x10($inp), @XMM[1]
1412 movdqu 0x20($inp), @XMM[2]
1413 movdqu 0x30($inp), @XMM[3]
1414 movdqu 0x40($inp), @XMM[4]
1415 movdqu 0x50($inp), @XMM[5]
1416 mov %rsp, %rax # pass key schedule
1417 movdqu 0x60($inp), @XMM[6]
1418 mov %ebx,%r10d # pass rounds
1419 movdqu 0x70($inp), @XMM[7]
1420 lea 0x80($inp), $inp
1421
1422 call _bsaes_decrypt8
1423
1424 movdqu @XMM[0], 0x00($out) # write output
1425 movdqu @XMM[1], 0x10($out)
1426 movdqu @XMM[6], 0x20($out)
1427 movdqu @XMM[4], 0x30($out)
1428 movdqu @XMM[2], 0x40($out)
1429 movdqu @XMM[7], 0x50($out)
1430 movdqu @XMM[3], 0x60($out)
1431 movdqu @XMM[5], 0x70($out)
1432 lea 0x80($out), $out
1433 sub \$8,$len
1434 jnc .Lecb_dec_loop
1435
1436 add \$8,$len
1437 jz .Lecb_dec_done
1438
1439 movdqu 0x00($inp), @XMM[0] # load input
1440 mov %rsp, %rax # pass key schedule
1441 mov %ebx,%r10d # pass rounds
1442 cmp \$2,$len
1443 jb .Lecb_dec_one
1444 movdqu 0x10($inp), @XMM[1]
1445 je .Lecb_dec_two
1446 movdqu 0x20($inp), @XMM[2]
1447 cmp \$4,$len
1448 jb .Lecb_dec_three
1449 movdqu 0x30($inp), @XMM[3]
1450 je .Lecb_dec_four
1451 movdqu 0x40($inp), @XMM[4]
1452 cmp \$6,$len
1453 jb .Lecb_dec_five
1454 movdqu 0x50($inp), @XMM[5]
1455 je .Lecb_dec_six
1456 movdqu 0x60($inp), @XMM[6]
1457 call _bsaes_decrypt8
1458 movdqu @XMM[0], 0x00($out) # write output
1459 movdqu @XMM[1], 0x10($out)
1460 movdqu @XMM[6], 0x20($out)
1461 movdqu @XMM[4], 0x30($out)
1462 movdqu @XMM[2], 0x40($out)
1463 movdqu @XMM[7], 0x50($out)
1464 movdqu @XMM[3], 0x60($out)
1465 jmp .Lecb_dec_done
1466.align 16
1467.Lecb_dec_six:
1468 call _bsaes_decrypt8
1469 movdqu @XMM[0], 0x00($out) # write output
1470 movdqu @XMM[1], 0x10($out)
1471 movdqu @XMM[6], 0x20($out)
1472 movdqu @XMM[4], 0x30($out)
1473 movdqu @XMM[2], 0x40($out)
1474 movdqu @XMM[7], 0x50($out)
1475 jmp .Lecb_dec_done
1476.align 16
1477.Lecb_dec_five:
1478 call _bsaes_decrypt8
1479 movdqu @XMM[0], 0x00($out) # write output
1480 movdqu @XMM[1], 0x10($out)
1481 movdqu @XMM[6], 0x20($out)
1482 movdqu @XMM[4], 0x30($out)
1483 movdqu @XMM[2], 0x40($out)
1484 jmp .Lecb_dec_done
1485.align 16
1486.Lecb_dec_four:
1487 call _bsaes_decrypt8
1488 movdqu @XMM[0], 0x00($out) # write output
1489 movdqu @XMM[1], 0x10($out)
1490 movdqu @XMM[6], 0x20($out)
1491 movdqu @XMM[4], 0x30($out)
1492 jmp .Lecb_dec_done
1493.align 16
1494.Lecb_dec_three:
1495 call _bsaes_decrypt8
1496 movdqu @XMM[0], 0x00($out) # write output
1497 movdqu @XMM[1], 0x10($out)
1498 movdqu @XMM[6], 0x20($out)
1499 jmp .Lecb_dec_done
1500.align 16
1501.Lecb_dec_two:
1502 call _bsaes_decrypt8
1503 movdqu @XMM[0], 0x00($out) # write output
1504 movdqu @XMM[1], 0x10($out)
1505 jmp .Lecb_dec_done
1506.align 16
1507.Lecb_dec_one:
1508 call _bsaes_decrypt8
1509 movdqu @XMM[0], 0x00($out) # write output
1510 jmp .Lecb_dec_done
1511.align 16
1512.Lecb_dec_short:
1513 lea ($inp), $arg1
1514 lea ($out), $arg2
1515 lea ($key), $arg3
1516 call asm_AES_decrypt
1517 lea 16($inp), $inp
1518 lea 16($out), $out
1519 dec $len
1520 jnz .Lecb_dec_short
1521
1522.Lecb_dec_done:
1523 lea (%rsp),%rax
1524 pxor %xmm0, %xmm0
1525.Lecb_dec_bzero: # wipe key schedule [if any]
1526 movdqa %xmm0, 0x00(%rax)
1527 movdqa %xmm0, 0x10(%rax)
1528 lea 0x20(%rax), %rax
1529 cmp %rax, %rbp
1530 jb .Lecb_dec_bzero
1531
Robert Sloana94fe052017-02-21 08:49:28 -08001532 lea 0x78(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001533___
1534$code.=<<___ if ($win64);
1535 movaps 0x40(%rbp), %xmm6
1536 movaps 0x50(%rbp), %xmm7
1537 movaps 0x60(%rbp), %xmm8
1538 movaps 0x70(%rbp), %xmm9
1539 movaps 0x80(%rbp), %xmm10
1540 movaps 0x90(%rbp), %xmm11
1541 movaps 0xa0(%rbp), %xmm12
1542 movaps 0xb0(%rbp), %xmm13
1543 movaps 0xc0(%rbp), %xmm14
1544 movaps 0xd0(%rbp), %xmm15
Robert Sloana94fe052017-02-21 08:49:28 -08001545 lea 0xa0(%rax), %rax
1546.Lecb_dec_tail:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001547___
1548$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08001549 mov -48(%rax), %r15
1550 mov -40(%rax), %r14
1551 mov -32(%rax), %r13
1552 mov -24(%rax), %r12
1553 mov -16(%rax), %rbx
1554 mov -8(%rax), %rbp
1555 lea (%rax), %rsp # restore %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001556.Lecb_dec_epilogue:
1557 ret
1558.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1559___
1560}
1561$code.=<<___;
1562.extern asm_AES_cbc_encrypt
1563.globl bsaes_cbc_encrypt
1564.type bsaes_cbc_encrypt,\@abi-omnipotent
1565.align 16
1566bsaes_cbc_encrypt:
1567___
1568$code.=<<___ if ($win64);
1569 mov 48(%rsp),$arg6 # pull direction flag
1570___
1571$code.=<<___;
1572 cmp \$0,$arg6
1573 jne asm_AES_cbc_encrypt
1574 cmp \$128,$arg3
1575 jb asm_AES_cbc_encrypt
1576
1577 mov %rsp, %rax
1578.Lcbc_dec_prologue:
1579 push %rbp
1580 push %rbx
1581 push %r12
1582 push %r13
1583 push %r14
1584 push %r15
1585 lea -0x48(%rsp), %rsp
1586___
1587$code.=<<___ if ($win64);
1588 mov 0xa0(%rsp),$arg5 # pull ivp
1589 lea -0xa0(%rsp), %rsp
1590 movaps %xmm6, 0x40(%rsp)
1591 movaps %xmm7, 0x50(%rsp)
1592 movaps %xmm8, 0x60(%rsp)
1593 movaps %xmm9, 0x70(%rsp)
1594 movaps %xmm10, 0x80(%rsp)
1595 movaps %xmm11, 0x90(%rsp)
1596 movaps %xmm12, 0xa0(%rsp)
1597 movaps %xmm13, 0xb0(%rsp)
1598 movaps %xmm14, 0xc0(%rsp)
1599 movaps %xmm15, 0xd0(%rsp)
1600.Lcbc_dec_body:
1601___
1602$code.=<<___;
1603 mov %rsp, %rbp # backup %rsp
1604 mov 240($arg4), %eax # rounds
1605 mov $arg1, $inp # backup arguments
1606 mov $arg2, $out
1607 mov $arg3, $len
1608 mov $arg4, $key
1609 mov $arg5, %rbx
1610 shr \$4, $len # bytes to blocks
1611
1612 mov %eax, %edx # rounds
1613 shl \$7, %rax # 128 bytes per inner round key
1614 sub \$`128-32`, %rax # size of bit-sliced key schedule
1615 sub %rax, %rsp
1616
1617 mov %rsp, %rax # pass key schedule
1618 mov $key, %rcx # pass key
1619 mov %edx, %r10d # pass rounds
1620 call _bsaes_key_convert
1621 pxor (%rsp),%xmm7 # fix up 0 round key
1622 movdqa %xmm6,(%rax) # save last round key
1623 movdqa %xmm7,(%rsp)
1624
1625 movdqu (%rbx), @XMM[15] # load IV
1626 sub \$8,$len
1627.Lcbc_dec_loop:
1628 movdqu 0x00($inp), @XMM[0] # load input
1629 movdqu 0x10($inp), @XMM[1]
1630 movdqu 0x20($inp), @XMM[2]
1631 movdqu 0x30($inp), @XMM[3]
1632 movdqu 0x40($inp), @XMM[4]
1633 movdqu 0x50($inp), @XMM[5]
1634 mov %rsp, %rax # pass key schedule
1635 movdqu 0x60($inp), @XMM[6]
1636 mov %edx,%r10d # pass rounds
1637 movdqu 0x70($inp), @XMM[7]
1638 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1639
1640 call _bsaes_decrypt8
1641
1642 pxor 0x20(%rbp), @XMM[0] # ^= IV
1643 movdqu 0x00($inp), @XMM[8] # re-load input
1644 movdqu 0x10($inp), @XMM[9]
1645 pxor @XMM[8], @XMM[1]
1646 movdqu 0x20($inp), @XMM[10]
1647 pxor @XMM[9], @XMM[6]
1648 movdqu 0x30($inp), @XMM[11]
1649 pxor @XMM[10], @XMM[4]
1650 movdqu 0x40($inp), @XMM[12]
1651 pxor @XMM[11], @XMM[2]
1652 movdqu 0x50($inp), @XMM[13]
1653 pxor @XMM[12], @XMM[7]
1654 movdqu 0x60($inp), @XMM[14]
1655 pxor @XMM[13], @XMM[3]
1656 movdqu 0x70($inp), @XMM[15] # IV
1657 pxor @XMM[14], @XMM[5]
1658 movdqu @XMM[0], 0x00($out) # write output
1659 lea 0x80($inp), $inp
1660 movdqu @XMM[1], 0x10($out)
1661 movdqu @XMM[6], 0x20($out)
1662 movdqu @XMM[4], 0x30($out)
1663 movdqu @XMM[2], 0x40($out)
1664 movdqu @XMM[7], 0x50($out)
1665 movdqu @XMM[3], 0x60($out)
1666 movdqu @XMM[5], 0x70($out)
1667 lea 0x80($out), $out
1668 sub \$8,$len
1669 jnc .Lcbc_dec_loop
1670
1671 add \$8,$len
1672 jz .Lcbc_dec_done
1673
1674 movdqu 0x00($inp), @XMM[0] # load input
1675 mov %rsp, %rax # pass key schedule
1676 mov %edx, %r10d # pass rounds
1677 cmp \$2,$len
1678 jb .Lcbc_dec_one
1679 movdqu 0x10($inp), @XMM[1]
1680 je .Lcbc_dec_two
1681 movdqu 0x20($inp), @XMM[2]
1682 cmp \$4,$len
1683 jb .Lcbc_dec_three
1684 movdqu 0x30($inp), @XMM[3]
1685 je .Lcbc_dec_four
1686 movdqu 0x40($inp), @XMM[4]
1687 cmp \$6,$len
1688 jb .Lcbc_dec_five
1689 movdqu 0x50($inp), @XMM[5]
1690 je .Lcbc_dec_six
1691 movdqu 0x60($inp), @XMM[6]
1692 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1693 call _bsaes_decrypt8
1694 pxor 0x20(%rbp), @XMM[0] # ^= IV
1695 movdqu 0x00($inp), @XMM[8] # re-load input
1696 movdqu 0x10($inp), @XMM[9]
1697 pxor @XMM[8], @XMM[1]
1698 movdqu 0x20($inp), @XMM[10]
1699 pxor @XMM[9], @XMM[6]
1700 movdqu 0x30($inp), @XMM[11]
1701 pxor @XMM[10], @XMM[4]
1702 movdqu 0x40($inp), @XMM[12]
1703 pxor @XMM[11], @XMM[2]
1704 movdqu 0x50($inp), @XMM[13]
1705 pxor @XMM[12], @XMM[7]
1706 movdqu 0x60($inp), @XMM[15] # IV
1707 pxor @XMM[13], @XMM[3]
1708 movdqu @XMM[0], 0x00($out) # write output
1709 movdqu @XMM[1], 0x10($out)
1710 movdqu @XMM[6], 0x20($out)
1711 movdqu @XMM[4], 0x30($out)
1712 movdqu @XMM[2], 0x40($out)
1713 movdqu @XMM[7], 0x50($out)
1714 movdqu @XMM[3], 0x60($out)
1715 jmp .Lcbc_dec_done
1716.align 16
1717.Lcbc_dec_six:
1718 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1719 call _bsaes_decrypt8
1720 pxor 0x20(%rbp), @XMM[0] # ^= IV
1721 movdqu 0x00($inp), @XMM[8] # re-load input
1722 movdqu 0x10($inp), @XMM[9]
1723 pxor @XMM[8], @XMM[1]
1724 movdqu 0x20($inp), @XMM[10]
1725 pxor @XMM[9], @XMM[6]
1726 movdqu 0x30($inp), @XMM[11]
1727 pxor @XMM[10], @XMM[4]
1728 movdqu 0x40($inp), @XMM[12]
1729 pxor @XMM[11], @XMM[2]
1730 movdqu 0x50($inp), @XMM[15] # IV
1731 pxor @XMM[12], @XMM[7]
1732 movdqu @XMM[0], 0x00($out) # write output
1733 movdqu @XMM[1], 0x10($out)
1734 movdqu @XMM[6], 0x20($out)
1735 movdqu @XMM[4], 0x30($out)
1736 movdqu @XMM[2], 0x40($out)
1737 movdqu @XMM[7], 0x50($out)
1738 jmp .Lcbc_dec_done
1739.align 16
1740.Lcbc_dec_five:
1741 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1742 call _bsaes_decrypt8
1743 pxor 0x20(%rbp), @XMM[0] # ^= IV
1744 movdqu 0x00($inp), @XMM[8] # re-load input
1745 movdqu 0x10($inp), @XMM[9]
1746 pxor @XMM[8], @XMM[1]
1747 movdqu 0x20($inp), @XMM[10]
1748 pxor @XMM[9], @XMM[6]
1749 movdqu 0x30($inp), @XMM[11]
1750 pxor @XMM[10], @XMM[4]
1751 movdqu 0x40($inp), @XMM[15] # IV
1752 pxor @XMM[11], @XMM[2]
1753 movdqu @XMM[0], 0x00($out) # write output
1754 movdqu @XMM[1], 0x10($out)
1755 movdqu @XMM[6], 0x20($out)
1756 movdqu @XMM[4], 0x30($out)
1757 movdqu @XMM[2], 0x40($out)
1758 jmp .Lcbc_dec_done
1759.align 16
1760.Lcbc_dec_four:
1761 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1762 call _bsaes_decrypt8
1763 pxor 0x20(%rbp), @XMM[0] # ^= IV
1764 movdqu 0x00($inp), @XMM[8] # re-load input
1765 movdqu 0x10($inp), @XMM[9]
1766 pxor @XMM[8], @XMM[1]
1767 movdqu 0x20($inp), @XMM[10]
1768 pxor @XMM[9], @XMM[6]
1769 movdqu 0x30($inp), @XMM[15] # IV
1770 pxor @XMM[10], @XMM[4]
1771 movdqu @XMM[0], 0x00($out) # write output
1772 movdqu @XMM[1], 0x10($out)
1773 movdqu @XMM[6], 0x20($out)
1774 movdqu @XMM[4], 0x30($out)
1775 jmp .Lcbc_dec_done
1776.align 16
1777.Lcbc_dec_three:
1778 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1779 call _bsaes_decrypt8
1780 pxor 0x20(%rbp), @XMM[0] # ^= IV
1781 movdqu 0x00($inp), @XMM[8] # re-load input
1782 movdqu 0x10($inp), @XMM[9]
1783 pxor @XMM[8], @XMM[1]
1784 movdqu 0x20($inp), @XMM[15] # IV
1785 pxor @XMM[9], @XMM[6]
1786 movdqu @XMM[0], 0x00($out) # write output
1787 movdqu @XMM[1], 0x10($out)
1788 movdqu @XMM[6], 0x20($out)
1789 jmp .Lcbc_dec_done
1790.align 16
1791.Lcbc_dec_two:
1792 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1793 call _bsaes_decrypt8
1794 pxor 0x20(%rbp), @XMM[0] # ^= IV
1795 movdqu 0x00($inp), @XMM[8] # re-load input
1796 movdqu 0x10($inp), @XMM[15] # IV
1797 pxor @XMM[8], @XMM[1]
1798 movdqu @XMM[0], 0x00($out) # write output
1799 movdqu @XMM[1], 0x10($out)
1800 jmp .Lcbc_dec_done
1801.align 16
1802.Lcbc_dec_one:
1803 lea ($inp), $arg1
1804 lea 0x20(%rbp), $arg2 # buffer output
1805 lea ($key), $arg3
1806 call asm_AES_decrypt # doesn't touch %xmm
1807 pxor 0x20(%rbp), @XMM[15] # ^= IV
1808 movdqu @XMM[15], ($out) # write output
1809 movdqa @XMM[0], @XMM[15] # IV
1810
1811.Lcbc_dec_done:
1812 movdqu @XMM[15], (%rbx) # return IV
1813 lea (%rsp), %rax
1814 pxor %xmm0, %xmm0
1815.Lcbc_dec_bzero: # wipe key schedule [if any]
1816 movdqa %xmm0, 0x00(%rax)
1817 movdqa %xmm0, 0x10(%rax)
1818 lea 0x20(%rax), %rax
1819 cmp %rax, %rbp
1820 ja .Lcbc_dec_bzero
1821
Robert Sloana94fe052017-02-21 08:49:28 -08001822 lea 0x78(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001823___
1824$code.=<<___ if ($win64);
1825 movaps 0x40(%rbp), %xmm6
1826 movaps 0x50(%rbp), %xmm7
1827 movaps 0x60(%rbp), %xmm8
1828 movaps 0x70(%rbp), %xmm9
1829 movaps 0x80(%rbp), %xmm10
1830 movaps 0x90(%rbp), %xmm11
1831 movaps 0xa0(%rbp), %xmm12
1832 movaps 0xb0(%rbp), %xmm13
1833 movaps 0xc0(%rbp), %xmm14
1834 movaps 0xd0(%rbp), %xmm15
Robert Sloana94fe052017-02-21 08:49:28 -08001835 lea 0xa0(%rax), %rax
1836.Lcbc_dec_tail:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001837___
1838$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08001839 mov -48(%rax), %r15
1840 mov -40(%rax), %r14
1841 mov -32(%rax), %r13
1842 mov -24(%rax), %r12
1843 mov -16(%rax), %rbx
1844 mov -8(%rax), %rbp
1845 lea (%rax), %rsp # restore %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001846.Lcbc_dec_epilogue:
1847 ret
1848.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1849
1850.globl bsaes_ctr32_encrypt_blocks
1851.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1852.align 16
1853bsaes_ctr32_encrypt_blocks:
1854 mov %rsp, %rax
1855.Lctr_enc_prologue:
1856 push %rbp
1857 push %rbx
1858 push %r12
1859 push %r13
1860 push %r14
1861 push %r15
1862 lea -0x48(%rsp), %rsp
1863___
1864$code.=<<___ if ($win64);
1865 mov 0xa0(%rsp),$arg5 # pull ivp
1866 lea -0xa0(%rsp), %rsp
1867 movaps %xmm6, 0x40(%rsp)
1868 movaps %xmm7, 0x50(%rsp)
1869 movaps %xmm8, 0x60(%rsp)
1870 movaps %xmm9, 0x70(%rsp)
1871 movaps %xmm10, 0x80(%rsp)
1872 movaps %xmm11, 0x90(%rsp)
1873 movaps %xmm12, 0xa0(%rsp)
1874 movaps %xmm13, 0xb0(%rsp)
1875 movaps %xmm14, 0xc0(%rsp)
1876 movaps %xmm15, 0xd0(%rsp)
1877.Lctr_enc_body:
1878___
1879$code.=<<___;
1880 mov %rsp, %rbp # backup %rsp
1881 movdqu ($arg5), %xmm0 # load counter
1882 mov 240($arg4), %eax # rounds
1883 mov $arg1, $inp # backup arguments
1884 mov $arg2, $out
1885 mov $arg3, $len
1886 mov $arg4, $key
1887 movdqa %xmm0, 0x20(%rbp) # copy counter
1888 cmp \$8, $arg3
1889 jb .Lctr_enc_short
1890
1891 mov %eax, %ebx # rounds
1892 shl \$7, %rax # 128 bytes per inner round key
1893 sub \$`128-32`, %rax # size of bit-sliced key schedule
1894 sub %rax, %rsp
1895
1896 mov %rsp, %rax # pass key schedule
1897 mov $key, %rcx # pass key
1898 mov %ebx, %r10d # pass rounds
1899 call _bsaes_key_convert
1900 pxor %xmm6,%xmm7 # fix up last round key
1901 movdqa %xmm7,(%rax) # save last round key
1902
1903 movdqa (%rsp), @XMM[9] # load round0 key
1904 lea .LADD1(%rip), %r11
1905 movdqa 0x20(%rbp), @XMM[0] # counter copy
1906 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1907 pshufb @XMM[8], @XMM[9] # byte swap upper part
1908 pshufb @XMM[8], @XMM[0]
1909 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1910 jmp .Lctr_enc_loop
1911.align 16
1912.Lctr_enc_loop:
1913 movdqa @XMM[0], 0x20(%rbp) # save counter
1914 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1915 movdqa @XMM[0], @XMM[2]
1916 paddd 0x00(%r11), @XMM[1] # .LADD1
1917 movdqa @XMM[0], @XMM[3]
1918 paddd 0x10(%r11), @XMM[2] # .LADD2
1919 movdqa @XMM[0], @XMM[4]
1920 paddd 0x20(%r11), @XMM[3] # .LADD3
1921 movdqa @XMM[0], @XMM[5]
1922 paddd 0x30(%r11), @XMM[4] # .LADD4
1923 movdqa @XMM[0], @XMM[6]
1924 paddd 0x40(%r11), @XMM[5] # .LADD5
1925 movdqa @XMM[0], @XMM[7]
1926 paddd 0x50(%r11), @XMM[6] # .LADD6
1927 paddd 0x60(%r11), @XMM[7] # .LADD7
1928
1929 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1930 # to flip byte order in 32-bit counter
1931 movdqa (%rsp), @XMM[9] # round 0 key
1932 lea 0x10(%rsp), %rax # pass key schedule
1933 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1934 pxor @XMM[9], @XMM[0] # xor with round0 key
1935 pxor @XMM[9], @XMM[1]
1936 pxor @XMM[9], @XMM[2]
1937 pxor @XMM[9], @XMM[3]
1938 pshufb @XMM[8], @XMM[0]
1939 pshufb @XMM[8], @XMM[1]
1940 pxor @XMM[9], @XMM[4]
1941 pxor @XMM[9], @XMM[5]
1942 pshufb @XMM[8], @XMM[2]
1943 pshufb @XMM[8], @XMM[3]
1944 pxor @XMM[9], @XMM[6]
1945 pxor @XMM[9], @XMM[7]
1946 pshufb @XMM[8], @XMM[4]
1947 pshufb @XMM[8], @XMM[5]
1948 pshufb @XMM[8], @XMM[6]
1949 pshufb @XMM[8], @XMM[7]
1950 lea .LBS0(%rip), %r11 # constants table
1951 mov %ebx,%r10d # pass rounds
1952
1953 call _bsaes_encrypt8_bitslice
1954
1955 sub \$8,$len
1956 jc .Lctr_enc_loop_done
1957
1958 movdqu 0x00($inp), @XMM[8] # load input
1959 movdqu 0x10($inp), @XMM[9]
1960 movdqu 0x20($inp), @XMM[10]
1961 movdqu 0x30($inp), @XMM[11]
1962 movdqu 0x40($inp), @XMM[12]
1963 movdqu 0x50($inp), @XMM[13]
1964 movdqu 0x60($inp), @XMM[14]
1965 movdqu 0x70($inp), @XMM[15]
1966 lea 0x80($inp),$inp
1967 pxor @XMM[0], @XMM[8]
1968 movdqa 0x20(%rbp), @XMM[0] # load counter
1969 pxor @XMM[9], @XMM[1]
1970 movdqu @XMM[8], 0x00($out) # write output
1971 pxor @XMM[10], @XMM[4]
1972 movdqu @XMM[1], 0x10($out)
1973 pxor @XMM[11], @XMM[6]
1974 movdqu @XMM[4], 0x20($out)
1975 pxor @XMM[12], @XMM[3]
1976 movdqu @XMM[6], 0x30($out)
1977 pxor @XMM[13], @XMM[7]
1978 movdqu @XMM[3], 0x40($out)
1979 pxor @XMM[14], @XMM[2]
1980 movdqu @XMM[7], 0x50($out)
1981 pxor @XMM[15], @XMM[5]
1982 movdqu @XMM[2], 0x60($out)
1983 lea .LADD1(%rip), %r11
1984 movdqu @XMM[5], 0x70($out)
1985 lea 0x80($out), $out
1986 paddd 0x70(%r11), @XMM[0] # .LADD8
1987 jnz .Lctr_enc_loop
1988
1989 jmp .Lctr_enc_done
1990.align 16
1991.Lctr_enc_loop_done:
1992 add \$8, $len
1993 movdqu 0x00($inp), @XMM[8] # load input
1994 pxor @XMM[8], @XMM[0]
1995 movdqu @XMM[0], 0x00($out) # write output
1996 cmp \$2,$len
1997 jb .Lctr_enc_done
1998 movdqu 0x10($inp), @XMM[9]
1999 pxor @XMM[9], @XMM[1]
2000 movdqu @XMM[1], 0x10($out)
2001 je .Lctr_enc_done
2002 movdqu 0x20($inp), @XMM[10]
2003 pxor @XMM[10], @XMM[4]
2004 movdqu @XMM[4], 0x20($out)
2005 cmp \$4,$len
2006 jb .Lctr_enc_done
2007 movdqu 0x30($inp), @XMM[11]
2008 pxor @XMM[11], @XMM[6]
2009 movdqu @XMM[6], 0x30($out)
2010 je .Lctr_enc_done
2011 movdqu 0x40($inp), @XMM[12]
2012 pxor @XMM[12], @XMM[3]
2013 movdqu @XMM[3], 0x40($out)
2014 cmp \$6,$len
2015 jb .Lctr_enc_done
2016 movdqu 0x50($inp), @XMM[13]
2017 pxor @XMM[13], @XMM[7]
2018 movdqu @XMM[7], 0x50($out)
2019 je .Lctr_enc_done
2020 movdqu 0x60($inp), @XMM[14]
2021 pxor @XMM[14], @XMM[2]
2022 movdqu @XMM[2], 0x60($out)
2023 jmp .Lctr_enc_done
2024
2025.align 16
2026.Lctr_enc_short:
2027 lea 0x20(%rbp), $arg1
2028 lea 0x30(%rbp), $arg2
2029 lea ($key), $arg3
2030 call asm_AES_encrypt
2031 movdqu ($inp), @XMM[1]
2032 lea 16($inp), $inp
2033 mov 0x2c(%rbp), %eax # load 32-bit counter
2034 bswap %eax
2035 pxor 0x30(%rbp), @XMM[1]
2036 inc %eax # increment
2037 movdqu @XMM[1], ($out)
2038 bswap %eax
2039 lea 16($out), $out
2040 mov %eax, 0x2c(%rsp) # save 32-bit counter
2041 dec $len
2042 jnz .Lctr_enc_short
2043
2044.Lctr_enc_done:
2045 lea (%rsp), %rax
2046 pxor %xmm0, %xmm0
2047.Lctr_enc_bzero: # wipe key schedule [if any]
2048 movdqa %xmm0, 0x00(%rax)
2049 movdqa %xmm0, 0x10(%rax)
2050 lea 0x20(%rax), %rax
2051 cmp %rax, %rbp
2052 ja .Lctr_enc_bzero
2053
Robert Sloana94fe052017-02-21 08:49:28 -08002054 lea 0x78(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08002055___
2056$code.=<<___ if ($win64);
2057 movaps 0x40(%rbp), %xmm6
2058 movaps 0x50(%rbp), %xmm7
2059 movaps 0x60(%rbp), %xmm8
2060 movaps 0x70(%rbp), %xmm9
2061 movaps 0x80(%rbp), %xmm10
2062 movaps 0x90(%rbp), %xmm11
2063 movaps 0xa0(%rbp), %xmm12
2064 movaps 0xb0(%rbp), %xmm13
2065 movaps 0xc0(%rbp), %xmm14
2066 movaps 0xd0(%rbp), %xmm15
Robert Sloana94fe052017-02-21 08:49:28 -08002067 lea 0xa0(%rax), %rax
2068.Lctr_enc_tail:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002069___
2070$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08002071 mov -48(%rax), %r15
2072 mov -40(%rax), %r14
2073 mov -32(%rax), %r13
2074 mov -24(%rax), %r12
2075 mov -16(%rax), %rbx
2076 mov -8(%rax), %rbp
2077 lea (%rax), %rsp # restore %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002078.Lctr_enc_epilogue:
2079 ret
2080.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2081___
2082######################################################################
2083# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2084# const AES_KEY *key1, const AES_KEY *key2,
2085# const unsigned char iv[16]);
2086#
2087my ($twmask,$twres,$twtmp)=@XMM[13..15];
2088$arg6=~s/d$//;
2089
2090$code.=<<___;
2091.globl bsaes_xts_encrypt
2092.type bsaes_xts_encrypt,\@abi-omnipotent
2093.align 16
2094bsaes_xts_encrypt:
2095 mov %rsp, %rax
2096.Lxts_enc_prologue:
2097 push %rbp
2098 push %rbx
2099 push %r12
2100 push %r13
2101 push %r14
2102 push %r15
2103 lea -0x48(%rsp), %rsp
2104___
2105$code.=<<___ if ($win64);
2106 mov 0xa0(%rsp),$arg5 # pull key2
2107 mov 0xa8(%rsp),$arg6 # pull ivp
2108 lea -0xa0(%rsp), %rsp
2109 movaps %xmm6, 0x40(%rsp)
2110 movaps %xmm7, 0x50(%rsp)
2111 movaps %xmm8, 0x60(%rsp)
2112 movaps %xmm9, 0x70(%rsp)
2113 movaps %xmm10, 0x80(%rsp)
2114 movaps %xmm11, 0x90(%rsp)
2115 movaps %xmm12, 0xa0(%rsp)
2116 movaps %xmm13, 0xb0(%rsp)
2117 movaps %xmm14, 0xc0(%rsp)
2118 movaps %xmm15, 0xd0(%rsp)
2119.Lxts_enc_body:
2120___
2121$code.=<<___;
2122 mov %rsp, %rbp # backup %rsp
2123 mov $arg1, $inp # backup arguments
2124 mov $arg2, $out
2125 mov $arg3, $len
2126 mov $arg4, $key
2127
2128 lea ($arg6), $arg1
2129 lea 0x20(%rbp), $arg2
2130 lea ($arg5), $arg3
2131 call asm_AES_encrypt # generate initial tweak
2132
2133 mov 240($key), %eax # rounds
2134 mov $len, %rbx # backup $len
2135
2136 mov %eax, %edx # rounds
2137 shl \$7, %rax # 128 bytes per inner round key
2138 sub \$`128-32`, %rax # size of bit-sliced key schedule
2139 sub %rax, %rsp
2140
2141 mov %rsp, %rax # pass key schedule
2142 mov $key, %rcx # pass key
2143 mov %edx, %r10d # pass rounds
2144 call _bsaes_key_convert
2145 pxor %xmm6, %xmm7 # fix up last round key
2146 movdqa %xmm7, (%rax) # save last round key
2147
2148 and \$-16, $len
2149 sub \$0x80, %rsp # place for tweak[8]
2150 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2151
2152 pxor $twtmp, $twtmp
2153 movdqa .Lxts_magic(%rip), $twmask
2154 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2155
2156 sub \$0x80, $len
2157 jc .Lxts_enc_short
2158 jmp .Lxts_enc_loop
2159
2160.align 16
2161.Lxts_enc_loop:
2162___
2163 for ($i=0;$i<7;$i++) {
2164 $code.=<<___;
2165 pshufd \$0x13, $twtmp, $twres
2166 pxor $twtmp, $twtmp
2167 movdqa @XMM[7], @XMM[$i]
2168 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2169 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2170 pand $twmask, $twres # isolate carry and residue
2171 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2172 pxor $twres, @XMM[7]
2173___
2174 $code.=<<___ if ($i>=1);
2175 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2176___
2177 $code.=<<___ if ($i>=2);
2178 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2179___
2180 }
2181$code.=<<___;
2182 movdqu 0x60($inp), @XMM[8+6]
2183 pxor @XMM[8+5], @XMM[5]
2184 movdqu 0x70($inp), @XMM[8+7]
2185 lea 0x80($inp), $inp
2186 movdqa @XMM[7], 0x70(%rsp)
2187 pxor @XMM[8+6], @XMM[6]
2188 lea 0x80(%rsp), %rax # pass key schedule
2189 pxor @XMM[8+7], @XMM[7]
2190 mov %edx, %r10d # pass rounds
2191
2192 call _bsaes_encrypt8
2193
2194 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2195 pxor 0x10(%rsp), @XMM[1]
2196 movdqu @XMM[0], 0x00($out) # write output
2197 pxor 0x20(%rsp), @XMM[4]
2198 movdqu @XMM[1], 0x10($out)
2199 pxor 0x30(%rsp), @XMM[6]
2200 movdqu @XMM[4], 0x20($out)
2201 pxor 0x40(%rsp), @XMM[3]
2202 movdqu @XMM[6], 0x30($out)
2203 pxor 0x50(%rsp), @XMM[7]
2204 movdqu @XMM[3], 0x40($out)
2205 pxor 0x60(%rsp), @XMM[2]
2206 movdqu @XMM[7], 0x50($out)
2207 pxor 0x70(%rsp), @XMM[5]
2208 movdqu @XMM[2], 0x60($out)
2209 movdqu @XMM[5], 0x70($out)
2210 lea 0x80($out), $out
2211
2212 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2213 pxor $twtmp, $twtmp
2214 movdqa .Lxts_magic(%rip), $twmask
2215 pcmpgtd @XMM[7], $twtmp
2216 pshufd \$0x13, $twtmp, $twres
2217 pxor $twtmp, $twtmp
2218 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2219 pand $twmask, $twres # isolate carry and residue
2220 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2221 pxor $twres, @XMM[7]
2222
2223 sub \$0x80,$len
2224 jnc .Lxts_enc_loop
2225
2226.Lxts_enc_short:
2227 add \$0x80, $len
2228 jz .Lxts_enc_done
2229___
2230 for ($i=0;$i<7;$i++) {
2231 $code.=<<___;
2232 pshufd \$0x13, $twtmp, $twres
2233 pxor $twtmp, $twtmp
2234 movdqa @XMM[7], @XMM[$i]
2235 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2236 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2237 pand $twmask, $twres # isolate carry and residue
2238 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2239 pxor $twres, @XMM[7]
2240___
2241 $code.=<<___ if ($i>=1);
2242 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2243 cmp \$`0x10*$i`,$len
2244 je .Lxts_enc_$i
2245___
2246 $code.=<<___ if ($i>=2);
2247 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2248___
2249 }
2250$code.=<<___;
2251 movdqu 0x60($inp), @XMM[8+6]
2252 pxor @XMM[8+5], @XMM[5]
2253 movdqa @XMM[7], 0x70(%rsp)
2254 lea 0x70($inp), $inp
2255 pxor @XMM[8+6], @XMM[6]
2256 lea 0x80(%rsp), %rax # pass key schedule
2257 mov %edx, %r10d # pass rounds
2258
2259 call _bsaes_encrypt8
2260
2261 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2262 pxor 0x10(%rsp), @XMM[1]
2263 movdqu @XMM[0], 0x00($out) # write output
2264 pxor 0x20(%rsp), @XMM[4]
2265 movdqu @XMM[1], 0x10($out)
2266 pxor 0x30(%rsp), @XMM[6]
2267 movdqu @XMM[4], 0x20($out)
2268 pxor 0x40(%rsp), @XMM[3]
2269 movdqu @XMM[6], 0x30($out)
2270 pxor 0x50(%rsp), @XMM[7]
2271 movdqu @XMM[3], 0x40($out)
2272 pxor 0x60(%rsp), @XMM[2]
2273 movdqu @XMM[7], 0x50($out)
2274 movdqu @XMM[2], 0x60($out)
2275 lea 0x70($out), $out
2276
2277 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2278 jmp .Lxts_enc_done
2279.align 16
2280.Lxts_enc_6:
2281 pxor @XMM[8+4], @XMM[4]
2282 lea 0x60($inp), $inp
2283 pxor @XMM[8+5], @XMM[5]
2284 lea 0x80(%rsp), %rax # pass key schedule
2285 mov %edx, %r10d # pass rounds
2286
2287 call _bsaes_encrypt8
2288
2289 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2290 pxor 0x10(%rsp), @XMM[1]
2291 movdqu @XMM[0], 0x00($out) # write output
2292 pxor 0x20(%rsp), @XMM[4]
2293 movdqu @XMM[1], 0x10($out)
2294 pxor 0x30(%rsp), @XMM[6]
2295 movdqu @XMM[4], 0x20($out)
2296 pxor 0x40(%rsp), @XMM[3]
2297 movdqu @XMM[6], 0x30($out)
2298 pxor 0x50(%rsp), @XMM[7]
2299 movdqu @XMM[3], 0x40($out)
2300 movdqu @XMM[7], 0x50($out)
2301 lea 0x60($out), $out
2302
2303 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2304 jmp .Lxts_enc_done
2305.align 16
2306.Lxts_enc_5:
2307 pxor @XMM[8+3], @XMM[3]
2308 lea 0x50($inp), $inp
2309 pxor @XMM[8+4], @XMM[4]
2310 lea 0x80(%rsp), %rax # pass key schedule
2311 mov %edx, %r10d # pass rounds
2312
2313 call _bsaes_encrypt8
2314
2315 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2316 pxor 0x10(%rsp), @XMM[1]
2317 movdqu @XMM[0], 0x00($out) # write output
2318 pxor 0x20(%rsp), @XMM[4]
2319 movdqu @XMM[1], 0x10($out)
2320 pxor 0x30(%rsp), @XMM[6]
2321 movdqu @XMM[4], 0x20($out)
2322 pxor 0x40(%rsp), @XMM[3]
2323 movdqu @XMM[6], 0x30($out)
2324 movdqu @XMM[3], 0x40($out)
2325 lea 0x50($out), $out
2326
2327 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2328 jmp .Lxts_enc_done
2329.align 16
2330.Lxts_enc_4:
2331 pxor @XMM[8+2], @XMM[2]
2332 lea 0x40($inp), $inp
2333 pxor @XMM[8+3], @XMM[3]
2334 lea 0x80(%rsp), %rax # pass key schedule
2335 mov %edx, %r10d # pass rounds
2336
2337 call _bsaes_encrypt8
2338
2339 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2340 pxor 0x10(%rsp), @XMM[1]
2341 movdqu @XMM[0], 0x00($out) # write output
2342 pxor 0x20(%rsp), @XMM[4]
2343 movdqu @XMM[1], 0x10($out)
2344 pxor 0x30(%rsp), @XMM[6]
2345 movdqu @XMM[4], 0x20($out)
2346 movdqu @XMM[6], 0x30($out)
2347 lea 0x40($out), $out
2348
2349 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2350 jmp .Lxts_enc_done
2351.align 16
2352.Lxts_enc_3:
2353 pxor @XMM[8+1], @XMM[1]
2354 lea 0x30($inp), $inp
2355 pxor @XMM[8+2], @XMM[2]
2356 lea 0x80(%rsp), %rax # pass key schedule
2357 mov %edx, %r10d # pass rounds
2358
2359 call _bsaes_encrypt8
2360
2361 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2362 pxor 0x10(%rsp), @XMM[1]
2363 movdqu @XMM[0], 0x00($out) # write output
2364 pxor 0x20(%rsp), @XMM[4]
2365 movdqu @XMM[1], 0x10($out)
2366 movdqu @XMM[4], 0x20($out)
2367 lea 0x30($out), $out
2368
2369 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2370 jmp .Lxts_enc_done
2371.align 16
2372.Lxts_enc_2:
2373 pxor @XMM[8+0], @XMM[0]
2374 lea 0x20($inp), $inp
2375 pxor @XMM[8+1], @XMM[1]
2376 lea 0x80(%rsp), %rax # pass key schedule
2377 mov %edx, %r10d # pass rounds
2378
2379 call _bsaes_encrypt8
2380
2381 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2382 pxor 0x10(%rsp), @XMM[1]
2383 movdqu @XMM[0], 0x00($out) # write output
2384 movdqu @XMM[1], 0x10($out)
2385 lea 0x20($out), $out
2386
2387 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2388 jmp .Lxts_enc_done
2389.align 16
2390.Lxts_enc_1:
2391 pxor @XMM[0], @XMM[8]
2392 lea 0x10($inp), $inp
2393 movdqa @XMM[8], 0x20(%rbp)
2394 lea 0x20(%rbp), $arg1
2395 lea 0x20(%rbp), $arg2
2396 lea ($key), $arg3
2397 call asm_AES_encrypt # doesn't touch %xmm
2398 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2399 #pxor @XMM[8], @XMM[0]
2400 #lea 0x80(%rsp), %rax # pass key schedule
2401 #mov %edx, %r10d # pass rounds
2402 #call _bsaes_encrypt8
2403 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2404 movdqu @XMM[0], 0x00($out) # write output
2405 lea 0x10($out), $out
2406
2407 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2408
2409.Lxts_enc_done:
2410 and \$15, %ebx
2411 jz .Lxts_enc_ret
2412 mov $out, %rdx
2413
2414.Lxts_enc_steal:
2415 movzb ($inp), %eax
2416 movzb -16(%rdx), %ecx
2417 lea 1($inp), $inp
2418 mov %al, -16(%rdx)
2419 mov %cl, 0(%rdx)
2420 lea 1(%rdx), %rdx
2421 sub \$1,%ebx
2422 jnz .Lxts_enc_steal
2423
2424 movdqu -16($out), @XMM[0]
2425 lea 0x20(%rbp), $arg1
2426 pxor @XMM[7], @XMM[0]
2427 lea 0x20(%rbp), $arg2
2428 movdqa @XMM[0], 0x20(%rbp)
2429 lea ($key), $arg3
2430 call asm_AES_encrypt # doesn't touch %xmm
2431 pxor 0x20(%rbp), @XMM[7]
2432 movdqu @XMM[7], -16($out)
2433
2434.Lxts_enc_ret:
2435 lea (%rsp), %rax
2436 pxor %xmm0, %xmm0
2437.Lxts_enc_bzero: # wipe key schedule [if any]
2438 movdqa %xmm0, 0x00(%rax)
2439 movdqa %xmm0, 0x10(%rax)
2440 lea 0x20(%rax), %rax
2441 cmp %rax, %rbp
2442 ja .Lxts_enc_bzero
2443
Robert Sloana94fe052017-02-21 08:49:28 -08002444 lea 0x78(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08002445___
2446$code.=<<___ if ($win64);
2447 movaps 0x40(%rbp), %xmm6
2448 movaps 0x50(%rbp), %xmm7
2449 movaps 0x60(%rbp), %xmm8
2450 movaps 0x70(%rbp), %xmm9
2451 movaps 0x80(%rbp), %xmm10
2452 movaps 0x90(%rbp), %xmm11
2453 movaps 0xa0(%rbp), %xmm12
2454 movaps 0xb0(%rbp), %xmm13
2455 movaps 0xc0(%rbp), %xmm14
2456 movaps 0xd0(%rbp), %xmm15
Robert Sloana94fe052017-02-21 08:49:28 -08002457 lea 0xa0(%rax), %rax
2458.Lxts_enc_tail:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002459___
2460$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08002461 mov -48(%rax), %r15
2462 mov -40(%rax), %r14
2463 mov -32(%rax), %r13
2464 mov -24(%rax), %r12
2465 mov -16(%rax), %rbx
2466 mov -8(%rax), %rbp
2467 lea (%rax), %rsp # restore %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002468.Lxts_enc_epilogue:
2469 ret
2470.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2471
2472.globl bsaes_xts_decrypt
2473.type bsaes_xts_decrypt,\@abi-omnipotent
2474.align 16
2475bsaes_xts_decrypt:
2476 mov %rsp, %rax
2477.Lxts_dec_prologue:
2478 push %rbp
2479 push %rbx
2480 push %r12
2481 push %r13
2482 push %r14
2483 push %r15
2484 lea -0x48(%rsp), %rsp
2485___
2486$code.=<<___ if ($win64);
2487 mov 0xa0(%rsp),$arg5 # pull key2
2488 mov 0xa8(%rsp),$arg6 # pull ivp
2489 lea -0xa0(%rsp), %rsp
2490 movaps %xmm6, 0x40(%rsp)
2491 movaps %xmm7, 0x50(%rsp)
2492 movaps %xmm8, 0x60(%rsp)
2493 movaps %xmm9, 0x70(%rsp)
2494 movaps %xmm10, 0x80(%rsp)
2495 movaps %xmm11, 0x90(%rsp)
2496 movaps %xmm12, 0xa0(%rsp)
2497 movaps %xmm13, 0xb0(%rsp)
2498 movaps %xmm14, 0xc0(%rsp)
2499 movaps %xmm15, 0xd0(%rsp)
2500.Lxts_dec_body:
2501___
2502$code.=<<___;
2503 mov %rsp, %rbp # backup %rsp
2504 mov $arg1, $inp # backup arguments
2505 mov $arg2, $out
2506 mov $arg3, $len
2507 mov $arg4, $key
2508
2509 lea ($arg6), $arg1
2510 lea 0x20(%rbp), $arg2
2511 lea ($arg5), $arg3
2512 call asm_AES_encrypt # generate initial tweak
2513
2514 mov 240($key), %eax # rounds
2515 mov $len, %rbx # backup $len
2516
2517 mov %eax, %edx # rounds
2518 shl \$7, %rax # 128 bytes per inner round key
2519 sub \$`128-32`, %rax # size of bit-sliced key schedule
2520 sub %rax, %rsp
2521
2522 mov %rsp, %rax # pass key schedule
2523 mov $key, %rcx # pass key
2524 mov %edx, %r10d # pass rounds
2525 call _bsaes_key_convert
2526 pxor (%rsp), %xmm7 # fix up round 0 key
2527 movdqa %xmm6, (%rax) # save last round key
2528 movdqa %xmm7, (%rsp)
2529
2530 xor %eax, %eax # if ($len%16) len-=16;
2531 and \$-16, $len
2532 test \$15, %ebx
2533 setnz %al
2534 shl \$4, %rax
2535 sub %rax, $len
2536
2537 sub \$0x80, %rsp # place for tweak[8]
2538 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2539
2540 pxor $twtmp, $twtmp
2541 movdqa .Lxts_magic(%rip), $twmask
2542 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2543
2544 sub \$0x80, $len
2545 jc .Lxts_dec_short
2546 jmp .Lxts_dec_loop
2547
2548.align 16
2549.Lxts_dec_loop:
2550___
2551 for ($i=0;$i<7;$i++) {
2552 $code.=<<___;
2553 pshufd \$0x13, $twtmp, $twres
2554 pxor $twtmp, $twtmp
2555 movdqa @XMM[7], @XMM[$i]
2556 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2557 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2558 pand $twmask, $twres # isolate carry and residue
2559 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2560 pxor $twres, @XMM[7]
2561___
2562 $code.=<<___ if ($i>=1);
2563 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2564___
2565 $code.=<<___ if ($i>=2);
2566 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2567___
2568 }
2569$code.=<<___;
2570 movdqu 0x60($inp), @XMM[8+6]
2571 pxor @XMM[8+5], @XMM[5]
2572 movdqu 0x70($inp), @XMM[8+7]
2573 lea 0x80($inp), $inp
2574 movdqa @XMM[7], 0x70(%rsp)
2575 pxor @XMM[8+6], @XMM[6]
2576 lea 0x80(%rsp), %rax # pass key schedule
2577 pxor @XMM[8+7], @XMM[7]
2578 mov %edx, %r10d # pass rounds
2579
2580 call _bsaes_decrypt8
2581
2582 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2583 pxor 0x10(%rsp), @XMM[1]
2584 movdqu @XMM[0], 0x00($out) # write output
2585 pxor 0x20(%rsp), @XMM[6]
2586 movdqu @XMM[1], 0x10($out)
2587 pxor 0x30(%rsp), @XMM[4]
2588 movdqu @XMM[6], 0x20($out)
2589 pxor 0x40(%rsp), @XMM[2]
2590 movdqu @XMM[4], 0x30($out)
2591 pxor 0x50(%rsp), @XMM[7]
2592 movdqu @XMM[2], 0x40($out)
2593 pxor 0x60(%rsp), @XMM[3]
2594 movdqu @XMM[7], 0x50($out)
2595 pxor 0x70(%rsp), @XMM[5]
2596 movdqu @XMM[3], 0x60($out)
2597 movdqu @XMM[5], 0x70($out)
2598 lea 0x80($out), $out
2599
2600 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2601 pxor $twtmp, $twtmp
2602 movdqa .Lxts_magic(%rip), $twmask
2603 pcmpgtd @XMM[7], $twtmp
2604 pshufd \$0x13, $twtmp, $twres
2605 pxor $twtmp, $twtmp
2606 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2607 pand $twmask, $twres # isolate carry and residue
2608 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2609 pxor $twres, @XMM[7]
2610
2611 sub \$0x80,$len
2612 jnc .Lxts_dec_loop
2613
2614.Lxts_dec_short:
2615 add \$0x80, $len
2616 jz .Lxts_dec_done
2617___
2618 for ($i=0;$i<7;$i++) {
2619 $code.=<<___;
2620 pshufd \$0x13, $twtmp, $twres
2621 pxor $twtmp, $twtmp
2622 movdqa @XMM[7], @XMM[$i]
2623 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2624 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2625 pand $twmask, $twres # isolate carry and residue
2626 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2627 pxor $twres, @XMM[7]
2628___
2629 $code.=<<___ if ($i>=1);
2630 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2631 cmp \$`0x10*$i`,$len
2632 je .Lxts_dec_$i
2633___
2634 $code.=<<___ if ($i>=2);
2635 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2636___
2637 }
2638$code.=<<___;
2639 movdqu 0x60($inp), @XMM[8+6]
2640 pxor @XMM[8+5], @XMM[5]
2641 movdqa @XMM[7], 0x70(%rsp)
2642 lea 0x70($inp), $inp
2643 pxor @XMM[8+6], @XMM[6]
2644 lea 0x80(%rsp), %rax # pass key schedule
2645 mov %edx, %r10d # pass rounds
2646
2647 call _bsaes_decrypt8
2648
2649 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2650 pxor 0x10(%rsp), @XMM[1]
2651 movdqu @XMM[0], 0x00($out) # write output
2652 pxor 0x20(%rsp), @XMM[6]
2653 movdqu @XMM[1], 0x10($out)
2654 pxor 0x30(%rsp), @XMM[4]
2655 movdqu @XMM[6], 0x20($out)
2656 pxor 0x40(%rsp), @XMM[2]
2657 movdqu @XMM[4], 0x30($out)
2658 pxor 0x50(%rsp), @XMM[7]
2659 movdqu @XMM[2], 0x40($out)
2660 pxor 0x60(%rsp), @XMM[3]
2661 movdqu @XMM[7], 0x50($out)
2662 movdqu @XMM[3], 0x60($out)
2663 lea 0x70($out), $out
2664
2665 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2666 jmp .Lxts_dec_done
2667.align 16
2668.Lxts_dec_6:
2669 pxor @XMM[8+4], @XMM[4]
2670 lea 0x60($inp), $inp
2671 pxor @XMM[8+5], @XMM[5]
2672 lea 0x80(%rsp), %rax # pass key schedule
2673 mov %edx, %r10d # pass rounds
2674
2675 call _bsaes_decrypt8
2676
2677 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2678 pxor 0x10(%rsp), @XMM[1]
2679 movdqu @XMM[0], 0x00($out) # write output
2680 pxor 0x20(%rsp), @XMM[6]
2681 movdqu @XMM[1], 0x10($out)
2682 pxor 0x30(%rsp), @XMM[4]
2683 movdqu @XMM[6], 0x20($out)
2684 pxor 0x40(%rsp), @XMM[2]
2685 movdqu @XMM[4], 0x30($out)
2686 pxor 0x50(%rsp), @XMM[7]
2687 movdqu @XMM[2], 0x40($out)
2688 movdqu @XMM[7], 0x50($out)
2689 lea 0x60($out), $out
2690
2691 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2692 jmp .Lxts_dec_done
2693.align 16
2694.Lxts_dec_5:
2695 pxor @XMM[8+3], @XMM[3]
2696 lea 0x50($inp), $inp
2697 pxor @XMM[8+4], @XMM[4]
2698 lea 0x80(%rsp), %rax # pass key schedule
2699 mov %edx, %r10d # pass rounds
2700
2701 call _bsaes_decrypt8
2702
2703 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2704 pxor 0x10(%rsp), @XMM[1]
2705 movdqu @XMM[0], 0x00($out) # write output
2706 pxor 0x20(%rsp), @XMM[6]
2707 movdqu @XMM[1], 0x10($out)
2708 pxor 0x30(%rsp), @XMM[4]
2709 movdqu @XMM[6], 0x20($out)
2710 pxor 0x40(%rsp), @XMM[2]
2711 movdqu @XMM[4], 0x30($out)
2712 movdqu @XMM[2], 0x40($out)
2713 lea 0x50($out), $out
2714
2715 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2716 jmp .Lxts_dec_done
2717.align 16
2718.Lxts_dec_4:
2719 pxor @XMM[8+2], @XMM[2]
2720 lea 0x40($inp), $inp
2721 pxor @XMM[8+3], @XMM[3]
2722 lea 0x80(%rsp), %rax # pass key schedule
2723 mov %edx, %r10d # pass rounds
2724
2725 call _bsaes_decrypt8
2726
2727 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2728 pxor 0x10(%rsp), @XMM[1]
2729 movdqu @XMM[0], 0x00($out) # write output
2730 pxor 0x20(%rsp), @XMM[6]
2731 movdqu @XMM[1], 0x10($out)
2732 pxor 0x30(%rsp), @XMM[4]
2733 movdqu @XMM[6], 0x20($out)
2734 movdqu @XMM[4], 0x30($out)
2735 lea 0x40($out), $out
2736
2737 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2738 jmp .Lxts_dec_done
2739.align 16
2740.Lxts_dec_3:
2741 pxor @XMM[8+1], @XMM[1]
2742 lea 0x30($inp), $inp
2743 pxor @XMM[8+2], @XMM[2]
2744 lea 0x80(%rsp), %rax # pass key schedule
2745 mov %edx, %r10d # pass rounds
2746
2747 call _bsaes_decrypt8
2748
2749 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2750 pxor 0x10(%rsp), @XMM[1]
2751 movdqu @XMM[0], 0x00($out) # write output
2752 pxor 0x20(%rsp), @XMM[6]
2753 movdqu @XMM[1], 0x10($out)
2754 movdqu @XMM[6], 0x20($out)
2755 lea 0x30($out), $out
2756
2757 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2758 jmp .Lxts_dec_done
2759.align 16
2760.Lxts_dec_2:
2761 pxor @XMM[8+0], @XMM[0]
2762 lea 0x20($inp), $inp
2763 pxor @XMM[8+1], @XMM[1]
2764 lea 0x80(%rsp), %rax # pass key schedule
2765 mov %edx, %r10d # pass rounds
2766
2767 call _bsaes_decrypt8
2768
2769 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2770 pxor 0x10(%rsp), @XMM[1]
2771 movdqu @XMM[0], 0x00($out) # write output
2772 movdqu @XMM[1], 0x10($out)
2773 lea 0x20($out), $out
2774
2775 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2776 jmp .Lxts_dec_done
2777.align 16
2778.Lxts_dec_1:
2779 pxor @XMM[0], @XMM[8]
2780 lea 0x10($inp), $inp
2781 movdqa @XMM[8], 0x20(%rbp)
2782 lea 0x20(%rbp), $arg1
2783 lea 0x20(%rbp), $arg2
2784 lea ($key), $arg3
2785 call asm_AES_decrypt # doesn't touch %xmm
2786 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2787 #pxor @XMM[8], @XMM[0]
2788 #lea 0x80(%rsp), %rax # pass key schedule
2789 #mov %edx, %r10d # pass rounds
2790 #call _bsaes_decrypt8
2791 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2792 movdqu @XMM[0], 0x00($out) # write output
2793 lea 0x10($out), $out
2794
2795 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2796
2797.Lxts_dec_done:
2798 and \$15, %ebx
2799 jz .Lxts_dec_ret
2800
2801 pxor $twtmp, $twtmp
2802 movdqa .Lxts_magic(%rip), $twmask
2803 pcmpgtd @XMM[7], $twtmp
2804 pshufd \$0x13, $twtmp, $twres
2805 movdqa @XMM[7], @XMM[6]
2806 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2807 pand $twmask, $twres # isolate carry and residue
2808 movdqu ($inp), @XMM[0]
2809 pxor $twres, @XMM[7]
2810
2811 lea 0x20(%rbp), $arg1
2812 pxor @XMM[7], @XMM[0]
2813 lea 0x20(%rbp), $arg2
2814 movdqa @XMM[0], 0x20(%rbp)
2815 lea ($key), $arg3
2816 call asm_AES_decrypt # doesn't touch %xmm
2817 pxor 0x20(%rbp), @XMM[7]
2818 mov $out, %rdx
2819 movdqu @XMM[7], ($out)
2820
2821.Lxts_dec_steal:
2822 movzb 16($inp), %eax
2823 movzb (%rdx), %ecx
2824 lea 1($inp), $inp
2825 mov %al, (%rdx)
2826 mov %cl, 16(%rdx)
2827 lea 1(%rdx), %rdx
2828 sub \$1,%ebx
2829 jnz .Lxts_dec_steal
2830
2831 movdqu ($out), @XMM[0]
2832 lea 0x20(%rbp), $arg1
2833 pxor @XMM[6], @XMM[0]
2834 lea 0x20(%rbp), $arg2
2835 movdqa @XMM[0], 0x20(%rbp)
2836 lea ($key), $arg3
2837 call asm_AES_decrypt # doesn't touch %xmm
2838 pxor 0x20(%rbp), @XMM[6]
2839 movdqu @XMM[6], ($out)
2840
2841.Lxts_dec_ret:
2842 lea (%rsp), %rax
2843 pxor %xmm0, %xmm0
2844.Lxts_dec_bzero: # wipe key schedule [if any]
2845 movdqa %xmm0, 0x00(%rax)
2846 movdqa %xmm0, 0x10(%rax)
2847 lea 0x20(%rax), %rax
2848 cmp %rax, %rbp
2849 ja .Lxts_dec_bzero
2850
Robert Sloana94fe052017-02-21 08:49:28 -08002851 lea 0x78(%rbp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08002852___
2853$code.=<<___ if ($win64);
2854 movaps 0x40(%rbp), %xmm6
2855 movaps 0x50(%rbp), %xmm7
2856 movaps 0x60(%rbp), %xmm8
2857 movaps 0x70(%rbp), %xmm9
2858 movaps 0x80(%rbp), %xmm10
2859 movaps 0x90(%rbp), %xmm11
2860 movaps 0xa0(%rbp), %xmm12
2861 movaps 0xb0(%rbp), %xmm13
2862 movaps 0xc0(%rbp), %xmm14
2863 movaps 0xd0(%rbp), %xmm15
Robert Sloana94fe052017-02-21 08:49:28 -08002864 lea 0xa0(%rax), %rax
2865.Lxts_dec_tail:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002866___
2867$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08002868 mov -48(%rax), %r15
2869 mov -40(%rax), %r14
2870 mov -32(%rax), %r13
2871 mov -24(%rax), %r12
2872 mov -16(%rax), %rbx
2873 mov -8(%rax), %rbp
2874 lea (%rax), %rsp # restore %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002875.Lxts_dec_epilogue:
2876 ret
2877.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2878___
2879}
2880$code.=<<___;
2881.type _bsaes_const,\@object
2882.align 64
2883_bsaes_const:
2884.LM0ISR: # InvShiftRows constants
2885 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2886.LISRM0:
2887 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2888.LISR:
2889 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2890.LBS0: # bit-slice constants
2891 .quad 0x5555555555555555, 0x5555555555555555
2892.LBS1:
2893 .quad 0x3333333333333333, 0x3333333333333333
2894.LBS2:
2895 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2896.LSR: # shiftrows constants
2897 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2898.LSRM0:
2899 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2900.LM0SR:
2901 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2902.LSWPUP: # byte-swap upper dword
2903 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2904.LSWPUPM0SR:
2905 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2906.LADD1: # counter increment constants
2907 .quad 0x0000000000000000, 0x0000000100000000
2908.LADD2:
2909 .quad 0x0000000000000000, 0x0000000200000000
2910.LADD3:
2911 .quad 0x0000000000000000, 0x0000000300000000
2912.LADD4:
2913 .quad 0x0000000000000000, 0x0000000400000000
2914.LADD5:
2915 .quad 0x0000000000000000, 0x0000000500000000
2916.LADD6:
2917 .quad 0x0000000000000000, 0x0000000600000000
2918.LADD7:
2919 .quad 0x0000000000000000, 0x0000000700000000
2920.LADD8:
2921 .quad 0x0000000000000000, 0x0000000800000000
2922.Lxts_magic:
2923 .long 0x87,0,1,0
2924.Lmasks:
2925 .quad 0x0101010101010101, 0x0101010101010101
2926 .quad 0x0202020202020202, 0x0202020202020202
2927 .quad 0x0404040404040404, 0x0404040404040404
2928 .quad 0x0808080808080808, 0x0808080808080808
2929.LM0:
2930 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2931.L63:
2932 .quad 0x6363636363636363, 0x6363636363636363
2933.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2934.align 64
2935.size _bsaes_const,.-_bsaes_const
2936___
2937
2938# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2939# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2940if ($win64) {
2941$rec="%rcx";
2942$frame="%rdx";
2943$context="%r8";
2944$disp="%r9";
2945
2946$code.=<<___;
2947.extern __imp_RtlVirtualUnwind
2948.type se_handler,\@abi-omnipotent
2949.align 16
2950se_handler:
2951 push %rsi
2952 push %rdi
2953 push %rbx
2954 push %rbp
2955 push %r12
2956 push %r13
2957 push %r14
2958 push %r15
2959 pushfq
2960 sub \$64,%rsp
2961
2962 mov 120($context),%rax # pull context->Rax
2963 mov 248($context),%rbx # pull context->Rip
2964
2965 mov 8($disp),%rsi # disp->ImageBase
2966 mov 56($disp),%r11 # disp->HandlerData
2967
2968 mov 0(%r11),%r10d # HandlerData[0]
2969 lea (%rsi,%r10),%r10 # prologue label
Robert Sloana94fe052017-02-21 08:49:28 -08002970 cmp %r10,%rbx # context->Rip<=prologue label
2971 jbe .Lin_prologue
Adam Langleyd9e397b2015-01-22 14:27:53 -08002972
2973 mov 4(%r11),%r10d # HandlerData[1]
2974 lea (%rsi,%r10),%r10 # epilogue label
2975 cmp %r10,%rbx # context->Rip>=epilogue label
2976 jae .Lin_prologue
2977
Robert Sloana94fe052017-02-21 08:49:28 -08002978 mov 8(%r11),%r10d # HandlerData[2]
2979 lea (%rsi,%r10),%r10 # epilogue label
2980 cmp %r10,%rbx # context->Rip>=tail label
2981 jae .Lin_tail
2982
Adam Langleyd9e397b2015-01-22 14:27:53 -08002983 mov 160($context),%rax # pull context->Rbp
2984
2985 lea 0x40(%rax),%rsi # %xmm save area
2986 lea 512($context),%rdi # &context.Xmm6
2987 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2988 .long 0xa548f3fc # cld; rep movsq
Robert Sloana94fe052017-02-21 08:49:28 -08002989 lea 0xa0+0x78(%rax),%rax # adjust stack pointer
Adam Langleyd9e397b2015-01-22 14:27:53 -08002990
Robert Sloana94fe052017-02-21 08:49:28 -08002991.Lin_tail:
2992 mov -48(%rax),%rbp
2993 mov -40(%rax),%rbx
2994 mov -32(%rax),%r12
2995 mov -24(%rax),%r13
2996 mov -16(%rax),%r14
2997 mov -8(%rax),%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002998 mov %rbx,144($context) # restore context->Rbx
2999 mov %rbp,160($context) # restore context->Rbp
3000 mov %r12,216($context) # restore context->R12
3001 mov %r13,224($context) # restore context->R13
3002 mov %r14,232($context) # restore context->R14
3003 mov %r15,240($context) # restore context->R15
3004
3005.Lin_prologue:
3006 mov %rax,152($context) # restore context->Rsp
3007
3008 mov 40($disp),%rdi # disp->ContextRecord
3009 mov $context,%rsi # context
3010 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3011 .long 0xa548f3fc # cld; rep movsq
3012
3013 mov $disp,%rsi
3014 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3015 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3016 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3017 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3018 mov 40(%rsi),%r10 # disp->ContextRecord
3019 lea 56(%rsi),%r11 # &disp->HandlerData
3020 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3021 mov %r10,32(%rsp) # arg5
3022 mov %r11,40(%rsp) # arg6
3023 mov %r12,48(%rsp) # arg7
3024 mov %rcx,56(%rsp) # arg8, (NULL)
3025 call *__imp_RtlVirtualUnwind(%rip)
3026
3027 mov \$1,%eax # ExceptionContinueSearch
3028 add \$64,%rsp
3029 popfq
3030 pop %r15
3031 pop %r14
3032 pop %r13
3033 pop %r12
3034 pop %rbp
3035 pop %rbx
3036 pop %rdi
3037 pop %rsi
3038 ret
3039.size se_handler,.-se_handler
3040
3041.section .pdata
3042.align 4
3043___
3044$code.=<<___ if ($ecb);
3045 .rva .Lecb_enc_prologue
3046 .rva .Lecb_enc_epilogue
3047 .rva .Lecb_enc_info
3048
3049 .rva .Lecb_dec_prologue
3050 .rva .Lecb_dec_epilogue
3051 .rva .Lecb_dec_info
3052___
3053$code.=<<___;
3054 .rva .Lcbc_dec_prologue
3055 .rva .Lcbc_dec_epilogue
3056 .rva .Lcbc_dec_info
3057
3058 .rva .Lctr_enc_prologue
3059 .rva .Lctr_enc_epilogue
3060 .rva .Lctr_enc_info
3061
3062 .rva .Lxts_enc_prologue
3063 .rva .Lxts_enc_epilogue
3064 .rva .Lxts_enc_info
3065
3066 .rva .Lxts_dec_prologue
3067 .rva .Lxts_dec_epilogue
3068 .rva .Lxts_dec_info
3069
3070.section .xdata
3071.align 8
3072___
3073$code.=<<___ if ($ecb);
3074.Lecb_enc_info:
3075 .byte 9,0,0,0
3076 .rva se_handler
3077 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
Robert Sloana94fe052017-02-21 08:49:28 -08003078 .rva .Lecb_enc_tail
3079 .long 0
Adam Langleyd9e397b2015-01-22 14:27:53 -08003080.Lecb_dec_info:
3081 .byte 9,0,0,0
3082 .rva se_handler
3083 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
Robert Sloana94fe052017-02-21 08:49:28 -08003084 .rva .Lecb_dec_tail
3085 .long 0
Adam Langleyd9e397b2015-01-22 14:27:53 -08003086___
3087$code.=<<___;
3088.Lcbc_dec_info:
3089 .byte 9,0,0,0
3090 .rva se_handler
3091 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
Robert Sloana94fe052017-02-21 08:49:28 -08003092 .rva .Lcbc_dec_tail
3093 .long 0
Adam Langleyd9e397b2015-01-22 14:27:53 -08003094.Lctr_enc_info:
3095 .byte 9,0,0,0
3096 .rva se_handler
3097 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
Robert Sloana94fe052017-02-21 08:49:28 -08003098 .rva .Lctr_enc_tail
3099 .long 0
Adam Langleyd9e397b2015-01-22 14:27:53 -08003100.Lxts_enc_info:
3101 .byte 9,0,0,0
3102 .rva se_handler
3103 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
Robert Sloana94fe052017-02-21 08:49:28 -08003104 .rva .Lxts_enc_tail
3105 .long 0
Adam Langleyd9e397b2015-01-22 14:27:53 -08003106.Lxts_dec_info:
3107 .byte 9,0,0,0
3108 .rva se_handler
3109 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
Robert Sloana94fe052017-02-21 08:49:28 -08003110 .rva .Lxts_dec_tail
3111 .long 0
Adam Langleyd9e397b2015-01-22 14:27:53 -08003112___
3113}
3114
3115$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3116
3117print $code;
3118
3119close STDOUT;