blob: a1869418753c859ff4816a3609a050fff36c52ff [file] [log] [blame]
Robert Sloana94fe052017-02-21 08:49:28 -08001#! /usr/bin/env perl
2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
Adam Langleyd9e397b2015-01-22 14:27:53 -08009#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20# details].
21#
22# Performance.
23#
24# Given aes(enc|dec) instructions' latency asymptotic performance for
25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26# processed with 128-bit key. And given their throughput asymptotic
27# performance for parallelizable modes is 1.25 cycles per byte. Being
28# asymptotic limit it's not something you commonly achieve in reality,
29# but how close does one get? Below are results collected for
30# different modes and block sized. Pairs of numbers are for en-/
31# decryption.
32#
33# 16-byte 64-byte 256-byte 1-KB 8-KB
34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
Robert Sloana94fe052017-02-21 08:49:28 -080037# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
Adam Langleyd9e397b2015-01-22 14:27:53 -080038# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
40#
41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44# The results were collected with specially crafted speed.c benchmark
45# in order to compare them with results reported in "Intel Advanced
46# Encryption Standard (AES) New Instruction Set" White Paper Revision
47# 3.0 dated May 2010. All above results are consistently better. This
48# module also provides better performance for block sizes smaller than
49# 128 bytes in points *not* represented in the above table.
50#
51# Looking at the results for 8-KB buffer.
52#
53# CFB and OFB results are far from the limit, because implementation
54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55# single-block aesni_encrypt, which is not the most optimal way to go.
56# CBC encrypt result is unexpectedly high and there is no documented
57# explanation for it. Seemingly there is a small penalty for feeding
58# the result back to AES unit the way it's done in CBC mode. There is
59# nothing one can do and the result appears optimal. CCM result is
60# identical to CBC, because CBC-MAC is essentially CBC encrypt without
61# saving output. CCM CTR "stays invisible," because it's neatly
62# interleaved wih CBC-MAC. This provides ~30% improvement over
Robert Sloanab8b8882018-03-26 11:39:51 -070063# "straightforward" CCM implementation with CTR and CBC-MAC performed
Adam Langleyd9e397b2015-01-22 14:27:53 -080064# disjointly. Parallelizable modes practically achieve the theoretical
65# limit.
66#
67# Looking at how results vary with buffer size.
68#
69# Curves are practically saturated at 1-KB buffer size. In most cases
70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71# CTR curve doesn't follow this pattern and is "slowest" changing one
72# with "256-byte" result being 87% of "8-KB." This is because overhead
73# in CTR mode is most computationally intensive. Small-block CCM
74# decrypt is slower than encrypt, because first CTR and last CBC-MAC
75# iterations can't be interleaved.
76#
77# Results for 192- and 256-bit keys.
78#
79# EVP-free results were observed to scale perfectly with number of
80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82# are a tad smaller, because the above mentioned penalty biases all
83# results by same constant value. In similar way function call
84# overhead affects small-block performance, as well as OFB and CFB
85# results. Differences are not large, most common coefficients are
86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
88
89# January 2011
90#
91# While Westmere processor features 6 cycles latency for aes[enc|dec]
92# instructions, which can be scheduled every second cycle, Sandy
93# Bridge spends 8 cycles per instruction, but it can schedule them
94# every cycle. This means that code targeting Westmere would perform
95# suboptimally on Sandy Bridge. Therefore this update.
96#
97# In addition, non-parallelizable CBC encrypt (as well as CCM) is
98# optimized. Relative improvement might appear modest, 8% on Westmere,
99# but in absolute terms it's 3.77 cycles per byte encrypted with
100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101# should be compared to asymptotic limits of 3.75 for Westmere and
102# 5.00 for Sandy Bridge. Actually, the fact that they get this close
103# to asymptotic limits is quite amazing. Indeed, the limit is
104# calculated as latency times number of rounds, 10 for 128-bit key,
105# and divided by 16, the number of bytes in block, or in other words
106# it accounts *solely* for aesenc instructions. But there are extra
107# instructions, and numbers so close to the asymptotic limits mean
108# that it's as if it takes as little as *one* additional cycle to
109# execute all of them. How is it possible? It is possible thanks to
110# out-of-order execution logic, which manages to overlap post-
111# processing of previous block, things like saving the output, with
112# actual encryption of current block, as well as pre-processing of
113# current block, things like fetching input and xor-ing it with
114# 0-round element of the key schedule, with actual encryption of
115# previous block. Keep this in mind...
116#
117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118# performance is achieved by interleaving instructions working on
119# independent blocks. In which case asymptotic limit for such modes
120# can be obtained by dividing above mentioned numbers by AES
Robert Sloana94fe052017-02-21 08:49:28 -0800121# instructions' interleave factor. Westmere can execute at most 3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800122# instructions at a time, meaning that optimal interleave factor is 3,
123# and that's where the "magic" number of 1.25 come from. "Optimal
124# interleave factor" means that increase of interleave factor does
125# not improve performance. The formula has proven to reflect reality
126# pretty well on Westmere... Sandy Bridge on the other hand can
127# execute up to 8 AES instructions at a time, so how does varying
128# interleave factor affect the performance? Here is table for ECB
129# (numbers are cycles per byte processed with 128-bit key):
130#
131# instruction interleave factor 3x 6x 8x
132# theoretical asymptotic limit 1.67 0.83 0.625
133# measured performance for 8KB block 1.05 0.86 0.84
134#
135# "as if" interleave factor 4.7x 5.8x 6.0x
136#
137# Further data for other parallelizable modes:
138#
139# CBC decrypt 1.16 0.93 0.74
140# CTR 1.14 0.91 0.74
141#
142# Well, given 3x column it's probably inappropriate to call the limit
143# asymptotic, if it can be surpassed, isn't it? What happens there?
144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145# magic is responsible for this. Processor overlaps not only the
Robert Sloanab8b8882018-03-26 11:39:51 -0700146# additional instructions with AES ones, but even AES instructions
Adam Langleyd9e397b2015-01-22 14:27:53 -0800147# processing adjacent triplets of independent blocks. In the 6x case
148# additional instructions still claim disproportionally small amount
149# of additional cycles, but in 8x case number of instructions must be
150# a tad too high for out-of-order logic to cope with, and AES unit
151# remains underutilized... As you can see 8x interleave is hardly
152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
Robert Sloanab8b8882018-03-26 11:39:51 -0700153# utilizes 6x interleave because of limited register bank capacity.
Adam Langleyd9e397b2015-01-22 14:27:53 -0800154#
155# Higher interleave factors do have negative impact on Westmere
156# performance. While for ECB mode it's negligible ~1.5%, other
157# parallelizables perform ~5% worse, which is outweighed by ~25%
158# improvement on Sandy Bridge. To balance regression on Westmere
159# CTR mode was implemented with 6x aesenc interleave factor.
160
161# April 2011
162#
163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165# in CTR mode AES instruction interleave factor was chosen to be 6x.
166
Robert Sloana94fe052017-02-21 08:49:28 -0800167# November 2015
168#
169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
170# chosen to be 6x.
171
Adam Langleyd9e397b2015-01-22 14:27:53 -0800172######################################################################
173# Current large-block performance in cycles per byte processed with
174# 128-bit key (less is better).
175#
Robert Sloana94fe052017-02-21 08:49:28 -0800176# CBC en-/decrypt CTR XTS ECB OCB
Adam Langleyd9e397b2015-01-22 14:27:53 -0800177# Westmere 3.77/1.25 1.25 1.25 1.26
Robert Sloana94fe052017-02-21 08:49:28 -0800178# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
179# Haswell 4.44/0.63 0.63 0.73 0.63 0.70
180# Skylake 2.62/0.63 0.63 0.63 0.63
181# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
Robert Sloanfe7cd212017-08-07 09:03:39 -0700182# Knights L 2.54/0.77 0.78 0.85 - 1.50
Robert Sloana94fe052017-02-21 08:49:28 -0800183# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
184# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
Robert Sloanfe7cd212017-08-07 09:03:39 -0700185# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49
Adam Langleyd9e397b2015-01-22 14:27:53 -0800186#
Adam Langleye9ada862015-05-11 17:20:37 -0700187# (*) Atom Silvermont ECB result is suboptimal because of penalties
188# incurred by operations on %xmm8-15. As ECB is not considered
Adam Langleyd9e397b2015-01-22 14:27:53 -0800189# critical, nothing was done to mitigate the problem.
190
191$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
192 # generates drop-in replacement for
193 # crypto/aes/asm/aes-x86_64.pl:-)
194
195$flavour = shift;
196$output = shift;
197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
198
199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
200
201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
Robert Sloan572a4e22017-04-17 10:52:19 -0700203( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
Adam Langleyd9e397b2015-01-22 14:27:53 -0800204die "can't locate x86_64-xlate.pl";
205
David Benjaminc895d6b2016-08-11 13:26:41 -0400206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
Adam Langleyd9e397b2015-01-22 14:27:53 -0800207*STDOUT=*OUT;
208
209$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
210@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
211 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
212
213$code=".text\n";
Robert Sloan2424d842017-05-01 07:46:28 -0700214$code.=".extern OPENSSL_ia32cap_P\n";
Adam Langleyd9e397b2015-01-22 14:27:53 -0800215
216$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
218$inp="%rdi";
219$out="%rsi";
220$len="%rdx";
221$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
222$ivp="%r8"; # cbc, ctr, ...
223
224$rnds_="%r10d"; # backup copy for $rounds
225$key_="%r11"; # backup copy for $key
226
227# %xmm register layout
228$rndkey0="%xmm0"; $rndkey1="%xmm1";
229$inout0="%xmm2"; $inout1="%xmm3";
230$inout2="%xmm4"; $inout3="%xmm5";
231$inout4="%xmm6"; $inout5="%xmm7";
232$inout6="%xmm8"; $inout7="%xmm9";
233
234$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
235$in0="%xmm8"; $iv="%xmm9";
236
237# Inline version of internal aesni_[en|de]crypt1.
238#
239# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
240# cycles which take care of loop variables...
241{ my $sn;
242sub aesni_generate1 {
243my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
244++$sn;
245$code.=<<___;
246 $movkey ($key),$rndkey0
247 $movkey 16($key),$rndkey1
248___
249$code.=<<___ if (defined($ivec));
250 xorps $rndkey0,$ivec
251 lea 32($key),$key
252 xorps $ivec,$inout
253___
254$code.=<<___ if (!defined($ivec));
255 lea 32($key),$key
256 xorps $rndkey0,$inout
257___
258$code.=<<___;
259.Loop_${p}1_$sn:
260 aes${p} $rndkey1,$inout
261 dec $rounds
262 $movkey ($key),$rndkey1
263 lea 16($key),$key
264 jnz .Loop_${p}1_$sn # loop body is 16 bytes
265 aes${p}last $rndkey1,$inout
266___
267}}
268# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
269#
270{ my ($inp,$out,$key) = @_4args;
271
272$code.=<<___;
273.globl ${PREFIX}_encrypt
274.type ${PREFIX}_encrypt,\@abi-omnipotent
275.align 16
276${PREFIX}_encrypt:
277 movups ($inp),$inout0 # load input
278 mov 240($key),$rounds # key->rounds
279___
280 &aesni_generate1("enc",$key,$rounds);
281$code.=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -0700282 pxor $rndkey0,$rndkey0 # clear register bank
283 pxor $rndkey1,$rndkey1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800284 movups $inout0,($out) # output
Adam Langleye9ada862015-05-11 17:20:37 -0700285 pxor $inout0,$inout0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800286 ret
287.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
288
289.globl ${PREFIX}_decrypt
290.type ${PREFIX}_decrypt,\@abi-omnipotent
291.align 16
292${PREFIX}_decrypt:
293 movups ($inp),$inout0 # load input
294 mov 240($key),$rounds # key->rounds
295___
296 &aesni_generate1("dec",$key,$rounds);
297$code.=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -0700298 pxor $rndkey0,$rndkey0 # clear register bank
299 pxor $rndkey1,$rndkey1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800300 movups $inout0,($out) # output
Adam Langleye9ada862015-05-11 17:20:37 -0700301 pxor $inout0,$inout0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800302 ret
303.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
304___
305}
306
307# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
308# factor. Why 3x subroutine were originally used in loops? Even though
309# aes[enc|dec] latency was originally 6, it could be scheduled only
310# every *2nd* cycle. Thus 3x interleave was the one providing optimal
311# utilization, i.e. when subroutine's throughput is virtually same as
312# of non-interleaved subroutine [for number of input blocks up to 3].
313# This is why it originally made no sense to implement 2x subroutine.
314# But times change and it became appropriate to spend extra 192 bytes
315# on 2x subroutine on Atom Silvermont account. For processors that
316# can schedule aes[enc|dec] every cycle optimal interleave factor
317# equals to corresponding instructions latency. 8x is optimal for
Robert Sloana94fe052017-02-21 08:49:28 -0800318# * Bridge and "super-optimal" for other Intel CPUs...
Adam Langleyd9e397b2015-01-22 14:27:53 -0800319
320sub aesni_generate2 {
321my $dir=shift;
322# As already mentioned it takes in $key and $rounds, which are *not*
323# preserved. $inout[0-1] is cipher/clear text...
324$code.=<<___;
325.type _aesni_${dir}rypt2,\@abi-omnipotent
326.align 16
327_aesni_${dir}rypt2:
328 $movkey ($key),$rndkey0
329 shl \$4,$rounds
330 $movkey 16($key),$rndkey1
331 xorps $rndkey0,$inout0
332 xorps $rndkey0,$inout1
333 $movkey 32($key),$rndkey0
334 lea 32($key,$rounds),$key
335 neg %rax # $rounds
336 add \$16,%rax
337
338.L${dir}_loop2:
339 aes${dir} $rndkey1,$inout0
340 aes${dir} $rndkey1,$inout1
341 $movkey ($key,%rax),$rndkey1
342 add \$32,%rax
343 aes${dir} $rndkey0,$inout0
344 aes${dir} $rndkey0,$inout1
345 $movkey -16($key,%rax),$rndkey0
346 jnz .L${dir}_loop2
347
348 aes${dir} $rndkey1,$inout0
349 aes${dir} $rndkey1,$inout1
350 aes${dir}last $rndkey0,$inout0
351 aes${dir}last $rndkey0,$inout1
352 ret
353.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
354___
355}
356sub aesni_generate3 {
357my $dir=shift;
358# As already mentioned it takes in $key and $rounds, which are *not*
359# preserved. $inout[0-2] is cipher/clear text...
360$code.=<<___;
361.type _aesni_${dir}rypt3,\@abi-omnipotent
362.align 16
363_aesni_${dir}rypt3:
364 $movkey ($key),$rndkey0
365 shl \$4,$rounds
366 $movkey 16($key),$rndkey1
367 xorps $rndkey0,$inout0
368 xorps $rndkey0,$inout1
369 xorps $rndkey0,$inout2
370 $movkey 32($key),$rndkey0
371 lea 32($key,$rounds),$key
372 neg %rax # $rounds
373 add \$16,%rax
374
375.L${dir}_loop3:
376 aes${dir} $rndkey1,$inout0
377 aes${dir} $rndkey1,$inout1
378 aes${dir} $rndkey1,$inout2
379 $movkey ($key,%rax),$rndkey1
380 add \$32,%rax
381 aes${dir} $rndkey0,$inout0
382 aes${dir} $rndkey0,$inout1
383 aes${dir} $rndkey0,$inout2
384 $movkey -16($key,%rax),$rndkey0
385 jnz .L${dir}_loop3
386
387 aes${dir} $rndkey1,$inout0
388 aes${dir} $rndkey1,$inout1
389 aes${dir} $rndkey1,$inout2
390 aes${dir}last $rndkey0,$inout0
391 aes${dir}last $rndkey0,$inout1
392 aes${dir}last $rndkey0,$inout2
393 ret
394.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
395___
396}
397# 4x interleave is implemented to improve small block performance,
398# most notably [and naturally] 4 block by ~30%. One can argue that one
399# should have implemented 5x as well, but improvement would be <20%,
400# so it's not worth it...
401sub aesni_generate4 {
402my $dir=shift;
403# As already mentioned it takes in $key and $rounds, which are *not*
404# preserved. $inout[0-3] is cipher/clear text...
405$code.=<<___;
406.type _aesni_${dir}rypt4,\@abi-omnipotent
407.align 16
408_aesni_${dir}rypt4:
409 $movkey ($key),$rndkey0
410 shl \$4,$rounds
411 $movkey 16($key),$rndkey1
412 xorps $rndkey0,$inout0
413 xorps $rndkey0,$inout1
414 xorps $rndkey0,$inout2
415 xorps $rndkey0,$inout3
416 $movkey 32($key),$rndkey0
417 lea 32($key,$rounds),$key
418 neg %rax # $rounds
419 .byte 0x0f,0x1f,0x00
420 add \$16,%rax
421
422.L${dir}_loop4:
423 aes${dir} $rndkey1,$inout0
424 aes${dir} $rndkey1,$inout1
425 aes${dir} $rndkey1,$inout2
426 aes${dir} $rndkey1,$inout3
427 $movkey ($key,%rax),$rndkey1
428 add \$32,%rax
429 aes${dir} $rndkey0,$inout0
430 aes${dir} $rndkey0,$inout1
431 aes${dir} $rndkey0,$inout2
432 aes${dir} $rndkey0,$inout3
433 $movkey -16($key,%rax),$rndkey0
434 jnz .L${dir}_loop4
435
436 aes${dir} $rndkey1,$inout0
437 aes${dir} $rndkey1,$inout1
438 aes${dir} $rndkey1,$inout2
439 aes${dir} $rndkey1,$inout3
440 aes${dir}last $rndkey0,$inout0
441 aes${dir}last $rndkey0,$inout1
442 aes${dir}last $rndkey0,$inout2
443 aes${dir}last $rndkey0,$inout3
444 ret
445.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
446___
447}
448sub aesni_generate6 {
449my $dir=shift;
450# As already mentioned it takes in $key and $rounds, which are *not*
451# preserved. $inout[0-5] is cipher/clear text...
452$code.=<<___;
453.type _aesni_${dir}rypt6,\@abi-omnipotent
454.align 16
455_aesni_${dir}rypt6:
456 $movkey ($key),$rndkey0
457 shl \$4,$rounds
458 $movkey 16($key),$rndkey1
459 xorps $rndkey0,$inout0
460 pxor $rndkey0,$inout1
461 pxor $rndkey0,$inout2
462 aes${dir} $rndkey1,$inout0
463 lea 32($key,$rounds),$key
464 neg %rax # $rounds
465 aes${dir} $rndkey1,$inout1
466 pxor $rndkey0,$inout3
467 pxor $rndkey0,$inout4
468 aes${dir} $rndkey1,$inout2
469 pxor $rndkey0,$inout5
Adam Langleye9ada862015-05-11 17:20:37 -0700470 $movkey ($key,%rax),$rndkey0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800471 add \$16,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800472 jmp .L${dir}_loop6_enter
473.align 16
474.L${dir}_loop6:
475 aes${dir} $rndkey1,$inout0
476 aes${dir} $rndkey1,$inout1
477 aes${dir} $rndkey1,$inout2
Adam Langleye9ada862015-05-11 17:20:37 -0700478.L${dir}_loop6_enter:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800479 aes${dir} $rndkey1,$inout3
480 aes${dir} $rndkey1,$inout4
481 aes${dir} $rndkey1,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -0800482 $movkey ($key,%rax),$rndkey1
483 add \$32,%rax
484 aes${dir} $rndkey0,$inout0
485 aes${dir} $rndkey0,$inout1
486 aes${dir} $rndkey0,$inout2
487 aes${dir} $rndkey0,$inout3
488 aes${dir} $rndkey0,$inout4
489 aes${dir} $rndkey0,$inout5
490 $movkey -16($key,%rax),$rndkey0
491 jnz .L${dir}_loop6
492
493 aes${dir} $rndkey1,$inout0
494 aes${dir} $rndkey1,$inout1
495 aes${dir} $rndkey1,$inout2
496 aes${dir} $rndkey1,$inout3
497 aes${dir} $rndkey1,$inout4
498 aes${dir} $rndkey1,$inout5
499 aes${dir}last $rndkey0,$inout0
500 aes${dir}last $rndkey0,$inout1
501 aes${dir}last $rndkey0,$inout2
502 aes${dir}last $rndkey0,$inout3
503 aes${dir}last $rndkey0,$inout4
504 aes${dir}last $rndkey0,$inout5
505 ret
506.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
507___
508}
509sub aesni_generate8 {
510my $dir=shift;
511# As already mentioned it takes in $key and $rounds, which are *not*
512# preserved. $inout[0-7] is cipher/clear text...
513$code.=<<___;
514.type _aesni_${dir}rypt8,\@abi-omnipotent
515.align 16
516_aesni_${dir}rypt8:
517 $movkey ($key),$rndkey0
518 shl \$4,$rounds
519 $movkey 16($key),$rndkey1
520 xorps $rndkey0,$inout0
521 xorps $rndkey0,$inout1
522 pxor $rndkey0,$inout2
523 pxor $rndkey0,$inout3
524 pxor $rndkey0,$inout4
525 lea 32($key,$rounds),$key
526 neg %rax # $rounds
527 aes${dir} $rndkey1,$inout0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800528 pxor $rndkey0,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -0800529 pxor $rndkey0,$inout6
Adam Langleye9ada862015-05-11 17:20:37 -0700530 aes${dir} $rndkey1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800531 pxor $rndkey0,$inout7
Adam Langleye9ada862015-05-11 17:20:37 -0700532 $movkey ($key,%rax),$rndkey0
533 add \$16,%rax
534 jmp .L${dir}_loop8_inner
Adam Langleyd9e397b2015-01-22 14:27:53 -0800535.align 16
536.L${dir}_loop8:
537 aes${dir} $rndkey1,$inout0
538 aes${dir} $rndkey1,$inout1
Adam Langleye9ada862015-05-11 17:20:37 -0700539.L${dir}_loop8_inner:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800540 aes${dir} $rndkey1,$inout2
541 aes${dir} $rndkey1,$inout3
542 aes${dir} $rndkey1,$inout4
543 aes${dir} $rndkey1,$inout5
544 aes${dir} $rndkey1,$inout6
545 aes${dir} $rndkey1,$inout7
546.L${dir}_loop8_enter:
547 $movkey ($key,%rax),$rndkey1
548 add \$32,%rax
549 aes${dir} $rndkey0,$inout0
550 aes${dir} $rndkey0,$inout1
551 aes${dir} $rndkey0,$inout2
552 aes${dir} $rndkey0,$inout3
553 aes${dir} $rndkey0,$inout4
554 aes${dir} $rndkey0,$inout5
555 aes${dir} $rndkey0,$inout6
556 aes${dir} $rndkey0,$inout7
557 $movkey -16($key,%rax),$rndkey0
558 jnz .L${dir}_loop8
559
560 aes${dir} $rndkey1,$inout0
561 aes${dir} $rndkey1,$inout1
562 aes${dir} $rndkey1,$inout2
563 aes${dir} $rndkey1,$inout3
564 aes${dir} $rndkey1,$inout4
565 aes${dir} $rndkey1,$inout5
566 aes${dir} $rndkey1,$inout6
567 aes${dir} $rndkey1,$inout7
568 aes${dir}last $rndkey0,$inout0
569 aes${dir}last $rndkey0,$inout1
570 aes${dir}last $rndkey0,$inout2
571 aes${dir}last $rndkey0,$inout3
572 aes${dir}last $rndkey0,$inout4
573 aes${dir}last $rndkey0,$inout5
574 aes${dir}last $rndkey0,$inout6
575 aes${dir}last $rndkey0,$inout7
576 ret
577.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
578___
579}
580&aesni_generate2("enc") if ($PREFIX eq "aesni");
581&aesni_generate2("dec");
582&aesni_generate3("enc") if ($PREFIX eq "aesni");
583&aesni_generate3("dec");
584&aesni_generate4("enc") if ($PREFIX eq "aesni");
585&aesni_generate4("dec");
586&aesni_generate6("enc") if ($PREFIX eq "aesni");
587&aesni_generate6("dec");
588&aesni_generate8("enc") if ($PREFIX eq "aesni");
589&aesni_generate8("dec");
590
591if ($PREFIX eq "aesni") {
592########################################################################
593# void aesni_ecb_encrypt (const void *in, void *out,
594# size_t length, const AES_KEY *key,
595# int enc);
596$code.=<<___;
597.globl aesni_ecb_encrypt
598.type aesni_ecb_encrypt,\@function,5
599.align 16
600aesni_ecb_encrypt:
601___
602$code.=<<___ if ($win64);
603 lea -0x58(%rsp),%rsp
Adam Langleye9ada862015-05-11 17:20:37 -0700604 movaps %xmm6,(%rsp) # offload $inout4..7
Adam Langleyd9e397b2015-01-22 14:27:53 -0800605 movaps %xmm7,0x10(%rsp)
606 movaps %xmm8,0x20(%rsp)
607 movaps %xmm9,0x30(%rsp)
608.Lecb_enc_body:
609___
610$code.=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -0700611 and \$-16,$len # if ($len<16)
612 jz .Lecb_ret # return
Adam Langleyd9e397b2015-01-22 14:27:53 -0800613
614 mov 240($key),$rounds # key->rounds
615 $movkey ($key),$rndkey0
616 mov $key,$key_ # backup $key
617 mov $rounds,$rnds_ # backup $rounds
618 test %r8d,%r8d # 5th argument
619 jz .Lecb_decrypt
620#--------------------------- ECB ENCRYPT ------------------------------#
Adam Langleye9ada862015-05-11 17:20:37 -0700621 cmp \$0x80,$len # if ($len<8*16)
622 jb .Lecb_enc_tail # short input
Adam Langleyd9e397b2015-01-22 14:27:53 -0800623
Adam Langleye9ada862015-05-11 17:20:37 -0700624 movdqu ($inp),$inout0 # load 8 input blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800625 movdqu 0x10($inp),$inout1
626 movdqu 0x20($inp),$inout2
627 movdqu 0x30($inp),$inout3
628 movdqu 0x40($inp),$inout4
629 movdqu 0x50($inp),$inout5
630 movdqu 0x60($inp),$inout6
631 movdqu 0x70($inp),$inout7
Adam Langleye9ada862015-05-11 17:20:37 -0700632 lea 0x80($inp),$inp # $inp+=8*16
633 sub \$0x80,$len # $len-=8*16 (can be zero)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800634 jmp .Lecb_enc_loop8_enter
635.align 16
636.Lecb_enc_loop8:
Adam Langleye9ada862015-05-11 17:20:37 -0700637 movups $inout0,($out) # store 8 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800638 mov $key_,$key # restore $key
Adam Langleye9ada862015-05-11 17:20:37 -0700639 movdqu ($inp),$inout0 # load 8 input blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800640 mov $rnds_,$rounds # restore $rounds
641 movups $inout1,0x10($out)
642 movdqu 0x10($inp),$inout1
643 movups $inout2,0x20($out)
644 movdqu 0x20($inp),$inout2
645 movups $inout3,0x30($out)
646 movdqu 0x30($inp),$inout3
647 movups $inout4,0x40($out)
648 movdqu 0x40($inp),$inout4
649 movups $inout5,0x50($out)
650 movdqu 0x50($inp),$inout5
651 movups $inout6,0x60($out)
652 movdqu 0x60($inp),$inout6
653 movups $inout7,0x70($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700654 lea 0x80($out),$out # $out+=8*16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800655 movdqu 0x70($inp),$inout7
Adam Langleye9ada862015-05-11 17:20:37 -0700656 lea 0x80($inp),$inp # $inp+=8*16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800657.Lecb_enc_loop8_enter:
658
659 call _aesni_encrypt8
660
661 sub \$0x80,$len
Adam Langleye9ada862015-05-11 17:20:37 -0700662 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
Adam Langleyd9e397b2015-01-22 14:27:53 -0800663
Adam Langleye9ada862015-05-11 17:20:37 -0700664 movups $inout0,($out) # store 8 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800665 mov $key_,$key # restore $key
666 movups $inout1,0x10($out)
667 mov $rnds_,$rounds # restore $rounds
668 movups $inout2,0x20($out)
669 movups $inout3,0x30($out)
670 movups $inout4,0x40($out)
671 movups $inout5,0x50($out)
672 movups $inout6,0x60($out)
673 movups $inout7,0x70($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700674 lea 0x80($out),$out # $out+=8*16
675 add \$0x80,$len # restore real remaining $len
676 jz .Lecb_ret # done if ($len==0)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800677
Adam Langleye9ada862015-05-11 17:20:37 -0700678.Lecb_enc_tail: # $len is less than 8*16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800679 movups ($inp),$inout0
680 cmp \$0x20,$len
681 jb .Lecb_enc_one
682 movups 0x10($inp),$inout1
683 je .Lecb_enc_two
684 movups 0x20($inp),$inout2
685 cmp \$0x40,$len
686 jb .Lecb_enc_three
687 movups 0x30($inp),$inout3
688 je .Lecb_enc_four
689 movups 0x40($inp),$inout4
690 cmp \$0x60,$len
691 jb .Lecb_enc_five
692 movups 0x50($inp),$inout5
693 je .Lecb_enc_six
694 movdqu 0x60($inp),$inout6
Adam Langleye9ada862015-05-11 17:20:37 -0700695 xorps $inout7,$inout7
Adam Langleyd9e397b2015-01-22 14:27:53 -0800696 call _aesni_encrypt8
Adam Langleye9ada862015-05-11 17:20:37 -0700697 movups $inout0,($out) # store 7 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800698 movups $inout1,0x10($out)
699 movups $inout2,0x20($out)
700 movups $inout3,0x30($out)
701 movups $inout4,0x40($out)
702 movups $inout5,0x50($out)
703 movups $inout6,0x60($out)
704 jmp .Lecb_ret
705.align 16
706.Lecb_enc_one:
707___
708 &aesni_generate1("enc",$key,$rounds);
709$code.=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -0700710 movups $inout0,($out) # store one output block
Adam Langleyd9e397b2015-01-22 14:27:53 -0800711 jmp .Lecb_ret
712.align 16
713.Lecb_enc_two:
714 call _aesni_encrypt2
Adam Langleye9ada862015-05-11 17:20:37 -0700715 movups $inout0,($out) # store 2 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800716 movups $inout1,0x10($out)
717 jmp .Lecb_ret
718.align 16
719.Lecb_enc_three:
720 call _aesni_encrypt3
Adam Langleye9ada862015-05-11 17:20:37 -0700721 movups $inout0,($out) # store 3 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800722 movups $inout1,0x10($out)
723 movups $inout2,0x20($out)
724 jmp .Lecb_ret
725.align 16
726.Lecb_enc_four:
727 call _aesni_encrypt4
Adam Langleye9ada862015-05-11 17:20:37 -0700728 movups $inout0,($out) # store 4 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800729 movups $inout1,0x10($out)
730 movups $inout2,0x20($out)
731 movups $inout3,0x30($out)
732 jmp .Lecb_ret
733.align 16
734.Lecb_enc_five:
735 xorps $inout5,$inout5
736 call _aesni_encrypt6
Adam Langleye9ada862015-05-11 17:20:37 -0700737 movups $inout0,($out) # store 5 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800738 movups $inout1,0x10($out)
739 movups $inout2,0x20($out)
740 movups $inout3,0x30($out)
741 movups $inout4,0x40($out)
742 jmp .Lecb_ret
743.align 16
744.Lecb_enc_six:
745 call _aesni_encrypt6
Adam Langleye9ada862015-05-11 17:20:37 -0700746 movups $inout0,($out) # store 6 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800747 movups $inout1,0x10($out)
748 movups $inout2,0x20($out)
749 movups $inout3,0x30($out)
750 movups $inout4,0x40($out)
751 movups $inout5,0x50($out)
752 jmp .Lecb_ret
753 #--------------------------- ECB DECRYPT ------------------------------#
754.align 16
755.Lecb_decrypt:
Adam Langleye9ada862015-05-11 17:20:37 -0700756 cmp \$0x80,$len # if ($len<8*16)
757 jb .Lecb_dec_tail # short input
Adam Langleyd9e397b2015-01-22 14:27:53 -0800758
Adam Langleye9ada862015-05-11 17:20:37 -0700759 movdqu ($inp),$inout0 # load 8 input blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800760 movdqu 0x10($inp),$inout1
761 movdqu 0x20($inp),$inout2
762 movdqu 0x30($inp),$inout3
763 movdqu 0x40($inp),$inout4
764 movdqu 0x50($inp),$inout5
765 movdqu 0x60($inp),$inout6
766 movdqu 0x70($inp),$inout7
Adam Langleye9ada862015-05-11 17:20:37 -0700767 lea 0x80($inp),$inp # $inp+=8*16
768 sub \$0x80,$len # $len-=8*16 (can be zero)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800769 jmp .Lecb_dec_loop8_enter
770.align 16
771.Lecb_dec_loop8:
Adam Langleye9ada862015-05-11 17:20:37 -0700772 movups $inout0,($out) # store 8 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800773 mov $key_,$key # restore $key
Adam Langleye9ada862015-05-11 17:20:37 -0700774 movdqu ($inp),$inout0 # load 8 input blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800775 mov $rnds_,$rounds # restore $rounds
776 movups $inout1,0x10($out)
777 movdqu 0x10($inp),$inout1
778 movups $inout2,0x20($out)
779 movdqu 0x20($inp),$inout2
780 movups $inout3,0x30($out)
781 movdqu 0x30($inp),$inout3
782 movups $inout4,0x40($out)
783 movdqu 0x40($inp),$inout4
784 movups $inout5,0x50($out)
785 movdqu 0x50($inp),$inout5
786 movups $inout6,0x60($out)
787 movdqu 0x60($inp),$inout6
788 movups $inout7,0x70($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700789 lea 0x80($out),$out # $out+=8*16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800790 movdqu 0x70($inp),$inout7
Adam Langleye9ada862015-05-11 17:20:37 -0700791 lea 0x80($inp),$inp # $inp+=8*16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800792.Lecb_dec_loop8_enter:
793
794 call _aesni_decrypt8
795
796 $movkey ($key_),$rndkey0
797 sub \$0x80,$len
Adam Langleye9ada862015-05-11 17:20:37 -0700798 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
Adam Langleyd9e397b2015-01-22 14:27:53 -0800799
Adam Langleye9ada862015-05-11 17:20:37 -0700800 movups $inout0,($out) # store 8 output blocks
801 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800802 mov $key_,$key # restore $key
803 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700804 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800805 mov $rnds_,$rounds # restore $rounds
806 movups $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700807 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800808 movups $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700809 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800810 movups $inout4,0x40($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700811 pxor $inout4,$inout4
Adam Langleyd9e397b2015-01-22 14:27:53 -0800812 movups $inout5,0x50($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700813 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -0800814 movups $inout6,0x60($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700815 pxor $inout6,$inout6
Adam Langleyd9e397b2015-01-22 14:27:53 -0800816 movups $inout7,0x70($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700817 pxor $inout7,$inout7
818 lea 0x80($out),$out # $out+=8*16
819 add \$0x80,$len # restore real remaining $len
820 jz .Lecb_ret # done if ($len==0)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800821
822.Lecb_dec_tail:
823 movups ($inp),$inout0
824 cmp \$0x20,$len
825 jb .Lecb_dec_one
826 movups 0x10($inp),$inout1
827 je .Lecb_dec_two
828 movups 0x20($inp),$inout2
829 cmp \$0x40,$len
830 jb .Lecb_dec_three
831 movups 0x30($inp),$inout3
832 je .Lecb_dec_four
833 movups 0x40($inp),$inout4
834 cmp \$0x60,$len
835 jb .Lecb_dec_five
836 movups 0x50($inp),$inout5
837 je .Lecb_dec_six
838 movups 0x60($inp),$inout6
839 $movkey ($key),$rndkey0
Adam Langleye9ada862015-05-11 17:20:37 -0700840 xorps $inout7,$inout7
Adam Langleyd9e397b2015-01-22 14:27:53 -0800841 call _aesni_decrypt8
Adam Langleye9ada862015-05-11 17:20:37 -0700842 movups $inout0,($out) # store 7 output blocks
843 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800844 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700845 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800846 movups $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700847 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800848 movups $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700849 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800850 movups $inout4,0x40($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700851 pxor $inout4,$inout4
Adam Langleyd9e397b2015-01-22 14:27:53 -0800852 movups $inout5,0x50($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700853 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -0800854 movups $inout6,0x60($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700855 pxor $inout6,$inout6
856 pxor $inout7,$inout7
Adam Langleyd9e397b2015-01-22 14:27:53 -0800857 jmp .Lecb_ret
858.align 16
859.Lecb_dec_one:
860___
861 &aesni_generate1("dec",$key,$rounds);
862$code.=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -0700863 movups $inout0,($out) # store one output block
864 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800865 jmp .Lecb_ret
866.align 16
867.Lecb_dec_two:
868 call _aesni_decrypt2
Adam Langleye9ada862015-05-11 17:20:37 -0700869 movups $inout0,($out) # store 2 output blocks
870 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800871 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700872 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800873 jmp .Lecb_ret
874.align 16
875.Lecb_dec_three:
876 call _aesni_decrypt3
Adam Langleye9ada862015-05-11 17:20:37 -0700877 movups $inout0,($out) # store 3 output blocks
878 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800879 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700880 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800881 movups $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700882 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800883 jmp .Lecb_ret
884.align 16
885.Lecb_dec_four:
886 call _aesni_decrypt4
Adam Langleye9ada862015-05-11 17:20:37 -0700887 movups $inout0,($out) # store 4 output blocks
888 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800889 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700890 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800891 movups $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700892 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800893 movups $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700894 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800895 jmp .Lecb_ret
896.align 16
897.Lecb_dec_five:
898 xorps $inout5,$inout5
899 call _aesni_decrypt6
Adam Langleye9ada862015-05-11 17:20:37 -0700900 movups $inout0,($out) # store 5 output blocks
901 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800902 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700903 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800904 movups $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700905 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800906 movups $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700907 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800908 movups $inout4,0x40($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700909 pxor $inout4,$inout4
910 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -0800911 jmp .Lecb_ret
912.align 16
913.Lecb_dec_six:
914 call _aesni_decrypt6
Adam Langleye9ada862015-05-11 17:20:37 -0700915 movups $inout0,($out) # store 6 output blocks
916 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800917 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700918 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800919 movups $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700920 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800921 movups $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700922 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800923 movups $inout4,0x40($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700924 pxor $inout4,$inout4
Adam Langleyd9e397b2015-01-22 14:27:53 -0800925 movups $inout5,0x50($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700926 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -0800927
928.Lecb_ret:
Adam Langleye9ada862015-05-11 17:20:37 -0700929 xorps $rndkey0,$rndkey0 # %xmm0
930 pxor $rndkey1,$rndkey1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800931___
932$code.=<<___ if ($win64);
933 movaps (%rsp),%xmm6
Adam Langleye9ada862015-05-11 17:20:37 -0700934 movaps %xmm0,(%rsp) # clear stack
Adam Langleyd9e397b2015-01-22 14:27:53 -0800935 movaps 0x10(%rsp),%xmm7
Adam Langleye9ada862015-05-11 17:20:37 -0700936 movaps %xmm0,0x10(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800937 movaps 0x20(%rsp),%xmm8
Adam Langleye9ada862015-05-11 17:20:37 -0700938 movaps %xmm0,0x20(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800939 movaps 0x30(%rsp),%xmm9
Adam Langleye9ada862015-05-11 17:20:37 -0700940 movaps %xmm0,0x30(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800941 lea 0x58(%rsp),%rsp
942.Lecb_enc_ret:
943___
944$code.=<<___;
945 ret
946.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
947___
948
949{
950######################################################################
951# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
952# size_t blocks, const AES_KEY *key,
953# const char *ivec,char *cmac);
954#
955# Handles only complete blocks, operates on 64-bit counter and
956# does not update *ivec! Nor does it finalize CMAC value
957# (see engine/eng_aesni.c for details)
958#
959{
960my $cmac="%r9"; # 6th argument
961
962my $increment="%xmm9";
963my $iv="%xmm6";
964my $bswap_mask="%xmm7";
965
966$code.=<<___;
967.globl aesni_ccm64_encrypt_blocks
968.type aesni_ccm64_encrypt_blocks,\@function,6
969.align 16
970aesni_ccm64_encrypt_blocks:
971___
972$code.=<<___ if ($win64);
973 lea -0x58(%rsp),%rsp
Adam Langleye9ada862015-05-11 17:20:37 -0700974 movaps %xmm6,(%rsp) # $iv
975 movaps %xmm7,0x10(%rsp) # $bswap_mask
976 movaps %xmm8,0x20(%rsp) # $in0
977 movaps %xmm9,0x30(%rsp) # $increment
Adam Langleyd9e397b2015-01-22 14:27:53 -0800978.Lccm64_enc_body:
979___
980$code.=<<___;
981 mov 240($key),$rounds # key->rounds
982 movdqu ($ivp),$iv
983 movdqa .Lincrement64(%rip),$increment
984 movdqa .Lbswap_mask(%rip),$bswap_mask
985
986 shl \$4,$rounds
987 mov \$16,$rnds_
988 lea 0($key),$key_
989 movdqu ($cmac),$inout1
990 movdqa $iv,$inout0
991 lea 32($key,$rounds),$key # end of key schedule
992 pshufb $bswap_mask,$iv
993 sub %rax,%r10 # twisted $rounds
994 jmp .Lccm64_enc_outer
995.align 16
996.Lccm64_enc_outer:
997 $movkey ($key_),$rndkey0
998 mov %r10,%rax
999 movups ($inp),$in0 # load inp
1000
1001 xorps $rndkey0,$inout0 # counter
1002 $movkey 16($key_),$rndkey1
1003 xorps $in0,$rndkey0
1004 xorps $rndkey0,$inout1 # cmac^=inp
1005 $movkey 32($key_),$rndkey0
1006
1007.Lccm64_enc2_loop:
1008 aesenc $rndkey1,$inout0
1009 aesenc $rndkey1,$inout1
1010 $movkey ($key,%rax),$rndkey1
1011 add \$32,%rax
1012 aesenc $rndkey0,$inout0
1013 aesenc $rndkey0,$inout1
1014 $movkey -16($key,%rax),$rndkey0
1015 jnz .Lccm64_enc2_loop
1016 aesenc $rndkey1,$inout0
1017 aesenc $rndkey1,$inout1
1018 paddq $increment,$iv
Adam Langleye9ada862015-05-11 17:20:37 -07001019 dec $len # $len-- ($len is in blocks)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001020 aesenclast $rndkey0,$inout0
1021 aesenclast $rndkey0,$inout1
1022
1023 lea 16($inp),$inp
1024 xorps $inout0,$in0 # inp ^= E(iv)
1025 movdqa $iv,$inout0
1026 movups $in0,($out) # save output
1027 pshufb $bswap_mask,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07001028 lea 16($out),$out # $out+=16
1029 jnz .Lccm64_enc_outer # loop if ($len!=0)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001030
Adam Langleye9ada862015-05-11 17:20:37 -07001031 pxor $rndkey0,$rndkey0 # clear register bank
1032 pxor $rndkey1,$rndkey1
1033 pxor $inout0,$inout0
1034 movups $inout1,($cmac) # store resulting mac
1035 pxor $inout1,$inout1
1036 pxor $in0,$in0
1037 pxor $iv,$iv
Adam Langleyd9e397b2015-01-22 14:27:53 -08001038___
1039$code.=<<___ if ($win64);
1040 movaps (%rsp),%xmm6
Adam Langleye9ada862015-05-11 17:20:37 -07001041 movaps %xmm0,(%rsp) # clear stack
Adam Langleyd9e397b2015-01-22 14:27:53 -08001042 movaps 0x10(%rsp),%xmm7
Adam Langleye9ada862015-05-11 17:20:37 -07001043 movaps %xmm0,0x10(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001044 movaps 0x20(%rsp),%xmm8
Adam Langleye9ada862015-05-11 17:20:37 -07001045 movaps %xmm0,0x20(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001046 movaps 0x30(%rsp),%xmm9
Adam Langleye9ada862015-05-11 17:20:37 -07001047 movaps %xmm0,0x30(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001048 lea 0x58(%rsp),%rsp
1049.Lccm64_enc_ret:
1050___
1051$code.=<<___;
1052 ret
1053.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1054___
1055######################################################################
1056$code.=<<___;
1057.globl aesni_ccm64_decrypt_blocks
1058.type aesni_ccm64_decrypt_blocks,\@function,6
1059.align 16
1060aesni_ccm64_decrypt_blocks:
1061___
1062$code.=<<___ if ($win64);
1063 lea -0x58(%rsp),%rsp
Adam Langleye9ada862015-05-11 17:20:37 -07001064 movaps %xmm6,(%rsp) # $iv
1065 movaps %xmm7,0x10(%rsp) # $bswap_mask
1066 movaps %xmm8,0x20(%rsp) # $in8
1067 movaps %xmm9,0x30(%rsp) # $increment
Adam Langleyd9e397b2015-01-22 14:27:53 -08001068.Lccm64_dec_body:
1069___
1070$code.=<<___;
1071 mov 240($key),$rounds # key->rounds
1072 movups ($ivp),$iv
1073 movdqu ($cmac),$inout1
1074 movdqa .Lincrement64(%rip),$increment
1075 movdqa .Lbswap_mask(%rip),$bswap_mask
1076
1077 movaps $iv,$inout0
1078 mov $rounds,$rnds_
1079 mov $key,$key_
1080 pshufb $bswap_mask,$iv
1081___
1082 &aesni_generate1("enc",$key,$rounds);
1083$code.=<<___;
1084 shl \$4,$rnds_
1085 mov \$16,$rounds
1086 movups ($inp),$in0 # load inp
1087 paddq $increment,$iv
Adam Langleye9ada862015-05-11 17:20:37 -07001088 lea 16($inp),$inp # $inp+=16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001089 sub %r10,%rax # twisted $rounds
1090 lea 32($key_,$rnds_),$key # end of key schedule
1091 mov %rax,%r10
1092 jmp .Lccm64_dec_outer
1093.align 16
1094.Lccm64_dec_outer:
1095 xorps $inout0,$in0 # inp ^= E(iv)
1096 movdqa $iv,$inout0
1097 movups $in0,($out) # save output
Adam Langleye9ada862015-05-11 17:20:37 -07001098 lea 16($out),$out # $out+=16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001099 pshufb $bswap_mask,$inout0
1100
Adam Langleye9ada862015-05-11 17:20:37 -07001101 sub \$1,$len # $len-- ($len is in blocks)
1102 jz .Lccm64_dec_break # if ($len==0) break
Adam Langleyd9e397b2015-01-22 14:27:53 -08001103
1104 $movkey ($key_),$rndkey0
1105 mov %r10,%rax
1106 $movkey 16($key_),$rndkey1
1107 xorps $rndkey0,$in0
1108 xorps $rndkey0,$inout0
1109 xorps $in0,$inout1 # cmac^=out
1110 $movkey 32($key_),$rndkey0
1111 jmp .Lccm64_dec2_loop
1112.align 16
1113.Lccm64_dec2_loop:
1114 aesenc $rndkey1,$inout0
1115 aesenc $rndkey1,$inout1
1116 $movkey ($key,%rax),$rndkey1
1117 add \$32,%rax
1118 aesenc $rndkey0,$inout0
1119 aesenc $rndkey0,$inout1
1120 $movkey -16($key,%rax),$rndkey0
1121 jnz .Lccm64_dec2_loop
Adam Langleye9ada862015-05-11 17:20:37 -07001122 movups ($inp),$in0 # load input
Adam Langleyd9e397b2015-01-22 14:27:53 -08001123 paddq $increment,$iv
1124 aesenc $rndkey1,$inout0
1125 aesenc $rndkey1,$inout1
1126 aesenclast $rndkey0,$inout0
1127 aesenclast $rndkey0,$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07001128 lea 16($inp),$inp # $inp+=16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001129 jmp .Lccm64_dec_outer
1130
1131.align 16
1132.Lccm64_dec_break:
1133 #xorps $in0,$inout1 # cmac^=out
1134 mov 240($key_),$rounds
1135___
1136 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1137$code.=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -07001138 pxor $rndkey0,$rndkey0 # clear register bank
1139 pxor $rndkey1,$rndkey1
1140 pxor $inout0,$inout0
1141 movups $inout1,($cmac) # store resulting mac
1142 pxor $inout1,$inout1
1143 pxor $in0,$in0
1144 pxor $iv,$iv
Adam Langleyd9e397b2015-01-22 14:27:53 -08001145___
1146$code.=<<___ if ($win64);
1147 movaps (%rsp),%xmm6
Adam Langleye9ada862015-05-11 17:20:37 -07001148 movaps %xmm0,(%rsp) # clear stack
Adam Langleyd9e397b2015-01-22 14:27:53 -08001149 movaps 0x10(%rsp),%xmm7
Adam Langleye9ada862015-05-11 17:20:37 -07001150 movaps %xmm0,0x10(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001151 movaps 0x20(%rsp),%xmm8
Adam Langleye9ada862015-05-11 17:20:37 -07001152 movaps %xmm0,0x20(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001153 movaps 0x30(%rsp),%xmm9
Adam Langleye9ada862015-05-11 17:20:37 -07001154 movaps %xmm0,0x30(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001155 lea 0x58(%rsp),%rsp
1156.Lccm64_dec_ret:
1157___
1158$code.=<<___;
1159 ret
1160.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1161___
1162}
1163######################################################################
1164# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1165# size_t blocks, const AES_KEY *key,
1166# const char *ivec);
1167#
1168# Handles only complete blocks, operates on 32-bit counter and
1169# does not update *ivec! (see crypto/modes/ctr128.c for details)
1170#
1171# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1172# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1173# Keywords are full unroll and modulo-schedule counter calculations
1174# with zero-round key xor.
1175{
1176my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
Robert Sloana94fe052017-02-21 08:49:28 -08001177my ($key0,$ctr)=("%ebp","${ivp}d");
Adam Langleyd9e397b2015-01-22 14:27:53 -08001178my $frame_size = 0x80 + ($win64?160:0);
1179
1180$code.=<<___;
1181.globl aesni_ctr32_encrypt_blocks
1182.type aesni_ctr32_encrypt_blocks,\@function,5
1183.align 16
1184aesni_ctr32_encrypt_blocks:
Robert Sloanab8b8882018-03-26 11:39:51 -07001185.cfi_startproc
Adam Langleye9ada862015-05-11 17:20:37 -07001186 cmp \$1,$len
1187 jne .Lctr32_bulk
1188
1189 # handle single block without allocating stack frame,
1190 # useful when handling edges
1191 movups ($ivp),$inout0
1192 movups ($inp),$inout1
1193 mov 240($key),%edx # key->rounds
1194___
1195 &aesni_generate1("enc",$key,"%edx");
1196$code.=<<___;
1197 pxor $rndkey0,$rndkey0 # clear register bank
1198 pxor $rndkey1,$rndkey1
1199 xorps $inout1,$inout0
1200 pxor $inout1,$inout1
1201 movups $inout0,($out)
1202 xorps $inout0,$inout0
1203 jmp .Lctr32_epilogue
1204
1205.align 16
1206.Lctr32_bulk:
Robert Sloana94fe052017-02-21 08:49:28 -08001207 lea (%rsp),$key_ # use $key_ as frame pointer
Robert Sloanab8b8882018-03-26 11:39:51 -07001208.cfi_def_cfa_register $key_
Adam Langleyd9e397b2015-01-22 14:27:53 -08001209 push %rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07001210.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001211 sub \$$frame_size,%rsp
1212 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1213___
1214$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -08001215 movaps %xmm6,-0xa8($key_) # offload everything
1216 movaps %xmm7,-0x98($key_)
1217 movaps %xmm8,-0x88($key_)
1218 movaps %xmm9,-0x78($key_)
1219 movaps %xmm10,-0x68($key_)
1220 movaps %xmm11,-0x58($key_)
1221 movaps %xmm12,-0x48($key_)
1222 movaps %xmm13,-0x38($key_)
1223 movaps %xmm14,-0x28($key_)
1224 movaps %xmm15,-0x18($key_)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001225.Lctr32_body:
1226___
1227$code.=<<___;
Adam Langleyd9e397b2015-01-22 14:27:53 -08001228
Adam Langleye9ada862015-05-11 17:20:37 -07001229 # 8 16-byte words on top of stack are counter values
1230 # xor-ed with zero-round key
Adam Langleyd9e397b2015-01-22 14:27:53 -08001231
1232 movdqu ($ivp),$inout0
1233 movdqu ($key),$rndkey0
1234 mov 12($ivp),$ctr # counter LSB
1235 pxor $rndkey0,$inout0
1236 mov 12($key),$key0 # 0-round key LSB
1237 movdqa $inout0,0x00(%rsp) # populate counter block
1238 bswap $ctr
1239 movdqa $inout0,$inout1
1240 movdqa $inout0,$inout2
1241 movdqa $inout0,$inout3
1242 movdqa $inout0,0x40(%rsp)
1243 movdqa $inout0,0x50(%rsp)
1244 movdqa $inout0,0x60(%rsp)
Adam Langleye9ada862015-05-11 17:20:37 -07001245 mov %rdx,%r10 # about to borrow %rdx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001246 movdqa $inout0,0x70(%rsp)
1247
1248 lea 1($ctr),%rax
1249 lea 2($ctr),%rdx
1250 bswap %eax
1251 bswap %edx
1252 xor $key0,%eax
1253 xor $key0,%edx
1254 pinsrd \$3,%eax,$inout1
1255 lea 3($ctr),%rax
1256 movdqa $inout1,0x10(%rsp)
1257 pinsrd \$3,%edx,$inout2
1258 bswap %eax
1259 mov %r10,%rdx # restore %rdx
1260 lea 4($ctr),%r10
1261 movdqa $inout2,0x20(%rsp)
1262 xor $key0,%eax
1263 bswap %r10d
1264 pinsrd \$3,%eax,$inout3
1265 xor $key0,%r10d
1266 movdqa $inout3,0x30(%rsp)
1267 lea 5($ctr),%r9
1268 mov %r10d,0x40+12(%rsp)
1269 bswap %r9d
1270 lea 6($ctr),%r10
1271 mov 240($key),$rounds # key->rounds
1272 xor $key0,%r9d
1273 bswap %r10d
1274 mov %r9d,0x50+12(%rsp)
1275 xor $key0,%r10d
1276 lea 7($ctr),%r9
1277 mov %r10d,0x60+12(%rsp)
1278 bswap %r9d
Robert Sloan2424d842017-05-01 07:46:28 -07001279 leaq OPENSSL_ia32cap_P(%rip),%r10
Robert Sloan572a4e22017-04-17 10:52:19 -07001280 mov 4(%r10),%r10d
Adam Langleyd9e397b2015-01-22 14:27:53 -08001281 xor $key0,%r9d
1282 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
1283 mov %r9d,0x70+12(%rsp)
1284
1285 $movkey 0x10($key),$rndkey1
1286
1287 movdqa 0x40(%rsp),$inout4
1288 movdqa 0x50(%rsp),$inout5
1289
Adam Langleye9ada862015-05-11 17:20:37 -07001290 cmp \$8,$len # $len is in blocks
1291 jb .Lctr32_tail # short input if ($len<8)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001292
Adam Langleye9ada862015-05-11 17:20:37 -07001293 sub \$6,$len # $len is biased by -6
Adam Langleyd9e397b2015-01-22 14:27:53 -08001294 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
Adam Langleye9ada862015-05-11 17:20:37 -07001295 je .Lctr32_6x # [which denotes Atom Silvermont]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001296
1297 lea 0x80($key),$key # size optimization
Adam Langleye9ada862015-05-11 17:20:37 -07001298 sub \$2,$len # $len is biased by -8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001299 jmp .Lctr32_loop8
1300
1301.align 16
1302.Lctr32_6x:
1303 shl \$4,$rounds
1304 mov \$48,$rnds_
1305 bswap $key0
1306 lea 32($key,$rounds),$key # end of key schedule
1307 sub %rax,%r10 # twisted $rounds
1308 jmp .Lctr32_loop6
1309
1310.align 16
1311.Lctr32_loop6:
Adam Langleye9ada862015-05-11 17:20:37 -07001312 add \$6,$ctr # next counter value
Adam Langleyd9e397b2015-01-22 14:27:53 -08001313 $movkey -48($key,$rnds_),$rndkey0
1314 aesenc $rndkey1,$inout0
1315 mov $ctr,%eax
1316 xor $key0,%eax
1317 aesenc $rndkey1,$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07001318 movbe %eax,`0x00+12`(%rsp) # store next counter value
Adam Langleyd9e397b2015-01-22 14:27:53 -08001319 lea 1($ctr),%eax
1320 aesenc $rndkey1,$inout2
1321 xor $key0,%eax
1322 movbe %eax,`0x10+12`(%rsp)
1323 aesenc $rndkey1,$inout3
1324 lea 2($ctr),%eax
1325 xor $key0,%eax
1326 aesenc $rndkey1,$inout4
1327 movbe %eax,`0x20+12`(%rsp)
1328 lea 3($ctr),%eax
1329 aesenc $rndkey1,$inout5
1330 $movkey -32($key,$rnds_),$rndkey1
1331 xor $key0,%eax
1332
1333 aesenc $rndkey0,$inout0
1334 movbe %eax,`0x30+12`(%rsp)
1335 lea 4($ctr),%eax
1336 aesenc $rndkey0,$inout1
1337 xor $key0,%eax
1338 movbe %eax,`0x40+12`(%rsp)
1339 aesenc $rndkey0,$inout2
1340 lea 5($ctr),%eax
1341 xor $key0,%eax
1342 aesenc $rndkey0,$inout3
1343 movbe %eax,`0x50+12`(%rsp)
1344 mov %r10,%rax # mov $rnds_,$rounds
1345 aesenc $rndkey0,$inout4
1346 aesenc $rndkey0,$inout5
1347 $movkey -16($key,$rnds_),$rndkey0
1348
1349 call .Lenc_loop6
1350
Adam Langleye9ada862015-05-11 17:20:37 -07001351 movdqu ($inp),$inout6 # load 6 input blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08001352 movdqu 0x10($inp),$inout7
1353 movdqu 0x20($inp),$in0
1354 movdqu 0x30($inp),$in1
1355 movdqu 0x40($inp),$in2
1356 movdqu 0x50($inp),$in3
Adam Langleye9ada862015-05-11 17:20:37 -07001357 lea 0x60($inp),$inp # $inp+=6*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001358 $movkey -64($key,$rnds_),$rndkey1
Adam Langleye9ada862015-05-11 17:20:37 -07001359 pxor $inout0,$inout6 # inp^=E(ctr)
1360 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001361 pxor $inout1,$inout7
1362 movaps 0x10(%rsp),$inout1
1363 pxor $inout2,$in0
1364 movaps 0x20(%rsp),$inout2
1365 pxor $inout3,$in1
1366 movaps 0x30(%rsp),$inout3
1367 pxor $inout4,$in2
1368 movaps 0x40(%rsp),$inout4
1369 pxor $inout5,$in3
1370 movaps 0x50(%rsp),$inout5
Adam Langleye9ada862015-05-11 17:20:37 -07001371 movdqu $inout6,($out) # store 6 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08001372 movdqu $inout7,0x10($out)
1373 movdqu $in0,0x20($out)
1374 movdqu $in1,0x30($out)
1375 movdqu $in2,0x40($out)
1376 movdqu $in3,0x50($out)
Adam Langleye9ada862015-05-11 17:20:37 -07001377 lea 0x60($out),$out # $out+=6*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001378
Adam Langleye9ada862015-05-11 17:20:37 -07001379 sub \$6,$len
1380 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
1381
1382 add \$6,$len # restore real remaining $len
1383 jz .Lctr32_done # done if ($len==0)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001384
1385 lea -48($rnds_),$rounds
1386 lea -80($key,$rnds_),$key # restore $key
1387 neg $rounds
1388 shr \$4,$rounds # restore $rounds
1389 jmp .Lctr32_tail
1390
1391.align 32
1392.Lctr32_loop8:
Adam Langleye9ada862015-05-11 17:20:37 -07001393 add \$8,$ctr # next counter value
Adam Langleyd9e397b2015-01-22 14:27:53 -08001394 movdqa 0x60(%rsp),$inout6
1395 aesenc $rndkey1,$inout0
1396 mov $ctr,%r9d
1397 movdqa 0x70(%rsp),$inout7
1398 aesenc $rndkey1,$inout1
1399 bswap %r9d
1400 $movkey 0x20-0x80($key),$rndkey0
1401 aesenc $rndkey1,$inout2
1402 xor $key0,%r9d
1403 nop
1404 aesenc $rndkey1,$inout3
Adam Langleye9ada862015-05-11 17:20:37 -07001405 mov %r9d,0x00+12(%rsp) # store next counter value
Adam Langleyd9e397b2015-01-22 14:27:53 -08001406 lea 1($ctr),%r9
1407 aesenc $rndkey1,$inout4
1408 aesenc $rndkey1,$inout5
1409 aesenc $rndkey1,$inout6
1410 aesenc $rndkey1,$inout7
1411 $movkey 0x30-0x80($key),$rndkey1
1412___
1413for($i=2;$i<8;$i++) {
1414my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1415$code.=<<___;
1416 bswap %r9d
1417 aesenc $rndkeyx,$inout0
1418 aesenc $rndkeyx,$inout1
1419 xor $key0,%r9d
1420 .byte 0x66,0x90
1421 aesenc $rndkeyx,$inout2
1422 aesenc $rndkeyx,$inout3
1423 mov %r9d,`0x10*($i-1)`+12(%rsp)
1424 lea $i($ctr),%r9
1425 aesenc $rndkeyx,$inout4
1426 aesenc $rndkeyx,$inout5
1427 aesenc $rndkeyx,$inout6
1428 aesenc $rndkeyx,$inout7
1429 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
1430___
1431}
1432$code.=<<___;
1433 bswap %r9d
1434 aesenc $rndkey0,$inout0
1435 aesenc $rndkey0,$inout1
1436 aesenc $rndkey0,$inout2
1437 xor $key0,%r9d
Adam Langleye9ada862015-05-11 17:20:37 -07001438 movdqu 0x00($inp),$in0 # start loading input
Adam Langleyd9e397b2015-01-22 14:27:53 -08001439 aesenc $rndkey0,$inout3
1440 mov %r9d,0x70+12(%rsp)
1441 cmp \$11,$rounds
1442 aesenc $rndkey0,$inout4
1443 aesenc $rndkey0,$inout5
1444 aesenc $rndkey0,$inout6
1445 aesenc $rndkey0,$inout7
1446 $movkey 0xa0-0x80($key),$rndkey0
1447
1448 jb .Lctr32_enc_done
1449
1450 aesenc $rndkey1,$inout0
1451 aesenc $rndkey1,$inout1
1452 aesenc $rndkey1,$inout2
1453 aesenc $rndkey1,$inout3
1454 aesenc $rndkey1,$inout4
1455 aesenc $rndkey1,$inout5
1456 aesenc $rndkey1,$inout6
1457 aesenc $rndkey1,$inout7
1458 $movkey 0xb0-0x80($key),$rndkey1
1459
1460 aesenc $rndkey0,$inout0
1461 aesenc $rndkey0,$inout1
1462 aesenc $rndkey0,$inout2
1463 aesenc $rndkey0,$inout3
1464 aesenc $rndkey0,$inout4
1465 aesenc $rndkey0,$inout5
1466 aesenc $rndkey0,$inout6
1467 aesenc $rndkey0,$inout7
1468 $movkey 0xc0-0x80($key),$rndkey0
1469 je .Lctr32_enc_done
1470
1471 aesenc $rndkey1,$inout0
1472 aesenc $rndkey1,$inout1
1473 aesenc $rndkey1,$inout2
1474 aesenc $rndkey1,$inout3
1475 aesenc $rndkey1,$inout4
1476 aesenc $rndkey1,$inout5
1477 aesenc $rndkey1,$inout6
1478 aesenc $rndkey1,$inout7
1479 $movkey 0xd0-0x80($key),$rndkey1
1480
1481 aesenc $rndkey0,$inout0
1482 aesenc $rndkey0,$inout1
1483 aesenc $rndkey0,$inout2
1484 aesenc $rndkey0,$inout3
1485 aesenc $rndkey0,$inout4
1486 aesenc $rndkey0,$inout5
1487 aesenc $rndkey0,$inout6
1488 aesenc $rndkey0,$inout7
1489 $movkey 0xe0-0x80($key),$rndkey0
1490 jmp .Lctr32_enc_done
1491
1492.align 16
1493.Lctr32_enc_done:
1494 movdqu 0x10($inp),$in1
Adam Langleye9ada862015-05-11 17:20:37 -07001495 pxor $rndkey0,$in0 # input^=round[last]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001496 movdqu 0x20($inp),$in2
1497 pxor $rndkey0,$in1
1498 movdqu 0x30($inp),$in3
1499 pxor $rndkey0,$in2
1500 movdqu 0x40($inp),$in4
1501 pxor $rndkey0,$in3
1502 movdqu 0x50($inp),$in5
1503 pxor $rndkey0,$in4
1504 pxor $rndkey0,$in5
1505 aesenc $rndkey1,$inout0
1506 aesenc $rndkey1,$inout1
1507 aesenc $rndkey1,$inout2
1508 aesenc $rndkey1,$inout3
1509 aesenc $rndkey1,$inout4
1510 aesenc $rndkey1,$inout5
1511 aesenc $rndkey1,$inout6
1512 aesenc $rndkey1,$inout7
Adam Langleye9ada862015-05-11 17:20:37 -07001513 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
1514 lea 0x80($inp),$inp # $inp+=8*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001515
Adam Langleye9ada862015-05-11 17:20:37 -07001516 aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
1517 pxor $rndkey0,$rndkey1 # borrowed $rndkey
Adam Langleyd9e397b2015-01-22 14:27:53 -08001518 movdqu 0x70-0x80($inp),$in0
1519 aesenclast $in1,$inout1
1520 pxor $rndkey0,$in0
1521 movdqa 0x00(%rsp),$in1 # load next counter block
1522 aesenclast $in2,$inout2
1523 aesenclast $in3,$inout3
1524 movdqa 0x10(%rsp),$in2
1525 movdqa 0x20(%rsp),$in3
1526 aesenclast $in4,$inout4
1527 aesenclast $in5,$inout5
1528 movdqa 0x30(%rsp),$in4
1529 movdqa 0x40(%rsp),$in5
1530 aesenclast $rndkey1,$inout6
1531 movdqa 0x50(%rsp),$rndkey0
Adam Langleye9ada862015-05-11 17:20:37 -07001532 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
Adam Langleyd9e397b2015-01-22 14:27:53 -08001533 aesenclast $in0,$inout7
1534
Adam Langleye9ada862015-05-11 17:20:37 -07001535 movups $inout0,($out) # store 8 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08001536 movdqa $in1,$inout0
1537 movups $inout1,0x10($out)
1538 movdqa $in2,$inout1
1539 movups $inout2,0x20($out)
1540 movdqa $in3,$inout2
1541 movups $inout3,0x30($out)
1542 movdqa $in4,$inout3
1543 movups $inout4,0x40($out)
1544 movdqa $in5,$inout4
1545 movups $inout5,0x50($out)
1546 movdqa $rndkey0,$inout5
1547 movups $inout6,0x60($out)
1548 movups $inout7,0x70($out)
Adam Langleye9ada862015-05-11 17:20:37 -07001549 lea 0x80($out),$out # $out+=8*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001550
Adam Langleye9ada862015-05-11 17:20:37 -07001551 sub \$8,$len
1552 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
1553
Robert Sloanab8b8882018-03-26 11:39:51 -07001554 add \$8,$len # restore real remaining $len
Adam Langleye9ada862015-05-11 17:20:37 -07001555 jz .Lctr32_done # done if ($len==0)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001556 lea -0x80($key),$key
1557
1558.Lctr32_tail:
Adam Langleye9ada862015-05-11 17:20:37 -07001559 # note that at this point $inout0..5 are populated with
Robert Sloana94fe052017-02-21 08:49:28 -08001560 # counter values xor-ed with 0-round key
Adam Langleyd9e397b2015-01-22 14:27:53 -08001561 lea 16($key),$key
1562 cmp \$4,$len
1563 jb .Lctr32_loop3
1564 je .Lctr32_loop4
1565
Adam Langleye9ada862015-05-11 17:20:37 -07001566 # if ($len>4) compute 7 E(counter)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001567 shl \$4,$rounds
1568 movdqa 0x60(%rsp),$inout6
1569 pxor $inout7,$inout7
1570
1571 $movkey 16($key),$rndkey0
1572 aesenc $rndkey1,$inout0
1573 aesenc $rndkey1,$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07001574 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
Adam Langleyd9e397b2015-01-22 14:27:53 -08001575 neg %rax
1576 aesenc $rndkey1,$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07001577 add \$16,%rax # prepare for .Lenc_loop8_enter
Adam Langleyd9e397b2015-01-22 14:27:53 -08001578 movups ($inp),$in0
1579 aesenc $rndkey1,$inout3
1580 aesenc $rndkey1,$inout4
Adam Langleye9ada862015-05-11 17:20:37 -07001581 movups 0x10($inp),$in1 # pre-load input
Adam Langleyd9e397b2015-01-22 14:27:53 -08001582 movups 0x20($inp),$in2
1583 aesenc $rndkey1,$inout5
1584 aesenc $rndkey1,$inout6
1585
1586 call .Lenc_loop8_enter
1587
1588 movdqu 0x30($inp),$in3
1589 pxor $in0,$inout0
1590 movdqu 0x40($inp),$in0
1591 pxor $in1,$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07001592 movdqu $inout0,($out) # store output
Adam Langleyd9e397b2015-01-22 14:27:53 -08001593 pxor $in2,$inout2
1594 movdqu $inout1,0x10($out)
1595 pxor $in3,$inout3
1596 movdqu $inout2,0x20($out)
1597 pxor $in0,$inout4
1598 movdqu $inout3,0x30($out)
1599 movdqu $inout4,0x40($out)
1600 cmp \$6,$len
Adam Langleye9ada862015-05-11 17:20:37 -07001601 jb .Lctr32_done # $len was 5, stop store
Adam Langleyd9e397b2015-01-22 14:27:53 -08001602
1603 movups 0x50($inp),$in1
1604 xorps $in1,$inout5
1605 movups $inout5,0x50($out)
Adam Langleye9ada862015-05-11 17:20:37 -07001606 je .Lctr32_done # $len was 6, stop store
Adam Langleyd9e397b2015-01-22 14:27:53 -08001607
1608 movups 0x60($inp),$in2
1609 xorps $in2,$inout6
1610 movups $inout6,0x60($out)
Adam Langleye9ada862015-05-11 17:20:37 -07001611 jmp .Lctr32_done # $len was 7, stop store
Adam Langleyd9e397b2015-01-22 14:27:53 -08001612
1613.align 32
1614.Lctr32_loop4:
1615 aesenc $rndkey1,$inout0
1616 lea 16($key),$key
1617 dec $rounds
1618 aesenc $rndkey1,$inout1
1619 aesenc $rndkey1,$inout2
1620 aesenc $rndkey1,$inout3
1621 $movkey ($key),$rndkey1
1622 jnz .Lctr32_loop4
1623 aesenclast $rndkey1,$inout0
1624 aesenclast $rndkey1,$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07001625 movups ($inp),$in0 # load input
Adam Langleyd9e397b2015-01-22 14:27:53 -08001626 movups 0x10($inp),$in1
1627 aesenclast $rndkey1,$inout2
1628 aesenclast $rndkey1,$inout3
1629 movups 0x20($inp),$in2
1630 movups 0x30($inp),$in3
1631
1632 xorps $in0,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07001633 movups $inout0,($out) # store output
Adam Langleyd9e397b2015-01-22 14:27:53 -08001634 xorps $in1,$inout1
1635 movups $inout1,0x10($out)
1636 pxor $in2,$inout2
1637 movdqu $inout2,0x20($out)
1638 pxor $in3,$inout3
1639 movdqu $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -07001640 jmp .Lctr32_done # $len was 4, stop store
Adam Langleyd9e397b2015-01-22 14:27:53 -08001641
1642.align 32
1643.Lctr32_loop3:
1644 aesenc $rndkey1,$inout0
1645 lea 16($key),$key
1646 dec $rounds
1647 aesenc $rndkey1,$inout1
1648 aesenc $rndkey1,$inout2
1649 $movkey ($key),$rndkey1
1650 jnz .Lctr32_loop3
1651 aesenclast $rndkey1,$inout0
1652 aesenclast $rndkey1,$inout1
1653 aesenclast $rndkey1,$inout2
1654
Adam Langleye9ada862015-05-11 17:20:37 -07001655 movups ($inp),$in0 # load input
Adam Langleyd9e397b2015-01-22 14:27:53 -08001656 xorps $in0,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07001657 movups $inout0,($out) # store output
Adam Langleyd9e397b2015-01-22 14:27:53 -08001658 cmp \$2,$len
Adam Langleye9ada862015-05-11 17:20:37 -07001659 jb .Lctr32_done # $len was 1, stop store
Adam Langleyd9e397b2015-01-22 14:27:53 -08001660
1661 movups 0x10($inp),$in1
1662 xorps $in1,$inout1
1663 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -07001664 je .Lctr32_done # $len was 2, stop store
Adam Langleyd9e397b2015-01-22 14:27:53 -08001665
1666 movups 0x20($inp),$in2
1667 xorps $in2,$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07001668 movups $inout2,0x20($out) # $len was 3, stop store
Adam Langleyd9e397b2015-01-22 14:27:53 -08001669
Adam Langleyd9e397b2015-01-22 14:27:53 -08001670.Lctr32_done:
Robert Sloanab8b8882018-03-26 11:39:51 -07001671 xorps %xmm0,%xmm0 # clear register bank
Adam Langleye9ada862015-05-11 17:20:37 -07001672 xor $key0,$key0
1673 pxor %xmm1,%xmm1
1674 pxor %xmm2,%xmm2
1675 pxor %xmm3,%xmm3
1676 pxor %xmm4,%xmm4
1677 pxor %xmm5,%xmm5
1678___
1679$code.=<<___ if (!$win64);
1680 pxor %xmm6,%xmm6
1681 pxor %xmm7,%xmm7
1682 movaps %xmm0,0x00(%rsp) # clear stack
1683 pxor %xmm8,%xmm8
1684 movaps %xmm0,0x10(%rsp)
1685 pxor %xmm9,%xmm9
1686 movaps %xmm0,0x20(%rsp)
1687 pxor %xmm10,%xmm10
1688 movaps %xmm0,0x30(%rsp)
1689 pxor %xmm11,%xmm11
1690 movaps %xmm0,0x40(%rsp)
1691 pxor %xmm12,%xmm12
1692 movaps %xmm0,0x50(%rsp)
1693 pxor %xmm13,%xmm13
1694 movaps %xmm0,0x60(%rsp)
1695 pxor %xmm14,%xmm14
1696 movaps %xmm0,0x70(%rsp)
1697 pxor %xmm15,%xmm15
Adam Langleyd9e397b2015-01-22 14:27:53 -08001698___
1699$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -08001700 movaps -0xa8($key_),%xmm6
1701 movaps %xmm0,-0xa8($key_) # clear stack
1702 movaps -0x98($key_),%xmm7
1703 movaps %xmm0,-0x98($key_)
1704 movaps -0x88($key_),%xmm8
1705 movaps %xmm0,-0x88($key_)
1706 movaps -0x78($key_),%xmm9
1707 movaps %xmm0,-0x78($key_)
1708 movaps -0x68($key_),%xmm10
1709 movaps %xmm0,-0x68($key_)
1710 movaps -0x58($key_),%xmm11
1711 movaps %xmm0,-0x58($key_)
1712 movaps -0x48($key_),%xmm12
1713 movaps %xmm0,-0x48($key_)
1714 movaps -0x38($key_),%xmm13
1715 movaps %xmm0,-0x38($key_)
1716 movaps -0x28($key_),%xmm14
1717 movaps %xmm0,-0x28($key_)
1718 movaps -0x18($key_),%xmm15
1719 movaps %xmm0,-0x18($key_)
Adam Langleye9ada862015-05-11 17:20:37 -07001720 movaps %xmm0,0x00(%rsp)
1721 movaps %xmm0,0x10(%rsp)
1722 movaps %xmm0,0x20(%rsp)
1723 movaps %xmm0,0x30(%rsp)
1724 movaps %xmm0,0x40(%rsp)
1725 movaps %xmm0,0x50(%rsp)
1726 movaps %xmm0,0x60(%rsp)
1727 movaps %xmm0,0x70(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001728___
1729$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08001730 mov -8($key_),%rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07001731.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08001732 lea ($key_),%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07001733.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001734.Lctr32_epilogue:
1735 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07001736.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08001737.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1738___
1739}
1740
1741######################################################################
1742# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1743# const AES_KEY *key1, const AES_KEY *key2
1744# const unsigned char iv[16]);
1745#
1746{
1747my @tweak=map("%xmm$_",(10..15));
1748my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1749my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1750my $frame_size = 0x70 + ($win64?160:0);
Robert Sloana94fe052017-02-21 08:49:28 -08001751my $key_ = "%rbp"; # override so that we can use %r11 as FP
Adam Langleyd9e397b2015-01-22 14:27:53 -08001752
1753$code.=<<___;
1754.globl aesni_xts_encrypt
1755.type aesni_xts_encrypt,\@function,6
1756.align 16
1757aesni_xts_encrypt:
Robert Sloanab8b8882018-03-26 11:39:51 -07001758.cfi_startproc
Robert Sloana94fe052017-02-21 08:49:28 -08001759 lea (%rsp),%r11 # frame pointer
Robert Sloanab8b8882018-03-26 11:39:51 -07001760.cfi_def_cfa_register %r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08001761 push %rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07001762.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001763 sub \$$frame_size,%rsp
1764 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1765___
1766$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -08001767 movaps %xmm6,-0xa8(%r11) # offload everything
1768 movaps %xmm7,-0x98(%r11)
1769 movaps %xmm8,-0x88(%r11)
1770 movaps %xmm9,-0x78(%r11)
1771 movaps %xmm10,-0x68(%r11)
1772 movaps %xmm11,-0x58(%r11)
1773 movaps %xmm12,-0x48(%r11)
1774 movaps %xmm13,-0x38(%r11)
1775 movaps %xmm14,-0x28(%r11)
1776 movaps %xmm15,-0x18(%r11)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001777.Lxts_enc_body:
1778___
1779$code.=<<___;
Adam Langleyd9e397b2015-01-22 14:27:53 -08001780 movups ($ivp),$inout0 # load clear-text tweak
1781 mov 240(%r8),$rounds # key2->rounds
1782 mov 240($key),$rnds_ # key1->rounds
1783___
1784 # generate the tweak
1785 &aesni_generate1("enc",$key2,$rounds,$inout0);
1786$code.=<<___;
1787 $movkey ($key),$rndkey0 # zero round key
1788 mov $key,$key_ # backup $key
1789 mov $rnds_,$rounds # backup $rounds
1790 shl \$4,$rnds_
1791 mov $len,$len_ # backup $len
1792 and \$-16,$len
1793
1794 $movkey 16($key,$rnds_),$rndkey1 # last round key
1795
1796 movdqa .Lxts_magic(%rip),$twmask
1797 movdqa $inout0,@tweak[5]
1798 pshufd \$0x5f,$inout0,$twres
1799 pxor $rndkey0,$rndkey1
1800___
1801 # alternative tweak calculation algorithm is based on suggestions
1802 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1803 # and should help in the future...
1804 for ($i=0;$i<4;$i++) {
1805 $code.=<<___;
1806 movdqa $twres,$twtmp
1807 paddd $twres,$twres
1808 movdqa @tweak[5],@tweak[$i]
1809 psrad \$31,$twtmp # broadcast upper bits
1810 paddq @tweak[5],@tweak[5]
1811 pand $twmask,$twtmp
1812 pxor $rndkey0,@tweak[$i]
1813 pxor $twtmp,@tweak[5]
1814___
1815 }
1816$code.=<<___;
1817 movdqa @tweak[5],@tweak[4]
1818 psrad \$31,$twres
1819 paddq @tweak[5],@tweak[5]
1820 pand $twmask,$twres
1821 pxor $rndkey0,@tweak[4]
1822 pxor $twres,@tweak[5]
1823 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
1824
1825 sub \$16*6,$len
Adam Langleye9ada862015-05-11 17:20:37 -07001826 jc .Lxts_enc_short # if $len-=6*16 borrowed
Adam Langleyd9e397b2015-01-22 14:27:53 -08001827
1828 mov \$16+96,$rounds
1829 lea 32($key_,$rnds_),$key # end of key schedule
1830 sub %r10,%rax # twisted $rounds
1831 $movkey 16($key_),$rndkey1
1832 mov %rax,%r10 # backup twisted $rounds
1833 lea .Lxts_magic(%rip),%r8
1834 jmp .Lxts_enc_grandloop
1835
1836.align 32
1837.Lxts_enc_grandloop:
1838 movdqu `16*0`($inp),$inout0 # load input
1839 movdqa $rndkey0,$twmask
1840 movdqu `16*1`($inp),$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07001841 pxor @tweak[0],$inout0 # input^=tweak^round[0]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001842 movdqu `16*2`($inp),$inout2
1843 pxor @tweak[1],$inout1
1844 aesenc $rndkey1,$inout0
1845 movdqu `16*3`($inp),$inout3
1846 pxor @tweak[2],$inout2
1847 aesenc $rndkey1,$inout1
1848 movdqu `16*4`($inp),$inout4
1849 pxor @tweak[3],$inout3
1850 aesenc $rndkey1,$inout2
1851 movdqu `16*5`($inp),$inout5
1852 pxor @tweak[5],$twmask # round[0]^=tweak[5]
1853 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
1854 pxor @tweak[4],$inout4
1855 aesenc $rndkey1,$inout3
1856 $movkey 32($key_),$rndkey0
1857 lea `16*6`($inp),$inp
1858 pxor $twmask,$inout5
1859
Robert Sloanab8b8882018-03-26 11:39:51 -07001860 pxor $twres,@tweak[0] # calculate tweaks^round[last]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001861 aesenc $rndkey1,$inout4
1862 pxor $twres,@tweak[1]
Adam Langleye9ada862015-05-11 17:20:37 -07001863 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001864 aesenc $rndkey1,$inout5
1865 $movkey 48($key_),$rndkey1
1866 pxor $twres,@tweak[2]
1867
1868 aesenc $rndkey0,$inout0
1869 pxor $twres,@tweak[3]
1870 movdqa @tweak[1],`16*1`(%rsp)
1871 aesenc $rndkey0,$inout1
1872 pxor $twres,@tweak[4]
1873 movdqa @tweak[2],`16*2`(%rsp)
1874 aesenc $rndkey0,$inout2
1875 aesenc $rndkey0,$inout3
1876 pxor $twres,$twmask
1877 movdqa @tweak[4],`16*4`(%rsp)
1878 aesenc $rndkey0,$inout4
1879 aesenc $rndkey0,$inout5
1880 $movkey 64($key_),$rndkey0
1881 movdqa $twmask,`16*5`(%rsp)
1882 pshufd \$0x5f,@tweak[5],$twres
1883 jmp .Lxts_enc_loop6
1884.align 32
1885.Lxts_enc_loop6:
1886 aesenc $rndkey1,$inout0
1887 aesenc $rndkey1,$inout1
1888 aesenc $rndkey1,$inout2
1889 aesenc $rndkey1,$inout3
1890 aesenc $rndkey1,$inout4
1891 aesenc $rndkey1,$inout5
1892 $movkey -64($key,%rax),$rndkey1
1893 add \$32,%rax
1894
1895 aesenc $rndkey0,$inout0
1896 aesenc $rndkey0,$inout1
1897 aesenc $rndkey0,$inout2
1898 aesenc $rndkey0,$inout3
1899 aesenc $rndkey0,$inout4
1900 aesenc $rndkey0,$inout5
1901 $movkey -80($key,%rax),$rndkey0
1902 jnz .Lxts_enc_loop6
1903
Adam Langleye9ada862015-05-11 17:20:37 -07001904 movdqa (%r8),$twmask # start calculating next tweak
Adam Langleyd9e397b2015-01-22 14:27:53 -08001905 movdqa $twres,$twtmp
1906 paddd $twres,$twres
1907 aesenc $rndkey1,$inout0
1908 paddq @tweak[5],@tweak[5]
1909 psrad \$31,$twtmp
1910 aesenc $rndkey1,$inout1
1911 pand $twmask,$twtmp
1912 $movkey ($key_),@tweak[0] # load round[0]
1913 aesenc $rndkey1,$inout2
1914 aesenc $rndkey1,$inout3
1915 aesenc $rndkey1,$inout4
1916 pxor $twtmp,@tweak[5]
1917 movaps @tweak[0],@tweak[1] # copy round[0]
1918 aesenc $rndkey1,$inout5
1919 $movkey -64($key),$rndkey1
1920
1921 movdqa $twres,$twtmp
1922 aesenc $rndkey0,$inout0
1923 paddd $twres,$twres
1924 pxor @tweak[5],@tweak[0]
1925 aesenc $rndkey0,$inout1
1926 psrad \$31,$twtmp
1927 paddq @tweak[5],@tweak[5]
1928 aesenc $rndkey0,$inout2
1929 aesenc $rndkey0,$inout3
1930 pand $twmask,$twtmp
1931 movaps @tweak[1],@tweak[2]
1932 aesenc $rndkey0,$inout4
1933 pxor $twtmp,@tweak[5]
1934 movdqa $twres,$twtmp
1935 aesenc $rndkey0,$inout5
1936 $movkey -48($key),$rndkey0
1937
1938 paddd $twres,$twres
1939 aesenc $rndkey1,$inout0
1940 pxor @tweak[5],@tweak[1]
1941 psrad \$31,$twtmp
1942 aesenc $rndkey1,$inout1
1943 paddq @tweak[5],@tweak[5]
1944 pand $twmask,$twtmp
1945 aesenc $rndkey1,$inout2
1946 aesenc $rndkey1,$inout3
1947 movdqa @tweak[3],`16*3`(%rsp)
1948 pxor $twtmp,@tweak[5]
1949 aesenc $rndkey1,$inout4
1950 movaps @tweak[2],@tweak[3]
1951 movdqa $twres,$twtmp
1952 aesenc $rndkey1,$inout5
1953 $movkey -32($key),$rndkey1
1954
1955 paddd $twres,$twres
1956 aesenc $rndkey0,$inout0
1957 pxor @tweak[5],@tweak[2]
1958 psrad \$31,$twtmp
1959 aesenc $rndkey0,$inout1
1960 paddq @tweak[5],@tweak[5]
1961 pand $twmask,$twtmp
1962 aesenc $rndkey0,$inout2
1963 aesenc $rndkey0,$inout3
1964 aesenc $rndkey0,$inout4
1965 pxor $twtmp,@tweak[5]
1966 movaps @tweak[3],@tweak[4]
1967 aesenc $rndkey0,$inout5
1968
1969 movdqa $twres,$rndkey0
1970 paddd $twres,$twres
1971 aesenc $rndkey1,$inout0
1972 pxor @tweak[5],@tweak[3]
1973 psrad \$31,$rndkey0
1974 aesenc $rndkey1,$inout1
1975 paddq @tweak[5],@tweak[5]
1976 pand $twmask,$rndkey0
1977 aesenc $rndkey1,$inout2
1978 aesenc $rndkey1,$inout3
1979 pxor $rndkey0,@tweak[5]
1980 $movkey ($key_),$rndkey0
1981 aesenc $rndkey1,$inout4
1982 aesenc $rndkey1,$inout5
1983 $movkey 16($key_),$rndkey1
1984
1985 pxor @tweak[5],@tweak[4]
1986 aesenclast `16*0`(%rsp),$inout0
1987 psrad \$31,$twres
1988 paddq @tweak[5],@tweak[5]
1989 aesenclast `16*1`(%rsp),$inout1
1990 aesenclast `16*2`(%rsp),$inout2
1991 pand $twmask,$twres
1992 mov %r10,%rax # restore $rounds
1993 aesenclast `16*3`(%rsp),$inout3
1994 aesenclast `16*4`(%rsp),$inout4
1995 aesenclast `16*5`(%rsp),$inout5
1996 pxor $twres,@tweak[5]
1997
Adam Langleye9ada862015-05-11 17:20:37 -07001998 lea `16*6`($out),$out # $out+=6*16
1999 movups $inout0,`-16*6`($out) # store 6 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002000 movups $inout1,`-16*5`($out)
2001 movups $inout2,`-16*4`($out)
2002 movups $inout3,`-16*3`($out)
2003 movups $inout4,`-16*2`($out)
2004 movups $inout5,`-16*1`($out)
2005 sub \$16*6,$len
Adam Langleye9ada862015-05-11 17:20:37 -07002006 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
Adam Langleyd9e397b2015-01-22 14:27:53 -08002007
2008 mov \$16+96,$rounds
2009 sub $rnds_,$rounds
2010 mov $key_,$key # restore $key
2011 shr \$4,$rounds # restore original value
2012
2013.Lxts_enc_short:
Adam Langleye9ada862015-05-11 17:20:37 -07002014 # at the point @tweak[0..5] are populated with tweak values
Adam Langleyd9e397b2015-01-22 14:27:53 -08002015 mov $rounds,$rnds_ # backup $rounds
2016 pxor $rndkey0,@tweak[0]
Adam Langleye9ada862015-05-11 17:20:37 -07002017 add \$16*6,$len # restore real remaining $len
2018 jz .Lxts_enc_done # done if ($len==0)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002019
2020 pxor $rndkey0,@tweak[1]
2021 cmp \$0x20,$len
Adam Langleye9ada862015-05-11 17:20:37 -07002022 jb .Lxts_enc_one # $len is 1*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002023 pxor $rndkey0,@tweak[2]
Adam Langleye9ada862015-05-11 17:20:37 -07002024 je .Lxts_enc_two # $len is 2*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002025
2026 pxor $rndkey0,@tweak[3]
2027 cmp \$0x40,$len
Adam Langleye9ada862015-05-11 17:20:37 -07002028 jb .Lxts_enc_three # $len is 3*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002029 pxor $rndkey0,@tweak[4]
Adam Langleye9ada862015-05-11 17:20:37 -07002030 je .Lxts_enc_four # $len is 4*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002031
Adam Langleye9ada862015-05-11 17:20:37 -07002032 movdqu ($inp),$inout0 # $len is 5*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002033 movdqu 16*1($inp),$inout1
2034 movdqu 16*2($inp),$inout2
2035 pxor @tweak[0],$inout0
2036 movdqu 16*3($inp),$inout3
2037 pxor @tweak[1],$inout1
2038 movdqu 16*4($inp),$inout4
Adam Langleye9ada862015-05-11 17:20:37 -07002039 lea 16*5($inp),$inp # $inp+=5*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002040 pxor @tweak[2],$inout2
2041 pxor @tweak[3],$inout3
2042 pxor @tweak[4],$inout4
Adam Langleye9ada862015-05-11 17:20:37 -07002043 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -08002044
2045 call _aesni_encrypt6
2046
2047 xorps @tweak[0],$inout0
2048 movdqa @tweak[5],@tweak[0]
2049 xorps @tweak[1],$inout1
2050 xorps @tweak[2],$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002051 movdqu $inout0,($out) # store 5 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002052 xorps @tweak[3],$inout3
2053 movdqu $inout1,16*1($out)
2054 xorps @tweak[4],$inout4
2055 movdqu $inout2,16*2($out)
2056 movdqu $inout3,16*3($out)
2057 movdqu $inout4,16*4($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002058 lea 16*5($out),$out # $out+=5*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002059 jmp .Lxts_enc_done
2060
2061.align 16
2062.Lxts_enc_one:
2063 movups ($inp),$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07002064 lea 16*1($inp),$inp # inp+=1*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002065 xorps @tweak[0],$inout0
2066___
2067 &aesni_generate1("enc",$key,$rounds);
2068$code.=<<___;
2069 xorps @tweak[0],$inout0
2070 movdqa @tweak[1],@tweak[0]
Adam Langleye9ada862015-05-11 17:20:37 -07002071 movups $inout0,($out) # store one output block
2072 lea 16*1($out),$out # $out+=1*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002073 jmp .Lxts_enc_done
2074
2075.align 16
2076.Lxts_enc_two:
2077 movups ($inp),$inout0
2078 movups 16($inp),$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07002079 lea 32($inp),$inp # $inp+=2*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002080 xorps @tweak[0],$inout0
2081 xorps @tweak[1],$inout1
2082
2083 call _aesni_encrypt2
2084
2085 xorps @tweak[0],$inout0
2086 movdqa @tweak[2],@tweak[0]
2087 xorps @tweak[1],$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07002088 movups $inout0,($out) # store 2 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002089 movups $inout1,16*1($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002090 lea 16*2($out),$out # $out+=2*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002091 jmp .Lxts_enc_done
2092
2093.align 16
2094.Lxts_enc_three:
2095 movups ($inp),$inout0
2096 movups 16*1($inp),$inout1
2097 movups 16*2($inp),$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002098 lea 16*3($inp),$inp # $inp+=3*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002099 xorps @tweak[0],$inout0
2100 xorps @tweak[1],$inout1
2101 xorps @tweak[2],$inout2
2102
2103 call _aesni_encrypt3
2104
2105 xorps @tweak[0],$inout0
2106 movdqa @tweak[3],@tweak[0]
2107 xorps @tweak[1],$inout1
2108 xorps @tweak[2],$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002109 movups $inout0,($out) # store 3 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002110 movups $inout1,16*1($out)
2111 movups $inout2,16*2($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002112 lea 16*3($out),$out # $out+=3*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002113 jmp .Lxts_enc_done
2114
2115.align 16
2116.Lxts_enc_four:
2117 movups ($inp),$inout0
2118 movups 16*1($inp),$inout1
2119 movups 16*2($inp),$inout2
2120 xorps @tweak[0],$inout0
2121 movups 16*3($inp),$inout3
Adam Langleye9ada862015-05-11 17:20:37 -07002122 lea 16*4($inp),$inp # $inp+=4*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002123 xorps @tweak[1],$inout1
2124 xorps @tweak[2],$inout2
2125 xorps @tweak[3],$inout3
2126
2127 call _aesni_encrypt4
2128
2129 pxor @tweak[0],$inout0
2130 movdqa @tweak[4],@tweak[0]
2131 pxor @tweak[1],$inout1
2132 pxor @tweak[2],$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002133 movdqu $inout0,($out) # store 4 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002134 pxor @tweak[3],$inout3
2135 movdqu $inout1,16*1($out)
2136 movdqu $inout2,16*2($out)
2137 movdqu $inout3,16*3($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002138 lea 16*4($out),$out # $out+=4*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002139 jmp .Lxts_enc_done
2140
2141.align 16
2142.Lxts_enc_done:
Adam Langleye9ada862015-05-11 17:20:37 -07002143 and \$15,$len_ # see if $len%16 is 0
Adam Langleyd9e397b2015-01-22 14:27:53 -08002144 jz .Lxts_enc_ret
2145 mov $len_,$len
2146
2147.Lxts_enc_steal:
2148 movzb ($inp),%eax # borrow $rounds ...
2149 movzb -16($out),%ecx # ... and $key
2150 lea 1($inp),$inp
2151 mov %al,-16($out)
2152 mov %cl,0($out)
2153 lea 1($out),$out
2154 sub \$1,$len
2155 jnz .Lxts_enc_steal
2156
2157 sub $len_,$out # rewind $out
2158 mov $key_,$key # restore $key
2159 mov $rnds_,$rounds # restore $rounds
2160
2161 movups -16($out),$inout0
2162 xorps @tweak[0],$inout0
2163___
2164 &aesni_generate1("enc",$key,$rounds);
2165$code.=<<___;
2166 xorps @tweak[0],$inout0
2167 movups $inout0,-16($out)
2168
2169.Lxts_enc_ret:
Adam Langleye9ada862015-05-11 17:20:37 -07002170 xorps %xmm0,%xmm0 # clear register bank
2171 pxor %xmm1,%xmm1
2172 pxor %xmm2,%xmm2
2173 pxor %xmm3,%xmm3
2174 pxor %xmm4,%xmm4
2175 pxor %xmm5,%xmm5
2176___
2177$code.=<<___ if (!$win64);
2178 pxor %xmm6,%xmm6
2179 pxor %xmm7,%xmm7
2180 movaps %xmm0,0x00(%rsp) # clear stack
2181 pxor %xmm8,%xmm8
2182 movaps %xmm0,0x10(%rsp)
2183 pxor %xmm9,%xmm9
2184 movaps %xmm0,0x20(%rsp)
2185 pxor %xmm10,%xmm10
2186 movaps %xmm0,0x30(%rsp)
2187 pxor %xmm11,%xmm11
2188 movaps %xmm0,0x40(%rsp)
2189 pxor %xmm12,%xmm12
2190 movaps %xmm0,0x50(%rsp)
2191 pxor %xmm13,%xmm13
2192 movaps %xmm0,0x60(%rsp)
2193 pxor %xmm14,%xmm14
2194 pxor %xmm15,%xmm15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002195___
2196$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -08002197 movaps -0xa8(%r11),%xmm6
2198 movaps %xmm0,-0xa8(%r11) # clear stack
2199 movaps -0x98(%r11),%xmm7
2200 movaps %xmm0,-0x98(%r11)
2201 movaps -0x88(%r11),%xmm8
2202 movaps %xmm0,-0x88(%r11)
2203 movaps -0x78(%r11),%xmm9
2204 movaps %xmm0,-0x78(%r11)
2205 movaps -0x68(%r11),%xmm10
2206 movaps %xmm0,-0x68(%r11)
2207 movaps -0x58(%r11),%xmm11
2208 movaps %xmm0,-0x58(%r11)
2209 movaps -0x48(%r11),%xmm12
2210 movaps %xmm0,-0x48(%r11)
2211 movaps -0x38(%r11),%xmm13
2212 movaps %xmm0,-0x38(%r11)
2213 movaps -0x28(%r11),%xmm14
2214 movaps %xmm0,-0x28(%r11)
2215 movaps -0x18(%r11),%xmm15
2216 movaps %xmm0,-0x18(%r11)
Adam Langleye9ada862015-05-11 17:20:37 -07002217 movaps %xmm0,0x00(%rsp)
2218 movaps %xmm0,0x10(%rsp)
2219 movaps %xmm0,0x20(%rsp)
2220 movaps %xmm0,0x30(%rsp)
2221 movaps %xmm0,0x40(%rsp)
2222 movaps %xmm0,0x50(%rsp)
2223 movaps %xmm0,0x60(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002224___
2225$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08002226 mov -8(%r11),%rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07002227.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002228 lea (%r11),%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07002229.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002230.Lxts_enc_epilogue:
2231 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07002232.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08002233.size aesni_xts_encrypt,.-aesni_xts_encrypt
2234___
2235
2236$code.=<<___;
2237.globl aesni_xts_decrypt
2238.type aesni_xts_decrypt,\@function,6
2239.align 16
2240aesni_xts_decrypt:
Robert Sloanab8b8882018-03-26 11:39:51 -07002241.cfi_startproc
Robert Sloana94fe052017-02-21 08:49:28 -08002242 lea (%rsp),%r11 # frame pointer
Robert Sloanab8b8882018-03-26 11:39:51 -07002243.cfi_def_cfa_register %r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08002244 push %rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07002245.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002246 sub \$$frame_size,%rsp
2247 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
2248___
2249$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -08002250 movaps %xmm6,-0xa8(%r11) # offload everything
2251 movaps %xmm7,-0x98(%r11)
2252 movaps %xmm8,-0x88(%r11)
2253 movaps %xmm9,-0x78(%r11)
2254 movaps %xmm10,-0x68(%r11)
2255 movaps %xmm11,-0x58(%r11)
2256 movaps %xmm12,-0x48(%r11)
2257 movaps %xmm13,-0x38(%r11)
2258 movaps %xmm14,-0x28(%r11)
2259 movaps %xmm15,-0x18(%r11)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002260.Lxts_dec_body:
2261___
2262$code.=<<___;
Adam Langleyd9e397b2015-01-22 14:27:53 -08002263 movups ($ivp),$inout0 # load clear-text tweak
2264 mov 240($key2),$rounds # key2->rounds
2265 mov 240($key),$rnds_ # key1->rounds
2266___
2267 # generate the tweak
2268 &aesni_generate1("enc",$key2,$rounds,$inout0);
2269$code.=<<___;
2270 xor %eax,%eax # if ($len%16) len-=16;
2271 test \$15,$len
2272 setnz %al
2273 shl \$4,%rax
2274 sub %rax,$len
2275
2276 $movkey ($key),$rndkey0 # zero round key
2277 mov $key,$key_ # backup $key
2278 mov $rnds_,$rounds # backup $rounds
2279 shl \$4,$rnds_
2280 mov $len,$len_ # backup $len
2281 and \$-16,$len
2282
2283 $movkey 16($key,$rnds_),$rndkey1 # last round key
2284
2285 movdqa .Lxts_magic(%rip),$twmask
2286 movdqa $inout0,@tweak[5]
2287 pshufd \$0x5f,$inout0,$twres
2288 pxor $rndkey0,$rndkey1
2289___
2290 for ($i=0;$i<4;$i++) {
2291 $code.=<<___;
2292 movdqa $twres,$twtmp
2293 paddd $twres,$twres
2294 movdqa @tweak[5],@tweak[$i]
2295 psrad \$31,$twtmp # broadcast upper bits
2296 paddq @tweak[5],@tweak[5]
2297 pand $twmask,$twtmp
2298 pxor $rndkey0,@tweak[$i]
2299 pxor $twtmp,@tweak[5]
2300___
2301 }
2302$code.=<<___;
2303 movdqa @tweak[5],@tweak[4]
2304 psrad \$31,$twres
2305 paddq @tweak[5],@tweak[5]
2306 pand $twmask,$twres
2307 pxor $rndkey0,@tweak[4]
2308 pxor $twres,@tweak[5]
2309 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
2310
2311 sub \$16*6,$len
Adam Langleye9ada862015-05-11 17:20:37 -07002312 jc .Lxts_dec_short # if $len-=6*16 borrowed
Adam Langleyd9e397b2015-01-22 14:27:53 -08002313
2314 mov \$16+96,$rounds
2315 lea 32($key_,$rnds_),$key # end of key schedule
2316 sub %r10,%rax # twisted $rounds
2317 $movkey 16($key_),$rndkey1
2318 mov %rax,%r10 # backup twisted $rounds
2319 lea .Lxts_magic(%rip),%r8
2320 jmp .Lxts_dec_grandloop
2321
2322.align 32
2323.Lxts_dec_grandloop:
2324 movdqu `16*0`($inp),$inout0 # load input
2325 movdqa $rndkey0,$twmask
2326 movdqu `16*1`($inp),$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07002327 pxor @tweak[0],$inout0 # intput^=tweak^round[0]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002328 movdqu `16*2`($inp),$inout2
2329 pxor @tweak[1],$inout1
2330 aesdec $rndkey1,$inout0
2331 movdqu `16*3`($inp),$inout3
2332 pxor @tweak[2],$inout2
2333 aesdec $rndkey1,$inout1
2334 movdqu `16*4`($inp),$inout4
2335 pxor @tweak[3],$inout3
2336 aesdec $rndkey1,$inout2
2337 movdqu `16*5`($inp),$inout5
2338 pxor @tweak[5],$twmask # round[0]^=tweak[5]
2339 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
2340 pxor @tweak[4],$inout4
2341 aesdec $rndkey1,$inout3
2342 $movkey 32($key_),$rndkey0
2343 lea `16*6`($inp),$inp
2344 pxor $twmask,$inout5
2345
Robert Sloanab8b8882018-03-26 11:39:51 -07002346 pxor $twres,@tweak[0] # calculate tweaks^round[last]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002347 aesdec $rndkey1,$inout4
2348 pxor $twres,@tweak[1]
2349 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
2350 aesdec $rndkey1,$inout5
2351 $movkey 48($key_),$rndkey1
2352 pxor $twres,@tweak[2]
2353
2354 aesdec $rndkey0,$inout0
2355 pxor $twres,@tweak[3]
2356 movdqa @tweak[1],`16*1`(%rsp)
2357 aesdec $rndkey0,$inout1
2358 pxor $twres,@tweak[4]
2359 movdqa @tweak[2],`16*2`(%rsp)
2360 aesdec $rndkey0,$inout2
2361 aesdec $rndkey0,$inout3
2362 pxor $twres,$twmask
2363 movdqa @tweak[4],`16*4`(%rsp)
2364 aesdec $rndkey0,$inout4
2365 aesdec $rndkey0,$inout5
2366 $movkey 64($key_),$rndkey0
2367 movdqa $twmask,`16*5`(%rsp)
2368 pshufd \$0x5f,@tweak[5],$twres
2369 jmp .Lxts_dec_loop6
2370.align 32
2371.Lxts_dec_loop6:
2372 aesdec $rndkey1,$inout0
2373 aesdec $rndkey1,$inout1
2374 aesdec $rndkey1,$inout2
2375 aesdec $rndkey1,$inout3
2376 aesdec $rndkey1,$inout4
2377 aesdec $rndkey1,$inout5
2378 $movkey -64($key,%rax),$rndkey1
2379 add \$32,%rax
2380
2381 aesdec $rndkey0,$inout0
2382 aesdec $rndkey0,$inout1
2383 aesdec $rndkey0,$inout2
2384 aesdec $rndkey0,$inout3
2385 aesdec $rndkey0,$inout4
2386 aesdec $rndkey0,$inout5
2387 $movkey -80($key,%rax),$rndkey0
2388 jnz .Lxts_dec_loop6
2389
Adam Langleye9ada862015-05-11 17:20:37 -07002390 movdqa (%r8),$twmask # start calculating next tweak
Adam Langleyd9e397b2015-01-22 14:27:53 -08002391 movdqa $twres,$twtmp
2392 paddd $twres,$twres
2393 aesdec $rndkey1,$inout0
2394 paddq @tweak[5],@tweak[5]
2395 psrad \$31,$twtmp
2396 aesdec $rndkey1,$inout1
2397 pand $twmask,$twtmp
2398 $movkey ($key_),@tweak[0] # load round[0]
2399 aesdec $rndkey1,$inout2
2400 aesdec $rndkey1,$inout3
2401 aesdec $rndkey1,$inout4
2402 pxor $twtmp,@tweak[5]
2403 movaps @tweak[0],@tweak[1] # copy round[0]
2404 aesdec $rndkey1,$inout5
2405 $movkey -64($key),$rndkey1
2406
2407 movdqa $twres,$twtmp
2408 aesdec $rndkey0,$inout0
2409 paddd $twres,$twres
2410 pxor @tweak[5],@tweak[0]
2411 aesdec $rndkey0,$inout1
2412 psrad \$31,$twtmp
2413 paddq @tweak[5],@tweak[5]
2414 aesdec $rndkey0,$inout2
2415 aesdec $rndkey0,$inout3
2416 pand $twmask,$twtmp
2417 movaps @tweak[1],@tweak[2]
2418 aesdec $rndkey0,$inout4
2419 pxor $twtmp,@tweak[5]
2420 movdqa $twres,$twtmp
2421 aesdec $rndkey0,$inout5
2422 $movkey -48($key),$rndkey0
2423
2424 paddd $twres,$twres
2425 aesdec $rndkey1,$inout0
2426 pxor @tweak[5],@tweak[1]
2427 psrad \$31,$twtmp
2428 aesdec $rndkey1,$inout1
2429 paddq @tweak[5],@tweak[5]
2430 pand $twmask,$twtmp
2431 aesdec $rndkey1,$inout2
2432 aesdec $rndkey1,$inout3
2433 movdqa @tweak[3],`16*3`(%rsp)
2434 pxor $twtmp,@tweak[5]
2435 aesdec $rndkey1,$inout4
2436 movaps @tweak[2],@tweak[3]
2437 movdqa $twres,$twtmp
2438 aesdec $rndkey1,$inout5
2439 $movkey -32($key),$rndkey1
2440
2441 paddd $twres,$twres
2442 aesdec $rndkey0,$inout0
2443 pxor @tweak[5],@tweak[2]
2444 psrad \$31,$twtmp
2445 aesdec $rndkey0,$inout1
2446 paddq @tweak[5],@tweak[5]
2447 pand $twmask,$twtmp
2448 aesdec $rndkey0,$inout2
2449 aesdec $rndkey0,$inout3
2450 aesdec $rndkey0,$inout4
2451 pxor $twtmp,@tweak[5]
2452 movaps @tweak[3],@tweak[4]
2453 aesdec $rndkey0,$inout5
2454
2455 movdqa $twres,$rndkey0
2456 paddd $twres,$twres
2457 aesdec $rndkey1,$inout0
2458 pxor @tweak[5],@tweak[3]
2459 psrad \$31,$rndkey0
2460 aesdec $rndkey1,$inout1
2461 paddq @tweak[5],@tweak[5]
2462 pand $twmask,$rndkey0
2463 aesdec $rndkey1,$inout2
2464 aesdec $rndkey1,$inout3
2465 pxor $rndkey0,@tweak[5]
2466 $movkey ($key_),$rndkey0
2467 aesdec $rndkey1,$inout4
2468 aesdec $rndkey1,$inout5
2469 $movkey 16($key_),$rndkey1
2470
2471 pxor @tweak[5],@tweak[4]
2472 aesdeclast `16*0`(%rsp),$inout0
2473 psrad \$31,$twres
2474 paddq @tweak[5],@tweak[5]
2475 aesdeclast `16*1`(%rsp),$inout1
2476 aesdeclast `16*2`(%rsp),$inout2
2477 pand $twmask,$twres
2478 mov %r10,%rax # restore $rounds
2479 aesdeclast `16*3`(%rsp),$inout3
2480 aesdeclast `16*4`(%rsp),$inout4
2481 aesdeclast `16*5`(%rsp),$inout5
2482 pxor $twres,@tweak[5]
2483
Adam Langleye9ada862015-05-11 17:20:37 -07002484 lea `16*6`($out),$out # $out+=6*16
2485 movups $inout0,`-16*6`($out) # store 6 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002486 movups $inout1,`-16*5`($out)
2487 movups $inout2,`-16*4`($out)
2488 movups $inout3,`-16*3`($out)
2489 movups $inout4,`-16*2`($out)
2490 movups $inout5,`-16*1`($out)
2491 sub \$16*6,$len
Adam Langleye9ada862015-05-11 17:20:37 -07002492 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
Adam Langleyd9e397b2015-01-22 14:27:53 -08002493
2494 mov \$16+96,$rounds
2495 sub $rnds_,$rounds
2496 mov $key_,$key # restore $key
2497 shr \$4,$rounds # restore original value
2498
2499.Lxts_dec_short:
Adam Langleye9ada862015-05-11 17:20:37 -07002500 # at the point @tweak[0..5] are populated with tweak values
Adam Langleyd9e397b2015-01-22 14:27:53 -08002501 mov $rounds,$rnds_ # backup $rounds
2502 pxor $rndkey0,@tweak[0]
2503 pxor $rndkey0,@tweak[1]
Adam Langleye9ada862015-05-11 17:20:37 -07002504 add \$16*6,$len # restore real remaining $len
2505 jz .Lxts_dec_done # done if ($len==0)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002506
2507 pxor $rndkey0,@tweak[2]
2508 cmp \$0x20,$len
Adam Langleye9ada862015-05-11 17:20:37 -07002509 jb .Lxts_dec_one # $len is 1*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002510 pxor $rndkey0,@tweak[3]
Adam Langleye9ada862015-05-11 17:20:37 -07002511 je .Lxts_dec_two # $len is 2*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002512
2513 pxor $rndkey0,@tweak[4]
2514 cmp \$0x40,$len
Adam Langleye9ada862015-05-11 17:20:37 -07002515 jb .Lxts_dec_three # $len is 3*16
2516 je .Lxts_dec_four # $len is 4*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002517
Adam Langleye9ada862015-05-11 17:20:37 -07002518 movdqu ($inp),$inout0 # $len is 5*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002519 movdqu 16*1($inp),$inout1
2520 movdqu 16*2($inp),$inout2
2521 pxor @tweak[0],$inout0
2522 movdqu 16*3($inp),$inout3
2523 pxor @tweak[1],$inout1
2524 movdqu 16*4($inp),$inout4
Adam Langleye9ada862015-05-11 17:20:37 -07002525 lea 16*5($inp),$inp # $inp+=5*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002526 pxor @tweak[2],$inout2
2527 pxor @tweak[3],$inout3
2528 pxor @tweak[4],$inout4
2529
2530 call _aesni_decrypt6
2531
2532 xorps @tweak[0],$inout0
2533 xorps @tweak[1],$inout1
2534 xorps @tweak[2],$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002535 movdqu $inout0,($out) # store 5 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002536 xorps @tweak[3],$inout3
2537 movdqu $inout1,16*1($out)
2538 xorps @tweak[4],$inout4
2539 movdqu $inout2,16*2($out)
2540 pxor $twtmp,$twtmp
2541 movdqu $inout3,16*3($out)
2542 pcmpgtd @tweak[5],$twtmp
2543 movdqu $inout4,16*4($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002544 lea 16*5($out),$out # $out+=5*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002545 pshufd \$0x13,$twtmp,@tweak[1] # $twres
2546 and \$15,$len_
2547 jz .Lxts_dec_ret
2548
2549 movdqa @tweak[5],@tweak[0]
2550 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
2551 pand $twmask,@tweak[1] # isolate carry and residue
2552 pxor @tweak[5],@tweak[1]
2553 jmp .Lxts_dec_done2
2554
2555.align 16
2556.Lxts_dec_one:
2557 movups ($inp),$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07002558 lea 16*1($inp),$inp # $inp+=1*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002559 xorps @tweak[0],$inout0
2560___
2561 &aesni_generate1("dec",$key,$rounds);
2562$code.=<<___;
2563 xorps @tweak[0],$inout0
2564 movdqa @tweak[1],@tweak[0]
Adam Langleye9ada862015-05-11 17:20:37 -07002565 movups $inout0,($out) # store one output block
Adam Langleyd9e397b2015-01-22 14:27:53 -08002566 movdqa @tweak[2],@tweak[1]
Adam Langleye9ada862015-05-11 17:20:37 -07002567 lea 16*1($out),$out # $out+=1*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002568 jmp .Lxts_dec_done
2569
2570.align 16
2571.Lxts_dec_two:
2572 movups ($inp),$inout0
2573 movups 16($inp),$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07002574 lea 32($inp),$inp # $inp+=2*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002575 xorps @tweak[0],$inout0
2576 xorps @tweak[1],$inout1
2577
2578 call _aesni_decrypt2
2579
2580 xorps @tweak[0],$inout0
2581 movdqa @tweak[2],@tweak[0]
2582 xorps @tweak[1],$inout1
2583 movdqa @tweak[3],@tweak[1]
Adam Langleye9ada862015-05-11 17:20:37 -07002584 movups $inout0,($out) # store 2 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002585 movups $inout1,16*1($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002586 lea 16*2($out),$out # $out+=2*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002587 jmp .Lxts_dec_done
2588
2589.align 16
2590.Lxts_dec_three:
2591 movups ($inp),$inout0
2592 movups 16*1($inp),$inout1
2593 movups 16*2($inp),$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002594 lea 16*3($inp),$inp # $inp+=3*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002595 xorps @tweak[0],$inout0
2596 xorps @tweak[1],$inout1
2597 xorps @tweak[2],$inout2
2598
2599 call _aesni_decrypt3
2600
2601 xorps @tweak[0],$inout0
2602 movdqa @tweak[3],@tweak[0]
2603 xorps @tweak[1],$inout1
2604 movdqa @tweak[4],@tweak[1]
2605 xorps @tweak[2],$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002606 movups $inout0,($out) # store 3 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002607 movups $inout1,16*1($out)
2608 movups $inout2,16*2($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002609 lea 16*3($out),$out # $out+=3*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002610 jmp .Lxts_dec_done
2611
2612.align 16
2613.Lxts_dec_four:
2614 movups ($inp),$inout0
2615 movups 16*1($inp),$inout1
2616 movups 16*2($inp),$inout2
2617 xorps @tweak[0],$inout0
2618 movups 16*3($inp),$inout3
Adam Langleye9ada862015-05-11 17:20:37 -07002619 lea 16*4($inp),$inp # $inp+=4*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002620 xorps @tweak[1],$inout1
2621 xorps @tweak[2],$inout2
2622 xorps @tweak[3],$inout3
2623
2624 call _aesni_decrypt4
2625
2626 pxor @tweak[0],$inout0
2627 movdqa @tweak[4],@tweak[0]
2628 pxor @tweak[1],$inout1
2629 movdqa @tweak[5],@tweak[1]
2630 pxor @tweak[2],$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002631 movdqu $inout0,($out) # store 4 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002632 pxor @tweak[3],$inout3
2633 movdqu $inout1,16*1($out)
2634 movdqu $inout2,16*2($out)
2635 movdqu $inout3,16*3($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002636 lea 16*4($out),$out # $out+=4*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002637 jmp .Lxts_dec_done
2638
2639.align 16
2640.Lxts_dec_done:
Adam Langleye9ada862015-05-11 17:20:37 -07002641 and \$15,$len_ # see if $len%16 is 0
Adam Langleyd9e397b2015-01-22 14:27:53 -08002642 jz .Lxts_dec_ret
2643.Lxts_dec_done2:
2644 mov $len_,$len
2645 mov $key_,$key # restore $key
2646 mov $rnds_,$rounds # restore $rounds
2647
2648 movups ($inp),$inout0
2649 xorps @tweak[1],$inout0
2650___
2651 &aesni_generate1("dec",$key,$rounds);
2652$code.=<<___;
2653 xorps @tweak[1],$inout0
2654 movups $inout0,($out)
2655
2656.Lxts_dec_steal:
2657 movzb 16($inp),%eax # borrow $rounds ...
2658 movzb ($out),%ecx # ... and $key
2659 lea 1($inp),$inp
2660 mov %al,($out)
2661 mov %cl,16($out)
2662 lea 1($out),$out
2663 sub \$1,$len
2664 jnz .Lxts_dec_steal
2665
2666 sub $len_,$out # rewind $out
2667 mov $key_,$key # restore $key
2668 mov $rnds_,$rounds # restore $rounds
2669
2670 movups ($out),$inout0
2671 xorps @tweak[0],$inout0
2672___
2673 &aesni_generate1("dec",$key,$rounds);
2674$code.=<<___;
2675 xorps @tweak[0],$inout0
2676 movups $inout0,($out)
2677
2678.Lxts_dec_ret:
Adam Langleye9ada862015-05-11 17:20:37 -07002679 xorps %xmm0,%xmm0 # clear register bank
2680 pxor %xmm1,%xmm1
2681 pxor %xmm2,%xmm2
2682 pxor %xmm3,%xmm3
2683 pxor %xmm4,%xmm4
2684 pxor %xmm5,%xmm5
2685___
2686$code.=<<___ if (!$win64);
2687 pxor %xmm6,%xmm6
2688 pxor %xmm7,%xmm7
2689 movaps %xmm0,0x00(%rsp) # clear stack
2690 pxor %xmm8,%xmm8
2691 movaps %xmm0,0x10(%rsp)
2692 pxor %xmm9,%xmm9
2693 movaps %xmm0,0x20(%rsp)
2694 pxor %xmm10,%xmm10
2695 movaps %xmm0,0x30(%rsp)
2696 pxor %xmm11,%xmm11
2697 movaps %xmm0,0x40(%rsp)
2698 pxor %xmm12,%xmm12
2699 movaps %xmm0,0x50(%rsp)
2700 pxor %xmm13,%xmm13
2701 movaps %xmm0,0x60(%rsp)
2702 pxor %xmm14,%xmm14
2703 pxor %xmm15,%xmm15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002704___
2705$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -08002706 movaps -0xa8(%r11),%xmm6
2707 movaps %xmm0,-0xa8(%r11) # clear stack
2708 movaps -0x98(%r11),%xmm7
2709 movaps %xmm0,-0x98(%r11)
2710 movaps -0x88(%r11),%xmm8
2711 movaps %xmm0,-0x88(%r11)
2712 movaps -0x78(%r11),%xmm9
2713 movaps %xmm0,-0x78(%r11)
2714 movaps -0x68(%r11),%xmm10
2715 movaps %xmm0,-0x68(%r11)
2716 movaps -0x58(%r11),%xmm11
2717 movaps %xmm0,-0x58(%r11)
2718 movaps -0x48(%r11),%xmm12
2719 movaps %xmm0,-0x48(%r11)
2720 movaps -0x38(%r11),%xmm13
2721 movaps %xmm0,-0x38(%r11)
2722 movaps -0x28(%r11),%xmm14
2723 movaps %xmm0,-0x28(%r11)
2724 movaps -0x18(%r11),%xmm15
2725 movaps %xmm0,-0x18(%r11)
Adam Langleye9ada862015-05-11 17:20:37 -07002726 movaps %xmm0,0x00(%rsp)
2727 movaps %xmm0,0x10(%rsp)
2728 movaps %xmm0,0x20(%rsp)
2729 movaps %xmm0,0x30(%rsp)
2730 movaps %xmm0,0x40(%rsp)
2731 movaps %xmm0,0x50(%rsp)
2732 movaps %xmm0,0x60(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002733___
2734$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08002735 mov -8(%r11),%rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07002736.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002737 lea (%r11),%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07002738.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002739.Lxts_dec_epilogue:
2740 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07002741.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08002742.size aesni_xts_decrypt,.-aesni_xts_decrypt
2743___
Robert Sloana94fe052017-02-21 08:49:28 -08002744}
2745
2746######################################################################
2747# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2748# const AES_KEY *key, unsigned int start_block_num,
2749# unsigned char offset_i[16], const unsigned char L_[][16],
2750# unsigned char checksum[16]);
2751#
2752{
2753my @offset=map("%xmm$_",(10..15));
2754my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2755my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
2756my ($L_p,$checksum_p) = ("%rbx","%rbp");
2757my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2758my $seventh_arg = $win64 ? 56 : 8;
2759my $blocks = $len;
2760
2761$code.=<<___;
2762.globl aesni_ocb_encrypt
2763.type aesni_ocb_encrypt,\@function,6
2764.align 32
2765aesni_ocb_encrypt:
Robert Sloanab8b8882018-03-26 11:39:51 -07002766.cfi_startproc
Robert Sloana94fe052017-02-21 08:49:28 -08002767 lea (%rsp),%rax
2768 push %rbx
Robert Sloanab8b8882018-03-26 11:39:51 -07002769.cfi_push %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08002770 push %rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07002771.cfi_push %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002772 push %r12
Robert Sloanab8b8882018-03-26 11:39:51 -07002773.cfi_push %r12
Robert Sloana94fe052017-02-21 08:49:28 -08002774 push %r13
Robert Sloanab8b8882018-03-26 11:39:51 -07002775.cfi_push %r13
Robert Sloana94fe052017-02-21 08:49:28 -08002776 push %r14
Robert Sloanab8b8882018-03-26 11:39:51 -07002777.cfi_push %r14
Robert Sloana94fe052017-02-21 08:49:28 -08002778___
2779$code.=<<___ if ($win64);
2780 lea -0xa0(%rsp),%rsp
2781 movaps %xmm6,0x00(%rsp) # offload everything
2782 movaps %xmm7,0x10(%rsp)
2783 movaps %xmm8,0x20(%rsp)
2784 movaps %xmm9,0x30(%rsp)
2785 movaps %xmm10,0x40(%rsp)
2786 movaps %xmm11,0x50(%rsp)
2787 movaps %xmm12,0x60(%rsp)
2788 movaps %xmm13,0x70(%rsp)
2789 movaps %xmm14,0x80(%rsp)
2790 movaps %xmm15,0x90(%rsp)
2791.Locb_enc_body:
2792___
2793$code.=<<___;
2794 mov $seventh_arg(%rax),$L_p # 7th argument
2795 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
2796
2797 mov 240($key),$rnds_
2798 mov $key,$key_
2799 shl \$4,$rnds_
2800 $movkey ($key),$rndkey0l # round[0]
2801 $movkey 16($key,$rnds_),$rndkey1 # round[last]
2802
2803 movdqu ($offset_p),@offset[5] # load last offset_i
2804 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
2805 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
2806
2807 mov \$16+32,$rounds
2808 lea 32($key_,$rnds_),$key
2809 $movkey 16($key_),$rndkey1 # round[1]
2810 sub %r10,%rax # twisted $rounds
2811 mov %rax,%r10 # backup twisted $rounds
2812
2813 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
2814 movdqu ($checksum_p),$checksum # load checksum
2815
2816 test \$1,$block_num # is first block number odd?
2817 jnz .Locb_enc_odd
2818
2819 bsf $block_num,$i1
2820 add \$1,$block_num
2821 shl \$4,$i1
2822 movdqu ($L_p,$i1),$inout5 # borrow
2823 movdqu ($inp),$inout0
2824 lea 16($inp),$inp
2825
2826 call __ocb_encrypt1
2827
2828 movdqa $inout5,@offset[5]
2829 movups $inout0,($out)
2830 lea 16($out),$out
2831 sub \$1,$blocks
2832 jz .Locb_enc_done
2833
2834.Locb_enc_odd:
2835 lea 1($block_num),$i1 # even-numbered blocks
2836 lea 3($block_num),$i3
2837 lea 5($block_num),$i5
2838 lea 6($block_num),$block_num
2839 bsf $i1,$i1 # ntz(block)
2840 bsf $i3,$i3
2841 bsf $i5,$i5
2842 shl \$4,$i1 # ntz(block) -> table offset
2843 shl \$4,$i3
2844 shl \$4,$i5
2845
2846 sub \$6,$blocks
2847 jc .Locb_enc_short
2848 jmp .Locb_enc_grandloop
2849
2850.align 32
2851.Locb_enc_grandloop:
2852 movdqu `16*0`($inp),$inout0 # load input
2853 movdqu `16*1`($inp),$inout1
2854 movdqu `16*2`($inp),$inout2
2855 movdqu `16*3`($inp),$inout3
2856 movdqu `16*4`($inp),$inout4
2857 movdqu `16*5`($inp),$inout5
2858 lea `16*6`($inp),$inp
2859
2860 call __ocb_encrypt6
2861
2862 movups $inout0,`16*0`($out) # store output
2863 movups $inout1,`16*1`($out)
2864 movups $inout2,`16*2`($out)
2865 movups $inout3,`16*3`($out)
2866 movups $inout4,`16*4`($out)
2867 movups $inout5,`16*5`($out)
2868 lea `16*6`($out),$out
2869 sub \$6,$blocks
2870 jnc .Locb_enc_grandloop
2871
2872.Locb_enc_short:
2873 add \$6,$blocks
2874 jz .Locb_enc_done
2875
2876 movdqu `16*0`($inp),$inout0
2877 cmp \$2,$blocks
2878 jb .Locb_enc_one
2879 movdqu `16*1`($inp),$inout1
2880 je .Locb_enc_two
2881
2882 movdqu `16*2`($inp),$inout2
2883 cmp \$4,$blocks
2884 jb .Locb_enc_three
2885 movdqu `16*3`($inp),$inout3
2886 je .Locb_enc_four
2887
2888 movdqu `16*4`($inp),$inout4
2889 pxor $inout5,$inout5
2890
2891 call __ocb_encrypt6
2892
2893 movdqa @offset[4],@offset[5]
2894 movups $inout0,`16*0`($out)
2895 movups $inout1,`16*1`($out)
2896 movups $inout2,`16*2`($out)
2897 movups $inout3,`16*3`($out)
2898 movups $inout4,`16*4`($out)
2899
2900 jmp .Locb_enc_done
2901
2902.align 16
2903.Locb_enc_one:
2904 movdqa @offset[0],$inout5 # borrow
2905
2906 call __ocb_encrypt1
2907
2908 movdqa $inout5,@offset[5]
2909 movups $inout0,`16*0`($out)
2910 jmp .Locb_enc_done
2911
2912.align 16
2913.Locb_enc_two:
2914 pxor $inout2,$inout2
2915 pxor $inout3,$inout3
2916
2917 call __ocb_encrypt4
2918
2919 movdqa @offset[1],@offset[5]
2920 movups $inout0,`16*0`($out)
2921 movups $inout1,`16*1`($out)
2922
2923 jmp .Locb_enc_done
2924
2925.align 16
2926.Locb_enc_three:
2927 pxor $inout3,$inout3
2928
2929 call __ocb_encrypt4
2930
2931 movdqa @offset[2],@offset[5]
2932 movups $inout0,`16*0`($out)
2933 movups $inout1,`16*1`($out)
2934 movups $inout2,`16*2`($out)
2935
2936 jmp .Locb_enc_done
2937
2938.align 16
2939.Locb_enc_four:
2940 call __ocb_encrypt4
2941
2942 movdqa @offset[3],@offset[5]
2943 movups $inout0,`16*0`($out)
2944 movups $inout1,`16*1`($out)
2945 movups $inout2,`16*2`($out)
2946 movups $inout3,`16*3`($out)
2947
2948.Locb_enc_done:
2949 pxor $rndkey0,@offset[5] # "remove" round[last]
2950 movdqu $checksum,($checksum_p) # store checksum
2951 movdqu @offset[5],($offset_p) # store last offset_i
2952
2953 xorps %xmm0,%xmm0 # clear register bank
2954 pxor %xmm1,%xmm1
2955 pxor %xmm2,%xmm2
2956 pxor %xmm3,%xmm3
2957 pxor %xmm4,%xmm4
2958 pxor %xmm5,%xmm5
2959___
2960$code.=<<___ if (!$win64);
2961 pxor %xmm6,%xmm6
2962 pxor %xmm7,%xmm7
2963 pxor %xmm8,%xmm8
2964 pxor %xmm9,%xmm9
2965 pxor %xmm10,%xmm10
2966 pxor %xmm11,%xmm11
2967 pxor %xmm12,%xmm12
2968 pxor %xmm13,%xmm13
2969 pxor %xmm14,%xmm14
2970 pxor %xmm15,%xmm15
2971 lea 0x28(%rsp),%rax
Robert Sloanab8b8882018-03-26 11:39:51 -07002972.cfi_def_cfa %rax,8
Robert Sloana94fe052017-02-21 08:49:28 -08002973___
2974$code.=<<___ if ($win64);
2975 movaps 0x00(%rsp),%xmm6
2976 movaps %xmm0,0x00(%rsp) # clear stack
2977 movaps 0x10(%rsp),%xmm7
2978 movaps %xmm0,0x10(%rsp)
2979 movaps 0x20(%rsp),%xmm8
2980 movaps %xmm0,0x20(%rsp)
2981 movaps 0x30(%rsp),%xmm9
2982 movaps %xmm0,0x30(%rsp)
2983 movaps 0x40(%rsp),%xmm10
2984 movaps %xmm0,0x40(%rsp)
2985 movaps 0x50(%rsp),%xmm11
2986 movaps %xmm0,0x50(%rsp)
2987 movaps 0x60(%rsp),%xmm12
2988 movaps %xmm0,0x60(%rsp)
2989 movaps 0x70(%rsp),%xmm13
2990 movaps %xmm0,0x70(%rsp)
2991 movaps 0x80(%rsp),%xmm14
2992 movaps %xmm0,0x80(%rsp)
2993 movaps 0x90(%rsp),%xmm15
2994 movaps %xmm0,0x90(%rsp)
2995 lea 0xa0+0x28(%rsp),%rax
2996.Locb_enc_pop:
2997___
2998$code.=<<___;
2999 mov -40(%rax),%r14
Robert Sloanab8b8882018-03-26 11:39:51 -07003000.cfi_restore %r14
Robert Sloana94fe052017-02-21 08:49:28 -08003001 mov -32(%rax),%r13
Robert Sloanab8b8882018-03-26 11:39:51 -07003002.cfi_restore %r13
Robert Sloana94fe052017-02-21 08:49:28 -08003003 mov -24(%rax),%r12
Robert Sloanab8b8882018-03-26 11:39:51 -07003004.cfi_restore %r12
Robert Sloana94fe052017-02-21 08:49:28 -08003005 mov -16(%rax),%rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07003006.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08003007 mov -8(%rax),%rbx
Robert Sloanab8b8882018-03-26 11:39:51 -07003008.cfi_restore %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08003009 lea (%rax),%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07003010.cfi_def_cfa_register %rsp
Robert Sloana94fe052017-02-21 08:49:28 -08003011.Locb_enc_epilogue:
3012 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07003013.cfi_endproc
Robert Sloana94fe052017-02-21 08:49:28 -08003014.size aesni_ocb_encrypt,.-aesni_ocb_encrypt
3015
3016.type __ocb_encrypt6,\@abi-omnipotent
3017.align 32
3018__ocb_encrypt6:
3019 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3020 movdqu ($L_p,$i1),@offset[1]
3021 movdqa @offset[0],@offset[2]
3022 movdqu ($L_p,$i3),@offset[3]
3023 movdqa @offset[0],@offset[4]
3024 pxor @offset[5],@offset[0]
3025 movdqu ($L_p,$i5),@offset[5]
3026 pxor @offset[0],@offset[1]
3027 pxor $inout0,$checksum # accumulate checksum
3028 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3029 pxor @offset[1],@offset[2]
3030 pxor $inout1,$checksum
3031 pxor @offset[1],$inout1
3032 pxor @offset[2],@offset[3]
3033 pxor $inout2,$checksum
3034 pxor @offset[2],$inout2
3035 pxor @offset[3],@offset[4]
3036 pxor $inout3,$checksum
3037 pxor @offset[3],$inout3
3038 pxor @offset[4],@offset[5]
3039 pxor $inout4,$checksum
3040 pxor @offset[4],$inout4
3041 pxor $inout5,$checksum
3042 pxor @offset[5],$inout5
3043 $movkey 32($key_),$rndkey0
3044
3045 lea 1($block_num),$i1 # even-numbered blocks
3046 lea 3($block_num),$i3
3047 lea 5($block_num),$i5
3048 add \$6,$block_num
3049 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3050 bsf $i1,$i1 # ntz(block)
3051 bsf $i3,$i3
3052 bsf $i5,$i5
3053
3054 aesenc $rndkey1,$inout0
3055 aesenc $rndkey1,$inout1
3056 aesenc $rndkey1,$inout2
3057 aesenc $rndkey1,$inout3
3058 pxor $rndkey0l,@offset[1]
3059 pxor $rndkey0l,@offset[2]
3060 aesenc $rndkey1,$inout4
3061 pxor $rndkey0l,@offset[3]
3062 pxor $rndkey0l,@offset[4]
3063 aesenc $rndkey1,$inout5
3064 $movkey 48($key_),$rndkey1
3065 pxor $rndkey0l,@offset[5]
3066
3067 aesenc $rndkey0,$inout0
3068 aesenc $rndkey0,$inout1
3069 aesenc $rndkey0,$inout2
3070 aesenc $rndkey0,$inout3
3071 aesenc $rndkey0,$inout4
3072 aesenc $rndkey0,$inout5
3073 $movkey 64($key_),$rndkey0
3074 shl \$4,$i1 # ntz(block) -> table offset
3075 shl \$4,$i3
3076 jmp .Locb_enc_loop6
3077
3078.align 32
3079.Locb_enc_loop6:
3080 aesenc $rndkey1,$inout0
3081 aesenc $rndkey1,$inout1
3082 aesenc $rndkey1,$inout2
3083 aesenc $rndkey1,$inout3
3084 aesenc $rndkey1,$inout4
3085 aesenc $rndkey1,$inout5
3086 $movkey ($key,%rax),$rndkey1
3087 add \$32,%rax
3088
3089 aesenc $rndkey0,$inout0
3090 aesenc $rndkey0,$inout1
3091 aesenc $rndkey0,$inout2
3092 aesenc $rndkey0,$inout3
3093 aesenc $rndkey0,$inout4
3094 aesenc $rndkey0,$inout5
3095 $movkey -16($key,%rax),$rndkey0
3096 jnz .Locb_enc_loop6
3097
3098 aesenc $rndkey1,$inout0
3099 aesenc $rndkey1,$inout1
3100 aesenc $rndkey1,$inout2
3101 aesenc $rndkey1,$inout3
3102 aesenc $rndkey1,$inout4
3103 aesenc $rndkey1,$inout5
3104 $movkey 16($key_),$rndkey1
3105 shl \$4,$i5
3106
3107 aesenclast @offset[0],$inout0
3108 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3109 mov %r10,%rax # restore twisted rounds
3110 aesenclast @offset[1],$inout1
3111 aesenclast @offset[2],$inout2
3112 aesenclast @offset[3],$inout3
3113 aesenclast @offset[4],$inout4
3114 aesenclast @offset[5],$inout5
3115 ret
3116.size __ocb_encrypt6,.-__ocb_encrypt6
3117
3118.type __ocb_encrypt4,\@abi-omnipotent
3119.align 32
3120__ocb_encrypt4:
3121 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3122 movdqu ($L_p,$i1),@offset[1]
3123 movdqa @offset[0],@offset[2]
3124 movdqu ($L_p,$i3),@offset[3]
3125 pxor @offset[5],@offset[0]
3126 pxor @offset[0],@offset[1]
3127 pxor $inout0,$checksum # accumulate checksum
3128 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3129 pxor @offset[1],@offset[2]
3130 pxor $inout1,$checksum
3131 pxor @offset[1],$inout1
3132 pxor @offset[2],@offset[3]
3133 pxor $inout2,$checksum
3134 pxor @offset[2],$inout2
3135 pxor $inout3,$checksum
3136 pxor @offset[3],$inout3
3137 $movkey 32($key_),$rndkey0
3138
3139 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3140 pxor $rndkey0l,@offset[1]
3141 pxor $rndkey0l,@offset[2]
3142 pxor $rndkey0l,@offset[3]
3143
3144 aesenc $rndkey1,$inout0
3145 aesenc $rndkey1,$inout1
3146 aesenc $rndkey1,$inout2
3147 aesenc $rndkey1,$inout3
3148 $movkey 48($key_),$rndkey1
3149
3150 aesenc $rndkey0,$inout0
3151 aesenc $rndkey0,$inout1
3152 aesenc $rndkey0,$inout2
3153 aesenc $rndkey0,$inout3
3154 $movkey 64($key_),$rndkey0
3155 jmp .Locb_enc_loop4
3156
3157.align 32
3158.Locb_enc_loop4:
3159 aesenc $rndkey1,$inout0
3160 aesenc $rndkey1,$inout1
3161 aesenc $rndkey1,$inout2
3162 aesenc $rndkey1,$inout3
3163 $movkey ($key,%rax),$rndkey1
3164 add \$32,%rax
3165
3166 aesenc $rndkey0,$inout0
3167 aesenc $rndkey0,$inout1
3168 aesenc $rndkey0,$inout2
3169 aesenc $rndkey0,$inout3
3170 $movkey -16($key,%rax),$rndkey0
3171 jnz .Locb_enc_loop4
3172
3173 aesenc $rndkey1,$inout0
3174 aesenc $rndkey1,$inout1
3175 aesenc $rndkey1,$inout2
3176 aesenc $rndkey1,$inout3
3177 $movkey 16($key_),$rndkey1
3178 mov %r10,%rax # restore twisted rounds
3179
3180 aesenclast @offset[0],$inout0
3181 aesenclast @offset[1],$inout1
3182 aesenclast @offset[2],$inout2
3183 aesenclast @offset[3],$inout3
3184 ret
3185.size __ocb_encrypt4,.-__ocb_encrypt4
3186
3187.type __ocb_encrypt1,\@abi-omnipotent
3188.align 32
3189__ocb_encrypt1:
3190 pxor @offset[5],$inout5 # offset_i
3191 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3192 pxor $inout0,$checksum # accumulate checksum
3193 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3194 $movkey 32($key_),$rndkey0
3195
3196 aesenc $rndkey1,$inout0
3197 $movkey 48($key_),$rndkey1
3198 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3199
3200 aesenc $rndkey0,$inout0
3201 $movkey 64($key_),$rndkey0
3202 jmp .Locb_enc_loop1
3203
3204.align 32
3205.Locb_enc_loop1:
3206 aesenc $rndkey1,$inout0
3207 $movkey ($key,%rax),$rndkey1
3208 add \$32,%rax
3209
3210 aesenc $rndkey0,$inout0
3211 $movkey -16($key,%rax),$rndkey0
3212 jnz .Locb_enc_loop1
3213
3214 aesenc $rndkey1,$inout0
3215 $movkey 16($key_),$rndkey1 # redundant in tail
3216 mov %r10,%rax # restore twisted rounds
3217
3218 aesenclast $inout5,$inout0
3219 ret
3220.size __ocb_encrypt1,.-__ocb_encrypt1
3221
3222.globl aesni_ocb_decrypt
3223.type aesni_ocb_decrypt,\@function,6
3224.align 32
3225aesni_ocb_decrypt:
Robert Sloanab8b8882018-03-26 11:39:51 -07003226.cfi_startproc
Robert Sloana94fe052017-02-21 08:49:28 -08003227 lea (%rsp),%rax
3228 push %rbx
Robert Sloanab8b8882018-03-26 11:39:51 -07003229.cfi_push %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08003230 push %rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07003231.cfi_push %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08003232 push %r12
Robert Sloanab8b8882018-03-26 11:39:51 -07003233.cfi_push %r12
Robert Sloana94fe052017-02-21 08:49:28 -08003234 push %r13
Robert Sloanab8b8882018-03-26 11:39:51 -07003235.cfi_push %r13
Robert Sloana94fe052017-02-21 08:49:28 -08003236 push %r14
Robert Sloanab8b8882018-03-26 11:39:51 -07003237.cfi_push %r14
Robert Sloana94fe052017-02-21 08:49:28 -08003238___
3239$code.=<<___ if ($win64);
3240 lea -0xa0(%rsp),%rsp
3241 movaps %xmm6,0x00(%rsp) # offload everything
3242 movaps %xmm7,0x10(%rsp)
3243 movaps %xmm8,0x20(%rsp)
3244 movaps %xmm9,0x30(%rsp)
3245 movaps %xmm10,0x40(%rsp)
3246 movaps %xmm11,0x50(%rsp)
3247 movaps %xmm12,0x60(%rsp)
3248 movaps %xmm13,0x70(%rsp)
3249 movaps %xmm14,0x80(%rsp)
3250 movaps %xmm15,0x90(%rsp)
3251.Locb_dec_body:
3252___
3253$code.=<<___;
3254 mov $seventh_arg(%rax),$L_p # 7th argument
3255 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
3256
3257 mov 240($key),$rnds_
3258 mov $key,$key_
3259 shl \$4,$rnds_
3260 $movkey ($key),$rndkey0l # round[0]
3261 $movkey 16($key,$rnds_),$rndkey1 # round[last]
3262
3263 movdqu ($offset_p),@offset[5] # load last offset_i
3264 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
3265 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
3266
3267 mov \$16+32,$rounds
3268 lea 32($key_,$rnds_),$key
3269 $movkey 16($key_),$rndkey1 # round[1]
3270 sub %r10,%rax # twisted $rounds
3271 mov %rax,%r10 # backup twisted $rounds
3272
3273 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3274 movdqu ($checksum_p),$checksum # load checksum
3275
3276 test \$1,$block_num # is first block number odd?
3277 jnz .Locb_dec_odd
3278
3279 bsf $block_num,$i1
3280 add \$1,$block_num
3281 shl \$4,$i1
3282 movdqu ($L_p,$i1),$inout5 # borrow
3283 movdqu ($inp),$inout0
3284 lea 16($inp),$inp
3285
3286 call __ocb_decrypt1
3287
3288 movdqa $inout5,@offset[5]
3289 movups $inout0,($out)
3290 xorps $inout0,$checksum # accumulate checksum
3291 lea 16($out),$out
3292 sub \$1,$blocks
3293 jz .Locb_dec_done
3294
3295.Locb_dec_odd:
3296 lea 1($block_num),$i1 # even-numbered blocks
3297 lea 3($block_num),$i3
3298 lea 5($block_num),$i5
3299 lea 6($block_num),$block_num
3300 bsf $i1,$i1 # ntz(block)
3301 bsf $i3,$i3
3302 bsf $i5,$i5
3303 shl \$4,$i1 # ntz(block) -> table offset
3304 shl \$4,$i3
3305 shl \$4,$i5
3306
3307 sub \$6,$blocks
3308 jc .Locb_dec_short
3309 jmp .Locb_dec_grandloop
3310
3311.align 32
3312.Locb_dec_grandloop:
3313 movdqu `16*0`($inp),$inout0 # load input
3314 movdqu `16*1`($inp),$inout1
3315 movdqu `16*2`($inp),$inout2
3316 movdqu `16*3`($inp),$inout3
3317 movdqu `16*4`($inp),$inout4
3318 movdqu `16*5`($inp),$inout5
3319 lea `16*6`($inp),$inp
3320
3321 call __ocb_decrypt6
3322
3323 movups $inout0,`16*0`($out) # store output
3324 pxor $inout0,$checksum # accumulate checksum
3325 movups $inout1,`16*1`($out)
3326 pxor $inout1,$checksum
3327 movups $inout2,`16*2`($out)
3328 pxor $inout2,$checksum
3329 movups $inout3,`16*3`($out)
3330 pxor $inout3,$checksum
3331 movups $inout4,`16*4`($out)
3332 pxor $inout4,$checksum
3333 movups $inout5,`16*5`($out)
3334 pxor $inout5,$checksum
3335 lea `16*6`($out),$out
3336 sub \$6,$blocks
3337 jnc .Locb_dec_grandloop
3338
3339.Locb_dec_short:
3340 add \$6,$blocks
3341 jz .Locb_dec_done
3342
3343 movdqu `16*0`($inp),$inout0
3344 cmp \$2,$blocks
3345 jb .Locb_dec_one
3346 movdqu `16*1`($inp),$inout1
3347 je .Locb_dec_two
3348
3349 movdqu `16*2`($inp),$inout2
3350 cmp \$4,$blocks
3351 jb .Locb_dec_three
3352 movdqu `16*3`($inp),$inout3
3353 je .Locb_dec_four
3354
3355 movdqu `16*4`($inp),$inout4
3356 pxor $inout5,$inout5
3357
3358 call __ocb_decrypt6
3359
3360 movdqa @offset[4],@offset[5]
3361 movups $inout0,`16*0`($out) # store output
3362 pxor $inout0,$checksum # accumulate checksum
3363 movups $inout1,`16*1`($out)
3364 pxor $inout1,$checksum
3365 movups $inout2,`16*2`($out)
3366 pxor $inout2,$checksum
3367 movups $inout3,`16*3`($out)
3368 pxor $inout3,$checksum
3369 movups $inout4,`16*4`($out)
3370 pxor $inout4,$checksum
3371
3372 jmp .Locb_dec_done
3373
3374.align 16
3375.Locb_dec_one:
3376 movdqa @offset[0],$inout5 # borrow
3377
3378 call __ocb_decrypt1
3379
3380 movdqa $inout5,@offset[5]
3381 movups $inout0,`16*0`($out) # store output
3382 xorps $inout0,$checksum # accumulate checksum
3383 jmp .Locb_dec_done
3384
3385.align 16
3386.Locb_dec_two:
3387 pxor $inout2,$inout2
3388 pxor $inout3,$inout3
3389
3390 call __ocb_decrypt4
3391
3392 movdqa @offset[1],@offset[5]
3393 movups $inout0,`16*0`($out) # store output
3394 xorps $inout0,$checksum # accumulate checksum
3395 movups $inout1,`16*1`($out)
3396 xorps $inout1,$checksum
3397
3398 jmp .Locb_dec_done
3399
3400.align 16
3401.Locb_dec_three:
3402 pxor $inout3,$inout3
3403
3404 call __ocb_decrypt4
3405
3406 movdqa @offset[2],@offset[5]
3407 movups $inout0,`16*0`($out) # store output
3408 xorps $inout0,$checksum # accumulate checksum
3409 movups $inout1,`16*1`($out)
3410 xorps $inout1,$checksum
3411 movups $inout2,`16*2`($out)
3412 xorps $inout2,$checksum
3413
3414 jmp .Locb_dec_done
3415
3416.align 16
3417.Locb_dec_four:
3418 call __ocb_decrypt4
3419
3420 movdqa @offset[3],@offset[5]
3421 movups $inout0,`16*0`($out) # store output
3422 pxor $inout0,$checksum # accumulate checksum
3423 movups $inout1,`16*1`($out)
3424 pxor $inout1,$checksum
3425 movups $inout2,`16*2`($out)
3426 pxor $inout2,$checksum
3427 movups $inout3,`16*3`($out)
3428 pxor $inout3,$checksum
3429
3430.Locb_dec_done:
3431 pxor $rndkey0,@offset[5] # "remove" round[last]
3432 movdqu $checksum,($checksum_p) # store checksum
3433 movdqu @offset[5],($offset_p) # store last offset_i
3434
3435 xorps %xmm0,%xmm0 # clear register bank
3436 pxor %xmm1,%xmm1
3437 pxor %xmm2,%xmm2
3438 pxor %xmm3,%xmm3
3439 pxor %xmm4,%xmm4
3440 pxor %xmm5,%xmm5
3441___
3442$code.=<<___ if (!$win64);
3443 pxor %xmm6,%xmm6
3444 pxor %xmm7,%xmm7
3445 pxor %xmm8,%xmm8
3446 pxor %xmm9,%xmm9
3447 pxor %xmm10,%xmm10
3448 pxor %xmm11,%xmm11
3449 pxor %xmm12,%xmm12
3450 pxor %xmm13,%xmm13
3451 pxor %xmm14,%xmm14
3452 pxor %xmm15,%xmm15
3453 lea 0x28(%rsp),%rax
Robert Sloanab8b8882018-03-26 11:39:51 -07003454.cfi_def_cfa %rax,8
Robert Sloana94fe052017-02-21 08:49:28 -08003455___
3456$code.=<<___ if ($win64);
3457 movaps 0x00(%rsp),%xmm6
3458 movaps %xmm0,0x00(%rsp) # clear stack
3459 movaps 0x10(%rsp),%xmm7
3460 movaps %xmm0,0x10(%rsp)
3461 movaps 0x20(%rsp),%xmm8
3462 movaps %xmm0,0x20(%rsp)
3463 movaps 0x30(%rsp),%xmm9
3464 movaps %xmm0,0x30(%rsp)
3465 movaps 0x40(%rsp),%xmm10
3466 movaps %xmm0,0x40(%rsp)
3467 movaps 0x50(%rsp),%xmm11
3468 movaps %xmm0,0x50(%rsp)
3469 movaps 0x60(%rsp),%xmm12
3470 movaps %xmm0,0x60(%rsp)
3471 movaps 0x70(%rsp),%xmm13
3472 movaps %xmm0,0x70(%rsp)
3473 movaps 0x80(%rsp),%xmm14
3474 movaps %xmm0,0x80(%rsp)
3475 movaps 0x90(%rsp),%xmm15
3476 movaps %xmm0,0x90(%rsp)
3477 lea 0xa0+0x28(%rsp),%rax
3478.Locb_dec_pop:
3479___
3480$code.=<<___;
3481 mov -40(%rax),%r14
Robert Sloanab8b8882018-03-26 11:39:51 -07003482.cfi_restore %r14
Robert Sloana94fe052017-02-21 08:49:28 -08003483 mov -32(%rax),%r13
Robert Sloanab8b8882018-03-26 11:39:51 -07003484.cfi_restore %r13
Robert Sloana94fe052017-02-21 08:49:28 -08003485 mov -24(%rax),%r12
Robert Sloanab8b8882018-03-26 11:39:51 -07003486.cfi_restore %r12
Robert Sloana94fe052017-02-21 08:49:28 -08003487 mov -16(%rax),%rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07003488.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08003489 mov -8(%rax),%rbx
Robert Sloanab8b8882018-03-26 11:39:51 -07003490.cfi_restore %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08003491 lea (%rax),%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07003492.cfi_def_cfa_register %rsp
Robert Sloana94fe052017-02-21 08:49:28 -08003493.Locb_dec_epilogue:
3494 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07003495.cfi_endproc
Robert Sloana94fe052017-02-21 08:49:28 -08003496.size aesni_ocb_decrypt,.-aesni_ocb_decrypt
3497
3498.type __ocb_decrypt6,\@abi-omnipotent
3499.align 32
3500__ocb_decrypt6:
3501 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3502 movdqu ($L_p,$i1),@offset[1]
3503 movdqa @offset[0],@offset[2]
3504 movdqu ($L_p,$i3),@offset[3]
3505 movdqa @offset[0],@offset[4]
3506 pxor @offset[5],@offset[0]
3507 movdqu ($L_p,$i5),@offset[5]
3508 pxor @offset[0],@offset[1]
3509 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3510 pxor @offset[1],@offset[2]
3511 pxor @offset[1],$inout1
3512 pxor @offset[2],@offset[3]
3513 pxor @offset[2],$inout2
3514 pxor @offset[3],@offset[4]
3515 pxor @offset[3],$inout3
3516 pxor @offset[4],@offset[5]
3517 pxor @offset[4],$inout4
3518 pxor @offset[5],$inout5
3519 $movkey 32($key_),$rndkey0
3520
3521 lea 1($block_num),$i1 # even-numbered blocks
3522 lea 3($block_num),$i3
3523 lea 5($block_num),$i5
3524 add \$6,$block_num
3525 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3526 bsf $i1,$i1 # ntz(block)
3527 bsf $i3,$i3
3528 bsf $i5,$i5
3529
3530 aesdec $rndkey1,$inout0
3531 aesdec $rndkey1,$inout1
3532 aesdec $rndkey1,$inout2
3533 aesdec $rndkey1,$inout3
3534 pxor $rndkey0l,@offset[1]
3535 pxor $rndkey0l,@offset[2]
3536 aesdec $rndkey1,$inout4
3537 pxor $rndkey0l,@offset[3]
3538 pxor $rndkey0l,@offset[4]
3539 aesdec $rndkey1,$inout5
3540 $movkey 48($key_),$rndkey1
3541 pxor $rndkey0l,@offset[5]
3542
3543 aesdec $rndkey0,$inout0
3544 aesdec $rndkey0,$inout1
3545 aesdec $rndkey0,$inout2
3546 aesdec $rndkey0,$inout3
3547 aesdec $rndkey0,$inout4
3548 aesdec $rndkey0,$inout5
3549 $movkey 64($key_),$rndkey0
3550 shl \$4,$i1 # ntz(block) -> table offset
3551 shl \$4,$i3
3552 jmp .Locb_dec_loop6
3553
3554.align 32
3555.Locb_dec_loop6:
3556 aesdec $rndkey1,$inout0
3557 aesdec $rndkey1,$inout1
3558 aesdec $rndkey1,$inout2
3559 aesdec $rndkey1,$inout3
3560 aesdec $rndkey1,$inout4
3561 aesdec $rndkey1,$inout5
3562 $movkey ($key,%rax),$rndkey1
3563 add \$32,%rax
3564
3565 aesdec $rndkey0,$inout0
3566 aesdec $rndkey0,$inout1
3567 aesdec $rndkey0,$inout2
3568 aesdec $rndkey0,$inout3
3569 aesdec $rndkey0,$inout4
3570 aesdec $rndkey0,$inout5
3571 $movkey -16($key,%rax),$rndkey0
3572 jnz .Locb_dec_loop6
3573
3574 aesdec $rndkey1,$inout0
3575 aesdec $rndkey1,$inout1
3576 aesdec $rndkey1,$inout2
3577 aesdec $rndkey1,$inout3
3578 aesdec $rndkey1,$inout4
3579 aesdec $rndkey1,$inout5
3580 $movkey 16($key_),$rndkey1
3581 shl \$4,$i5
3582
3583 aesdeclast @offset[0],$inout0
3584 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3585 mov %r10,%rax # restore twisted rounds
3586 aesdeclast @offset[1],$inout1
3587 aesdeclast @offset[2],$inout2
3588 aesdeclast @offset[3],$inout3
3589 aesdeclast @offset[4],$inout4
3590 aesdeclast @offset[5],$inout5
3591 ret
3592.size __ocb_decrypt6,.-__ocb_decrypt6
3593
3594.type __ocb_decrypt4,\@abi-omnipotent
3595.align 32
3596__ocb_decrypt4:
3597 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3598 movdqu ($L_p,$i1),@offset[1]
3599 movdqa @offset[0],@offset[2]
3600 movdqu ($L_p,$i3),@offset[3]
3601 pxor @offset[5],@offset[0]
3602 pxor @offset[0],@offset[1]
3603 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3604 pxor @offset[1],@offset[2]
3605 pxor @offset[1],$inout1
3606 pxor @offset[2],@offset[3]
3607 pxor @offset[2],$inout2
3608 pxor @offset[3],$inout3
3609 $movkey 32($key_),$rndkey0
3610
3611 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3612 pxor $rndkey0l,@offset[1]
3613 pxor $rndkey0l,@offset[2]
3614 pxor $rndkey0l,@offset[3]
3615
3616 aesdec $rndkey1,$inout0
3617 aesdec $rndkey1,$inout1
3618 aesdec $rndkey1,$inout2
3619 aesdec $rndkey1,$inout3
3620 $movkey 48($key_),$rndkey1
3621
3622 aesdec $rndkey0,$inout0
3623 aesdec $rndkey0,$inout1
3624 aesdec $rndkey0,$inout2
3625 aesdec $rndkey0,$inout3
3626 $movkey 64($key_),$rndkey0
3627 jmp .Locb_dec_loop4
3628
3629.align 32
3630.Locb_dec_loop4:
3631 aesdec $rndkey1,$inout0
3632 aesdec $rndkey1,$inout1
3633 aesdec $rndkey1,$inout2
3634 aesdec $rndkey1,$inout3
3635 $movkey ($key,%rax),$rndkey1
3636 add \$32,%rax
3637
3638 aesdec $rndkey0,$inout0
3639 aesdec $rndkey0,$inout1
3640 aesdec $rndkey0,$inout2
3641 aesdec $rndkey0,$inout3
3642 $movkey -16($key,%rax),$rndkey0
3643 jnz .Locb_dec_loop4
3644
3645 aesdec $rndkey1,$inout0
3646 aesdec $rndkey1,$inout1
3647 aesdec $rndkey1,$inout2
3648 aesdec $rndkey1,$inout3
3649 $movkey 16($key_),$rndkey1
3650 mov %r10,%rax # restore twisted rounds
3651
3652 aesdeclast @offset[0],$inout0
3653 aesdeclast @offset[1],$inout1
3654 aesdeclast @offset[2],$inout2
3655 aesdeclast @offset[3],$inout3
3656 ret
3657.size __ocb_decrypt4,.-__ocb_decrypt4
3658
3659.type __ocb_decrypt1,\@abi-omnipotent
3660.align 32
3661__ocb_decrypt1:
3662 pxor @offset[5],$inout5 # offset_i
3663 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3664 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3665 $movkey 32($key_),$rndkey0
3666
3667 aesdec $rndkey1,$inout0
3668 $movkey 48($key_),$rndkey1
3669 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3670
3671 aesdec $rndkey0,$inout0
3672 $movkey 64($key_),$rndkey0
3673 jmp .Locb_dec_loop1
3674
3675.align 32
3676.Locb_dec_loop1:
3677 aesdec $rndkey1,$inout0
3678 $movkey ($key,%rax),$rndkey1
3679 add \$32,%rax
3680
3681 aesdec $rndkey0,$inout0
3682 $movkey -16($key,%rax),$rndkey0
3683 jnz .Locb_dec_loop1
3684
3685 aesdec $rndkey1,$inout0
3686 $movkey 16($key_),$rndkey1 # redundant in tail
3687 mov %r10,%rax # restore twisted rounds
3688
3689 aesdeclast $inout5,$inout0
3690 ret
3691.size __ocb_decrypt1,.-__ocb_decrypt1
3692___
Adam Langleyd9e397b2015-01-22 14:27:53 -08003693} }}
3694
3695########################################################################
3696# void $PREFIX_cbc_encrypt (const void *inp, void *out,
3697# size_t length, const AES_KEY *key,
3698# unsigned char *ivp,const int enc);
3699{
3700my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
3701my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
Adam Langleyd9e397b2015-01-22 14:27:53 -08003702
3703$code.=<<___;
3704.globl ${PREFIX}_cbc_encrypt
3705.type ${PREFIX}_cbc_encrypt,\@function,6
3706.align 16
3707${PREFIX}_cbc_encrypt:
Robert Sloanab8b8882018-03-26 11:39:51 -07003708.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08003709 test $len,$len # check length
3710 jz .Lcbc_ret
3711
3712 mov 240($key),$rnds_ # key->rounds
3713 mov $key,$key_ # backup $key
3714 test %r9d,%r9d # 6th argument
3715 jz .Lcbc_decrypt
3716#--------------------------- CBC ENCRYPT ------------------------------#
3717 movups ($ivp),$inout0 # load iv as initial state
3718 mov $rnds_,$rounds
3719 cmp \$16,$len
3720 jb .Lcbc_enc_tail
3721 sub \$16,$len
3722 jmp .Lcbc_enc_loop
3723.align 16
3724.Lcbc_enc_loop:
3725 movups ($inp),$inout1 # load input
3726 lea 16($inp),$inp
3727 #xorps $inout1,$inout0
3728___
3729 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3730$code.=<<___;
3731 mov $rnds_,$rounds # restore $rounds
3732 mov $key_,$key # restore $key
3733 movups $inout0,0($out) # store output
3734 lea 16($out),$out
3735 sub \$16,$len
3736 jnc .Lcbc_enc_loop
3737 add \$16,$len
3738 jnz .Lcbc_enc_tail
Adam Langleye9ada862015-05-11 17:20:37 -07003739 pxor $rndkey0,$rndkey0 # clear register bank
3740 pxor $rndkey1,$rndkey1
Adam Langleyd9e397b2015-01-22 14:27:53 -08003741 movups $inout0,($ivp)
Adam Langleye9ada862015-05-11 17:20:37 -07003742 pxor $inout0,$inout0
3743 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -08003744 jmp .Lcbc_ret
3745
3746.Lcbc_enc_tail:
3747 mov $len,%rcx # zaps $key
3748 xchg $inp,$out # $inp is %rsi and $out is %rdi now
3749 .long 0x9066A4F3 # rep movsb
3750 mov \$16,%ecx # zero tail
3751 sub $len,%rcx
3752 xor %eax,%eax
3753 .long 0x9066AAF3 # rep stosb
3754 lea -16(%rdi),%rdi # rewind $out by 1 block
3755 mov $rnds_,$rounds # restore $rounds
3756 mov %rdi,%rsi # $inp and $out are the same
3757 mov $key_,$key # restore $key
3758 xor $len,$len # len=16
3759 jmp .Lcbc_enc_loop # one more spin
3760 #--------------------------- CBC DECRYPT ------------------------------#
3761.align 16
3762.Lcbc_decrypt:
Adam Langleye9ada862015-05-11 17:20:37 -07003763 cmp \$16,$len
3764 jne .Lcbc_decrypt_bulk
3765
3766 # handle single block without allocating stack frame,
3767 # useful in ciphertext stealing mode
3768 movdqu ($inp),$inout0 # load input
3769 movdqu ($ivp),$inout1 # load iv
3770 movdqa $inout0,$inout2 # future iv
3771___
3772 &aesni_generate1("dec",$key,$rnds_);
3773$code.=<<___;
3774 pxor $rndkey0,$rndkey0 # clear register bank
3775 pxor $rndkey1,$rndkey1
3776 movdqu $inout2,($ivp) # store iv
3777 xorps $inout1,$inout0 # ^=iv
3778 pxor $inout1,$inout1
3779 movups $inout0,($out) # store output
3780 pxor $inout0,$inout0
3781 jmp .Lcbc_ret
3782.align 16
3783.Lcbc_decrypt_bulk:
Robert Sloana94fe052017-02-21 08:49:28 -08003784 lea (%rsp),%r11 # frame pointer
Robert Sloanab8b8882018-03-26 11:39:51 -07003785.cfi_def_cfa_register %r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08003786 push %rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07003787.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08003788 sub \$$frame_size,%rsp
3789 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
3790___
3791$code.=<<___ if ($win64);
3792 movaps %xmm6,0x10(%rsp)
3793 movaps %xmm7,0x20(%rsp)
3794 movaps %xmm8,0x30(%rsp)
3795 movaps %xmm9,0x40(%rsp)
3796 movaps %xmm10,0x50(%rsp)
3797 movaps %xmm11,0x60(%rsp)
3798 movaps %xmm12,0x70(%rsp)
3799 movaps %xmm13,0x80(%rsp)
3800 movaps %xmm14,0x90(%rsp)
3801 movaps %xmm15,0xa0(%rsp)
3802.Lcbc_decrypt_body:
3803___
Robert Sloana94fe052017-02-21 08:49:28 -08003804
3805my $inp_=$key_="%rbp"; # reassign $key_
3806
Adam Langleyd9e397b2015-01-22 14:27:53 -08003807$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08003808 mov $key,$key_ # [re-]backup $key [after reassignment]
Adam Langleyd9e397b2015-01-22 14:27:53 -08003809 movups ($ivp),$iv
3810 mov $rnds_,$rounds
3811 cmp \$0x50,$len
3812 jbe .Lcbc_dec_tail
3813
3814 $movkey ($key),$rndkey0
3815 movdqu 0x00($inp),$inout0 # load input
3816 movdqu 0x10($inp),$inout1
3817 movdqa $inout0,$in0
3818 movdqu 0x20($inp),$inout2
3819 movdqa $inout1,$in1
3820 movdqu 0x30($inp),$inout3
3821 movdqa $inout2,$in2
3822 movdqu 0x40($inp),$inout4
3823 movdqa $inout3,$in3
3824 movdqu 0x50($inp),$inout5
3825 movdqa $inout4,$in4
Robert Sloan2424d842017-05-01 07:46:28 -07003826 leaq OPENSSL_ia32cap_P(%rip),%r9
Robert Sloan572a4e22017-04-17 10:52:19 -07003827 mov 4(%r9),%r9d
Adam Langleyd9e397b2015-01-22 14:27:53 -08003828 cmp \$0x70,$len
3829 jbe .Lcbc_dec_six_or_seven
3830
Adam Langleye9ada862015-05-11 17:20:37 -07003831 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
3832 sub \$0x50,$len # $len is biased by -5*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08003833 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
Adam Langleye9ada862015-05-11 17:20:37 -07003834 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
3835 sub \$0x20,$len # $len is biased by -7*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08003836 lea 0x70($key),$key # size optimization
3837 jmp .Lcbc_dec_loop8_enter
3838.align 16
3839.Lcbc_dec_loop8:
3840 movups $inout7,($out)
3841 lea 0x10($out),$out
3842.Lcbc_dec_loop8_enter:
3843 movdqu 0x60($inp),$inout6
3844 pxor $rndkey0,$inout0
3845 movdqu 0x70($inp),$inout7
3846 pxor $rndkey0,$inout1
3847 $movkey 0x10-0x70($key),$rndkey1
3848 pxor $rndkey0,$inout2
Robert Sloana94fe052017-02-21 08:49:28 -08003849 mov \$-1,$inp_
Adam Langleyd9e397b2015-01-22 14:27:53 -08003850 cmp \$0x70,$len # is there at least 0x60 bytes ahead?
3851 pxor $rndkey0,$inout3
3852 pxor $rndkey0,$inout4
3853 pxor $rndkey0,$inout5
3854 pxor $rndkey0,$inout6
3855
3856 aesdec $rndkey1,$inout0
3857 pxor $rndkey0,$inout7
3858 $movkey 0x20-0x70($key),$rndkey0
3859 aesdec $rndkey1,$inout1
3860 aesdec $rndkey1,$inout2
3861 aesdec $rndkey1,$inout3
3862 aesdec $rndkey1,$inout4
3863 aesdec $rndkey1,$inout5
3864 aesdec $rndkey1,$inout6
Robert Sloana94fe052017-02-21 08:49:28 -08003865 adc \$0,$inp_
3866 and \$128,$inp_
Adam Langleyd9e397b2015-01-22 14:27:53 -08003867 aesdec $rndkey1,$inout7
3868 add $inp,$inp_
3869 $movkey 0x30-0x70($key),$rndkey1
3870___
3871for($i=1;$i<12;$i++) {
3872my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3873$code.=<<___ if ($i==7);
3874 cmp \$11,$rounds
3875___
3876$code.=<<___;
3877 aesdec $rndkeyx,$inout0
3878 aesdec $rndkeyx,$inout1
3879 aesdec $rndkeyx,$inout2
3880 aesdec $rndkeyx,$inout3
3881 aesdec $rndkeyx,$inout4
3882 aesdec $rndkeyx,$inout5
3883 aesdec $rndkeyx,$inout6
3884 aesdec $rndkeyx,$inout7
3885 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
3886___
3887$code.=<<___ if ($i<6 || (!($i&1) && $i>7));
3888 nop
3889___
3890$code.=<<___ if ($i==7);
3891 jb .Lcbc_dec_done
3892___
3893$code.=<<___ if ($i==9);
3894 je .Lcbc_dec_done
3895___
3896$code.=<<___ if ($i==11);
3897 jmp .Lcbc_dec_done
3898___
3899}
3900$code.=<<___;
3901.align 16
3902.Lcbc_dec_done:
3903 aesdec $rndkey1,$inout0
3904 aesdec $rndkey1,$inout1
3905 pxor $rndkey0,$iv
3906 pxor $rndkey0,$in0
3907 aesdec $rndkey1,$inout2
3908 aesdec $rndkey1,$inout3
3909 pxor $rndkey0,$in1
3910 pxor $rndkey0,$in2
3911 aesdec $rndkey1,$inout4
3912 aesdec $rndkey1,$inout5
3913 pxor $rndkey0,$in3
3914 pxor $rndkey0,$in4
3915 aesdec $rndkey1,$inout6
3916 aesdec $rndkey1,$inout7
3917 movdqu 0x50($inp),$rndkey1
3918
3919 aesdeclast $iv,$inout0
3920 movdqu 0x60($inp),$iv # borrow $iv
3921 pxor $rndkey0,$rndkey1
3922 aesdeclast $in0,$inout1
3923 pxor $rndkey0,$iv
3924 movdqu 0x70($inp),$rndkey0 # next IV
3925 aesdeclast $in1,$inout2
3926 lea 0x80($inp),$inp
3927 movdqu 0x00($inp_),$in0
3928 aesdeclast $in2,$inout3
3929 aesdeclast $in3,$inout4
3930 movdqu 0x10($inp_),$in1
3931 movdqu 0x20($inp_),$in2
3932 aesdeclast $in4,$inout5
3933 aesdeclast $rndkey1,$inout6
3934 movdqu 0x30($inp_),$in3
3935 movdqu 0x40($inp_),$in4
3936 aesdeclast $iv,$inout7
3937 movdqa $rndkey0,$iv # return $iv
3938 movdqu 0x50($inp_),$rndkey1
3939 $movkey -0x70($key),$rndkey0
3940
3941 movups $inout0,($out) # store output
3942 movdqa $in0,$inout0
3943 movups $inout1,0x10($out)
3944 movdqa $in1,$inout1
3945 movups $inout2,0x20($out)
3946 movdqa $in2,$inout2
3947 movups $inout3,0x30($out)
3948 movdqa $in3,$inout3
3949 movups $inout4,0x40($out)
3950 movdqa $in4,$inout4
3951 movups $inout5,0x50($out)
3952 movdqa $rndkey1,$inout5
3953 movups $inout6,0x60($out)
3954 lea 0x70($out),$out
3955
3956 sub \$0x80,$len
3957 ja .Lcbc_dec_loop8
3958
3959 movaps $inout7,$inout0
3960 lea -0x70($key),$key
3961 add \$0x70,$len
Adam Langleye9ada862015-05-11 17:20:37 -07003962 jle .Lcbc_dec_clear_tail_collected
Adam Langleyd9e397b2015-01-22 14:27:53 -08003963 movups $inout7,($out)
3964 lea 0x10($out),$out
3965 cmp \$0x50,$len
3966 jbe .Lcbc_dec_tail
3967
3968 movaps $in0,$inout0
3969.Lcbc_dec_six_or_seven:
3970 cmp \$0x60,$len
3971 ja .Lcbc_dec_seven
3972
3973 movaps $inout5,$inout6
3974 call _aesni_decrypt6
3975 pxor $iv,$inout0 # ^= IV
3976 movaps $inout6,$iv
3977 pxor $in0,$inout1
3978 movdqu $inout0,($out)
3979 pxor $in1,$inout2
3980 movdqu $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -07003981 pxor $inout1,$inout1 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -08003982 pxor $in2,$inout3
3983 movdqu $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -07003984 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -08003985 pxor $in3,$inout4
3986 movdqu $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -07003987 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -08003988 pxor $in4,$inout5
3989 movdqu $inout4,0x40($out)
Adam Langleye9ada862015-05-11 17:20:37 -07003990 pxor $inout4,$inout4
Adam Langleyd9e397b2015-01-22 14:27:53 -08003991 lea 0x50($out),$out
3992 movdqa $inout5,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07003993 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -08003994 jmp .Lcbc_dec_tail_collected
3995
3996.align 16
3997.Lcbc_dec_seven:
3998 movups 0x60($inp),$inout6
3999 xorps $inout7,$inout7
4000 call _aesni_decrypt8
4001 movups 0x50($inp),$inout7
4002 pxor $iv,$inout0 # ^= IV
4003 movups 0x60($inp),$iv
4004 pxor $in0,$inout1
4005 movdqu $inout0,($out)
4006 pxor $in1,$inout2
4007 movdqu $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004008 pxor $inout1,$inout1 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -08004009 pxor $in2,$inout3
4010 movdqu $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004011 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -08004012 pxor $in3,$inout4
4013 movdqu $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004014 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -08004015 pxor $in4,$inout5
4016 movdqu $inout4,0x40($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004017 pxor $inout4,$inout4
Adam Langleyd9e397b2015-01-22 14:27:53 -08004018 pxor $inout7,$inout6
4019 movdqu $inout5,0x50($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004020 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -08004021 lea 0x60($out),$out
4022 movdqa $inout6,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07004023 pxor $inout6,$inout6
4024 pxor $inout7,$inout7
Adam Langleyd9e397b2015-01-22 14:27:53 -08004025 jmp .Lcbc_dec_tail_collected
4026
4027.align 16
4028.Lcbc_dec_loop6:
4029 movups $inout5,($out)
4030 lea 0x10($out),$out
4031 movdqu 0x00($inp),$inout0 # load input
4032 movdqu 0x10($inp),$inout1
4033 movdqa $inout0,$in0
4034 movdqu 0x20($inp),$inout2
4035 movdqa $inout1,$in1
4036 movdqu 0x30($inp),$inout3
4037 movdqa $inout2,$in2
4038 movdqu 0x40($inp),$inout4
4039 movdqa $inout3,$in3
4040 movdqu 0x50($inp),$inout5
4041 movdqa $inout4,$in4
4042.Lcbc_dec_loop6_enter:
4043 lea 0x60($inp),$inp
4044 movdqa $inout5,$inout6
4045
4046 call _aesni_decrypt6
4047
4048 pxor $iv,$inout0 # ^= IV
4049 movdqa $inout6,$iv
4050 pxor $in0,$inout1
4051 movdqu $inout0,($out)
4052 pxor $in1,$inout2
4053 movdqu $inout1,0x10($out)
4054 pxor $in2,$inout3
4055 movdqu $inout2,0x20($out)
4056 pxor $in3,$inout4
4057 mov $key_,$key
4058 movdqu $inout3,0x30($out)
4059 pxor $in4,$inout5
4060 mov $rnds_,$rounds
4061 movdqu $inout4,0x40($out)
4062 lea 0x50($out),$out
4063 sub \$0x60,$len
4064 ja .Lcbc_dec_loop6
4065
4066 movdqa $inout5,$inout0
4067 add \$0x50,$len
Adam Langleye9ada862015-05-11 17:20:37 -07004068 jle .Lcbc_dec_clear_tail_collected
Adam Langleyd9e397b2015-01-22 14:27:53 -08004069 movups $inout5,($out)
4070 lea 0x10($out),$out
4071
4072.Lcbc_dec_tail:
4073 movups ($inp),$inout0
4074 sub \$0x10,$len
Adam Langleye9ada862015-05-11 17:20:37 -07004075 jbe .Lcbc_dec_one # $len is 1*16 or less
Adam Langleyd9e397b2015-01-22 14:27:53 -08004076
4077 movups 0x10($inp),$inout1
4078 movaps $inout0,$in0
4079 sub \$0x10,$len
Adam Langleye9ada862015-05-11 17:20:37 -07004080 jbe .Lcbc_dec_two # $len is 2*16 or less
Adam Langleyd9e397b2015-01-22 14:27:53 -08004081
4082 movups 0x20($inp),$inout2
4083 movaps $inout1,$in1
4084 sub \$0x10,$len
Adam Langleye9ada862015-05-11 17:20:37 -07004085 jbe .Lcbc_dec_three # $len is 3*16 or less
Adam Langleyd9e397b2015-01-22 14:27:53 -08004086
4087 movups 0x30($inp),$inout3
4088 movaps $inout2,$in2
4089 sub \$0x10,$len
Adam Langleye9ada862015-05-11 17:20:37 -07004090 jbe .Lcbc_dec_four # $len is 4*16 or less
Adam Langleyd9e397b2015-01-22 14:27:53 -08004091
Adam Langleye9ada862015-05-11 17:20:37 -07004092 movups 0x40($inp),$inout4 # $len is 5*16 or less
Adam Langleyd9e397b2015-01-22 14:27:53 -08004093 movaps $inout3,$in3
4094 movaps $inout4,$in4
4095 xorps $inout5,$inout5
4096 call _aesni_decrypt6
4097 pxor $iv,$inout0
4098 movaps $in4,$iv
4099 pxor $in0,$inout1
4100 movdqu $inout0,($out)
4101 pxor $in1,$inout2
4102 movdqu $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004103 pxor $inout1,$inout1 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -08004104 pxor $in2,$inout3
4105 movdqu $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004106 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -08004107 pxor $in3,$inout4
4108 movdqu $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004109 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -08004110 lea 0x40($out),$out
4111 movdqa $inout4,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07004112 pxor $inout4,$inout4
4113 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -08004114 sub \$0x10,$len
4115 jmp .Lcbc_dec_tail_collected
4116
4117.align 16
4118.Lcbc_dec_one:
4119 movaps $inout0,$in0
4120___
4121 &aesni_generate1("dec",$key,$rounds);
4122$code.=<<___;
4123 xorps $iv,$inout0
4124 movaps $in0,$iv
4125 jmp .Lcbc_dec_tail_collected
4126.align 16
4127.Lcbc_dec_two:
4128 movaps $inout1,$in1
4129 call _aesni_decrypt2
4130 pxor $iv,$inout0
4131 movaps $in1,$iv
4132 pxor $in0,$inout1
4133 movdqu $inout0,($out)
4134 movdqa $inout1,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07004135 pxor $inout1,$inout1 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -08004136 lea 0x10($out),$out
4137 jmp .Lcbc_dec_tail_collected
4138.align 16
4139.Lcbc_dec_three:
4140 movaps $inout2,$in2
4141 call _aesni_decrypt3
4142 pxor $iv,$inout0
4143 movaps $in2,$iv
4144 pxor $in0,$inout1
4145 movdqu $inout0,($out)
4146 pxor $in1,$inout2
4147 movdqu $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004148 pxor $inout1,$inout1 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -08004149 movdqa $inout2,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07004150 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -08004151 lea 0x20($out),$out
4152 jmp .Lcbc_dec_tail_collected
4153.align 16
4154.Lcbc_dec_four:
4155 movaps $inout3,$in3
4156 call _aesni_decrypt4
4157 pxor $iv,$inout0
4158 movaps $in3,$iv
4159 pxor $in0,$inout1
4160 movdqu $inout0,($out)
4161 pxor $in1,$inout2
4162 movdqu $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004163 pxor $inout1,$inout1 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -08004164 pxor $in2,$inout3
4165 movdqu $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004166 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -08004167 movdqa $inout3,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07004168 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -08004169 lea 0x30($out),$out
4170 jmp .Lcbc_dec_tail_collected
4171
4172.align 16
Adam Langleye9ada862015-05-11 17:20:37 -07004173.Lcbc_dec_clear_tail_collected:
4174 pxor $inout1,$inout1 # clear register bank
4175 pxor $inout2,$inout2
4176 pxor $inout3,$inout3
4177___
4178$code.=<<___ if (!$win64);
4179 pxor $inout4,$inout4 # %xmm6..9
4180 pxor $inout5,$inout5
4181 pxor $inout6,$inout6
4182 pxor $inout7,$inout7
4183___
4184$code.=<<___;
Adam Langleyd9e397b2015-01-22 14:27:53 -08004185.Lcbc_dec_tail_collected:
4186 movups $iv,($ivp)
4187 and \$15,$len
4188 jnz .Lcbc_dec_tail_partial
4189 movups $inout0,($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004190 pxor $inout0,$inout0
Adam Langleyd9e397b2015-01-22 14:27:53 -08004191 jmp .Lcbc_dec_ret
4192.align 16
4193.Lcbc_dec_tail_partial:
4194 movaps $inout0,(%rsp)
Adam Langleye9ada862015-05-11 17:20:37 -07004195 pxor $inout0,$inout0
Adam Langleyd9e397b2015-01-22 14:27:53 -08004196 mov \$16,%rcx
4197 mov $out,%rdi
4198 sub $len,%rcx
4199 lea (%rsp),%rsi
Adam Langleye9ada862015-05-11 17:20:37 -07004200 .long 0x9066A4F3 # rep movsb
4201 movdqa $inout0,(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004202
4203.Lcbc_dec_ret:
Adam Langleye9ada862015-05-11 17:20:37 -07004204 xorps $rndkey0,$rndkey0 # %xmm0
4205 pxor $rndkey1,$rndkey1
Adam Langleyd9e397b2015-01-22 14:27:53 -08004206___
4207$code.=<<___ if ($win64);
4208 movaps 0x10(%rsp),%xmm6
Adam Langleye9ada862015-05-11 17:20:37 -07004209 movaps %xmm0,0x10(%rsp) # clear stack
Adam Langleyd9e397b2015-01-22 14:27:53 -08004210 movaps 0x20(%rsp),%xmm7
Adam Langleye9ada862015-05-11 17:20:37 -07004211 movaps %xmm0,0x20(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004212 movaps 0x30(%rsp),%xmm8
Adam Langleye9ada862015-05-11 17:20:37 -07004213 movaps %xmm0,0x30(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004214 movaps 0x40(%rsp),%xmm9
Adam Langleye9ada862015-05-11 17:20:37 -07004215 movaps %xmm0,0x40(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004216 movaps 0x50(%rsp),%xmm10
Adam Langleye9ada862015-05-11 17:20:37 -07004217 movaps %xmm0,0x50(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004218 movaps 0x60(%rsp),%xmm11
Adam Langleye9ada862015-05-11 17:20:37 -07004219 movaps %xmm0,0x60(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004220 movaps 0x70(%rsp),%xmm12
Adam Langleye9ada862015-05-11 17:20:37 -07004221 movaps %xmm0,0x70(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004222 movaps 0x80(%rsp),%xmm13
Adam Langleye9ada862015-05-11 17:20:37 -07004223 movaps %xmm0,0x80(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004224 movaps 0x90(%rsp),%xmm14
Adam Langleye9ada862015-05-11 17:20:37 -07004225 movaps %xmm0,0x90(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004226 movaps 0xa0(%rsp),%xmm15
Adam Langleye9ada862015-05-11 17:20:37 -07004227 movaps %xmm0,0xa0(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004228___
4229$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08004230 mov -8(%r11),%rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07004231.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08004232 lea (%r11),%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07004233.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08004234.Lcbc_ret:
4235 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07004236.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08004237.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4238___
4239}
Adam Langleye9ada862015-05-11 17:20:37 -07004240# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
Adam Langleyd9e397b2015-01-22 14:27:53 -08004241# int bits, AES_KEY *key)
Adam Langleye9ada862015-05-11 17:20:37 -07004242#
4243# input: $inp user-supplied key
4244# $bits $inp length in bits
4245# $key pointer to key schedule
4246# output: %eax 0 denoting success, -1 or -2 - failure (see C)
4247# *$key key schedule
4248#
Adam Langleyd9e397b2015-01-22 14:27:53 -08004249{ my ($inp,$bits,$key) = @_4args;
4250 $bits =~ s/%r/%e/;
4251
4252$code.=<<___;
4253.globl ${PREFIX}_set_decrypt_key
4254.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
4255.align 16
4256${PREFIX}_set_decrypt_key:
Robert Sloanab8b8882018-03-26 11:39:51 -07004257.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08004258 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
Robert Sloanab8b8882018-03-26 11:39:51 -07004259.cfi_adjust_cfa_offset 8
Adam Langleyd9e397b2015-01-22 14:27:53 -08004260 call __aesni_set_encrypt_key
4261 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
4262 test %eax,%eax
4263 jnz .Ldec_key_ret
4264 lea 16($key,$bits),$inp # points at the end of key schedule
4265
4266 $movkey ($key),%xmm0 # just swap
4267 $movkey ($inp),%xmm1
4268 $movkey %xmm0,($inp)
4269 $movkey %xmm1,($key)
4270 lea 16($key),$key
4271 lea -16($inp),$inp
4272
4273.Ldec_key_inverse:
4274 $movkey ($key),%xmm0 # swap and inverse
4275 $movkey ($inp),%xmm1
4276 aesimc %xmm0,%xmm0
4277 aesimc %xmm1,%xmm1
4278 lea 16($key),$key
4279 lea -16($inp),$inp
4280 $movkey %xmm0,16($inp)
4281 $movkey %xmm1,-16($key)
4282 cmp $key,$inp
4283 ja .Ldec_key_inverse
4284
4285 $movkey ($key),%xmm0 # inverse middle
4286 aesimc %xmm0,%xmm0
Adam Langleye9ada862015-05-11 17:20:37 -07004287 pxor %xmm1,%xmm1
Adam Langleyd9e397b2015-01-22 14:27:53 -08004288 $movkey %xmm0,($inp)
Adam Langleye9ada862015-05-11 17:20:37 -07004289 pxor %xmm0,%xmm0
Adam Langleyd9e397b2015-01-22 14:27:53 -08004290.Ldec_key_ret:
4291 add \$8,%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07004292.cfi_adjust_cfa_offset -8
Adam Langleyd9e397b2015-01-22 14:27:53 -08004293 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07004294.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08004295.LSEH_end_set_decrypt_key:
4296.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4297___
4298
Robert Sloanab8b8882018-03-26 11:39:51 -07004299# This is based on submission from Intel by
4300# Huang Ying
4301# Vinodh Gopal
Adam Langleyd9e397b2015-01-22 14:27:53 -08004302# Kahraman Akdemir
4303#
Robert Sloana94fe052017-02-21 08:49:28 -08004304# Aggressively optimized in respect to aeskeygenassist's critical path
Adam Langleyd9e397b2015-01-22 14:27:53 -08004305# and is contained in %xmm0-5 to meet Win64 ABI requirement.
4306#
Adam Langleye9ada862015-05-11 17:20:37 -07004307# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4308# int bits, AES_KEY * const key);
4309#
4310# input: $inp user-supplied key
4311# $bits $inp length in bits
4312# $key pointer to key schedule
4313# output: %eax 0 denoting success, -1 or -2 - failure (see C)
4314# $bits rounds-1 (used in aesni_set_decrypt_key)
4315# *$key key schedule
4316# $key pointer to key schedule (used in
4317# aesni_set_decrypt_key)
4318#
4319# Subroutine is frame-less, which means that only volatile registers
4320# are used. Note that it's declared "abi-omnipotent", which means that
4321# amount of volatile registers is smaller on Windows.
4322#
Adam Langleyd9e397b2015-01-22 14:27:53 -08004323$code.=<<___;
4324.globl ${PREFIX}_set_encrypt_key
4325.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
4326.align 16
4327${PREFIX}_set_encrypt_key:
4328__aesni_set_encrypt_key:
Robert Sloanab8b8882018-03-26 11:39:51 -07004329.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08004330 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
Robert Sloanab8b8882018-03-26 11:39:51 -07004331.cfi_adjust_cfa_offset 8
Adam Langleyd9e397b2015-01-22 14:27:53 -08004332 mov \$-1,%rax
4333 test $inp,$inp
4334 jz .Lenc_key_ret
4335 test $key,$key
4336 jz .Lenc_key_ret
4337
4338 movups ($inp),%xmm0 # pull first 128 bits of *userKey
4339 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
Robert Sloan2424d842017-05-01 07:46:28 -07004340 leaq OPENSSL_ia32cap_P(%rip),%r10
Robert Sloan572a4e22017-04-17 10:52:19 -07004341 movl 4(%r10),%r10d
4342 and \$`1<<28|1<<11`,%r10d # AVX and XOP bits
Adam Langleye9ada862015-05-11 17:20:37 -07004343 lea 16($key),%rax # %rax is used as modifiable copy of $key
Adam Langleyd9e397b2015-01-22 14:27:53 -08004344 cmp \$256,$bits
4345 je .L14rounds
4346 cmp \$192,$bits
4347 je .L12rounds
4348 cmp \$128,$bits
4349 jne .Lbad_keybits
4350
4351.L10rounds:
4352 mov \$9,$bits # 10 rounds for 128-bit key
Adam Langleye9ada862015-05-11 17:20:37 -07004353 cmp \$`1<<28`,%r10d # AVX, bit no XOP
4354 je .L10rounds_alt
4355
Adam Langleyd9e397b2015-01-22 14:27:53 -08004356 $movkey %xmm0,($key) # round 0
4357 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
4358 call .Lkey_expansion_128_cold
4359 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
4360 call .Lkey_expansion_128
4361 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
4362 call .Lkey_expansion_128
4363 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
4364 call .Lkey_expansion_128
4365 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
4366 call .Lkey_expansion_128
4367 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
4368 call .Lkey_expansion_128
4369 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
4370 call .Lkey_expansion_128
4371 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
4372 call .Lkey_expansion_128
4373 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
4374 call .Lkey_expansion_128
4375 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
4376 call .Lkey_expansion_128
4377 $movkey %xmm0,(%rax)
4378 mov $bits,80(%rax) # 240(%rdx)
4379 xor %eax,%eax
4380 jmp .Lenc_key_ret
4381
4382.align 16
Adam Langleye9ada862015-05-11 17:20:37 -07004383.L10rounds_alt:
4384 movdqa .Lkey_rotate(%rip),%xmm5
4385 mov \$8,%r10d
4386 movdqa .Lkey_rcon1(%rip),%xmm4
4387 movdqa %xmm0,%xmm2
4388 movdqu %xmm0,($key)
4389 jmp .Loop_key128
4390
4391.align 16
4392.Loop_key128:
4393 pshufb %xmm5,%xmm0
4394 aesenclast %xmm4,%xmm0
4395 pslld \$1,%xmm4
4396 lea 16(%rax),%rax
4397
4398 movdqa %xmm2,%xmm3
4399 pslldq \$4,%xmm2
4400 pxor %xmm2,%xmm3
4401 pslldq \$4,%xmm2
4402 pxor %xmm2,%xmm3
4403 pslldq \$4,%xmm2
4404 pxor %xmm3,%xmm2
4405
4406 pxor %xmm2,%xmm0
4407 movdqu %xmm0,-16(%rax)
4408 movdqa %xmm0,%xmm2
4409
4410 dec %r10d
4411 jnz .Loop_key128
4412
4413 movdqa .Lkey_rcon1b(%rip),%xmm4
4414
4415 pshufb %xmm5,%xmm0
4416 aesenclast %xmm4,%xmm0
4417 pslld \$1,%xmm4
4418
4419 movdqa %xmm2,%xmm3
4420 pslldq \$4,%xmm2
4421 pxor %xmm2,%xmm3
4422 pslldq \$4,%xmm2
4423 pxor %xmm2,%xmm3
4424 pslldq \$4,%xmm2
4425 pxor %xmm3,%xmm2
4426
4427 pxor %xmm2,%xmm0
4428 movdqu %xmm0,(%rax)
4429
4430 movdqa %xmm0,%xmm2
4431 pshufb %xmm5,%xmm0
4432 aesenclast %xmm4,%xmm0
4433
4434 movdqa %xmm2,%xmm3
4435 pslldq \$4,%xmm2
4436 pxor %xmm2,%xmm3
4437 pslldq \$4,%xmm2
4438 pxor %xmm2,%xmm3
4439 pslldq \$4,%xmm2
4440 pxor %xmm3,%xmm2
4441
4442 pxor %xmm2,%xmm0
4443 movdqu %xmm0,16(%rax)
4444
4445 mov $bits,96(%rax) # 240($key)
4446 xor %eax,%eax
4447 jmp .Lenc_key_ret
4448
4449.align 16
Adam Langleyd9e397b2015-01-22 14:27:53 -08004450.L12rounds:
4451 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
4452 mov \$11,$bits # 12 rounds for 192
Adam Langleye9ada862015-05-11 17:20:37 -07004453 cmp \$`1<<28`,%r10d # AVX, but no XOP
4454 je .L12rounds_alt
4455
Adam Langleyd9e397b2015-01-22 14:27:53 -08004456 $movkey %xmm0,($key) # round 0
4457 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
4458 call .Lkey_expansion_192a_cold
4459 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
4460 call .Lkey_expansion_192b
4461 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
4462 call .Lkey_expansion_192a
4463 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
4464 call .Lkey_expansion_192b
4465 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
4466 call .Lkey_expansion_192a
4467 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
4468 call .Lkey_expansion_192b
4469 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
4470 call .Lkey_expansion_192a
4471 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
4472 call .Lkey_expansion_192b
4473 $movkey %xmm0,(%rax)
4474 mov $bits,48(%rax) # 240(%rdx)
4475 xor %rax, %rax
4476 jmp .Lenc_key_ret
4477
4478.align 16
Adam Langleye9ada862015-05-11 17:20:37 -07004479.L12rounds_alt:
4480 movdqa .Lkey_rotate192(%rip),%xmm5
4481 movdqa .Lkey_rcon1(%rip),%xmm4
4482 mov \$8,%r10d
4483 movdqu %xmm0,($key)
4484 jmp .Loop_key192
4485
4486.align 16
4487.Loop_key192:
4488 movq %xmm2,0(%rax)
4489 movdqa %xmm2,%xmm1
4490 pshufb %xmm5,%xmm2
4491 aesenclast %xmm4,%xmm2
4492 pslld \$1, %xmm4
4493 lea 24(%rax),%rax
4494
4495 movdqa %xmm0,%xmm3
4496 pslldq \$4,%xmm0
4497 pxor %xmm0,%xmm3
4498 pslldq \$4,%xmm0
4499 pxor %xmm0,%xmm3
4500 pslldq \$4,%xmm0
4501 pxor %xmm3,%xmm0
4502
4503 pshufd \$0xff,%xmm0,%xmm3
4504 pxor %xmm1,%xmm3
4505 pslldq \$4,%xmm1
4506 pxor %xmm1,%xmm3
4507
4508 pxor %xmm2,%xmm0
4509 pxor %xmm3,%xmm2
4510 movdqu %xmm0,-16(%rax)
4511
4512 dec %r10d
4513 jnz .Loop_key192
4514
4515 mov $bits,32(%rax) # 240($key)
4516 xor %eax,%eax
4517 jmp .Lenc_key_ret
4518
4519.align 16
Adam Langleyd9e397b2015-01-22 14:27:53 -08004520.L14rounds:
Robert Sloanab8b8882018-03-26 11:39:51 -07004521 movups 16($inp),%xmm2 # remaining half of *userKey
Adam Langleyd9e397b2015-01-22 14:27:53 -08004522 mov \$13,$bits # 14 rounds for 256
4523 lea 16(%rax),%rax
Adam Langleye9ada862015-05-11 17:20:37 -07004524 cmp \$`1<<28`,%r10d # AVX, but no XOP
4525 je .L14rounds_alt
4526
Adam Langleyd9e397b2015-01-22 14:27:53 -08004527 $movkey %xmm0,($key) # round 0
4528 $movkey %xmm2,16($key) # round 1
4529 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
4530 call .Lkey_expansion_256a_cold
4531 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
4532 call .Lkey_expansion_256b
4533 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
4534 call .Lkey_expansion_256a
4535 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
4536 call .Lkey_expansion_256b
4537 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
4538 call .Lkey_expansion_256a
4539 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
4540 call .Lkey_expansion_256b
4541 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
4542 call .Lkey_expansion_256a
4543 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
4544 call .Lkey_expansion_256b
4545 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
4546 call .Lkey_expansion_256a
4547 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
4548 call .Lkey_expansion_256b
4549 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
4550 call .Lkey_expansion_256a
4551 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
4552 call .Lkey_expansion_256b
4553 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
4554 call .Lkey_expansion_256a
4555 $movkey %xmm0,(%rax)
4556 mov $bits,16(%rax) # 240(%rdx)
4557 xor %rax,%rax
4558 jmp .Lenc_key_ret
4559
4560.align 16
Adam Langleye9ada862015-05-11 17:20:37 -07004561.L14rounds_alt:
4562 movdqa .Lkey_rotate(%rip),%xmm5
4563 movdqa .Lkey_rcon1(%rip),%xmm4
4564 mov \$7,%r10d
4565 movdqu %xmm0,0($key)
4566 movdqa %xmm2,%xmm1
4567 movdqu %xmm2,16($key)
4568 jmp .Loop_key256
4569
4570.align 16
4571.Loop_key256:
4572 pshufb %xmm5,%xmm2
4573 aesenclast %xmm4,%xmm2
4574
4575 movdqa %xmm0,%xmm3
4576 pslldq \$4,%xmm0
4577 pxor %xmm0,%xmm3
4578 pslldq \$4,%xmm0
4579 pxor %xmm0,%xmm3
4580 pslldq \$4,%xmm0
4581 pxor %xmm3,%xmm0
4582 pslld \$1,%xmm4
4583
4584 pxor %xmm2,%xmm0
4585 movdqu %xmm0,(%rax)
4586
4587 dec %r10d
4588 jz .Ldone_key256
4589
4590 pshufd \$0xff,%xmm0,%xmm2
4591 pxor %xmm3,%xmm3
4592 aesenclast %xmm3,%xmm2
4593
4594 movdqa %xmm1,%xmm3
4595 pslldq \$4,%xmm1
4596 pxor %xmm1,%xmm3
4597 pslldq \$4,%xmm1
4598 pxor %xmm1,%xmm3
4599 pslldq \$4,%xmm1
4600 pxor %xmm3,%xmm1
4601
4602 pxor %xmm1,%xmm2
4603 movdqu %xmm2,16(%rax)
4604 lea 32(%rax),%rax
4605 movdqa %xmm2,%xmm1
4606
4607 jmp .Loop_key256
4608
4609.Ldone_key256:
4610 mov $bits,16(%rax) # 240($key)
4611 xor %eax,%eax
4612 jmp .Lenc_key_ret
4613
4614.align 16
Adam Langleyd9e397b2015-01-22 14:27:53 -08004615.Lbad_keybits:
4616 mov \$-2,%rax
4617.Lenc_key_ret:
Adam Langleye9ada862015-05-11 17:20:37 -07004618 pxor %xmm0,%xmm0
4619 pxor %xmm1,%xmm1
4620 pxor %xmm2,%xmm2
4621 pxor %xmm3,%xmm3
4622 pxor %xmm4,%xmm4
4623 pxor %xmm5,%xmm5
Adam Langleyd9e397b2015-01-22 14:27:53 -08004624 add \$8,%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07004625.cfi_adjust_cfa_offset -8
Adam Langleyd9e397b2015-01-22 14:27:53 -08004626 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07004627.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08004628.LSEH_end_set_encrypt_key:
4629
4630.align 16
4631.Lkey_expansion_128:
4632 $movkey %xmm0,(%rax)
4633 lea 16(%rax),%rax
4634.Lkey_expansion_128_cold:
4635 shufps \$0b00010000,%xmm0,%xmm4
4636 xorps %xmm4, %xmm0
4637 shufps \$0b10001100,%xmm0,%xmm4
4638 xorps %xmm4, %xmm0
4639 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4640 xorps %xmm1,%xmm0
4641 ret
4642
4643.align 16
4644.Lkey_expansion_192a:
4645 $movkey %xmm0,(%rax)
4646 lea 16(%rax),%rax
4647.Lkey_expansion_192a_cold:
4648 movaps %xmm2, %xmm5
4649.Lkey_expansion_192b_warm:
4650 shufps \$0b00010000,%xmm0,%xmm4
4651 movdqa %xmm2,%xmm3
4652 xorps %xmm4,%xmm0
4653 shufps \$0b10001100,%xmm0,%xmm4
4654 pslldq \$4,%xmm3
4655 xorps %xmm4,%xmm0
4656 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
4657 pxor %xmm3,%xmm2
4658 pxor %xmm1,%xmm0
4659 pshufd \$0b11111111,%xmm0,%xmm3
4660 pxor %xmm3,%xmm2
4661 ret
4662
4663.align 16
4664.Lkey_expansion_192b:
4665 movaps %xmm0,%xmm3
4666 shufps \$0b01000100,%xmm0,%xmm5
4667 $movkey %xmm5,(%rax)
4668 shufps \$0b01001110,%xmm2,%xmm3
4669 $movkey %xmm3,16(%rax)
4670 lea 32(%rax),%rax
4671 jmp .Lkey_expansion_192b_warm
4672
4673.align 16
4674.Lkey_expansion_256a:
4675 $movkey %xmm2,(%rax)
4676 lea 16(%rax),%rax
4677.Lkey_expansion_256a_cold:
4678 shufps \$0b00010000,%xmm0,%xmm4
4679 xorps %xmm4,%xmm0
4680 shufps \$0b10001100,%xmm0,%xmm4
4681 xorps %xmm4,%xmm0
4682 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4683 xorps %xmm1,%xmm0
4684 ret
4685
4686.align 16
4687.Lkey_expansion_256b:
4688 $movkey %xmm0,(%rax)
4689 lea 16(%rax),%rax
4690
4691 shufps \$0b00010000,%xmm2,%xmm4
4692 xorps %xmm4,%xmm2
4693 shufps \$0b10001100,%xmm2,%xmm4
4694 xorps %xmm4,%xmm2
4695 shufps \$0b10101010,%xmm1,%xmm1 # critical path
4696 xorps %xmm1,%xmm2
4697 ret
4698.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4699.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4700___
4701}
4702
4703$code.=<<___;
4704.align 64
4705.Lbswap_mask:
4706 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4707.Lincrement32:
4708 .long 6,6,6,0
4709.Lincrement64:
4710 .long 1,0,0,0
4711.Lxts_magic:
4712 .long 0x87,0,1,0
4713.Lincrement1:
4714 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
Adam Langleye9ada862015-05-11 17:20:37 -07004715.Lkey_rotate:
4716 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4717.Lkey_rotate192:
4718 .long 0x04070605,0x04070605,0x04070605,0x04070605
4719.Lkey_rcon1:
4720 .long 1,1,1,1
4721.Lkey_rcon1b:
4722 .long 0x1b,0x1b,0x1b,0x1b
Adam Langleyd9e397b2015-01-22 14:27:53 -08004723
4724.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4725.align 64
4726___
4727
4728# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4729# CONTEXT *context,DISPATCHER_CONTEXT *disp)
4730if ($win64) {
4731$rec="%rcx";
4732$frame="%rdx";
4733$context="%r8";
4734$disp="%r9";
4735
4736$code.=<<___;
4737.extern __imp_RtlVirtualUnwind
4738___
4739$code.=<<___ if ($PREFIX eq "aesni");
4740.type ecb_ccm64_se_handler,\@abi-omnipotent
4741.align 16
4742ecb_ccm64_se_handler:
4743 push %rsi
4744 push %rdi
4745 push %rbx
4746 push %rbp
4747 push %r12
4748 push %r13
4749 push %r14
4750 push %r15
4751 pushfq
4752 sub \$64,%rsp
4753
4754 mov 120($context),%rax # pull context->Rax
4755 mov 248($context),%rbx # pull context->Rip
4756
4757 mov 8($disp),%rsi # disp->ImageBase
4758 mov 56($disp),%r11 # disp->HandlerData
4759
4760 mov 0(%r11),%r10d # HandlerData[0]
4761 lea (%rsi,%r10),%r10 # prologue label
4762 cmp %r10,%rbx # context->Rip<prologue label
4763 jb .Lcommon_seh_tail
4764
4765 mov 152($context),%rax # pull context->Rsp
4766
4767 mov 4(%r11),%r10d # HandlerData[1]
4768 lea (%rsi,%r10),%r10 # epilogue label
4769 cmp %r10,%rbx # context->Rip>=epilogue label
4770 jae .Lcommon_seh_tail
4771
4772 lea 0(%rax),%rsi # %xmm save area
4773 lea 512($context),%rdi # &context.Xmm6
4774 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
4775 .long 0xa548f3fc # cld; rep movsq
4776 lea 0x58(%rax),%rax # adjust stack pointer
4777
4778 jmp .Lcommon_seh_tail
4779.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
4780
4781.type ctr_xts_se_handler,\@abi-omnipotent
4782.align 16
4783ctr_xts_se_handler:
4784 push %rsi
4785 push %rdi
4786 push %rbx
4787 push %rbp
4788 push %r12
4789 push %r13
4790 push %r14
4791 push %r15
4792 pushfq
4793 sub \$64,%rsp
4794
4795 mov 120($context),%rax # pull context->Rax
4796 mov 248($context),%rbx # pull context->Rip
4797
4798 mov 8($disp),%rsi # disp->ImageBase
4799 mov 56($disp),%r11 # disp->HandlerData
4800
4801 mov 0(%r11),%r10d # HandlerData[0]
4802 lea (%rsi,%r10),%r10 # prologue lable
4803 cmp %r10,%rbx # context->Rip<prologue label
4804 jb .Lcommon_seh_tail
4805
4806 mov 152($context),%rax # pull context->Rsp
4807
4808 mov 4(%r11),%r10d # HandlerData[1]
4809 lea (%rsi,%r10),%r10 # epilogue label
4810 cmp %r10,%rbx # context->Rip>=epilogue label
4811 jae .Lcommon_seh_tail
4812
Robert Sloana94fe052017-02-21 08:49:28 -08004813 mov 208($context),%rax # pull context->R11
4814
4815 lea -0xa8(%rax),%rsi # %xmm save area
Adam Langleyd9e397b2015-01-22 14:27:53 -08004816 lea 512($context),%rdi # & context.Xmm6
4817 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4818 .long 0xa548f3fc # cld; rep movsq
4819
Robert Sloana94fe052017-02-21 08:49:28 -08004820 mov -8(%rax),%rbp # restore saved %rbp
4821 mov %rbp,160($context) # restore context->Rbp
4822 jmp .Lcommon_seh_tail
Adam Langleyd9e397b2015-01-22 14:27:53 -08004823.size ctr_xts_se_handler,.-ctr_xts_se_handler
Robert Sloana94fe052017-02-21 08:49:28 -08004824
4825.type ocb_se_handler,\@abi-omnipotent
4826.align 16
4827ocb_se_handler:
4828 push %rsi
4829 push %rdi
4830 push %rbx
4831 push %rbp
4832 push %r12
4833 push %r13
4834 push %r14
4835 push %r15
4836 pushfq
4837 sub \$64,%rsp
4838
4839 mov 120($context),%rax # pull context->Rax
4840 mov 248($context),%rbx # pull context->Rip
4841
4842 mov 8($disp),%rsi # disp->ImageBase
4843 mov 56($disp),%r11 # disp->HandlerData
4844
4845 mov 0(%r11),%r10d # HandlerData[0]
4846 lea (%rsi,%r10),%r10 # prologue lable
4847 cmp %r10,%rbx # context->Rip<prologue label
4848 jb .Lcommon_seh_tail
4849
4850 mov 4(%r11),%r10d # HandlerData[1]
4851 lea (%rsi,%r10),%r10 # epilogue label
4852 cmp %r10,%rbx # context->Rip>=epilogue label
4853 jae .Lcommon_seh_tail
4854
4855 mov 8(%r11),%r10d # HandlerData[2]
4856 lea (%rsi,%r10),%r10
4857 cmp %r10,%rbx # context->Rip>=pop label
4858 jae .Locb_no_xmm
4859
4860 mov 152($context),%rax # pull context->Rsp
4861
4862 lea (%rax),%rsi # %xmm save area
4863 lea 512($context),%rdi # & context.Xmm6
4864 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4865 .long 0xa548f3fc # cld; rep movsq
4866 lea 0xa0+0x28(%rax),%rax
4867
4868.Locb_no_xmm:
4869 mov -8(%rax),%rbx
4870 mov -16(%rax),%rbp
4871 mov -24(%rax),%r12
4872 mov -32(%rax),%r13
4873 mov -40(%rax),%r14
4874
4875 mov %rbx,144($context) # restore context->Rbx
4876 mov %rbp,160($context) # restore context->Rbp
4877 mov %r12,216($context) # restore context->R12
4878 mov %r13,224($context) # restore context->R13
4879 mov %r14,232($context) # restore context->R14
4880
4881 jmp .Lcommon_seh_tail
4882.size ocb_se_handler,.-ocb_se_handler
Adam Langleyd9e397b2015-01-22 14:27:53 -08004883___
4884$code.=<<___;
4885.type cbc_se_handler,\@abi-omnipotent
4886.align 16
4887cbc_se_handler:
4888 push %rsi
4889 push %rdi
4890 push %rbx
4891 push %rbp
4892 push %r12
4893 push %r13
4894 push %r14
4895 push %r15
4896 pushfq
4897 sub \$64,%rsp
4898
4899 mov 152($context),%rax # pull context->Rsp
4900 mov 248($context),%rbx # pull context->Rip
4901
Adam Langleye9ada862015-05-11 17:20:37 -07004902 lea .Lcbc_decrypt_bulk(%rip),%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08004903 cmp %r10,%rbx # context->Rip<"prologue" label
4904 jb .Lcommon_seh_tail
4905
Robert Sloana94fe052017-02-21 08:49:28 -08004906 mov 120($context),%rax # pull context->Rax
4907
Adam Langleyd9e397b2015-01-22 14:27:53 -08004908 lea .Lcbc_decrypt_body(%rip),%r10
4909 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
Robert Sloana94fe052017-02-21 08:49:28 -08004910 jb .Lcommon_seh_tail
4911
4912 mov 152($context),%rax # pull context->Rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08004913
4914 lea .Lcbc_ret(%rip),%r10
4915 cmp %r10,%rbx # context->Rip>="epilogue" label
4916 jae .Lcommon_seh_tail
4917
4918 lea 16(%rax),%rsi # %xmm save area
4919 lea 512($context),%rdi # &context.Xmm6
4920 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4921 .long 0xa548f3fc # cld; rep movsq
4922
Robert Sloana94fe052017-02-21 08:49:28 -08004923 mov 208($context),%rax # pull context->R11
Adam Langleyd9e397b2015-01-22 14:27:53 -08004924
Robert Sloana94fe052017-02-21 08:49:28 -08004925 mov -8(%rax),%rbp # restore saved %rbp
4926 mov %rbp,160($context) # restore context->Rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08004927
4928.Lcommon_seh_tail:
4929 mov 8(%rax),%rdi
4930 mov 16(%rax),%rsi
4931 mov %rax,152($context) # restore context->Rsp
4932 mov %rsi,168($context) # restore context->Rsi
4933 mov %rdi,176($context) # restore context->Rdi
4934
4935 mov 40($disp),%rdi # disp->ContextRecord
4936 mov $context,%rsi # context
4937 mov \$154,%ecx # sizeof(CONTEXT)
4938 .long 0xa548f3fc # cld; rep movsq
4939
4940 mov $disp,%rsi
4941 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4942 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4943 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4944 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4945 mov 40(%rsi),%r10 # disp->ContextRecord
4946 lea 56(%rsi),%r11 # &disp->HandlerData
4947 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4948 mov %r10,32(%rsp) # arg5
4949 mov %r11,40(%rsp) # arg6
4950 mov %r12,48(%rsp) # arg7
4951 mov %rcx,56(%rsp) # arg8, (NULL)
4952 call *__imp_RtlVirtualUnwind(%rip)
4953
4954 mov \$1,%eax # ExceptionContinueSearch
4955 add \$64,%rsp
4956 popfq
4957 pop %r15
4958 pop %r14
4959 pop %r13
4960 pop %r12
4961 pop %rbp
4962 pop %rbx
4963 pop %rdi
4964 pop %rsi
4965 ret
4966.size cbc_se_handler,.-cbc_se_handler
4967
4968.section .pdata
4969.align 4
4970___
4971$code.=<<___ if ($PREFIX eq "aesni");
4972 .rva .LSEH_begin_aesni_ecb_encrypt
4973 .rva .LSEH_end_aesni_ecb_encrypt
4974 .rva .LSEH_info_ecb
4975
4976 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
4977 .rva .LSEH_end_aesni_ccm64_encrypt_blocks
4978 .rva .LSEH_info_ccm64_enc
4979
4980 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
4981 .rva .LSEH_end_aesni_ccm64_decrypt_blocks
4982 .rva .LSEH_info_ccm64_dec
4983
4984 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
4985 .rva .LSEH_end_aesni_ctr32_encrypt_blocks
4986 .rva .LSEH_info_ctr32
4987
4988 .rva .LSEH_begin_aesni_xts_encrypt
4989 .rva .LSEH_end_aesni_xts_encrypt
4990 .rva .LSEH_info_xts_enc
4991
4992 .rva .LSEH_begin_aesni_xts_decrypt
4993 .rva .LSEH_end_aesni_xts_decrypt
4994 .rva .LSEH_info_xts_dec
Robert Sloana94fe052017-02-21 08:49:28 -08004995
4996 .rva .LSEH_begin_aesni_ocb_encrypt
4997 .rva .LSEH_end_aesni_ocb_encrypt
4998 .rva .LSEH_info_ocb_enc
4999
5000 .rva .LSEH_begin_aesni_ocb_decrypt
5001 .rva .LSEH_end_aesni_ocb_decrypt
5002 .rva .LSEH_info_ocb_dec
Adam Langleyd9e397b2015-01-22 14:27:53 -08005003___
5004$code.=<<___;
5005 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
5006 .rva .LSEH_end_${PREFIX}_cbc_encrypt
5007 .rva .LSEH_info_cbc
5008
5009 .rva ${PREFIX}_set_decrypt_key
5010 .rva .LSEH_end_set_decrypt_key
5011 .rva .LSEH_info_key
5012
5013 .rva ${PREFIX}_set_encrypt_key
5014 .rva .LSEH_end_set_encrypt_key
5015 .rva .LSEH_info_key
5016.section .xdata
5017.align 8
5018___
5019$code.=<<___ if ($PREFIX eq "aesni");
5020.LSEH_info_ecb:
5021 .byte 9,0,0,0
5022 .rva ecb_ccm64_se_handler
5023 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
5024.LSEH_info_ccm64_enc:
5025 .byte 9,0,0,0
5026 .rva ecb_ccm64_se_handler
5027 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
5028.LSEH_info_ccm64_dec:
5029 .byte 9,0,0,0
5030 .rva ecb_ccm64_se_handler
5031 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
5032.LSEH_info_ctr32:
5033 .byte 9,0,0,0
5034 .rva ctr_xts_se_handler
5035 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
5036.LSEH_info_xts_enc:
5037 .byte 9,0,0,0
5038 .rva ctr_xts_se_handler
5039 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
5040.LSEH_info_xts_dec:
5041 .byte 9,0,0,0
5042 .rva ctr_xts_se_handler
5043 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
Robert Sloana94fe052017-02-21 08:49:28 -08005044.LSEH_info_ocb_enc:
5045 .byte 9,0,0,0
5046 .rva ocb_se_handler
5047 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
5048 .rva .Locb_enc_pop
5049 .long 0
5050.LSEH_info_ocb_dec:
5051 .byte 9,0,0,0
5052 .rva ocb_se_handler
5053 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
5054 .rva .Locb_dec_pop
5055 .long 0
Adam Langleyd9e397b2015-01-22 14:27:53 -08005056___
5057$code.=<<___;
5058.LSEH_info_cbc:
5059 .byte 9,0,0,0
5060 .rva cbc_se_handler
5061.LSEH_info_key:
5062 .byte 0x01,0x04,0x01,0x00
5063 .byte 0x04,0x02,0x00,0x00 # sub rsp,8
5064___
5065}
5066
5067sub rex {
5068 local *opcode=shift;
5069 my ($dst,$src)=@_;
5070 my $rex=0;
5071
5072 $rex|=0x04 if($dst>=8);
5073 $rex|=0x01 if($src>=8);
5074 push @opcode,$rex|0x40 if($rex);
5075}
5076
5077sub aesni {
5078 my $line=shift;
5079 my @opcode=(0x66);
5080
5081 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5082 rex(\@opcode,$4,$3);
5083 push @opcode,0x0f,0x3a,0xdf;
5084 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
5085 my $c=$2;
5086 push @opcode,$c=~/^0/?oct($c):$c;
5087 return ".byte\t".join(',',@opcode);
5088 }
5089 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5090 my %opcodelet = (
5091 "aesimc" => 0xdb,
5092 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5093 "aesdec" => 0xde, "aesdeclast" => 0xdf
5094 );
5095 return undef if (!defined($opcodelet{$1}));
5096 rex(\@opcode,$3,$2);
5097 push @opcode,0x0f,0x38,$opcodelet{$1};
5098 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
5099 return ".byte\t".join(',',@opcode);
5100 }
5101 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5102 my %opcodelet = (
5103 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5104 "aesdec" => 0xde, "aesdeclast" => 0xdf
5105 );
5106 return undef if (!defined($opcodelet{$1}));
5107 my $off = $2;
5108 push @opcode,0x44 if ($3>=8);
5109 push @opcode,0x0f,0x38,$opcodelet{$1};
5110 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
5111 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5112 return ".byte\t".join(',',@opcode);
5113 }
5114 return $line;
5115}
5116
5117sub movbe {
5118 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
5119}
5120
5121$code =~ s/\`([^\`]*)\`/eval($1)/gem;
5122$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5123#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
5124$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
5125
5126print $code;
5127
5128close STDOUT;