blob: 15f68059f2428a0d37dabb46d2ca7920fbcf52cc [file] [log] [blame]
Robert Sloana94fe052017-02-21 08:49:28 -08001#! /usr/bin/env perl
2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
Adam Langleyd9e397b2015-01-22 14:27:53 -08009#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20# details].
21#
22# Performance.
23#
24# Given aes(enc|dec) instructions' latency asymptotic performance for
25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26# processed with 128-bit key. And given their throughput asymptotic
27# performance for parallelizable modes is 1.25 cycles per byte. Being
28# asymptotic limit it's not something you commonly achieve in reality,
29# but how close does one get? Below are results collected for
30# different modes and block sized. Pairs of numbers are for en-/
31# decryption.
32#
33# 16-byte 64-byte 256-byte 1-KB 8-KB
34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
Robert Sloana94fe052017-02-21 08:49:28 -080037# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
Adam Langleyd9e397b2015-01-22 14:27:53 -080038# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
40#
41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44# The results were collected with specially crafted speed.c benchmark
45# in order to compare them with results reported in "Intel Advanced
46# Encryption Standard (AES) New Instruction Set" White Paper Revision
47# 3.0 dated May 2010. All above results are consistently better. This
48# module also provides better performance for block sizes smaller than
49# 128 bytes in points *not* represented in the above table.
50#
51# Looking at the results for 8-KB buffer.
52#
53# CFB and OFB results are far from the limit, because implementation
54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55# single-block aesni_encrypt, which is not the most optimal way to go.
56# CBC encrypt result is unexpectedly high and there is no documented
57# explanation for it. Seemingly there is a small penalty for feeding
58# the result back to AES unit the way it's done in CBC mode. There is
59# nothing one can do and the result appears optimal. CCM result is
60# identical to CBC, because CBC-MAC is essentially CBC encrypt without
61# saving output. CCM CTR "stays invisible," because it's neatly
62# interleaved wih CBC-MAC. This provides ~30% improvement over
Robert Sloanab8b8882018-03-26 11:39:51 -070063# "straightforward" CCM implementation with CTR and CBC-MAC performed
Adam Langleyd9e397b2015-01-22 14:27:53 -080064# disjointly. Parallelizable modes practically achieve the theoretical
65# limit.
66#
67# Looking at how results vary with buffer size.
68#
69# Curves are practically saturated at 1-KB buffer size. In most cases
70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71# CTR curve doesn't follow this pattern and is "slowest" changing one
72# with "256-byte" result being 87% of "8-KB." This is because overhead
73# in CTR mode is most computationally intensive. Small-block CCM
74# decrypt is slower than encrypt, because first CTR and last CBC-MAC
75# iterations can't be interleaved.
76#
77# Results for 192- and 256-bit keys.
78#
79# EVP-free results were observed to scale perfectly with number of
80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82# are a tad smaller, because the above mentioned penalty biases all
83# results by same constant value. In similar way function call
84# overhead affects small-block performance, as well as OFB and CFB
85# results. Differences are not large, most common coefficients are
86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
88
89# January 2011
90#
91# While Westmere processor features 6 cycles latency for aes[enc|dec]
92# instructions, which can be scheduled every second cycle, Sandy
93# Bridge spends 8 cycles per instruction, but it can schedule them
94# every cycle. This means that code targeting Westmere would perform
95# suboptimally on Sandy Bridge. Therefore this update.
96#
97# In addition, non-parallelizable CBC encrypt (as well as CCM) is
98# optimized. Relative improvement might appear modest, 8% on Westmere,
99# but in absolute terms it's 3.77 cycles per byte encrypted with
100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101# should be compared to asymptotic limits of 3.75 for Westmere and
102# 5.00 for Sandy Bridge. Actually, the fact that they get this close
103# to asymptotic limits is quite amazing. Indeed, the limit is
104# calculated as latency times number of rounds, 10 for 128-bit key,
105# and divided by 16, the number of bytes in block, or in other words
106# it accounts *solely* for aesenc instructions. But there are extra
107# instructions, and numbers so close to the asymptotic limits mean
108# that it's as if it takes as little as *one* additional cycle to
109# execute all of them. How is it possible? It is possible thanks to
110# out-of-order execution logic, which manages to overlap post-
111# processing of previous block, things like saving the output, with
112# actual encryption of current block, as well as pre-processing of
113# current block, things like fetching input and xor-ing it with
114# 0-round element of the key schedule, with actual encryption of
115# previous block. Keep this in mind...
116#
117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118# performance is achieved by interleaving instructions working on
119# independent blocks. In which case asymptotic limit for such modes
120# can be obtained by dividing above mentioned numbers by AES
Robert Sloana94fe052017-02-21 08:49:28 -0800121# instructions' interleave factor. Westmere can execute at most 3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800122# instructions at a time, meaning that optimal interleave factor is 3,
123# and that's where the "magic" number of 1.25 come from. "Optimal
124# interleave factor" means that increase of interleave factor does
125# not improve performance. The formula has proven to reflect reality
126# pretty well on Westmere... Sandy Bridge on the other hand can
127# execute up to 8 AES instructions at a time, so how does varying
128# interleave factor affect the performance? Here is table for ECB
129# (numbers are cycles per byte processed with 128-bit key):
130#
131# instruction interleave factor 3x 6x 8x
132# theoretical asymptotic limit 1.67 0.83 0.625
133# measured performance for 8KB block 1.05 0.86 0.84
134#
135# "as if" interleave factor 4.7x 5.8x 6.0x
136#
137# Further data for other parallelizable modes:
138#
139# CBC decrypt 1.16 0.93 0.74
140# CTR 1.14 0.91 0.74
141#
142# Well, given 3x column it's probably inappropriate to call the limit
143# asymptotic, if it can be surpassed, isn't it? What happens there?
144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145# magic is responsible for this. Processor overlaps not only the
Robert Sloanab8b8882018-03-26 11:39:51 -0700146# additional instructions with AES ones, but even AES instructions
Adam Langleyd9e397b2015-01-22 14:27:53 -0800147# processing adjacent triplets of independent blocks. In the 6x case
148# additional instructions still claim disproportionally small amount
149# of additional cycles, but in 8x case number of instructions must be
150# a tad too high for out-of-order logic to cope with, and AES unit
151# remains underutilized... As you can see 8x interleave is hardly
152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
Robert Sloanab8b8882018-03-26 11:39:51 -0700153# utilizes 6x interleave because of limited register bank capacity.
Adam Langleyd9e397b2015-01-22 14:27:53 -0800154#
155# Higher interleave factors do have negative impact on Westmere
156# performance. While for ECB mode it's negligible ~1.5%, other
157# parallelizables perform ~5% worse, which is outweighed by ~25%
158# improvement on Sandy Bridge. To balance regression on Westmere
159# CTR mode was implemented with 6x aesenc interleave factor.
160
161# April 2011
162#
163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165# in CTR mode AES instruction interleave factor was chosen to be 6x.
166
Robert Sloana94fe052017-02-21 08:49:28 -0800167# November 2015
168#
169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
170# chosen to be 6x.
171
Adam Langleyd9e397b2015-01-22 14:27:53 -0800172######################################################################
173# Current large-block performance in cycles per byte processed with
174# 128-bit key (less is better).
175#
Robert Sloana94fe052017-02-21 08:49:28 -0800176# CBC en-/decrypt CTR XTS ECB OCB
Adam Langleyd9e397b2015-01-22 14:27:53 -0800177# Westmere 3.77/1.25 1.25 1.25 1.26
Robert Sloana94fe052017-02-21 08:49:28 -0800178# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
179# Haswell 4.44/0.63 0.63 0.73 0.63 0.70
180# Skylake 2.62/0.63 0.63 0.63 0.63
181# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
Robert Sloanfe7cd212017-08-07 09:03:39 -0700182# Knights L 2.54/0.77 0.78 0.85 - 1.50
Robert Sloana94fe052017-02-21 08:49:28 -0800183# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
184# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
Robert Sloanfe7cd212017-08-07 09:03:39 -0700185# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49
Adam Langleyd9e397b2015-01-22 14:27:53 -0800186#
Adam Langleye9ada862015-05-11 17:20:37 -0700187# (*) Atom Silvermont ECB result is suboptimal because of penalties
188# incurred by operations on %xmm8-15. As ECB is not considered
Adam Langleyd9e397b2015-01-22 14:27:53 -0800189# critical, nothing was done to mitigate the problem.
190
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100191$PREFIX="aes_hw"; # if $PREFIX is set to "AES", the script
Adam Langleyd9e397b2015-01-22 14:27:53 -0800192 # generates drop-in replacement for
193 # crypto/aes/asm/aes-x86_64.pl:-)
194
195$flavour = shift;
196$output = shift;
197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
198
199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
200
201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
Robert Sloan572a4e22017-04-17 10:52:19 -0700203( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
Adam Langleyd9e397b2015-01-22 14:27:53 -0800204die "can't locate x86_64-xlate.pl";
205
David Benjaminc895d6b2016-08-11 13:26:41 -0400206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
Adam Langleyd9e397b2015-01-22 14:27:53 -0800207*STDOUT=*OUT;
208
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100209$movkey = $PREFIX eq "aes_hw" ? "movups" : "movups";
Adam Langleyd9e397b2015-01-22 14:27:53 -0800210@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
211 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
212
213$code=".text\n";
Robert Sloan2424d842017-05-01 07:46:28 -0700214$code.=".extern OPENSSL_ia32cap_P\n";
Adam Langleyd9e397b2015-01-22 14:27:53 -0800215
216$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
218$inp="%rdi";
219$out="%rsi";
220$len="%rdx";
221$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
222$ivp="%r8"; # cbc, ctr, ...
223
224$rnds_="%r10d"; # backup copy for $rounds
225$key_="%r11"; # backup copy for $key
226
227# %xmm register layout
228$rndkey0="%xmm0"; $rndkey1="%xmm1";
229$inout0="%xmm2"; $inout1="%xmm3";
230$inout2="%xmm4"; $inout3="%xmm5";
231$inout4="%xmm6"; $inout5="%xmm7";
232$inout6="%xmm8"; $inout7="%xmm9";
233
234$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
235$in0="%xmm8"; $iv="%xmm9";
236
237# Inline version of internal aesni_[en|de]crypt1.
238#
239# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
240# cycles which take care of loop variables...
241{ my $sn;
242sub aesni_generate1 {
243my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
244++$sn;
245$code.=<<___;
246 $movkey ($key),$rndkey0
247 $movkey 16($key),$rndkey1
248___
249$code.=<<___ if (defined($ivec));
250 xorps $rndkey0,$ivec
251 lea 32($key),$key
252 xorps $ivec,$inout
253___
254$code.=<<___ if (!defined($ivec));
255 lea 32($key),$key
256 xorps $rndkey0,$inout
257___
258$code.=<<___;
259.Loop_${p}1_$sn:
260 aes${p} $rndkey1,$inout
261 dec $rounds
262 $movkey ($key),$rndkey1
263 lea 16($key),$key
264 jnz .Loop_${p}1_$sn # loop body is 16 bytes
265 aes${p}last $rndkey1,$inout
266___
267}}
268# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
269#
270{ my ($inp,$out,$key) = @_4args;
271
272$code.=<<___;
273.globl ${PREFIX}_encrypt
274.type ${PREFIX}_encrypt,\@abi-omnipotent
275.align 16
276${PREFIX}_encrypt:
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800277.cfi_startproc
278#ifndef NDEBUG
279#ifndef BORINGSSL_FIPS
280.extern BORINGSSL_function_hit
281 movb \$1,BORINGSSL_function_hit+1(%rip)
282#endif
283#endif
Adam Langleyd9e397b2015-01-22 14:27:53 -0800284 movups ($inp),$inout0 # load input
285 mov 240($key),$rounds # key->rounds
286___
287 &aesni_generate1("enc",$key,$rounds);
288$code.=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -0700289 pxor $rndkey0,$rndkey0 # clear register bank
290 pxor $rndkey1,$rndkey1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800291 movups $inout0,($out) # output
Adam Langleye9ada862015-05-11 17:20:37 -0700292 pxor $inout0,$inout0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800293 ret
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800294.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800295.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
296
297.globl ${PREFIX}_decrypt
298.type ${PREFIX}_decrypt,\@abi-omnipotent
299.align 16
300${PREFIX}_decrypt:
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800301.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800302 movups ($inp),$inout0 # load input
303 mov 240($key),$rounds # key->rounds
304___
305 &aesni_generate1("dec",$key,$rounds);
306$code.=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -0700307 pxor $rndkey0,$rndkey0 # clear register bank
308 pxor $rndkey1,$rndkey1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800309 movups $inout0,($out) # output
Adam Langleye9ada862015-05-11 17:20:37 -0700310 pxor $inout0,$inout0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800311 ret
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800312.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800313.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
314___
315}
316
317# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
318# factor. Why 3x subroutine were originally used in loops? Even though
319# aes[enc|dec] latency was originally 6, it could be scheduled only
320# every *2nd* cycle. Thus 3x interleave was the one providing optimal
321# utilization, i.e. when subroutine's throughput is virtually same as
322# of non-interleaved subroutine [for number of input blocks up to 3].
323# This is why it originally made no sense to implement 2x subroutine.
324# But times change and it became appropriate to spend extra 192 bytes
325# on 2x subroutine on Atom Silvermont account. For processors that
326# can schedule aes[enc|dec] every cycle optimal interleave factor
327# equals to corresponding instructions latency. 8x is optimal for
Robert Sloana94fe052017-02-21 08:49:28 -0800328# * Bridge and "super-optimal" for other Intel CPUs...
Adam Langleyd9e397b2015-01-22 14:27:53 -0800329
330sub aesni_generate2 {
331my $dir=shift;
332# As already mentioned it takes in $key and $rounds, which are *not*
333# preserved. $inout[0-1] is cipher/clear text...
334$code.=<<___;
335.type _aesni_${dir}rypt2,\@abi-omnipotent
336.align 16
337_aesni_${dir}rypt2:
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800338.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800339 $movkey ($key),$rndkey0
340 shl \$4,$rounds
341 $movkey 16($key),$rndkey1
342 xorps $rndkey0,$inout0
343 xorps $rndkey0,$inout1
344 $movkey 32($key),$rndkey0
345 lea 32($key,$rounds),$key
346 neg %rax # $rounds
347 add \$16,%rax
348
349.L${dir}_loop2:
350 aes${dir} $rndkey1,$inout0
351 aes${dir} $rndkey1,$inout1
352 $movkey ($key,%rax),$rndkey1
353 add \$32,%rax
354 aes${dir} $rndkey0,$inout0
355 aes${dir} $rndkey0,$inout1
356 $movkey -16($key,%rax),$rndkey0
357 jnz .L${dir}_loop2
358
359 aes${dir} $rndkey1,$inout0
360 aes${dir} $rndkey1,$inout1
361 aes${dir}last $rndkey0,$inout0
362 aes${dir}last $rndkey0,$inout1
363 ret
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800364.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800365.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
366___
367}
368sub aesni_generate3 {
369my $dir=shift;
370# As already mentioned it takes in $key and $rounds, which are *not*
371# preserved. $inout[0-2] is cipher/clear text...
372$code.=<<___;
373.type _aesni_${dir}rypt3,\@abi-omnipotent
374.align 16
375_aesni_${dir}rypt3:
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800376.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800377 $movkey ($key),$rndkey0
378 shl \$4,$rounds
379 $movkey 16($key),$rndkey1
380 xorps $rndkey0,$inout0
381 xorps $rndkey0,$inout1
382 xorps $rndkey0,$inout2
383 $movkey 32($key),$rndkey0
384 lea 32($key,$rounds),$key
385 neg %rax # $rounds
386 add \$16,%rax
387
388.L${dir}_loop3:
389 aes${dir} $rndkey1,$inout0
390 aes${dir} $rndkey1,$inout1
391 aes${dir} $rndkey1,$inout2
392 $movkey ($key,%rax),$rndkey1
393 add \$32,%rax
394 aes${dir} $rndkey0,$inout0
395 aes${dir} $rndkey0,$inout1
396 aes${dir} $rndkey0,$inout2
397 $movkey -16($key,%rax),$rndkey0
398 jnz .L${dir}_loop3
399
400 aes${dir} $rndkey1,$inout0
401 aes${dir} $rndkey1,$inout1
402 aes${dir} $rndkey1,$inout2
403 aes${dir}last $rndkey0,$inout0
404 aes${dir}last $rndkey0,$inout1
405 aes${dir}last $rndkey0,$inout2
406 ret
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800407.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800408.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
409___
410}
411# 4x interleave is implemented to improve small block performance,
412# most notably [and naturally] 4 block by ~30%. One can argue that one
413# should have implemented 5x as well, but improvement would be <20%,
414# so it's not worth it...
415sub aesni_generate4 {
416my $dir=shift;
417# As already mentioned it takes in $key and $rounds, which are *not*
418# preserved. $inout[0-3] is cipher/clear text...
419$code.=<<___;
420.type _aesni_${dir}rypt4,\@abi-omnipotent
421.align 16
422_aesni_${dir}rypt4:
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800423.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800424 $movkey ($key),$rndkey0
425 shl \$4,$rounds
426 $movkey 16($key),$rndkey1
427 xorps $rndkey0,$inout0
428 xorps $rndkey0,$inout1
429 xorps $rndkey0,$inout2
430 xorps $rndkey0,$inout3
431 $movkey 32($key),$rndkey0
432 lea 32($key,$rounds),$key
433 neg %rax # $rounds
434 .byte 0x0f,0x1f,0x00
435 add \$16,%rax
436
437.L${dir}_loop4:
438 aes${dir} $rndkey1,$inout0
439 aes${dir} $rndkey1,$inout1
440 aes${dir} $rndkey1,$inout2
441 aes${dir} $rndkey1,$inout3
442 $movkey ($key,%rax),$rndkey1
443 add \$32,%rax
444 aes${dir} $rndkey0,$inout0
445 aes${dir} $rndkey0,$inout1
446 aes${dir} $rndkey0,$inout2
447 aes${dir} $rndkey0,$inout3
448 $movkey -16($key,%rax),$rndkey0
449 jnz .L${dir}_loop4
450
451 aes${dir} $rndkey1,$inout0
452 aes${dir} $rndkey1,$inout1
453 aes${dir} $rndkey1,$inout2
454 aes${dir} $rndkey1,$inout3
455 aes${dir}last $rndkey0,$inout0
456 aes${dir}last $rndkey0,$inout1
457 aes${dir}last $rndkey0,$inout2
458 aes${dir}last $rndkey0,$inout3
459 ret
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800460.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800461.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
462___
463}
464sub aesni_generate6 {
465my $dir=shift;
466# As already mentioned it takes in $key and $rounds, which are *not*
467# preserved. $inout[0-5] is cipher/clear text...
468$code.=<<___;
469.type _aesni_${dir}rypt6,\@abi-omnipotent
470.align 16
471_aesni_${dir}rypt6:
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800472.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800473 $movkey ($key),$rndkey0
474 shl \$4,$rounds
475 $movkey 16($key),$rndkey1
476 xorps $rndkey0,$inout0
477 pxor $rndkey0,$inout1
478 pxor $rndkey0,$inout2
479 aes${dir} $rndkey1,$inout0
480 lea 32($key,$rounds),$key
481 neg %rax # $rounds
482 aes${dir} $rndkey1,$inout1
483 pxor $rndkey0,$inout3
484 pxor $rndkey0,$inout4
485 aes${dir} $rndkey1,$inout2
486 pxor $rndkey0,$inout5
Adam Langleye9ada862015-05-11 17:20:37 -0700487 $movkey ($key,%rax),$rndkey0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800488 add \$16,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800489 jmp .L${dir}_loop6_enter
490.align 16
491.L${dir}_loop6:
492 aes${dir} $rndkey1,$inout0
493 aes${dir} $rndkey1,$inout1
494 aes${dir} $rndkey1,$inout2
Adam Langleye9ada862015-05-11 17:20:37 -0700495.L${dir}_loop6_enter:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800496 aes${dir} $rndkey1,$inout3
497 aes${dir} $rndkey1,$inout4
498 aes${dir} $rndkey1,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -0800499 $movkey ($key,%rax),$rndkey1
500 add \$32,%rax
501 aes${dir} $rndkey0,$inout0
502 aes${dir} $rndkey0,$inout1
503 aes${dir} $rndkey0,$inout2
504 aes${dir} $rndkey0,$inout3
505 aes${dir} $rndkey0,$inout4
506 aes${dir} $rndkey0,$inout5
507 $movkey -16($key,%rax),$rndkey0
508 jnz .L${dir}_loop6
509
510 aes${dir} $rndkey1,$inout0
511 aes${dir} $rndkey1,$inout1
512 aes${dir} $rndkey1,$inout2
513 aes${dir} $rndkey1,$inout3
514 aes${dir} $rndkey1,$inout4
515 aes${dir} $rndkey1,$inout5
516 aes${dir}last $rndkey0,$inout0
517 aes${dir}last $rndkey0,$inout1
518 aes${dir}last $rndkey0,$inout2
519 aes${dir}last $rndkey0,$inout3
520 aes${dir}last $rndkey0,$inout4
521 aes${dir}last $rndkey0,$inout5
522 ret
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800523.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800524.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
525___
526}
527sub aesni_generate8 {
528my $dir=shift;
529# As already mentioned it takes in $key and $rounds, which are *not*
530# preserved. $inout[0-7] is cipher/clear text...
531$code.=<<___;
532.type _aesni_${dir}rypt8,\@abi-omnipotent
533.align 16
534_aesni_${dir}rypt8:
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800535.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800536 $movkey ($key),$rndkey0
537 shl \$4,$rounds
538 $movkey 16($key),$rndkey1
539 xorps $rndkey0,$inout0
540 xorps $rndkey0,$inout1
541 pxor $rndkey0,$inout2
542 pxor $rndkey0,$inout3
543 pxor $rndkey0,$inout4
544 lea 32($key,$rounds),$key
545 neg %rax # $rounds
546 aes${dir} $rndkey1,$inout0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800547 pxor $rndkey0,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -0800548 pxor $rndkey0,$inout6
Adam Langleye9ada862015-05-11 17:20:37 -0700549 aes${dir} $rndkey1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800550 pxor $rndkey0,$inout7
Adam Langleye9ada862015-05-11 17:20:37 -0700551 $movkey ($key,%rax),$rndkey0
552 add \$16,%rax
553 jmp .L${dir}_loop8_inner
Adam Langleyd9e397b2015-01-22 14:27:53 -0800554.align 16
555.L${dir}_loop8:
556 aes${dir} $rndkey1,$inout0
557 aes${dir} $rndkey1,$inout1
Adam Langleye9ada862015-05-11 17:20:37 -0700558.L${dir}_loop8_inner:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800559 aes${dir} $rndkey1,$inout2
560 aes${dir} $rndkey1,$inout3
561 aes${dir} $rndkey1,$inout4
562 aes${dir} $rndkey1,$inout5
563 aes${dir} $rndkey1,$inout6
564 aes${dir} $rndkey1,$inout7
565.L${dir}_loop8_enter:
566 $movkey ($key,%rax),$rndkey1
567 add \$32,%rax
568 aes${dir} $rndkey0,$inout0
569 aes${dir} $rndkey0,$inout1
570 aes${dir} $rndkey0,$inout2
571 aes${dir} $rndkey0,$inout3
572 aes${dir} $rndkey0,$inout4
573 aes${dir} $rndkey0,$inout5
574 aes${dir} $rndkey0,$inout6
575 aes${dir} $rndkey0,$inout7
576 $movkey -16($key,%rax),$rndkey0
577 jnz .L${dir}_loop8
578
579 aes${dir} $rndkey1,$inout0
580 aes${dir} $rndkey1,$inout1
581 aes${dir} $rndkey1,$inout2
582 aes${dir} $rndkey1,$inout3
583 aes${dir} $rndkey1,$inout4
584 aes${dir} $rndkey1,$inout5
585 aes${dir} $rndkey1,$inout6
586 aes${dir} $rndkey1,$inout7
587 aes${dir}last $rndkey0,$inout0
588 aes${dir}last $rndkey0,$inout1
589 aes${dir}last $rndkey0,$inout2
590 aes${dir}last $rndkey0,$inout3
591 aes${dir}last $rndkey0,$inout4
592 aes${dir}last $rndkey0,$inout5
593 aes${dir}last $rndkey0,$inout6
594 aes${dir}last $rndkey0,$inout7
595 ret
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800596.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800597.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
598___
599}
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100600&aesni_generate2("enc") if ($PREFIX eq "aes_hw");
Adam Langleyd9e397b2015-01-22 14:27:53 -0800601&aesni_generate2("dec");
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100602&aesni_generate3("enc") if ($PREFIX eq "aes_hw");
Adam Langleyd9e397b2015-01-22 14:27:53 -0800603&aesni_generate3("dec");
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100604&aesni_generate4("enc") if ($PREFIX eq "aes_hw");
Adam Langleyd9e397b2015-01-22 14:27:53 -0800605&aesni_generate4("dec");
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100606&aesni_generate6("enc") if ($PREFIX eq "aes_hw");
Adam Langleyd9e397b2015-01-22 14:27:53 -0800607&aesni_generate6("dec");
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100608&aesni_generate8("enc") if ($PREFIX eq "aes_hw");
Adam Langleyd9e397b2015-01-22 14:27:53 -0800609&aesni_generate8("dec");
610
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100611if ($PREFIX eq "aes_hw") {
Adam Langleyd9e397b2015-01-22 14:27:53 -0800612########################################################################
613# void aesni_ecb_encrypt (const void *in, void *out,
614# size_t length, const AES_KEY *key,
615# int enc);
616$code.=<<___;
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100617.globl ${PREFIX}_ecb_encrypt
618.type ${PREFIX}_ecb_encrypt,\@function,5
Adam Langleyd9e397b2015-01-22 14:27:53 -0800619.align 16
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100620${PREFIX}_ecb_encrypt:
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800621.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800622___
623$code.=<<___ if ($win64);
624 lea -0x58(%rsp),%rsp
Adam Langleye9ada862015-05-11 17:20:37 -0700625 movaps %xmm6,(%rsp) # offload $inout4..7
Adam Langleyd9e397b2015-01-22 14:27:53 -0800626 movaps %xmm7,0x10(%rsp)
627 movaps %xmm8,0x20(%rsp)
628 movaps %xmm9,0x30(%rsp)
629.Lecb_enc_body:
630___
631$code.=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -0700632 and \$-16,$len # if ($len<16)
633 jz .Lecb_ret # return
Adam Langleyd9e397b2015-01-22 14:27:53 -0800634
635 mov 240($key),$rounds # key->rounds
636 $movkey ($key),$rndkey0
637 mov $key,$key_ # backup $key
638 mov $rounds,$rnds_ # backup $rounds
639 test %r8d,%r8d # 5th argument
640 jz .Lecb_decrypt
641#--------------------------- ECB ENCRYPT ------------------------------#
Adam Langleye9ada862015-05-11 17:20:37 -0700642 cmp \$0x80,$len # if ($len<8*16)
643 jb .Lecb_enc_tail # short input
Adam Langleyd9e397b2015-01-22 14:27:53 -0800644
Adam Langleye9ada862015-05-11 17:20:37 -0700645 movdqu ($inp),$inout0 # load 8 input blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800646 movdqu 0x10($inp),$inout1
647 movdqu 0x20($inp),$inout2
648 movdqu 0x30($inp),$inout3
649 movdqu 0x40($inp),$inout4
650 movdqu 0x50($inp),$inout5
651 movdqu 0x60($inp),$inout6
652 movdqu 0x70($inp),$inout7
Adam Langleye9ada862015-05-11 17:20:37 -0700653 lea 0x80($inp),$inp # $inp+=8*16
654 sub \$0x80,$len # $len-=8*16 (can be zero)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800655 jmp .Lecb_enc_loop8_enter
656.align 16
657.Lecb_enc_loop8:
Adam Langleye9ada862015-05-11 17:20:37 -0700658 movups $inout0,($out) # store 8 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800659 mov $key_,$key # restore $key
Adam Langleye9ada862015-05-11 17:20:37 -0700660 movdqu ($inp),$inout0 # load 8 input blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800661 mov $rnds_,$rounds # restore $rounds
662 movups $inout1,0x10($out)
663 movdqu 0x10($inp),$inout1
664 movups $inout2,0x20($out)
665 movdqu 0x20($inp),$inout2
666 movups $inout3,0x30($out)
667 movdqu 0x30($inp),$inout3
668 movups $inout4,0x40($out)
669 movdqu 0x40($inp),$inout4
670 movups $inout5,0x50($out)
671 movdqu 0x50($inp),$inout5
672 movups $inout6,0x60($out)
673 movdqu 0x60($inp),$inout6
674 movups $inout7,0x70($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700675 lea 0x80($out),$out # $out+=8*16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800676 movdqu 0x70($inp),$inout7
Adam Langleye9ada862015-05-11 17:20:37 -0700677 lea 0x80($inp),$inp # $inp+=8*16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800678.Lecb_enc_loop8_enter:
679
680 call _aesni_encrypt8
681
682 sub \$0x80,$len
Adam Langleye9ada862015-05-11 17:20:37 -0700683 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
Adam Langleyd9e397b2015-01-22 14:27:53 -0800684
Adam Langleye9ada862015-05-11 17:20:37 -0700685 movups $inout0,($out) # store 8 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800686 mov $key_,$key # restore $key
687 movups $inout1,0x10($out)
688 mov $rnds_,$rounds # restore $rounds
689 movups $inout2,0x20($out)
690 movups $inout3,0x30($out)
691 movups $inout4,0x40($out)
692 movups $inout5,0x50($out)
693 movups $inout6,0x60($out)
694 movups $inout7,0x70($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700695 lea 0x80($out),$out # $out+=8*16
696 add \$0x80,$len # restore real remaining $len
697 jz .Lecb_ret # done if ($len==0)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800698
Adam Langleye9ada862015-05-11 17:20:37 -0700699.Lecb_enc_tail: # $len is less than 8*16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800700 movups ($inp),$inout0
701 cmp \$0x20,$len
702 jb .Lecb_enc_one
703 movups 0x10($inp),$inout1
704 je .Lecb_enc_two
705 movups 0x20($inp),$inout2
706 cmp \$0x40,$len
707 jb .Lecb_enc_three
708 movups 0x30($inp),$inout3
709 je .Lecb_enc_four
710 movups 0x40($inp),$inout4
711 cmp \$0x60,$len
712 jb .Lecb_enc_five
713 movups 0x50($inp),$inout5
714 je .Lecb_enc_six
715 movdqu 0x60($inp),$inout6
Adam Langleye9ada862015-05-11 17:20:37 -0700716 xorps $inout7,$inout7
Adam Langleyd9e397b2015-01-22 14:27:53 -0800717 call _aesni_encrypt8
Adam Langleye9ada862015-05-11 17:20:37 -0700718 movups $inout0,($out) # store 7 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800719 movups $inout1,0x10($out)
720 movups $inout2,0x20($out)
721 movups $inout3,0x30($out)
722 movups $inout4,0x40($out)
723 movups $inout5,0x50($out)
724 movups $inout6,0x60($out)
725 jmp .Lecb_ret
726.align 16
727.Lecb_enc_one:
728___
729 &aesni_generate1("enc",$key,$rounds);
730$code.=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -0700731 movups $inout0,($out) # store one output block
Adam Langleyd9e397b2015-01-22 14:27:53 -0800732 jmp .Lecb_ret
733.align 16
734.Lecb_enc_two:
735 call _aesni_encrypt2
Adam Langleye9ada862015-05-11 17:20:37 -0700736 movups $inout0,($out) # store 2 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800737 movups $inout1,0x10($out)
738 jmp .Lecb_ret
739.align 16
740.Lecb_enc_three:
741 call _aesni_encrypt3
Adam Langleye9ada862015-05-11 17:20:37 -0700742 movups $inout0,($out) # store 3 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800743 movups $inout1,0x10($out)
744 movups $inout2,0x20($out)
745 jmp .Lecb_ret
746.align 16
747.Lecb_enc_four:
748 call _aesni_encrypt4
Adam Langleye9ada862015-05-11 17:20:37 -0700749 movups $inout0,($out) # store 4 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800750 movups $inout1,0x10($out)
751 movups $inout2,0x20($out)
752 movups $inout3,0x30($out)
753 jmp .Lecb_ret
754.align 16
755.Lecb_enc_five:
756 xorps $inout5,$inout5
757 call _aesni_encrypt6
Adam Langleye9ada862015-05-11 17:20:37 -0700758 movups $inout0,($out) # store 5 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800759 movups $inout1,0x10($out)
760 movups $inout2,0x20($out)
761 movups $inout3,0x30($out)
762 movups $inout4,0x40($out)
763 jmp .Lecb_ret
764.align 16
765.Lecb_enc_six:
766 call _aesni_encrypt6
Adam Langleye9ada862015-05-11 17:20:37 -0700767 movups $inout0,($out) # store 6 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800768 movups $inout1,0x10($out)
769 movups $inout2,0x20($out)
770 movups $inout3,0x30($out)
771 movups $inout4,0x40($out)
772 movups $inout5,0x50($out)
773 jmp .Lecb_ret
774 #--------------------------- ECB DECRYPT ------------------------------#
775.align 16
776.Lecb_decrypt:
Adam Langleye9ada862015-05-11 17:20:37 -0700777 cmp \$0x80,$len # if ($len<8*16)
778 jb .Lecb_dec_tail # short input
Adam Langleyd9e397b2015-01-22 14:27:53 -0800779
Adam Langleye9ada862015-05-11 17:20:37 -0700780 movdqu ($inp),$inout0 # load 8 input blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800781 movdqu 0x10($inp),$inout1
782 movdqu 0x20($inp),$inout2
783 movdqu 0x30($inp),$inout3
784 movdqu 0x40($inp),$inout4
785 movdqu 0x50($inp),$inout5
786 movdqu 0x60($inp),$inout6
787 movdqu 0x70($inp),$inout7
Adam Langleye9ada862015-05-11 17:20:37 -0700788 lea 0x80($inp),$inp # $inp+=8*16
789 sub \$0x80,$len # $len-=8*16 (can be zero)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800790 jmp .Lecb_dec_loop8_enter
791.align 16
792.Lecb_dec_loop8:
Adam Langleye9ada862015-05-11 17:20:37 -0700793 movups $inout0,($out) # store 8 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800794 mov $key_,$key # restore $key
Adam Langleye9ada862015-05-11 17:20:37 -0700795 movdqu ($inp),$inout0 # load 8 input blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -0800796 mov $rnds_,$rounds # restore $rounds
797 movups $inout1,0x10($out)
798 movdqu 0x10($inp),$inout1
799 movups $inout2,0x20($out)
800 movdqu 0x20($inp),$inout2
801 movups $inout3,0x30($out)
802 movdqu 0x30($inp),$inout3
803 movups $inout4,0x40($out)
804 movdqu 0x40($inp),$inout4
805 movups $inout5,0x50($out)
806 movdqu 0x50($inp),$inout5
807 movups $inout6,0x60($out)
808 movdqu 0x60($inp),$inout6
809 movups $inout7,0x70($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700810 lea 0x80($out),$out # $out+=8*16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800811 movdqu 0x70($inp),$inout7
Adam Langleye9ada862015-05-11 17:20:37 -0700812 lea 0x80($inp),$inp # $inp+=8*16
Adam Langleyd9e397b2015-01-22 14:27:53 -0800813.Lecb_dec_loop8_enter:
814
815 call _aesni_decrypt8
816
817 $movkey ($key_),$rndkey0
818 sub \$0x80,$len
Adam Langleye9ada862015-05-11 17:20:37 -0700819 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
Adam Langleyd9e397b2015-01-22 14:27:53 -0800820
Adam Langleye9ada862015-05-11 17:20:37 -0700821 movups $inout0,($out) # store 8 output blocks
822 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800823 mov $key_,$key # restore $key
824 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700825 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800826 mov $rnds_,$rounds # restore $rounds
827 movups $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700828 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800829 movups $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700830 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800831 movups $inout4,0x40($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700832 pxor $inout4,$inout4
Adam Langleyd9e397b2015-01-22 14:27:53 -0800833 movups $inout5,0x50($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700834 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -0800835 movups $inout6,0x60($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700836 pxor $inout6,$inout6
Adam Langleyd9e397b2015-01-22 14:27:53 -0800837 movups $inout7,0x70($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700838 pxor $inout7,$inout7
839 lea 0x80($out),$out # $out+=8*16
840 add \$0x80,$len # restore real remaining $len
841 jz .Lecb_ret # done if ($len==0)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800842
843.Lecb_dec_tail:
844 movups ($inp),$inout0
845 cmp \$0x20,$len
846 jb .Lecb_dec_one
847 movups 0x10($inp),$inout1
848 je .Lecb_dec_two
849 movups 0x20($inp),$inout2
850 cmp \$0x40,$len
851 jb .Lecb_dec_three
852 movups 0x30($inp),$inout3
853 je .Lecb_dec_four
854 movups 0x40($inp),$inout4
855 cmp \$0x60,$len
856 jb .Lecb_dec_five
857 movups 0x50($inp),$inout5
858 je .Lecb_dec_six
859 movups 0x60($inp),$inout6
860 $movkey ($key),$rndkey0
Adam Langleye9ada862015-05-11 17:20:37 -0700861 xorps $inout7,$inout7
Adam Langleyd9e397b2015-01-22 14:27:53 -0800862 call _aesni_decrypt8
Adam Langleye9ada862015-05-11 17:20:37 -0700863 movups $inout0,($out) # store 7 output blocks
864 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800865 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700866 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800867 movups $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700868 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800869 movups $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700870 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800871 movups $inout4,0x40($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700872 pxor $inout4,$inout4
Adam Langleyd9e397b2015-01-22 14:27:53 -0800873 movups $inout5,0x50($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700874 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -0800875 movups $inout6,0x60($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700876 pxor $inout6,$inout6
877 pxor $inout7,$inout7
Adam Langleyd9e397b2015-01-22 14:27:53 -0800878 jmp .Lecb_ret
879.align 16
880.Lecb_dec_one:
881___
882 &aesni_generate1("dec",$key,$rounds);
883$code.=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -0700884 movups $inout0,($out) # store one output block
885 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800886 jmp .Lecb_ret
887.align 16
888.Lecb_dec_two:
889 call _aesni_decrypt2
Adam Langleye9ada862015-05-11 17:20:37 -0700890 movups $inout0,($out) # store 2 output blocks
891 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800892 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700893 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800894 jmp .Lecb_ret
895.align 16
896.Lecb_dec_three:
897 call _aesni_decrypt3
Adam Langleye9ada862015-05-11 17:20:37 -0700898 movups $inout0,($out) # store 3 output blocks
899 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800900 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700901 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800902 movups $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700903 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800904 jmp .Lecb_ret
905.align 16
906.Lecb_dec_four:
907 call _aesni_decrypt4
Adam Langleye9ada862015-05-11 17:20:37 -0700908 movups $inout0,($out) # store 4 output blocks
909 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800910 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700911 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800912 movups $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700913 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800914 movups $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700915 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800916 jmp .Lecb_ret
917.align 16
918.Lecb_dec_five:
919 xorps $inout5,$inout5
920 call _aesni_decrypt6
Adam Langleye9ada862015-05-11 17:20:37 -0700921 movups $inout0,($out) # store 5 output blocks
922 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800923 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700924 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800925 movups $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700926 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800927 movups $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700928 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800929 movups $inout4,0x40($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700930 pxor $inout4,$inout4
931 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -0800932 jmp .Lecb_ret
933.align 16
934.Lecb_dec_six:
935 call _aesni_decrypt6
Adam Langleye9ada862015-05-11 17:20:37 -0700936 movups $inout0,($out) # store 6 output blocks
937 pxor $inout0,$inout0 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -0800938 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700939 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800940 movups $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700941 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800942 movups $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700943 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800944 movups $inout4,0x40($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700945 pxor $inout4,$inout4
Adam Langleyd9e397b2015-01-22 14:27:53 -0800946 movups $inout5,0x50($out)
Adam Langleye9ada862015-05-11 17:20:37 -0700947 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -0800948
949.Lecb_ret:
Adam Langleye9ada862015-05-11 17:20:37 -0700950 xorps $rndkey0,$rndkey0 # %xmm0
951 pxor $rndkey1,$rndkey1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800952___
953$code.=<<___ if ($win64);
954 movaps (%rsp),%xmm6
Adam Langleye9ada862015-05-11 17:20:37 -0700955 movaps %xmm0,(%rsp) # clear stack
Adam Langleyd9e397b2015-01-22 14:27:53 -0800956 movaps 0x10(%rsp),%xmm7
Adam Langleye9ada862015-05-11 17:20:37 -0700957 movaps %xmm0,0x10(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800958 movaps 0x20(%rsp),%xmm8
Adam Langleye9ada862015-05-11 17:20:37 -0700959 movaps %xmm0,0x20(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800960 movaps 0x30(%rsp),%xmm9
Adam Langleye9ada862015-05-11 17:20:37 -0700961 movaps %xmm0,0x30(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800962 lea 0x58(%rsp),%rsp
963.Lecb_enc_ret:
964___
965$code.=<<___;
966 ret
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800967.cfi_endproc
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100968.size ${PREFIX}_ecb_encrypt,.-${PREFIX}_ecb_encrypt
Adam Langleyd9e397b2015-01-22 14:27:53 -0800969___
970
971{
972######################################################################
973# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
974# size_t blocks, const AES_KEY *key,
975# const char *ivec,char *cmac);
976#
977# Handles only complete blocks, operates on 64-bit counter and
978# does not update *ivec! Nor does it finalize CMAC value
979# (see engine/eng_aesni.c for details)
980#
Robert Sloan4c22c5f2019-03-01 15:53:37 -0800981if (0) { # Omit these functions in BoringSSL
Adam Langleyd9e397b2015-01-22 14:27:53 -0800982my $cmac="%r9"; # 6th argument
983
984my $increment="%xmm9";
985my $iv="%xmm6";
986my $bswap_mask="%xmm7";
987
988$code.=<<___;
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100989.globl ${PREFIX}_ccm64_encrypt_blocks
990.type ${PREFIX}_ccm64_encrypt_blocks,\@function,6
Adam Langleyd9e397b2015-01-22 14:27:53 -0800991.align 16
Adam Vartanianbfcf3a72018-08-10 14:55:24 +0100992${PREFIX}_ccm64_encrypt_blocks:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800993___
994$code.=<<___ if ($win64);
995 lea -0x58(%rsp),%rsp
Adam Langleye9ada862015-05-11 17:20:37 -0700996 movaps %xmm6,(%rsp) # $iv
997 movaps %xmm7,0x10(%rsp) # $bswap_mask
998 movaps %xmm8,0x20(%rsp) # $in0
999 movaps %xmm9,0x30(%rsp) # $increment
Adam Langleyd9e397b2015-01-22 14:27:53 -08001000.Lccm64_enc_body:
1001___
1002$code.=<<___;
1003 mov 240($key),$rounds # key->rounds
1004 movdqu ($ivp),$iv
1005 movdqa .Lincrement64(%rip),$increment
1006 movdqa .Lbswap_mask(%rip),$bswap_mask
1007
1008 shl \$4,$rounds
1009 mov \$16,$rnds_
1010 lea 0($key),$key_
1011 movdqu ($cmac),$inout1
1012 movdqa $iv,$inout0
1013 lea 32($key,$rounds),$key # end of key schedule
1014 pshufb $bswap_mask,$iv
1015 sub %rax,%r10 # twisted $rounds
1016 jmp .Lccm64_enc_outer
1017.align 16
1018.Lccm64_enc_outer:
1019 $movkey ($key_),$rndkey0
1020 mov %r10,%rax
1021 movups ($inp),$in0 # load inp
1022
1023 xorps $rndkey0,$inout0 # counter
1024 $movkey 16($key_),$rndkey1
1025 xorps $in0,$rndkey0
1026 xorps $rndkey0,$inout1 # cmac^=inp
1027 $movkey 32($key_),$rndkey0
1028
1029.Lccm64_enc2_loop:
1030 aesenc $rndkey1,$inout0
1031 aesenc $rndkey1,$inout1
1032 $movkey ($key,%rax),$rndkey1
1033 add \$32,%rax
1034 aesenc $rndkey0,$inout0
1035 aesenc $rndkey0,$inout1
1036 $movkey -16($key,%rax),$rndkey0
1037 jnz .Lccm64_enc2_loop
1038 aesenc $rndkey1,$inout0
1039 aesenc $rndkey1,$inout1
1040 paddq $increment,$iv
Adam Langleye9ada862015-05-11 17:20:37 -07001041 dec $len # $len-- ($len is in blocks)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001042 aesenclast $rndkey0,$inout0
1043 aesenclast $rndkey0,$inout1
1044
1045 lea 16($inp),$inp
1046 xorps $inout0,$in0 # inp ^= E(iv)
1047 movdqa $iv,$inout0
1048 movups $in0,($out) # save output
1049 pshufb $bswap_mask,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07001050 lea 16($out),$out # $out+=16
1051 jnz .Lccm64_enc_outer # loop if ($len!=0)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001052
Adam Langleye9ada862015-05-11 17:20:37 -07001053 pxor $rndkey0,$rndkey0 # clear register bank
1054 pxor $rndkey1,$rndkey1
1055 pxor $inout0,$inout0
1056 movups $inout1,($cmac) # store resulting mac
1057 pxor $inout1,$inout1
1058 pxor $in0,$in0
1059 pxor $iv,$iv
Adam Langleyd9e397b2015-01-22 14:27:53 -08001060___
1061$code.=<<___ if ($win64);
1062 movaps (%rsp),%xmm6
Adam Langleye9ada862015-05-11 17:20:37 -07001063 movaps %xmm0,(%rsp) # clear stack
Adam Langleyd9e397b2015-01-22 14:27:53 -08001064 movaps 0x10(%rsp),%xmm7
Adam Langleye9ada862015-05-11 17:20:37 -07001065 movaps %xmm0,0x10(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001066 movaps 0x20(%rsp),%xmm8
Adam Langleye9ada862015-05-11 17:20:37 -07001067 movaps %xmm0,0x20(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001068 movaps 0x30(%rsp),%xmm9
Adam Langleye9ada862015-05-11 17:20:37 -07001069 movaps %xmm0,0x30(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001070 lea 0x58(%rsp),%rsp
1071.Lccm64_enc_ret:
1072___
1073$code.=<<___;
1074 ret
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001075.size ${PREFIX}_ccm64_encrypt_blocks,.-${PREFIX}_ccm64_encrypt_blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08001076___
1077######################################################################
1078$code.=<<___;
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001079.globl ${PREFIX}_ccm64_decrypt_blocks
1080.type ${PREFIX}_ccm64_decrypt_blocks,\@function,6
Adam Langleyd9e397b2015-01-22 14:27:53 -08001081.align 16
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001082${PREFIX}_ccm64_decrypt_blocks:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001083___
1084$code.=<<___ if ($win64);
1085 lea -0x58(%rsp),%rsp
Adam Langleye9ada862015-05-11 17:20:37 -07001086 movaps %xmm6,(%rsp) # $iv
1087 movaps %xmm7,0x10(%rsp) # $bswap_mask
1088 movaps %xmm8,0x20(%rsp) # $in8
1089 movaps %xmm9,0x30(%rsp) # $increment
Adam Langleyd9e397b2015-01-22 14:27:53 -08001090.Lccm64_dec_body:
1091___
1092$code.=<<___;
1093 mov 240($key),$rounds # key->rounds
1094 movups ($ivp),$iv
1095 movdqu ($cmac),$inout1
1096 movdqa .Lincrement64(%rip),$increment
1097 movdqa .Lbswap_mask(%rip),$bswap_mask
1098
1099 movaps $iv,$inout0
1100 mov $rounds,$rnds_
1101 mov $key,$key_
1102 pshufb $bswap_mask,$iv
1103___
1104 &aesni_generate1("enc",$key,$rounds);
1105$code.=<<___;
1106 shl \$4,$rnds_
1107 mov \$16,$rounds
1108 movups ($inp),$in0 # load inp
1109 paddq $increment,$iv
Adam Langleye9ada862015-05-11 17:20:37 -07001110 lea 16($inp),$inp # $inp+=16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001111 sub %r10,%rax # twisted $rounds
1112 lea 32($key_,$rnds_),$key # end of key schedule
1113 mov %rax,%r10
1114 jmp .Lccm64_dec_outer
1115.align 16
1116.Lccm64_dec_outer:
1117 xorps $inout0,$in0 # inp ^= E(iv)
1118 movdqa $iv,$inout0
1119 movups $in0,($out) # save output
Adam Langleye9ada862015-05-11 17:20:37 -07001120 lea 16($out),$out # $out+=16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001121 pshufb $bswap_mask,$inout0
1122
Adam Langleye9ada862015-05-11 17:20:37 -07001123 sub \$1,$len # $len-- ($len is in blocks)
1124 jz .Lccm64_dec_break # if ($len==0) break
Adam Langleyd9e397b2015-01-22 14:27:53 -08001125
1126 $movkey ($key_),$rndkey0
1127 mov %r10,%rax
1128 $movkey 16($key_),$rndkey1
1129 xorps $rndkey0,$in0
1130 xorps $rndkey0,$inout0
1131 xorps $in0,$inout1 # cmac^=out
1132 $movkey 32($key_),$rndkey0
1133 jmp .Lccm64_dec2_loop
1134.align 16
1135.Lccm64_dec2_loop:
1136 aesenc $rndkey1,$inout0
1137 aesenc $rndkey1,$inout1
1138 $movkey ($key,%rax),$rndkey1
1139 add \$32,%rax
1140 aesenc $rndkey0,$inout0
1141 aesenc $rndkey0,$inout1
1142 $movkey -16($key,%rax),$rndkey0
1143 jnz .Lccm64_dec2_loop
Adam Langleye9ada862015-05-11 17:20:37 -07001144 movups ($inp),$in0 # load input
Adam Langleyd9e397b2015-01-22 14:27:53 -08001145 paddq $increment,$iv
1146 aesenc $rndkey1,$inout0
1147 aesenc $rndkey1,$inout1
1148 aesenclast $rndkey0,$inout0
1149 aesenclast $rndkey0,$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07001150 lea 16($inp),$inp # $inp+=16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001151 jmp .Lccm64_dec_outer
1152
1153.align 16
1154.Lccm64_dec_break:
1155 #xorps $in0,$inout1 # cmac^=out
1156 mov 240($key_),$rounds
1157___
1158 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1159$code.=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -07001160 pxor $rndkey0,$rndkey0 # clear register bank
1161 pxor $rndkey1,$rndkey1
1162 pxor $inout0,$inout0
1163 movups $inout1,($cmac) # store resulting mac
1164 pxor $inout1,$inout1
1165 pxor $in0,$in0
1166 pxor $iv,$iv
Adam Langleyd9e397b2015-01-22 14:27:53 -08001167___
1168$code.=<<___ if ($win64);
1169 movaps (%rsp),%xmm6
Adam Langleye9ada862015-05-11 17:20:37 -07001170 movaps %xmm0,(%rsp) # clear stack
Adam Langleyd9e397b2015-01-22 14:27:53 -08001171 movaps 0x10(%rsp),%xmm7
Adam Langleye9ada862015-05-11 17:20:37 -07001172 movaps %xmm0,0x10(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001173 movaps 0x20(%rsp),%xmm8
Adam Langleye9ada862015-05-11 17:20:37 -07001174 movaps %xmm0,0x20(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001175 movaps 0x30(%rsp),%xmm9
Adam Langleye9ada862015-05-11 17:20:37 -07001176 movaps %xmm0,0x30(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001177 lea 0x58(%rsp),%rsp
1178.Lccm64_dec_ret:
1179___
1180$code.=<<___;
1181 ret
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001182.size ${PREFIX}_ccm64_decrypt_blocks,.-${PREFIX}_ccm64_decrypt_blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08001183___
1184}
1185######################################################################
1186# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1187# size_t blocks, const AES_KEY *key,
1188# const char *ivec);
1189#
1190# Handles only complete blocks, operates on 32-bit counter and
1191# does not update *ivec! (see crypto/modes/ctr128.c for details)
1192#
1193# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1194# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1195# Keywords are full unroll and modulo-schedule counter calculations
1196# with zero-round key xor.
1197{
1198my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
Robert Sloana94fe052017-02-21 08:49:28 -08001199my ($key0,$ctr)=("%ebp","${ivp}d");
Adam Langleyd9e397b2015-01-22 14:27:53 -08001200my $frame_size = 0x80 + ($win64?160:0);
1201
1202$code.=<<___;
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001203.globl ${PREFIX}_ctr32_encrypt_blocks
1204.type ${PREFIX}_ctr32_encrypt_blocks,\@function,5
Adam Langleyd9e397b2015-01-22 14:27:53 -08001205.align 16
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001206${PREFIX}_ctr32_encrypt_blocks:
Robert Sloanab8b8882018-03-26 11:39:51 -07001207.cfi_startproc
Robert Sloan4c22c5f2019-03-01 15:53:37 -08001208#ifndef NDEBUG
1209#ifndef BORINGSSL_FIPS
1210 movb \$1,BORINGSSL_function_hit(%rip)
1211#endif
1212#endif
Adam Langleye9ada862015-05-11 17:20:37 -07001213 cmp \$1,$len
1214 jne .Lctr32_bulk
1215
1216 # handle single block without allocating stack frame,
1217 # useful when handling edges
1218 movups ($ivp),$inout0
1219 movups ($inp),$inout1
1220 mov 240($key),%edx # key->rounds
1221___
1222 &aesni_generate1("enc",$key,"%edx");
1223$code.=<<___;
1224 pxor $rndkey0,$rndkey0 # clear register bank
1225 pxor $rndkey1,$rndkey1
1226 xorps $inout1,$inout0
1227 pxor $inout1,$inout1
1228 movups $inout0,($out)
1229 xorps $inout0,$inout0
1230 jmp .Lctr32_epilogue
1231
1232.align 16
1233.Lctr32_bulk:
Robert Sloana94fe052017-02-21 08:49:28 -08001234 lea (%rsp),$key_ # use $key_ as frame pointer
Robert Sloanab8b8882018-03-26 11:39:51 -07001235.cfi_def_cfa_register $key_
Adam Langleyd9e397b2015-01-22 14:27:53 -08001236 push %rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07001237.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001238 sub \$$frame_size,%rsp
1239 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1240___
1241$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -08001242 movaps %xmm6,-0xa8($key_) # offload everything
1243 movaps %xmm7,-0x98($key_)
1244 movaps %xmm8,-0x88($key_)
1245 movaps %xmm9,-0x78($key_)
1246 movaps %xmm10,-0x68($key_)
1247 movaps %xmm11,-0x58($key_)
1248 movaps %xmm12,-0x48($key_)
1249 movaps %xmm13,-0x38($key_)
1250 movaps %xmm14,-0x28($key_)
1251 movaps %xmm15,-0x18($key_)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001252.Lctr32_body:
1253___
1254$code.=<<___;
Adam Langleyd9e397b2015-01-22 14:27:53 -08001255
Adam Langleye9ada862015-05-11 17:20:37 -07001256 # 8 16-byte words on top of stack are counter values
1257 # xor-ed with zero-round key
Adam Langleyd9e397b2015-01-22 14:27:53 -08001258
1259 movdqu ($ivp),$inout0
1260 movdqu ($key),$rndkey0
1261 mov 12($ivp),$ctr # counter LSB
1262 pxor $rndkey0,$inout0
1263 mov 12($key),$key0 # 0-round key LSB
1264 movdqa $inout0,0x00(%rsp) # populate counter block
1265 bswap $ctr
1266 movdqa $inout0,$inout1
1267 movdqa $inout0,$inout2
1268 movdqa $inout0,$inout3
1269 movdqa $inout0,0x40(%rsp)
1270 movdqa $inout0,0x50(%rsp)
1271 movdqa $inout0,0x60(%rsp)
Adam Langleye9ada862015-05-11 17:20:37 -07001272 mov %rdx,%r10 # about to borrow %rdx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001273 movdqa $inout0,0x70(%rsp)
1274
1275 lea 1($ctr),%rax
1276 lea 2($ctr),%rdx
1277 bswap %eax
1278 bswap %edx
1279 xor $key0,%eax
1280 xor $key0,%edx
1281 pinsrd \$3,%eax,$inout1
1282 lea 3($ctr),%rax
1283 movdqa $inout1,0x10(%rsp)
1284 pinsrd \$3,%edx,$inout2
1285 bswap %eax
1286 mov %r10,%rdx # restore %rdx
1287 lea 4($ctr),%r10
1288 movdqa $inout2,0x20(%rsp)
1289 xor $key0,%eax
1290 bswap %r10d
1291 pinsrd \$3,%eax,$inout3
1292 xor $key0,%r10d
1293 movdqa $inout3,0x30(%rsp)
1294 lea 5($ctr),%r9
1295 mov %r10d,0x40+12(%rsp)
1296 bswap %r9d
1297 lea 6($ctr),%r10
1298 mov 240($key),$rounds # key->rounds
1299 xor $key0,%r9d
1300 bswap %r10d
1301 mov %r9d,0x50+12(%rsp)
1302 xor $key0,%r10d
1303 lea 7($ctr),%r9
1304 mov %r10d,0x60+12(%rsp)
1305 bswap %r9d
Robert Sloan2424d842017-05-01 07:46:28 -07001306 leaq OPENSSL_ia32cap_P(%rip),%r10
Robert Sloan572a4e22017-04-17 10:52:19 -07001307 mov 4(%r10),%r10d
Adam Langleyd9e397b2015-01-22 14:27:53 -08001308 xor $key0,%r9d
1309 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
1310 mov %r9d,0x70+12(%rsp)
1311
1312 $movkey 0x10($key),$rndkey1
1313
1314 movdqa 0x40(%rsp),$inout4
1315 movdqa 0x50(%rsp),$inout5
1316
Adam Langleye9ada862015-05-11 17:20:37 -07001317 cmp \$8,$len # $len is in blocks
1318 jb .Lctr32_tail # short input if ($len<8)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001319
Adam Langleye9ada862015-05-11 17:20:37 -07001320 sub \$6,$len # $len is biased by -6
Adam Langleyd9e397b2015-01-22 14:27:53 -08001321 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
Adam Langleye9ada862015-05-11 17:20:37 -07001322 je .Lctr32_6x # [which denotes Atom Silvermont]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001323
1324 lea 0x80($key),$key # size optimization
Adam Langleye9ada862015-05-11 17:20:37 -07001325 sub \$2,$len # $len is biased by -8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001326 jmp .Lctr32_loop8
1327
1328.align 16
1329.Lctr32_6x:
1330 shl \$4,$rounds
1331 mov \$48,$rnds_
1332 bswap $key0
1333 lea 32($key,$rounds),$key # end of key schedule
1334 sub %rax,%r10 # twisted $rounds
1335 jmp .Lctr32_loop6
1336
1337.align 16
1338.Lctr32_loop6:
Adam Langleye9ada862015-05-11 17:20:37 -07001339 add \$6,$ctr # next counter value
Adam Langleyd9e397b2015-01-22 14:27:53 -08001340 $movkey -48($key,$rnds_),$rndkey0
1341 aesenc $rndkey1,$inout0
1342 mov $ctr,%eax
1343 xor $key0,%eax
1344 aesenc $rndkey1,$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07001345 movbe %eax,`0x00+12`(%rsp) # store next counter value
Adam Langleyd9e397b2015-01-22 14:27:53 -08001346 lea 1($ctr),%eax
1347 aesenc $rndkey1,$inout2
1348 xor $key0,%eax
1349 movbe %eax,`0x10+12`(%rsp)
1350 aesenc $rndkey1,$inout3
1351 lea 2($ctr),%eax
1352 xor $key0,%eax
1353 aesenc $rndkey1,$inout4
1354 movbe %eax,`0x20+12`(%rsp)
1355 lea 3($ctr),%eax
1356 aesenc $rndkey1,$inout5
1357 $movkey -32($key,$rnds_),$rndkey1
1358 xor $key0,%eax
1359
1360 aesenc $rndkey0,$inout0
1361 movbe %eax,`0x30+12`(%rsp)
1362 lea 4($ctr),%eax
1363 aesenc $rndkey0,$inout1
1364 xor $key0,%eax
1365 movbe %eax,`0x40+12`(%rsp)
1366 aesenc $rndkey0,$inout2
1367 lea 5($ctr),%eax
1368 xor $key0,%eax
1369 aesenc $rndkey0,$inout3
1370 movbe %eax,`0x50+12`(%rsp)
1371 mov %r10,%rax # mov $rnds_,$rounds
1372 aesenc $rndkey0,$inout4
1373 aesenc $rndkey0,$inout5
1374 $movkey -16($key,$rnds_),$rndkey0
1375
1376 call .Lenc_loop6
1377
Adam Langleye9ada862015-05-11 17:20:37 -07001378 movdqu ($inp),$inout6 # load 6 input blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08001379 movdqu 0x10($inp),$inout7
1380 movdqu 0x20($inp),$in0
1381 movdqu 0x30($inp),$in1
1382 movdqu 0x40($inp),$in2
1383 movdqu 0x50($inp),$in3
Adam Langleye9ada862015-05-11 17:20:37 -07001384 lea 0x60($inp),$inp # $inp+=6*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001385 $movkey -64($key,$rnds_),$rndkey1
Adam Langleye9ada862015-05-11 17:20:37 -07001386 pxor $inout0,$inout6 # inp^=E(ctr)
1387 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001388 pxor $inout1,$inout7
1389 movaps 0x10(%rsp),$inout1
1390 pxor $inout2,$in0
1391 movaps 0x20(%rsp),$inout2
1392 pxor $inout3,$in1
1393 movaps 0x30(%rsp),$inout3
1394 pxor $inout4,$in2
1395 movaps 0x40(%rsp),$inout4
1396 pxor $inout5,$in3
1397 movaps 0x50(%rsp),$inout5
Adam Langleye9ada862015-05-11 17:20:37 -07001398 movdqu $inout6,($out) # store 6 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08001399 movdqu $inout7,0x10($out)
1400 movdqu $in0,0x20($out)
1401 movdqu $in1,0x30($out)
1402 movdqu $in2,0x40($out)
1403 movdqu $in3,0x50($out)
Adam Langleye9ada862015-05-11 17:20:37 -07001404 lea 0x60($out),$out # $out+=6*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001405
Adam Langleye9ada862015-05-11 17:20:37 -07001406 sub \$6,$len
1407 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
1408
1409 add \$6,$len # restore real remaining $len
1410 jz .Lctr32_done # done if ($len==0)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001411
1412 lea -48($rnds_),$rounds
1413 lea -80($key,$rnds_),$key # restore $key
1414 neg $rounds
1415 shr \$4,$rounds # restore $rounds
1416 jmp .Lctr32_tail
1417
1418.align 32
1419.Lctr32_loop8:
Adam Langleye9ada862015-05-11 17:20:37 -07001420 add \$8,$ctr # next counter value
Adam Langleyd9e397b2015-01-22 14:27:53 -08001421 movdqa 0x60(%rsp),$inout6
1422 aesenc $rndkey1,$inout0
1423 mov $ctr,%r9d
1424 movdqa 0x70(%rsp),$inout7
1425 aesenc $rndkey1,$inout1
1426 bswap %r9d
1427 $movkey 0x20-0x80($key),$rndkey0
1428 aesenc $rndkey1,$inout2
1429 xor $key0,%r9d
1430 nop
1431 aesenc $rndkey1,$inout3
Adam Langleye9ada862015-05-11 17:20:37 -07001432 mov %r9d,0x00+12(%rsp) # store next counter value
Adam Langleyd9e397b2015-01-22 14:27:53 -08001433 lea 1($ctr),%r9
1434 aesenc $rndkey1,$inout4
1435 aesenc $rndkey1,$inout5
1436 aesenc $rndkey1,$inout6
1437 aesenc $rndkey1,$inout7
1438 $movkey 0x30-0x80($key),$rndkey1
1439___
1440for($i=2;$i<8;$i++) {
1441my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1442$code.=<<___;
1443 bswap %r9d
1444 aesenc $rndkeyx,$inout0
1445 aesenc $rndkeyx,$inout1
1446 xor $key0,%r9d
1447 .byte 0x66,0x90
1448 aesenc $rndkeyx,$inout2
1449 aesenc $rndkeyx,$inout3
1450 mov %r9d,`0x10*($i-1)`+12(%rsp)
1451 lea $i($ctr),%r9
1452 aesenc $rndkeyx,$inout4
1453 aesenc $rndkeyx,$inout5
1454 aesenc $rndkeyx,$inout6
1455 aesenc $rndkeyx,$inout7
1456 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
1457___
1458}
1459$code.=<<___;
1460 bswap %r9d
1461 aesenc $rndkey0,$inout0
1462 aesenc $rndkey0,$inout1
1463 aesenc $rndkey0,$inout2
1464 xor $key0,%r9d
Adam Langleye9ada862015-05-11 17:20:37 -07001465 movdqu 0x00($inp),$in0 # start loading input
Adam Langleyd9e397b2015-01-22 14:27:53 -08001466 aesenc $rndkey0,$inout3
1467 mov %r9d,0x70+12(%rsp)
1468 cmp \$11,$rounds
1469 aesenc $rndkey0,$inout4
1470 aesenc $rndkey0,$inout5
1471 aesenc $rndkey0,$inout6
1472 aesenc $rndkey0,$inout7
1473 $movkey 0xa0-0x80($key),$rndkey0
1474
1475 jb .Lctr32_enc_done
1476
1477 aesenc $rndkey1,$inout0
1478 aesenc $rndkey1,$inout1
1479 aesenc $rndkey1,$inout2
1480 aesenc $rndkey1,$inout3
1481 aesenc $rndkey1,$inout4
1482 aesenc $rndkey1,$inout5
1483 aesenc $rndkey1,$inout6
1484 aesenc $rndkey1,$inout7
1485 $movkey 0xb0-0x80($key),$rndkey1
1486
1487 aesenc $rndkey0,$inout0
1488 aesenc $rndkey0,$inout1
1489 aesenc $rndkey0,$inout2
1490 aesenc $rndkey0,$inout3
1491 aesenc $rndkey0,$inout4
1492 aesenc $rndkey0,$inout5
1493 aesenc $rndkey0,$inout6
1494 aesenc $rndkey0,$inout7
1495 $movkey 0xc0-0x80($key),$rndkey0
1496 je .Lctr32_enc_done
1497
1498 aesenc $rndkey1,$inout0
1499 aesenc $rndkey1,$inout1
1500 aesenc $rndkey1,$inout2
1501 aesenc $rndkey1,$inout3
1502 aesenc $rndkey1,$inout4
1503 aesenc $rndkey1,$inout5
1504 aesenc $rndkey1,$inout6
1505 aesenc $rndkey1,$inout7
1506 $movkey 0xd0-0x80($key),$rndkey1
1507
1508 aesenc $rndkey0,$inout0
1509 aesenc $rndkey0,$inout1
1510 aesenc $rndkey0,$inout2
1511 aesenc $rndkey0,$inout3
1512 aesenc $rndkey0,$inout4
1513 aesenc $rndkey0,$inout5
1514 aesenc $rndkey0,$inout6
1515 aesenc $rndkey0,$inout7
1516 $movkey 0xe0-0x80($key),$rndkey0
1517 jmp .Lctr32_enc_done
1518
1519.align 16
1520.Lctr32_enc_done:
1521 movdqu 0x10($inp),$in1
Adam Langleye9ada862015-05-11 17:20:37 -07001522 pxor $rndkey0,$in0 # input^=round[last]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001523 movdqu 0x20($inp),$in2
1524 pxor $rndkey0,$in1
1525 movdqu 0x30($inp),$in3
1526 pxor $rndkey0,$in2
1527 movdqu 0x40($inp),$in4
1528 pxor $rndkey0,$in3
1529 movdqu 0x50($inp),$in5
1530 pxor $rndkey0,$in4
1531 pxor $rndkey0,$in5
1532 aesenc $rndkey1,$inout0
1533 aesenc $rndkey1,$inout1
1534 aesenc $rndkey1,$inout2
1535 aesenc $rndkey1,$inout3
1536 aesenc $rndkey1,$inout4
1537 aesenc $rndkey1,$inout5
1538 aesenc $rndkey1,$inout6
1539 aesenc $rndkey1,$inout7
Adam Langleye9ada862015-05-11 17:20:37 -07001540 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
1541 lea 0x80($inp),$inp # $inp+=8*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001542
Adam Langleye9ada862015-05-11 17:20:37 -07001543 aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
1544 pxor $rndkey0,$rndkey1 # borrowed $rndkey
Adam Langleyd9e397b2015-01-22 14:27:53 -08001545 movdqu 0x70-0x80($inp),$in0
1546 aesenclast $in1,$inout1
1547 pxor $rndkey0,$in0
1548 movdqa 0x00(%rsp),$in1 # load next counter block
1549 aesenclast $in2,$inout2
1550 aesenclast $in3,$inout3
1551 movdqa 0x10(%rsp),$in2
1552 movdqa 0x20(%rsp),$in3
1553 aesenclast $in4,$inout4
1554 aesenclast $in5,$inout5
1555 movdqa 0x30(%rsp),$in4
1556 movdqa 0x40(%rsp),$in5
1557 aesenclast $rndkey1,$inout6
1558 movdqa 0x50(%rsp),$rndkey0
Adam Langleye9ada862015-05-11 17:20:37 -07001559 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
Adam Langleyd9e397b2015-01-22 14:27:53 -08001560 aesenclast $in0,$inout7
1561
Adam Langleye9ada862015-05-11 17:20:37 -07001562 movups $inout0,($out) # store 8 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08001563 movdqa $in1,$inout0
1564 movups $inout1,0x10($out)
1565 movdqa $in2,$inout1
1566 movups $inout2,0x20($out)
1567 movdqa $in3,$inout2
1568 movups $inout3,0x30($out)
1569 movdqa $in4,$inout3
1570 movups $inout4,0x40($out)
1571 movdqa $in5,$inout4
1572 movups $inout5,0x50($out)
1573 movdqa $rndkey0,$inout5
1574 movups $inout6,0x60($out)
1575 movups $inout7,0x70($out)
Adam Langleye9ada862015-05-11 17:20:37 -07001576 lea 0x80($out),$out # $out+=8*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08001577
Adam Langleye9ada862015-05-11 17:20:37 -07001578 sub \$8,$len
1579 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
1580
Robert Sloanab8b8882018-03-26 11:39:51 -07001581 add \$8,$len # restore real remaining $len
Adam Langleye9ada862015-05-11 17:20:37 -07001582 jz .Lctr32_done # done if ($len==0)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001583 lea -0x80($key),$key
1584
1585.Lctr32_tail:
Adam Langleye9ada862015-05-11 17:20:37 -07001586 # note that at this point $inout0..5 are populated with
Robert Sloana94fe052017-02-21 08:49:28 -08001587 # counter values xor-ed with 0-round key
Adam Langleyd9e397b2015-01-22 14:27:53 -08001588 lea 16($key),$key
1589 cmp \$4,$len
1590 jb .Lctr32_loop3
1591 je .Lctr32_loop4
1592
Adam Langleye9ada862015-05-11 17:20:37 -07001593 # if ($len>4) compute 7 E(counter)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001594 shl \$4,$rounds
1595 movdqa 0x60(%rsp),$inout6
1596 pxor $inout7,$inout7
1597
1598 $movkey 16($key),$rndkey0
1599 aesenc $rndkey1,$inout0
1600 aesenc $rndkey1,$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07001601 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
Adam Langleyd9e397b2015-01-22 14:27:53 -08001602 neg %rax
1603 aesenc $rndkey1,$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07001604 add \$16,%rax # prepare for .Lenc_loop8_enter
Adam Langleyd9e397b2015-01-22 14:27:53 -08001605 movups ($inp),$in0
1606 aesenc $rndkey1,$inout3
1607 aesenc $rndkey1,$inout4
Adam Langleye9ada862015-05-11 17:20:37 -07001608 movups 0x10($inp),$in1 # pre-load input
Adam Langleyd9e397b2015-01-22 14:27:53 -08001609 movups 0x20($inp),$in2
1610 aesenc $rndkey1,$inout5
1611 aesenc $rndkey1,$inout6
1612
1613 call .Lenc_loop8_enter
1614
1615 movdqu 0x30($inp),$in3
1616 pxor $in0,$inout0
1617 movdqu 0x40($inp),$in0
1618 pxor $in1,$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07001619 movdqu $inout0,($out) # store output
Adam Langleyd9e397b2015-01-22 14:27:53 -08001620 pxor $in2,$inout2
1621 movdqu $inout1,0x10($out)
1622 pxor $in3,$inout3
1623 movdqu $inout2,0x20($out)
1624 pxor $in0,$inout4
1625 movdqu $inout3,0x30($out)
1626 movdqu $inout4,0x40($out)
1627 cmp \$6,$len
Adam Langleye9ada862015-05-11 17:20:37 -07001628 jb .Lctr32_done # $len was 5, stop store
Adam Langleyd9e397b2015-01-22 14:27:53 -08001629
1630 movups 0x50($inp),$in1
1631 xorps $in1,$inout5
1632 movups $inout5,0x50($out)
Adam Langleye9ada862015-05-11 17:20:37 -07001633 je .Lctr32_done # $len was 6, stop store
Adam Langleyd9e397b2015-01-22 14:27:53 -08001634
1635 movups 0x60($inp),$in2
1636 xorps $in2,$inout6
1637 movups $inout6,0x60($out)
Adam Langleye9ada862015-05-11 17:20:37 -07001638 jmp .Lctr32_done # $len was 7, stop store
Adam Langleyd9e397b2015-01-22 14:27:53 -08001639
1640.align 32
1641.Lctr32_loop4:
1642 aesenc $rndkey1,$inout0
1643 lea 16($key),$key
1644 dec $rounds
1645 aesenc $rndkey1,$inout1
1646 aesenc $rndkey1,$inout2
1647 aesenc $rndkey1,$inout3
1648 $movkey ($key),$rndkey1
1649 jnz .Lctr32_loop4
1650 aesenclast $rndkey1,$inout0
1651 aesenclast $rndkey1,$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07001652 movups ($inp),$in0 # load input
Adam Langleyd9e397b2015-01-22 14:27:53 -08001653 movups 0x10($inp),$in1
1654 aesenclast $rndkey1,$inout2
1655 aesenclast $rndkey1,$inout3
1656 movups 0x20($inp),$in2
1657 movups 0x30($inp),$in3
1658
1659 xorps $in0,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07001660 movups $inout0,($out) # store output
Adam Langleyd9e397b2015-01-22 14:27:53 -08001661 xorps $in1,$inout1
1662 movups $inout1,0x10($out)
1663 pxor $in2,$inout2
1664 movdqu $inout2,0x20($out)
1665 pxor $in3,$inout3
1666 movdqu $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -07001667 jmp .Lctr32_done # $len was 4, stop store
Adam Langleyd9e397b2015-01-22 14:27:53 -08001668
1669.align 32
1670.Lctr32_loop3:
1671 aesenc $rndkey1,$inout0
1672 lea 16($key),$key
1673 dec $rounds
1674 aesenc $rndkey1,$inout1
1675 aesenc $rndkey1,$inout2
1676 $movkey ($key),$rndkey1
1677 jnz .Lctr32_loop3
1678 aesenclast $rndkey1,$inout0
1679 aesenclast $rndkey1,$inout1
1680 aesenclast $rndkey1,$inout2
1681
Adam Langleye9ada862015-05-11 17:20:37 -07001682 movups ($inp),$in0 # load input
Adam Langleyd9e397b2015-01-22 14:27:53 -08001683 xorps $in0,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07001684 movups $inout0,($out) # store output
Adam Langleyd9e397b2015-01-22 14:27:53 -08001685 cmp \$2,$len
Adam Langleye9ada862015-05-11 17:20:37 -07001686 jb .Lctr32_done # $len was 1, stop store
Adam Langleyd9e397b2015-01-22 14:27:53 -08001687
1688 movups 0x10($inp),$in1
1689 xorps $in1,$inout1
1690 movups $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -07001691 je .Lctr32_done # $len was 2, stop store
Adam Langleyd9e397b2015-01-22 14:27:53 -08001692
1693 movups 0x20($inp),$in2
1694 xorps $in2,$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07001695 movups $inout2,0x20($out) # $len was 3, stop store
Adam Langleyd9e397b2015-01-22 14:27:53 -08001696
Adam Langleyd9e397b2015-01-22 14:27:53 -08001697.Lctr32_done:
Robert Sloanab8b8882018-03-26 11:39:51 -07001698 xorps %xmm0,%xmm0 # clear register bank
Adam Langleye9ada862015-05-11 17:20:37 -07001699 xor $key0,$key0
1700 pxor %xmm1,%xmm1
1701 pxor %xmm2,%xmm2
1702 pxor %xmm3,%xmm3
1703 pxor %xmm4,%xmm4
1704 pxor %xmm5,%xmm5
1705___
1706$code.=<<___ if (!$win64);
1707 pxor %xmm6,%xmm6
1708 pxor %xmm7,%xmm7
1709 movaps %xmm0,0x00(%rsp) # clear stack
1710 pxor %xmm8,%xmm8
1711 movaps %xmm0,0x10(%rsp)
1712 pxor %xmm9,%xmm9
1713 movaps %xmm0,0x20(%rsp)
1714 pxor %xmm10,%xmm10
1715 movaps %xmm0,0x30(%rsp)
1716 pxor %xmm11,%xmm11
1717 movaps %xmm0,0x40(%rsp)
1718 pxor %xmm12,%xmm12
1719 movaps %xmm0,0x50(%rsp)
1720 pxor %xmm13,%xmm13
1721 movaps %xmm0,0x60(%rsp)
1722 pxor %xmm14,%xmm14
1723 movaps %xmm0,0x70(%rsp)
1724 pxor %xmm15,%xmm15
Adam Langleyd9e397b2015-01-22 14:27:53 -08001725___
1726$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -08001727 movaps -0xa8($key_),%xmm6
1728 movaps %xmm0,-0xa8($key_) # clear stack
1729 movaps -0x98($key_),%xmm7
1730 movaps %xmm0,-0x98($key_)
1731 movaps -0x88($key_),%xmm8
1732 movaps %xmm0,-0x88($key_)
1733 movaps -0x78($key_),%xmm9
1734 movaps %xmm0,-0x78($key_)
1735 movaps -0x68($key_),%xmm10
1736 movaps %xmm0,-0x68($key_)
1737 movaps -0x58($key_),%xmm11
1738 movaps %xmm0,-0x58($key_)
1739 movaps -0x48($key_),%xmm12
1740 movaps %xmm0,-0x48($key_)
1741 movaps -0x38($key_),%xmm13
1742 movaps %xmm0,-0x38($key_)
1743 movaps -0x28($key_),%xmm14
1744 movaps %xmm0,-0x28($key_)
1745 movaps -0x18($key_),%xmm15
1746 movaps %xmm0,-0x18($key_)
Adam Langleye9ada862015-05-11 17:20:37 -07001747 movaps %xmm0,0x00(%rsp)
1748 movaps %xmm0,0x10(%rsp)
1749 movaps %xmm0,0x20(%rsp)
1750 movaps %xmm0,0x30(%rsp)
1751 movaps %xmm0,0x40(%rsp)
1752 movaps %xmm0,0x50(%rsp)
1753 movaps %xmm0,0x60(%rsp)
1754 movaps %xmm0,0x70(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001755___
1756$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08001757 mov -8($key_),%rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07001758.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08001759 lea ($key_),%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07001760.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001761.Lctr32_epilogue:
1762 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07001763.cfi_endproc
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001764.size ${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08001765___
1766}
1767
1768######################################################################
1769# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1770# const AES_KEY *key1, const AES_KEY *key2
1771# const unsigned char iv[16]);
1772#
Robert Sloan4c22c5f2019-03-01 15:53:37 -08001773if (0) { # Omit these functions in BoringSSL
Adam Langleyd9e397b2015-01-22 14:27:53 -08001774my @tweak=map("%xmm$_",(10..15));
1775my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1776my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1777my $frame_size = 0x70 + ($win64?160:0);
Robert Sloana94fe052017-02-21 08:49:28 -08001778my $key_ = "%rbp"; # override so that we can use %r11 as FP
Adam Langleyd9e397b2015-01-22 14:27:53 -08001779
1780$code.=<<___;
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001781.globl ${PREFIX}_xts_encrypt
1782.type ${PREFIX}_xts_encrypt,\@function,6
Adam Langleyd9e397b2015-01-22 14:27:53 -08001783.align 16
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01001784${PREFIX}_xts_encrypt:
Robert Sloanab8b8882018-03-26 11:39:51 -07001785.cfi_startproc
Robert Sloana94fe052017-02-21 08:49:28 -08001786 lea (%rsp),%r11 # frame pointer
Robert Sloanab8b8882018-03-26 11:39:51 -07001787.cfi_def_cfa_register %r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08001788 push %rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07001789.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001790 sub \$$frame_size,%rsp
1791 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1792___
1793$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -08001794 movaps %xmm6,-0xa8(%r11) # offload everything
1795 movaps %xmm7,-0x98(%r11)
1796 movaps %xmm8,-0x88(%r11)
1797 movaps %xmm9,-0x78(%r11)
1798 movaps %xmm10,-0x68(%r11)
1799 movaps %xmm11,-0x58(%r11)
1800 movaps %xmm12,-0x48(%r11)
1801 movaps %xmm13,-0x38(%r11)
1802 movaps %xmm14,-0x28(%r11)
1803 movaps %xmm15,-0x18(%r11)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001804.Lxts_enc_body:
1805___
1806$code.=<<___;
Adam Langleyd9e397b2015-01-22 14:27:53 -08001807 movups ($ivp),$inout0 # load clear-text tweak
1808 mov 240(%r8),$rounds # key2->rounds
1809 mov 240($key),$rnds_ # key1->rounds
1810___
1811 # generate the tweak
1812 &aesni_generate1("enc",$key2,$rounds,$inout0);
1813$code.=<<___;
1814 $movkey ($key),$rndkey0 # zero round key
1815 mov $key,$key_ # backup $key
1816 mov $rnds_,$rounds # backup $rounds
1817 shl \$4,$rnds_
1818 mov $len,$len_ # backup $len
1819 and \$-16,$len
1820
1821 $movkey 16($key,$rnds_),$rndkey1 # last round key
1822
1823 movdqa .Lxts_magic(%rip),$twmask
1824 movdqa $inout0,@tweak[5]
1825 pshufd \$0x5f,$inout0,$twres
1826 pxor $rndkey0,$rndkey1
1827___
1828 # alternative tweak calculation algorithm is based on suggestions
1829 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1830 # and should help in the future...
1831 for ($i=0;$i<4;$i++) {
1832 $code.=<<___;
1833 movdqa $twres,$twtmp
1834 paddd $twres,$twres
1835 movdqa @tweak[5],@tweak[$i]
1836 psrad \$31,$twtmp # broadcast upper bits
1837 paddq @tweak[5],@tweak[5]
1838 pand $twmask,$twtmp
1839 pxor $rndkey0,@tweak[$i]
1840 pxor $twtmp,@tweak[5]
1841___
1842 }
1843$code.=<<___;
1844 movdqa @tweak[5],@tweak[4]
1845 psrad \$31,$twres
1846 paddq @tweak[5],@tweak[5]
1847 pand $twmask,$twres
1848 pxor $rndkey0,@tweak[4]
1849 pxor $twres,@tweak[5]
1850 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
1851
1852 sub \$16*6,$len
Adam Langleye9ada862015-05-11 17:20:37 -07001853 jc .Lxts_enc_short # if $len-=6*16 borrowed
Adam Langleyd9e397b2015-01-22 14:27:53 -08001854
1855 mov \$16+96,$rounds
1856 lea 32($key_,$rnds_),$key # end of key schedule
1857 sub %r10,%rax # twisted $rounds
1858 $movkey 16($key_),$rndkey1
1859 mov %rax,%r10 # backup twisted $rounds
1860 lea .Lxts_magic(%rip),%r8
1861 jmp .Lxts_enc_grandloop
1862
1863.align 32
1864.Lxts_enc_grandloop:
1865 movdqu `16*0`($inp),$inout0 # load input
1866 movdqa $rndkey0,$twmask
1867 movdqu `16*1`($inp),$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07001868 pxor @tweak[0],$inout0 # input^=tweak^round[0]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001869 movdqu `16*2`($inp),$inout2
1870 pxor @tweak[1],$inout1
1871 aesenc $rndkey1,$inout0
1872 movdqu `16*3`($inp),$inout3
1873 pxor @tweak[2],$inout2
1874 aesenc $rndkey1,$inout1
1875 movdqu `16*4`($inp),$inout4
1876 pxor @tweak[3],$inout3
1877 aesenc $rndkey1,$inout2
1878 movdqu `16*5`($inp),$inout5
1879 pxor @tweak[5],$twmask # round[0]^=tweak[5]
1880 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
1881 pxor @tweak[4],$inout4
1882 aesenc $rndkey1,$inout3
1883 $movkey 32($key_),$rndkey0
1884 lea `16*6`($inp),$inp
1885 pxor $twmask,$inout5
1886
Robert Sloanab8b8882018-03-26 11:39:51 -07001887 pxor $twres,@tweak[0] # calculate tweaks^round[last]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001888 aesenc $rndkey1,$inout4
1889 pxor $twres,@tweak[1]
Adam Langleye9ada862015-05-11 17:20:37 -07001890 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001891 aesenc $rndkey1,$inout5
1892 $movkey 48($key_),$rndkey1
1893 pxor $twres,@tweak[2]
1894
1895 aesenc $rndkey0,$inout0
1896 pxor $twres,@tweak[3]
1897 movdqa @tweak[1],`16*1`(%rsp)
1898 aesenc $rndkey0,$inout1
1899 pxor $twres,@tweak[4]
1900 movdqa @tweak[2],`16*2`(%rsp)
1901 aesenc $rndkey0,$inout2
1902 aesenc $rndkey0,$inout3
1903 pxor $twres,$twmask
1904 movdqa @tweak[4],`16*4`(%rsp)
1905 aesenc $rndkey0,$inout4
1906 aesenc $rndkey0,$inout5
1907 $movkey 64($key_),$rndkey0
1908 movdqa $twmask,`16*5`(%rsp)
1909 pshufd \$0x5f,@tweak[5],$twres
1910 jmp .Lxts_enc_loop6
1911.align 32
1912.Lxts_enc_loop6:
1913 aesenc $rndkey1,$inout0
1914 aesenc $rndkey1,$inout1
1915 aesenc $rndkey1,$inout2
1916 aesenc $rndkey1,$inout3
1917 aesenc $rndkey1,$inout4
1918 aesenc $rndkey1,$inout5
1919 $movkey -64($key,%rax),$rndkey1
1920 add \$32,%rax
1921
1922 aesenc $rndkey0,$inout0
1923 aesenc $rndkey0,$inout1
1924 aesenc $rndkey0,$inout2
1925 aesenc $rndkey0,$inout3
1926 aesenc $rndkey0,$inout4
1927 aesenc $rndkey0,$inout5
1928 $movkey -80($key,%rax),$rndkey0
1929 jnz .Lxts_enc_loop6
1930
Adam Langleye9ada862015-05-11 17:20:37 -07001931 movdqa (%r8),$twmask # start calculating next tweak
Adam Langleyd9e397b2015-01-22 14:27:53 -08001932 movdqa $twres,$twtmp
1933 paddd $twres,$twres
1934 aesenc $rndkey1,$inout0
1935 paddq @tweak[5],@tweak[5]
1936 psrad \$31,$twtmp
1937 aesenc $rndkey1,$inout1
1938 pand $twmask,$twtmp
1939 $movkey ($key_),@tweak[0] # load round[0]
1940 aesenc $rndkey1,$inout2
1941 aesenc $rndkey1,$inout3
1942 aesenc $rndkey1,$inout4
1943 pxor $twtmp,@tweak[5]
1944 movaps @tweak[0],@tweak[1] # copy round[0]
1945 aesenc $rndkey1,$inout5
1946 $movkey -64($key),$rndkey1
1947
1948 movdqa $twres,$twtmp
1949 aesenc $rndkey0,$inout0
1950 paddd $twres,$twres
1951 pxor @tweak[5],@tweak[0]
1952 aesenc $rndkey0,$inout1
1953 psrad \$31,$twtmp
1954 paddq @tweak[5],@tweak[5]
1955 aesenc $rndkey0,$inout2
1956 aesenc $rndkey0,$inout3
1957 pand $twmask,$twtmp
1958 movaps @tweak[1],@tweak[2]
1959 aesenc $rndkey0,$inout4
1960 pxor $twtmp,@tweak[5]
1961 movdqa $twres,$twtmp
1962 aesenc $rndkey0,$inout5
1963 $movkey -48($key),$rndkey0
1964
1965 paddd $twres,$twres
1966 aesenc $rndkey1,$inout0
1967 pxor @tweak[5],@tweak[1]
1968 psrad \$31,$twtmp
1969 aesenc $rndkey1,$inout1
1970 paddq @tweak[5],@tweak[5]
1971 pand $twmask,$twtmp
1972 aesenc $rndkey1,$inout2
1973 aesenc $rndkey1,$inout3
1974 movdqa @tweak[3],`16*3`(%rsp)
1975 pxor $twtmp,@tweak[5]
1976 aesenc $rndkey1,$inout4
1977 movaps @tweak[2],@tweak[3]
1978 movdqa $twres,$twtmp
1979 aesenc $rndkey1,$inout5
1980 $movkey -32($key),$rndkey1
1981
1982 paddd $twres,$twres
1983 aesenc $rndkey0,$inout0
1984 pxor @tweak[5],@tweak[2]
1985 psrad \$31,$twtmp
1986 aesenc $rndkey0,$inout1
1987 paddq @tweak[5],@tweak[5]
1988 pand $twmask,$twtmp
1989 aesenc $rndkey0,$inout2
1990 aesenc $rndkey0,$inout3
1991 aesenc $rndkey0,$inout4
1992 pxor $twtmp,@tweak[5]
1993 movaps @tweak[3],@tweak[4]
1994 aesenc $rndkey0,$inout5
1995
1996 movdqa $twres,$rndkey0
1997 paddd $twres,$twres
1998 aesenc $rndkey1,$inout0
1999 pxor @tweak[5],@tweak[3]
2000 psrad \$31,$rndkey0
2001 aesenc $rndkey1,$inout1
2002 paddq @tweak[5],@tweak[5]
2003 pand $twmask,$rndkey0
2004 aesenc $rndkey1,$inout2
2005 aesenc $rndkey1,$inout3
2006 pxor $rndkey0,@tweak[5]
2007 $movkey ($key_),$rndkey0
2008 aesenc $rndkey1,$inout4
2009 aesenc $rndkey1,$inout5
2010 $movkey 16($key_),$rndkey1
2011
2012 pxor @tweak[5],@tweak[4]
2013 aesenclast `16*0`(%rsp),$inout0
2014 psrad \$31,$twres
2015 paddq @tweak[5],@tweak[5]
2016 aesenclast `16*1`(%rsp),$inout1
2017 aesenclast `16*2`(%rsp),$inout2
2018 pand $twmask,$twres
2019 mov %r10,%rax # restore $rounds
2020 aesenclast `16*3`(%rsp),$inout3
2021 aesenclast `16*4`(%rsp),$inout4
2022 aesenclast `16*5`(%rsp),$inout5
2023 pxor $twres,@tweak[5]
2024
Adam Langleye9ada862015-05-11 17:20:37 -07002025 lea `16*6`($out),$out # $out+=6*16
2026 movups $inout0,`-16*6`($out) # store 6 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002027 movups $inout1,`-16*5`($out)
2028 movups $inout2,`-16*4`($out)
2029 movups $inout3,`-16*3`($out)
2030 movups $inout4,`-16*2`($out)
2031 movups $inout5,`-16*1`($out)
2032 sub \$16*6,$len
Adam Langleye9ada862015-05-11 17:20:37 -07002033 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
Adam Langleyd9e397b2015-01-22 14:27:53 -08002034
2035 mov \$16+96,$rounds
2036 sub $rnds_,$rounds
2037 mov $key_,$key # restore $key
2038 shr \$4,$rounds # restore original value
2039
2040.Lxts_enc_short:
Adam Langleye9ada862015-05-11 17:20:37 -07002041 # at the point @tweak[0..5] are populated with tweak values
Adam Langleyd9e397b2015-01-22 14:27:53 -08002042 mov $rounds,$rnds_ # backup $rounds
2043 pxor $rndkey0,@tweak[0]
Adam Langleye9ada862015-05-11 17:20:37 -07002044 add \$16*6,$len # restore real remaining $len
2045 jz .Lxts_enc_done # done if ($len==0)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002046
2047 pxor $rndkey0,@tweak[1]
2048 cmp \$0x20,$len
Adam Langleye9ada862015-05-11 17:20:37 -07002049 jb .Lxts_enc_one # $len is 1*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002050 pxor $rndkey0,@tweak[2]
Adam Langleye9ada862015-05-11 17:20:37 -07002051 je .Lxts_enc_two # $len is 2*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002052
2053 pxor $rndkey0,@tweak[3]
2054 cmp \$0x40,$len
Adam Langleye9ada862015-05-11 17:20:37 -07002055 jb .Lxts_enc_three # $len is 3*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002056 pxor $rndkey0,@tweak[4]
Adam Langleye9ada862015-05-11 17:20:37 -07002057 je .Lxts_enc_four # $len is 4*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002058
Adam Langleye9ada862015-05-11 17:20:37 -07002059 movdqu ($inp),$inout0 # $len is 5*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002060 movdqu 16*1($inp),$inout1
2061 movdqu 16*2($inp),$inout2
2062 pxor @tweak[0],$inout0
2063 movdqu 16*3($inp),$inout3
2064 pxor @tweak[1],$inout1
2065 movdqu 16*4($inp),$inout4
Adam Langleye9ada862015-05-11 17:20:37 -07002066 lea 16*5($inp),$inp # $inp+=5*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002067 pxor @tweak[2],$inout2
2068 pxor @tweak[3],$inout3
2069 pxor @tweak[4],$inout4
Adam Langleye9ada862015-05-11 17:20:37 -07002070 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -08002071
2072 call _aesni_encrypt6
2073
2074 xorps @tweak[0],$inout0
2075 movdqa @tweak[5],@tweak[0]
2076 xorps @tweak[1],$inout1
2077 xorps @tweak[2],$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002078 movdqu $inout0,($out) # store 5 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002079 xorps @tweak[3],$inout3
2080 movdqu $inout1,16*1($out)
2081 xorps @tweak[4],$inout4
2082 movdqu $inout2,16*2($out)
2083 movdqu $inout3,16*3($out)
2084 movdqu $inout4,16*4($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002085 lea 16*5($out),$out # $out+=5*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002086 jmp .Lxts_enc_done
2087
2088.align 16
2089.Lxts_enc_one:
2090 movups ($inp),$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07002091 lea 16*1($inp),$inp # inp+=1*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002092 xorps @tweak[0],$inout0
2093___
2094 &aesni_generate1("enc",$key,$rounds);
2095$code.=<<___;
2096 xorps @tweak[0],$inout0
2097 movdqa @tweak[1],@tweak[0]
Adam Langleye9ada862015-05-11 17:20:37 -07002098 movups $inout0,($out) # store one output block
2099 lea 16*1($out),$out # $out+=1*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002100 jmp .Lxts_enc_done
2101
2102.align 16
2103.Lxts_enc_two:
2104 movups ($inp),$inout0
2105 movups 16($inp),$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07002106 lea 32($inp),$inp # $inp+=2*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002107 xorps @tweak[0],$inout0
2108 xorps @tweak[1],$inout1
2109
2110 call _aesni_encrypt2
2111
2112 xorps @tweak[0],$inout0
2113 movdqa @tweak[2],@tweak[0]
2114 xorps @tweak[1],$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07002115 movups $inout0,($out) # store 2 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002116 movups $inout1,16*1($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002117 lea 16*2($out),$out # $out+=2*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002118 jmp .Lxts_enc_done
2119
2120.align 16
2121.Lxts_enc_three:
2122 movups ($inp),$inout0
2123 movups 16*1($inp),$inout1
2124 movups 16*2($inp),$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002125 lea 16*3($inp),$inp # $inp+=3*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002126 xorps @tweak[0],$inout0
2127 xorps @tweak[1],$inout1
2128 xorps @tweak[2],$inout2
2129
2130 call _aesni_encrypt3
2131
2132 xorps @tweak[0],$inout0
2133 movdqa @tweak[3],@tweak[0]
2134 xorps @tweak[1],$inout1
2135 xorps @tweak[2],$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002136 movups $inout0,($out) # store 3 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002137 movups $inout1,16*1($out)
2138 movups $inout2,16*2($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002139 lea 16*3($out),$out # $out+=3*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002140 jmp .Lxts_enc_done
2141
2142.align 16
2143.Lxts_enc_four:
2144 movups ($inp),$inout0
2145 movups 16*1($inp),$inout1
2146 movups 16*2($inp),$inout2
2147 xorps @tweak[0],$inout0
2148 movups 16*3($inp),$inout3
Adam Langleye9ada862015-05-11 17:20:37 -07002149 lea 16*4($inp),$inp # $inp+=4*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002150 xorps @tweak[1],$inout1
2151 xorps @tweak[2],$inout2
2152 xorps @tweak[3],$inout3
2153
2154 call _aesni_encrypt4
2155
2156 pxor @tweak[0],$inout0
2157 movdqa @tweak[4],@tweak[0]
2158 pxor @tweak[1],$inout1
2159 pxor @tweak[2],$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002160 movdqu $inout0,($out) # store 4 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002161 pxor @tweak[3],$inout3
2162 movdqu $inout1,16*1($out)
2163 movdqu $inout2,16*2($out)
2164 movdqu $inout3,16*3($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002165 lea 16*4($out),$out # $out+=4*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002166 jmp .Lxts_enc_done
2167
2168.align 16
2169.Lxts_enc_done:
Adam Langleye9ada862015-05-11 17:20:37 -07002170 and \$15,$len_ # see if $len%16 is 0
Adam Langleyd9e397b2015-01-22 14:27:53 -08002171 jz .Lxts_enc_ret
2172 mov $len_,$len
2173
2174.Lxts_enc_steal:
2175 movzb ($inp),%eax # borrow $rounds ...
2176 movzb -16($out),%ecx # ... and $key
2177 lea 1($inp),$inp
2178 mov %al,-16($out)
2179 mov %cl,0($out)
2180 lea 1($out),$out
2181 sub \$1,$len
2182 jnz .Lxts_enc_steal
2183
2184 sub $len_,$out # rewind $out
2185 mov $key_,$key # restore $key
2186 mov $rnds_,$rounds # restore $rounds
2187
2188 movups -16($out),$inout0
2189 xorps @tweak[0],$inout0
2190___
2191 &aesni_generate1("enc",$key,$rounds);
2192$code.=<<___;
2193 xorps @tweak[0],$inout0
2194 movups $inout0,-16($out)
2195
2196.Lxts_enc_ret:
Adam Langleye9ada862015-05-11 17:20:37 -07002197 xorps %xmm0,%xmm0 # clear register bank
2198 pxor %xmm1,%xmm1
2199 pxor %xmm2,%xmm2
2200 pxor %xmm3,%xmm3
2201 pxor %xmm4,%xmm4
2202 pxor %xmm5,%xmm5
2203___
2204$code.=<<___ if (!$win64);
2205 pxor %xmm6,%xmm6
2206 pxor %xmm7,%xmm7
2207 movaps %xmm0,0x00(%rsp) # clear stack
2208 pxor %xmm8,%xmm8
2209 movaps %xmm0,0x10(%rsp)
2210 pxor %xmm9,%xmm9
2211 movaps %xmm0,0x20(%rsp)
2212 pxor %xmm10,%xmm10
2213 movaps %xmm0,0x30(%rsp)
2214 pxor %xmm11,%xmm11
2215 movaps %xmm0,0x40(%rsp)
2216 pxor %xmm12,%xmm12
2217 movaps %xmm0,0x50(%rsp)
2218 pxor %xmm13,%xmm13
2219 movaps %xmm0,0x60(%rsp)
2220 pxor %xmm14,%xmm14
2221 pxor %xmm15,%xmm15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002222___
2223$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -08002224 movaps -0xa8(%r11),%xmm6
2225 movaps %xmm0,-0xa8(%r11) # clear stack
2226 movaps -0x98(%r11),%xmm7
2227 movaps %xmm0,-0x98(%r11)
2228 movaps -0x88(%r11),%xmm8
2229 movaps %xmm0,-0x88(%r11)
2230 movaps -0x78(%r11),%xmm9
2231 movaps %xmm0,-0x78(%r11)
2232 movaps -0x68(%r11),%xmm10
2233 movaps %xmm0,-0x68(%r11)
2234 movaps -0x58(%r11),%xmm11
2235 movaps %xmm0,-0x58(%r11)
2236 movaps -0x48(%r11),%xmm12
2237 movaps %xmm0,-0x48(%r11)
2238 movaps -0x38(%r11),%xmm13
2239 movaps %xmm0,-0x38(%r11)
2240 movaps -0x28(%r11),%xmm14
2241 movaps %xmm0,-0x28(%r11)
2242 movaps -0x18(%r11),%xmm15
2243 movaps %xmm0,-0x18(%r11)
Adam Langleye9ada862015-05-11 17:20:37 -07002244 movaps %xmm0,0x00(%rsp)
2245 movaps %xmm0,0x10(%rsp)
2246 movaps %xmm0,0x20(%rsp)
2247 movaps %xmm0,0x30(%rsp)
2248 movaps %xmm0,0x40(%rsp)
2249 movaps %xmm0,0x50(%rsp)
2250 movaps %xmm0,0x60(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002251___
2252$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08002253 mov -8(%r11),%rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07002254.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002255 lea (%r11),%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07002256.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002257.Lxts_enc_epilogue:
2258 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07002259.cfi_endproc
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01002260.size ${PREFIX}_xts_encrypt,.-${PREFIX}_xts_encrypt
Adam Langleyd9e397b2015-01-22 14:27:53 -08002261___
2262
2263$code.=<<___;
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01002264.globl ${PREFIX}_xts_decrypt
2265.type ${PREFIX}_xts_decrypt,\@function,6
Adam Langleyd9e397b2015-01-22 14:27:53 -08002266.align 16
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01002267${PREFIX}_xts_decrypt:
Robert Sloanab8b8882018-03-26 11:39:51 -07002268.cfi_startproc
Robert Sloana94fe052017-02-21 08:49:28 -08002269 lea (%rsp),%r11 # frame pointer
Robert Sloanab8b8882018-03-26 11:39:51 -07002270.cfi_def_cfa_register %r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08002271 push %rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07002272.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002273 sub \$$frame_size,%rsp
2274 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
2275___
2276$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -08002277 movaps %xmm6,-0xa8(%r11) # offload everything
2278 movaps %xmm7,-0x98(%r11)
2279 movaps %xmm8,-0x88(%r11)
2280 movaps %xmm9,-0x78(%r11)
2281 movaps %xmm10,-0x68(%r11)
2282 movaps %xmm11,-0x58(%r11)
2283 movaps %xmm12,-0x48(%r11)
2284 movaps %xmm13,-0x38(%r11)
2285 movaps %xmm14,-0x28(%r11)
2286 movaps %xmm15,-0x18(%r11)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002287.Lxts_dec_body:
2288___
2289$code.=<<___;
Adam Langleyd9e397b2015-01-22 14:27:53 -08002290 movups ($ivp),$inout0 # load clear-text tweak
2291 mov 240($key2),$rounds # key2->rounds
2292 mov 240($key),$rnds_ # key1->rounds
2293___
2294 # generate the tweak
2295 &aesni_generate1("enc",$key2,$rounds,$inout0);
2296$code.=<<___;
2297 xor %eax,%eax # if ($len%16) len-=16;
2298 test \$15,$len
2299 setnz %al
2300 shl \$4,%rax
2301 sub %rax,$len
2302
2303 $movkey ($key),$rndkey0 # zero round key
2304 mov $key,$key_ # backup $key
2305 mov $rnds_,$rounds # backup $rounds
2306 shl \$4,$rnds_
2307 mov $len,$len_ # backup $len
2308 and \$-16,$len
2309
2310 $movkey 16($key,$rnds_),$rndkey1 # last round key
2311
2312 movdqa .Lxts_magic(%rip),$twmask
2313 movdqa $inout0,@tweak[5]
2314 pshufd \$0x5f,$inout0,$twres
2315 pxor $rndkey0,$rndkey1
2316___
2317 for ($i=0;$i<4;$i++) {
2318 $code.=<<___;
2319 movdqa $twres,$twtmp
2320 paddd $twres,$twres
2321 movdqa @tweak[5],@tweak[$i]
2322 psrad \$31,$twtmp # broadcast upper bits
2323 paddq @tweak[5],@tweak[5]
2324 pand $twmask,$twtmp
2325 pxor $rndkey0,@tweak[$i]
2326 pxor $twtmp,@tweak[5]
2327___
2328 }
2329$code.=<<___;
2330 movdqa @tweak[5],@tweak[4]
2331 psrad \$31,$twres
2332 paddq @tweak[5],@tweak[5]
2333 pand $twmask,$twres
2334 pxor $rndkey0,@tweak[4]
2335 pxor $twres,@tweak[5]
2336 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
2337
2338 sub \$16*6,$len
Adam Langleye9ada862015-05-11 17:20:37 -07002339 jc .Lxts_dec_short # if $len-=6*16 borrowed
Adam Langleyd9e397b2015-01-22 14:27:53 -08002340
2341 mov \$16+96,$rounds
2342 lea 32($key_,$rnds_),$key # end of key schedule
2343 sub %r10,%rax # twisted $rounds
2344 $movkey 16($key_),$rndkey1
2345 mov %rax,%r10 # backup twisted $rounds
2346 lea .Lxts_magic(%rip),%r8
2347 jmp .Lxts_dec_grandloop
2348
2349.align 32
2350.Lxts_dec_grandloop:
2351 movdqu `16*0`($inp),$inout0 # load input
2352 movdqa $rndkey0,$twmask
2353 movdqu `16*1`($inp),$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07002354 pxor @tweak[0],$inout0 # intput^=tweak^round[0]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002355 movdqu `16*2`($inp),$inout2
2356 pxor @tweak[1],$inout1
2357 aesdec $rndkey1,$inout0
2358 movdqu `16*3`($inp),$inout3
2359 pxor @tweak[2],$inout2
2360 aesdec $rndkey1,$inout1
2361 movdqu `16*4`($inp),$inout4
2362 pxor @tweak[3],$inout3
2363 aesdec $rndkey1,$inout2
2364 movdqu `16*5`($inp),$inout5
2365 pxor @tweak[5],$twmask # round[0]^=tweak[5]
2366 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
2367 pxor @tweak[4],$inout4
2368 aesdec $rndkey1,$inout3
2369 $movkey 32($key_),$rndkey0
2370 lea `16*6`($inp),$inp
2371 pxor $twmask,$inout5
2372
Robert Sloanab8b8882018-03-26 11:39:51 -07002373 pxor $twres,@tweak[0] # calculate tweaks^round[last]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002374 aesdec $rndkey1,$inout4
2375 pxor $twres,@tweak[1]
2376 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
2377 aesdec $rndkey1,$inout5
2378 $movkey 48($key_),$rndkey1
2379 pxor $twres,@tweak[2]
2380
2381 aesdec $rndkey0,$inout0
2382 pxor $twres,@tweak[3]
2383 movdqa @tweak[1],`16*1`(%rsp)
2384 aesdec $rndkey0,$inout1
2385 pxor $twres,@tweak[4]
2386 movdqa @tweak[2],`16*2`(%rsp)
2387 aesdec $rndkey0,$inout2
2388 aesdec $rndkey0,$inout3
2389 pxor $twres,$twmask
2390 movdqa @tweak[4],`16*4`(%rsp)
2391 aesdec $rndkey0,$inout4
2392 aesdec $rndkey0,$inout5
2393 $movkey 64($key_),$rndkey0
2394 movdqa $twmask,`16*5`(%rsp)
2395 pshufd \$0x5f,@tweak[5],$twres
2396 jmp .Lxts_dec_loop6
2397.align 32
2398.Lxts_dec_loop6:
2399 aesdec $rndkey1,$inout0
2400 aesdec $rndkey1,$inout1
2401 aesdec $rndkey1,$inout2
2402 aesdec $rndkey1,$inout3
2403 aesdec $rndkey1,$inout4
2404 aesdec $rndkey1,$inout5
2405 $movkey -64($key,%rax),$rndkey1
2406 add \$32,%rax
2407
2408 aesdec $rndkey0,$inout0
2409 aesdec $rndkey0,$inout1
2410 aesdec $rndkey0,$inout2
2411 aesdec $rndkey0,$inout3
2412 aesdec $rndkey0,$inout4
2413 aesdec $rndkey0,$inout5
2414 $movkey -80($key,%rax),$rndkey0
2415 jnz .Lxts_dec_loop6
2416
Adam Langleye9ada862015-05-11 17:20:37 -07002417 movdqa (%r8),$twmask # start calculating next tweak
Adam Langleyd9e397b2015-01-22 14:27:53 -08002418 movdqa $twres,$twtmp
2419 paddd $twres,$twres
2420 aesdec $rndkey1,$inout0
2421 paddq @tweak[5],@tweak[5]
2422 psrad \$31,$twtmp
2423 aesdec $rndkey1,$inout1
2424 pand $twmask,$twtmp
2425 $movkey ($key_),@tweak[0] # load round[0]
2426 aesdec $rndkey1,$inout2
2427 aesdec $rndkey1,$inout3
2428 aesdec $rndkey1,$inout4
2429 pxor $twtmp,@tweak[5]
2430 movaps @tweak[0],@tweak[1] # copy round[0]
2431 aesdec $rndkey1,$inout5
2432 $movkey -64($key),$rndkey1
2433
2434 movdqa $twres,$twtmp
2435 aesdec $rndkey0,$inout0
2436 paddd $twres,$twres
2437 pxor @tweak[5],@tweak[0]
2438 aesdec $rndkey0,$inout1
2439 psrad \$31,$twtmp
2440 paddq @tweak[5],@tweak[5]
2441 aesdec $rndkey0,$inout2
2442 aesdec $rndkey0,$inout3
2443 pand $twmask,$twtmp
2444 movaps @tweak[1],@tweak[2]
2445 aesdec $rndkey0,$inout4
2446 pxor $twtmp,@tweak[5]
2447 movdqa $twres,$twtmp
2448 aesdec $rndkey0,$inout5
2449 $movkey -48($key),$rndkey0
2450
2451 paddd $twres,$twres
2452 aesdec $rndkey1,$inout0
2453 pxor @tweak[5],@tweak[1]
2454 psrad \$31,$twtmp
2455 aesdec $rndkey1,$inout1
2456 paddq @tweak[5],@tweak[5]
2457 pand $twmask,$twtmp
2458 aesdec $rndkey1,$inout2
2459 aesdec $rndkey1,$inout3
2460 movdqa @tweak[3],`16*3`(%rsp)
2461 pxor $twtmp,@tweak[5]
2462 aesdec $rndkey1,$inout4
2463 movaps @tweak[2],@tweak[3]
2464 movdqa $twres,$twtmp
2465 aesdec $rndkey1,$inout5
2466 $movkey -32($key),$rndkey1
2467
2468 paddd $twres,$twres
2469 aesdec $rndkey0,$inout0
2470 pxor @tweak[5],@tweak[2]
2471 psrad \$31,$twtmp
2472 aesdec $rndkey0,$inout1
2473 paddq @tweak[5],@tweak[5]
2474 pand $twmask,$twtmp
2475 aesdec $rndkey0,$inout2
2476 aesdec $rndkey0,$inout3
2477 aesdec $rndkey0,$inout4
2478 pxor $twtmp,@tweak[5]
2479 movaps @tweak[3],@tweak[4]
2480 aesdec $rndkey0,$inout5
2481
2482 movdqa $twres,$rndkey0
2483 paddd $twres,$twres
2484 aesdec $rndkey1,$inout0
2485 pxor @tweak[5],@tweak[3]
2486 psrad \$31,$rndkey0
2487 aesdec $rndkey1,$inout1
2488 paddq @tweak[5],@tweak[5]
2489 pand $twmask,$rndkey0
2490 aesdec $rndkey1,$inout2
2491 aesdec $rndkey1,$inout3
2492 pxor $rndkey0,@tweak[5]
2493 $movkey ($key_),$rndkey0
2494 aesdec $rndkey1,$inout4
2495 aesdec $rndkey1,$inout5
2496 $movkey 16($key_),$rndkey1
2497
2498 pxor @tweak[5],@tweak[4]
2499 aesdeclast `16*0`(%rsp),$inout0
2500 psrad \$31,$twres
2501 paddq @tweak[5],@tweak[5]
2502 aesdeclast `16*1`(%rsp),$inout1
2503 aesdeclast `16*2`(%rsp),$inout2
2504 pand $twmask,$twres
2505 mov %r10,%rax # restore $rounds
2506 aesdeclast `16*3`(%rsp),$inout3
2507 aesdeclast `16*4`(%rsp),$inout4
2508 aesdeclast `16*5`(%rsp),$inout5
2509 pxor $twres,@tweak[5]
2510
Adam Langleye9ada862015-05-11 17:20:37 -07002511 lea `16*6`($out),$out # $out+=6*16
2512 movups $inout0,`-16*6`($out) # store 6 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002513 movups $inout1,`-16*5`($out)
2514 movups $inout2,`-16*4`($out)
2515 movups $inout3,`-16*3`($out)
2516 movups $inout4,`-16*2`($out)
2517 movups $inout5,`-16*1`($out)
2518 sub \$16*6,$len
Adam Langleye9ada862015-05-11 17:20:37 -07002519 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
Adam Langleyd9e397b2015-01-22 14:27:53 -08002520
2521 mov \$16+96,$rounds
2522 sub $rnds_,$rounds
2523 mov $key_,$key # restore $key
2524 shr \$4,$rounds # restore original value
2525
2526.Lxts_dec_short:
Adam Langleye9ada862015-05-11 17:20:37 -07002527 # at the point @tweak[0..5] are populated with tweak values
Adam Langleyd9e397b2015-01-22 14:27:53 -08002528 mov $rounds,$rnds_ # backup $rounds
2529 pxor $rndkey0,@tweak[0]
2530 pxor $rndkey0,@tweak[1]
Adam Langleye9ada862015-05-11 17:20:37 -07002531 add \$16*6,$len # restore real remaining $len
2532 jz .Lxts_dec_done # done if ($len==0)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002533
2534 pxor $rndkey0,@tweak[2]
2535 cmp \$0x20,$len
Adam Langleye9ada862015-05-11 17:20:37 -07002536 jb .Lxts_dec_one # $len is 1*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002537 pxor $rndkey0,@tweak[3]
Adam Langleye9ada862015-05-11 17:20:37 -07002538 je .Lxts_dec_two # $len is 2*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002539
2540 pxor $rndkey0,@tweak[4]
2541 cmp \$0x40,$len
Adam Langleye9ada862015-05-11 17:20:37 -07002542 jb .Lxts_dec_three # $len is 3*16
2543 je .Lxts_dec_four # $len is 4*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002544
Adam Langleye9ada862015-05-11 17:20:37 -07002545 movdqu ($inp),$inout0 # $len is 5*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002546 movdqu 16*1($inp),$inout1
2547 movdqu 16*2($inp),$inout2
2548 pxor @tweak[0],$inout0
2549 movdqu 16*3($inp),$inout3
2550 pxor @tweak[1],$inout1
2551 movdqu 16*4($inp),$inout4
Adam Langleye9ada862015-05-11 17:20:37 -07002552 lea 16*5($inp),$inp # $inp+=5*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002553 pxor @tweak[2],$inout2
2554 pxor @tweak[3],$inout3
2555 pxor @tweak[4],$inout4
2556
2557 call _aesni_decrypt6
2558
2559 xorps @tweak[0],$inout0
2560 xorps @tweak[1],$inout1
2561 xorps @tweak[2],$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002562 movdqu $inout0,($out) # store 5 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002563 xorps @tweak[3],$inout3
2564 movdqu $inout1,16*1($out)
2565 xorps @tweak[4],$inout4
2566 movdqu $inout2,16*2($out)
2567 pxor $twtmp,$twtmp
2568 movdqu $inout3,16*3($out)
2569 pcmpgtd @tweak[5],$twtmp
2570 movdqu $inout4,16*4($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002571 lea 16*5($out),$out # $out+=5*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002572 pshufd \$0x13,$twtmp,@tweak[1] # $twres
2573 and \$15,$len_
2574 jz .Lxts_dec_ret
2575
2576 movdqa @tweak[5],@tweak[0]
2577 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
2578 pand $twmask,@tweak[1] # isolate carry and residue
2579 pxor @tweak[5],@tweak[1]
2580 jmp .Lxts_dec_done2
2581
2582.align 16
2583.Lxts_dec_one:
2584 movups ($inp),$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07002585 lea 16*1($inp),$inp # $inp+=1*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002586 xorps @tweak[0],$inout0
2587___
2588 &aesni_generate1("dec",$key,$rounds);
2589$code.=<<___;
2590 xorps @tweak[0],$inout0
2591 movdqa @tweak[1],@tweak[0]
Adam Langleye9ada862015-05-11 17:20:37 -07002592 movups $inout0,($out) # store one output block
Adam Langleyd9e397b2015-01-22 14:27:53 -08002593 movdqa @tweak[2],@tweak[1]
Adam Langleye9ada862015-05-11 17:20:37 -07002594 lea 16*1($out),$out # $out+=1*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002595 jmp .Lxts_dec_done
2596
2597.align 16
2598.Lxts_dec_two:
2599 movups ($inp),$inout0
2600 movups 16($inp),$inout1
Adam Langleye9ada862015-05-11 17:20:37 -07002601 lea 32($inp),$inp # $inp+=2*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002602 xorps @tweak[0],$inout0
2603 xorps @tweak[1],$inout1
2604
2605 call _aesni_decrypt2
2606
2607 xorps @tweak[0],$inout0
2608 movdqa @tweak[2],@tweak[0]
2609 xorps @tweak[1],$inout1
2610 movdqa @tweak[3],@tweak[1]
Adam Langleye9ada862015-05-11 17:20:37 -07002611 movups $inout0,($out) # store 2 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002612 movups $inout1,16*1($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002613 lea 16*2($out),$out # $out+=2*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002614 jmp .Lxts_dec_done
2615
2616.align 16
2617.Lxts_dec_three:
2618 movups ($inp),$inout0
2619 movups 16*1($inp),$inout1
2620 movups 16*2($inp),$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002621 lea 16*3($inp),$inp # $inp+=3*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002622 xorps @tweak[0],$inout0
2623 xorps @tweak[1],$inout1
2624 xorps @tweak[2],$inout2
2625
2626 call _aesni_decrypt3
2627
2628 xorps @tweak[0],$inout0
2629 movdqa @tweak[3],@tweak[0]
2630 xorps @tweak[1],$inout1
2631 movdqa @tweak[4],@tweak[1]
2632 xorps @tweak[2],$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002633 movups $inout0,($out) # store 3 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002634 movups $inout1,16*1($out)
2635 movups $inout2,16*2($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002636 lea 16*3($out),$out # $out+=3*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002637 jmp .Lxts_dec_done
2638
2639.align 16
2640.Lxts_dec_four:
2641 movups ($inp),$inout0
2642 movups 16*1($inp),$inout1
2643 movups 16*2($inp),$inout2
2644 xorps @tweak[0],$inout0
2645 movups 16*3($inp),$inout3
Adam Langleye9ada862015-05-11 17:20:37 -07002646 lea 16*4($inp),$inp # $inp+=4*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002647 xorps @tweak[1],$inout1
2648 xorps @tweak[2],$inout2
2649 xorps @tweak[3],$inout3
2650
2651 call _aesni_decrypt4
2652
2653 pxor @tweak[0],$inout0
2654 movdqa @tweak[4],@tweak[0]
2655 pxor @tweak[1],$inout1
2656 movdqa @tweak[5],@tweak[1]
2657 pxor @tweak[2],$inout2
Adam Langleye9ada862015-05-11 17:20:37 -07002658 movdqu $inout0,($out) # store 4 output blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08002659 pxor @tweak[3],$inout3
2660 movdqu $inout1,16*1($out)
2661 movdqu $inout2,16*2($out)
2662 movdqu $inout3,16*3($out)
Adam Langleye9ada862015-05-11 17:20:37 -07002663 lea 16*4($out),$out # $out+=4*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002664 jmp .Lxts_dec_done
2665
2666.align 16
2667.Lxts_dec_done:
Adam Langleye9ada862015-05-11 17:20:37 -07002668 and \$15,$len_ # see if $len%16 is 0
Adam Langleyd9e397b2015-01-22 14:27:53 -08002669 jz .Lxts_dec_ret
2670.Lxts_dec_done2:
2671 mov $len_,$len
2672 mov $key_,$key # restore $key
2673 mov $rnds_,$rounds # restore $rounds
2674
2675 movups ($inp),$inout0
2676 xorps @tweak[1],$inout0
2677___
2678 &aesni_generate1("dec",$key,$rounds);
2679$code.=<<___;
2680 xorps @tweak[1],$inout0
2681 movups $inout0,($out)
2682
2683.Lxts_dec_steal:
2684 movzb 16($inp),%eax # borrow $rounds ...
2685 movzb ($out),%ecx # ... and $key
2686 lea 1($inp),$inp
2687 mov %al,($out)
2688 mov %cl,16($out)
2689 lea 1($out),$out
2690 sub \$1,$len
2691 jnz .Lxts_dec_steal
2692
2693 sub $len_,$out # rewind $out
2694 mov $key_,$key # restore $key
2695 mov $rnds_,$rounds # restore $rounds
2696
2697 movups ($out),$inout0
2698 xorps @tweak[0],$inout0
2699___
2700 &aesni_generate1("dec",$key,$rounds);
2701$code.=<<___;
2702 xorps @tweak[0],$inout0
2703 movups $inout0,($out)
2704
2705.Lxts_dec_ret:
Adam Langleye9ada862015-05-11 17:20:37 -07002706 xorps %xmm0,%xmm0 # clear register bank
2707 pxor %xmm1,%xmm1
2708 pxor %xmm2,%xmm2
2709 pxor %xmm3,%xmm3
2710 pxor %xmm4,%xmm4
2711 pxor %xmm5,%xmm5
2712___
2713$code.=<<___ if (!$win64);
2714 pxor %xmm6,%xmm6
2715 pxor %xmm7,%xmm7
2716 movaps %xmm0,0x00(%rsp) # clear stack
2717 pxor %xmm8,%xmm8
2718 movaps %xmm0,0x10(%rsp)
2719 pxor %xmm9,%xmm9
2720 movaps %xmm0,0x20(%rsp)
2721 pxor %xmm10,%xmm10
2722 movaps %xmm0,0x30(%rsp)
2723 pxor %xmm11,%xmm11
2724 movaps %xmm0,0x40(%rsp)
2725 pxor %xmm12,%xmm12
2726 movaps %xmm0,0x50(%rsp)
2727 pxor %xmm13,%xmm13
2728 movaps %xmm0,0x60(%rsp)
2729 pxor %xmm14,%xmm14
2730 pxor %xmm15,%xmm15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002731___
2732$code.=<<___ if ($win64);
Robert Sloana94fe052017-02-21 08:49:28 -08002733 movaps -0xa8(%r11),%xmm6
2734 movaps %xmm0,-0xa8(%r11) # clear stack
2735 movaps -0x98(%r11),%xmm7
2736 movaps %xmm0,-0x98(%r11)
2737 movaps -0x88(%r11),%xmm8
2738 movaps %xmm0,-0x88(%r11)
2739 movaps -0x78(%r11),%xmm9
2740 movaps %xmm0,-0x78(%r11)
2741 movaps -0x68(%r11),%xmm10
2742 movaps %xmm0,-0x68(%r11)
2743 movaps -0x58(%r11),%xmm11
2744 movaps %xmm0,-0x58(%r11)
2745 movaps -0x48(%r11),%xmm12
2746 movaps %xmm0,-0x48(%r11)
2747 movaps -0x38(%r11),%xmm13
2748 movaps %xmm0,-0x38(%r11)
2749 movaps -0x28(%r11),%xmm14
2750 movaps %xmm0,-0x28(%r11)
2751 movaps -0x18(%r11),%xmm15
2752 movaps %xmm0,-0x18(%r11)
Adam Langleye9ada862015-05-11 17:20:37 -07002753 movaps %xmm0,0x00(%rsp)
2754 movaps %xmm0,0x10(%rsp)
2755 movaps %xmm0,0x20(%rsp)
2756 movaps %xmm0,0x30(%rsp)
2757 movaps %xmm0,0x40(%rsp)
2758 movaps %xmm0,0x50(%rsp)
2759 movaps %xmm0,0x60(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002760___
2761$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08002762 mov -8(%r11),%rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07002763.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002764 lea (%r11),%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07002765.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002766.Lxts_dec_epilogue:
2767 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07002768.cfi_endproc
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01002769.size ${PREFIX}_xts_decrypt,.-${PREFIX}_xts_decrypt
Adam Langleyd9e397b2015-01-22 14:27:53 -08002770___
Robert Sloana94fe052017-02-21 08:49:28 -08002771}
2772
2773######################################################################
2774# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2775# const AES_KEY *key, unsigned int start_block_num,
2776# unsigned char offset_i[16], const unsigned char L_[][16],
2777# unsigned char checksum[16]);
2778#
Robert Sloan4c22c5f2019-03-01 15:53:37 -08002779if (0) { # Omit these functions in BoringSSL
Robert Sloana94fe052017-02-21 08:49:28 -08002780my @offset=map("%xmm$_",(10..15));
2781my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2782my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
2783my ($L_p,$checksum_p) = ("%rbx","%rbp");
2784my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2785my $seventh_arg = $win64 ? 56 : 8;
2786my $blocks = $len;
2787
2788$code.=<<___;
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01002789.globl ${PREFIX}_ocb_encrypt
2790.type ${PREFIX}_ocb_encrypt,\@function,6
Robert Sloana94fe052017-02-21 08:49:28 -08002791.align 32
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01002792${PREFIX}_ocb_encrypt:
Robert Sloanab8b8882018-03-26 11:39:51 -07002793.cfi_startproc
Robert Sloana94fe052017-02-21 08:49:28 -08002794 lea (%rsp),%rax
2795 push %rbx
Robert Sloanab8b8882018-03-26 11:39:51 -07002796.cfi_push %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08002797 push %rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07002798.cfi_push %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002799 push %r12
Robert Sloanab8b8882018-03-26 11:39:51 -07002800.cfi_push %r12
Robert Sloana94fe052017-02-21 08:49:28 -08002801 push %r13
Robert Sloanab8b8882018-03-26 11:39:51 -07002802.cfi_push %r13
Robert Sloana94fe052017-02-21 08:49:28 -08002803 push %r14
Robert Sloanab8b8882018-03-26 11:39:51 -07002804.cfi_push %r14
Robert Sloana94fe052017-02-21 08:49:28 -08002805___
2806$code.=<<___ if ($win64);
2807 lea -0xa0(%rsp),%rsp
2808 movaps %xmm6,0x00(%rsp) # offload everything
2809 movaps %xmm7,0x10(%rsp)
2810 movaps %xmm8,0x20(%rsp)
2811 movaps %xmm9,0x30(%rsp)
2812 movaps %xmm10,0x40(%rsp)
2813 movaps %xmm11,0x50(%rsp)
2814 movaps %xmm12,0x60(%rsp)
2815 movaps %xmm13,0x70(%rsp)
2816 movaps %xmm14,0x80(%rsp)
2817 movaps %xmm15,0x90(%rsp)
2818.Locb_enc_body:
2819___
2820$code.=<<___;
2821 mov $seventh_arg(%rax),$L_p # 7th argument
2822 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
2823
2824 mov 240($key),$rnds_
2825 mov $key,$key_
2826 shl \$4,$rnds_
2827 $movkey ($key),$rndkey0l # round[0]
2828 $movkey 16($key,$rnds_),$rndkey1 # round[last]
2829
2830 movdqu ($offset_p),@offset[5] # load last offset_i
2831 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
2832 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
2833
2834 mov \$16+32,$rounds
2835 lea 32($key_,$rnds_),$key
2836 $movkey 16($key_),$rndkey1 # round[1]
2837 sub %r10,%rax # twisted $rounds
2838 mov %rax,%r10 # backup twisted $rounds
2839
2840 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
2841 movdqu ($checksum_p),$checksum # load checksum
2842
2843 test \$1,$block_num # is first block number odd?
2844 jnz .Locb_enc_odd
2845
2846 bsf $block_num,$i1
2847 add \$1,$block_num
2848 shl \$4,$i1
2849 movdqu ($L_p,$i1),$inout5 # borrow
2850 movdqu ($inp),$inout0
2851 lea 16($inp),$inp
2852
2853 call __ocb_encrypt1
2854
2855 movdqa $inout5,@offset[5]
2856 movups $inout0,($out)
2857 lea 16($out),$out
2858 sub \$1,$blocks
2859 jz .Locb_enc_done
2860
2861.Locb_enc_odd:
2862 lea 1($block_num),$i1 # even-numbered blocks
2863 lea 3($block_num),$i3
2864 lea 5($block_num),$i5
2865 lea 6($block_num),$block_num
2866 bsf $i1,$i1 # ntz(block)
2867 bsf $i3,$i3
2868 bsf $i5,$i5
2869 shl \$4,$i1 # ntz(block) -> table offset
2870 shl \$4,$i3
2871 shl \$4,$i5
2872
2873 sub \$6,$blocks
2874 jc .Locb_enc_short
2875 jmp .Locb_enc_grandloop
2876
2877.align 32
2878.Locb_enc_grandloop:
2879 movdqu `16*0`($inp),$inout0 # load input
2880 movdqu `16*1`($inp),$inout1
2881 movdqu `16*2`($inp),$inout2
2882 movdqu `16*3`($inp),$inout3
2883 movdqu `16*4`($inp),$inout4
2884 movdqu `16*5`($inp),$inout5
2885 lea `16*6`($inp),$inp
2886
2887 call __ocb_encrypt6
2888
2889 movups $inout0,`16*0`($out) # store output
2890 movups $inout1,`16*1`($out)
2891 movups $inout2,`16*2`($out)
2892 movups $inout3,`16*3`($out)
2893 movups $inout4,`16*4`($out)
2894 movups $inout5,`16*5`($out)
2895 lea `16*6`($out),$out
2896 sub \$6,$blocks
2897 jnc .Locb_enc_grandloop
2898
2899.Locb_enc_short:
2900 add \$6,$blocks
2901 jz .Locb_enc_done
2902
2903 movdqu `16*0`($inp),$inout0
2904 cmp \$2,$blocks
2905 jb .Locb_enc_one
2906 movdqu `16*1`($inp),$inout1
2907 je .Locb_enc_two
2908
2909 movdqu `16*2`($inp),$inout2
2910 cmp \$4,$blocks
2911 jb .Locb_enc_three
2912 movdqu `16*3`($inp),$inout3
2913 je .Locb_enc_four
2914
2915 movdqu `16*4`($inp),$inout4
2916 pxor $inout5,$inout5
2917
2918 call __ocb_encrypt6
2919
2920 movdqa @offset[4],@offset[5]
2921 movups $inout0,`16*0`($out)
2922 movups $inout1,`16*1`($out)
2923 movups $inout2,`16*2`($out)
2924 movups $inout3,`16*3`($out)
2925 movups $inout4,`16*4`($out)
2926
2927 jmp .Locb_enc_done
2928
2929.align 16
2930.Locb_enc_one:
2931 movdqa @offset[0],$inout5 # borrow
2932
2933 call __ocb_encrypt1
2934
2935 movdqa $inout5,@offset[5]
2936 movups $inout0,`16*0`($out)
2937 jmp .Locb_enc_done
2938
2939.align 16
2940.Locb_enc_two:
2941 pxor $inout2,$inout2
2942 pxor $inout3,$inout3
2943
2944 call __ocb_encrypt4
2945
2946 movdqa @offset[1],@offset[5]
2947 movups $inout0,`16*0`($out)
2948 movups $inout1,`16*1`($out)
2949
2950 jmp .Locb_enc_done
2951
2952.align 16
2953.Locb_enc_three:
2954 pxor $inout3,$inout3
2955
2956 call __ocb_encrypt4
2957
2958 movdqa @offset[2],@offset[5]
2959 movups $inout0,`16*0`($out)
2960 movups $inout1,`16*1`($out)
2961 movups $inout2,`16*2`($out)
2962
2963 jmp .Locb_enc_done
2964
2965.align 16
2966.Locb_enc_four:
2967 call __ocb_encrypt4
2968
2969 movdqa @offset[3],@offset[5]
2970 movups $inout0,`16*0`($out)
2971 movups $inout1,`16*1`($out)
2972 movups $inout2,`16*2`($out)
2973 movups $inout3,`16*3`($out)
2974
2975.Locb_enc_done:
2976 pxor $rndkey0,@offset[5] # "remove" round[last]
2977 movdqu $checksum,($checksum_p) # store checksum
2978 movdqu @offset[5],($offset_p) # store last offset_i
2979
2980 xorps %xmm0,%xmm0 # clear register bank
2981 pxor %xmm1,%xmm1
2982 pxor %xmm2,%xmm2
2983 pxor %xmm3,%xmm3
2984 pxor %xmm4,%xmm4
2985 pxor %xmm5,%xmm5
2986___
2987$code.=<<___ if (!$win64);
2988 pxor %xmm6,%xmm6
2989 pxor %xmm7,%xmm7
2990 pxor %xmm8,%xmm8
2991 pxor %xmm9,%xmm9
2992 pxor %xmm10,%xmm10
2993 pxor %xmm11,%xmm11
2994 pxor %xmm12,%xmm12
2995 pxor %xmm13,%xmm13
2996 pxor %xmm14,%xmm14
2997 pxor %xmm15,%xmm15
2998 lea 0x28(%rsp),%rax
Robert Sloanab8b8882018-03-26 11:39:51 -07002999.cfi_def_cfa %rax,8
Robert Sloana94fe052017-02-21 08:49:28 -08003000___
3001$code.=<<___ if ($win64);
3002 movaps 0x00(%rsp),%xmm6
3003 movaps %xmm0,0x00(%rsp) # clear stack
3004 movaps 0x10(%rsp),%xmm7
3005 movaps %xmm0,0x10(%rsp)
3006 movaps 0x20(%rsp),%xmm8
3007 movaps %xmm0,0x20(%rsp)
3008 movaps 0x30(%rsp),%xmm9
3009 movaps %xmm0,0x30(%rsp)
3010 movaps 0x40(%rsp),%xmm10
3011 movaps %xmm0,0x40(%rsp)
3012 movaps 0x50(%rsp),%xmm11
3013 movaps %xmm0,0x50(%rsp)
3014 movaps 0x60(%rsp),%xmm12
3015 movaps %xmm0,0x60(%rsp)
3016 movaps 0x70(%rsp),%xmm13
3017 movaps %xmm0,0x70(%rsp)
3018 movaps 0x80(%rsp),%xmm14
3019 movaps %xmm0,0x80(%rsp)
3020 movaps 0x90(%rsp),%xmm15
3021 movaps %xmm0,0x90(%rsp)
3022 lea 0xa0+0x28(%rsp),%rax
3023.Locb_enc_pop:
3024___
3025$code.=<<___;
3026 mov -40(%rax),%r14
Robert Sloanab8b8882018-03-26 11:39:51 -07003027.cfi_restore %r14
Robert Sloana94fe052017-02-21 08:49:28 -08003028 mov -32(%rax),%r13
Robert Sloanab8b8882018-03-26 11:39:51 -07003029.cfi_restore %r13
Robert Sloana94fe052017-02-21 08:49:28 -08003030 mov -24(%rax),%r12
Robert Sloanab8b8882018-03-26 11:39:51 -07003031.cfi_restore %r12
Robert Sloana94fe052017-02-21 08:49:28 -08003032 mov -16(%rax),%rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07003033.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08003034 mov -8(%rax),%rbx
Robert Sloanab8b8882018-03-26 11:39:51 -07003035.cfi_restore %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08003036 lea (%rax),%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07003037.cfi_def_cfa_register %rsp
Robert Sloana94fe052017-02-21 08:49:28 -08003038.Locb_enc_epilogue:
3039 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07003040.cfi_endproc
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01003041.size ${PREFIX}_ocb_encrypt,.-${PREFIX}_ocb_encrypt
Robert Sloana94fe052017-02-21 08:49:28 -08003042
3043.type __ocb_encrypt6,\@abi-omnipotent
3044.align 32
3045__ocb_encrypt6:
3046 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3047 movdqu ($L_p,$i1),@offset[1]
3048 movdqa @offset[0],@offset[2]
3049 movdqu ($L_p,$i3),@offset[3]
3050 movdqa @offset[0],@offset[4]
3051 pxor @offset[5],@offset[0]
3052 movdqu ($L_p,$i5),@offset[5]
3053 pxor @offset[0],@offset[1]
3054 pxor $inout0,$checksum # accumulate checksum
3055 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3056 pxor @offset[1],@offset[2]
3057 pxor $inout1,$checksum
3058 pxor @offset[1],$inout1
3059 pxor @offset[2],@offset[3]
3060 pxor $inout2,$checksum
3061 pxor @offset[2],$inout2
3062 pxor @offset[3],@offset[4]
3063 pxor $inout3,$checksum
3064 pxor @offset[3],$inout3
3065 pxor @offset[4],@offset[5]
3066 pxor $inout4,$checksum
3067 pxor @offset[4],$inout4
3068 pxor $inout5,$checksum
3069 pxor @offset[5],$inout5
3070 $movkey 32($key_),$rndkey0
3071
3072 lea 1($block_num),$i1 # even-numbered blocks
3073 lea 3($block_num),$i3
3074 lea 5($block_num),$i5
3075 add \$6,$block_num
3076 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3077 bsf $i1,$i1 # ntz(block)
3078 bsf $i3,$i3
3079 bsf $i5,$i5
3080
3081 aesenc $rndkey1,$inout0
3082 aesenc $rndkey1,$inout1
3083 aesenc $rndkey1,$inout2
3084 aesenc $rndkey1,$inout3
3085 pxor $rndkey0l,@offset[1]
3086 pxor $rndkey0l,@offset[2]
3087 aesenc $rndkey1,$inout4
3088 pxor $rndkey0l,@offset[3]
3089 pxor $rndkey0l,@offset[4]
3090 aesenc $rndkey1,$inout5
3091 $movkey 48($key_),$rndkey1
3092 pxor $rndkey0l,@offset[5]
3093
3094 aesenc $rndkey0,$inout0
3095 aesenc $rndkey0,$inout1
3096 aesenc $rndkey0,$inout2
3097 aesenc $rndkey0,$inout3
3098 aesenc $rndkey0,$inout4
3099 aesenc $rndkey0,$inout5
3100 $movkey 64($key_),$rndkey0
3101 shl \$4,$i1 # ntz(block) -> table offset
3102 shl \$4,$i3
3103 jmp .Locb_enc_loop6
3104
3105.align 32
3106.Locb_enc_loop6:
3107 aesenc $rndkey1,$inout0
3108 aesenc $rndkey1,$inout1
3109 aesenc $rndkey1,$inout2
3110 aesenc $rndkey1,$inout3
3111 aesenc $rndkey1,$inout4
3112 aesenc $rndkey1,$inout5
3113 $movkey ($key,%rax),$rndkey1
3114 add \$32,%rax
3115
3116 aesenc $rndkey0,$inout0
3117 aesenc $rndkey0,$inout1
3118 aesenc $rndkey0,$inout2
3119 aesenc $rndkey0,$inout3
3120 aesenc $rndkey0,$inout4
3121 aesenc $rndkey0,$inout5
3122 $movkey -16($key,%rax),$rndkey0
3123 jnz .Locb_enc_loop6
3124
3125 aesenc $rndkey1,$inout0
3126 aesenc $rndkey1,$inout1
3127 aesenc $rndkey1,$inout2
3128 aesenc $rndkey1,$inout3
3129 aesenc $rndkey1,$inout4
3130 aesenc $rndkey1,$inout5
3131 $movkey 16($key_),$rndkey1
3132 shl \$4,$i5
3133
3134 aesenclast @offset[0],$inout0
3135 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3136 mov %r10,%rax # restore twisted rounds
3137 aesenclast @offset[1],$inout1
3138 aesenclast @offset[2],$inout2
3139 aesenclast @offset[3],$inout3
3140 aesenclast @offset[4],$inout4
3141 aesenclast @offset[5],$inout5
3142 ret
3143.size __ocb_encrypt6,.-__ocb_encrypt6
3144
3145.type __ocb_encrypt4,\@abi-omnipotent
3146.align 32
3147__ocb_encrypt4:
3148 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3149 movdqu ($L_p,$i1),@offset[1]
3150 movdqa @offset[0],@offset[2]
3151 movdqu ($L_p,$i3),@offset[3]
3152 pxor @offset[5],@offset[0]
3153 pxor @offset[0],@offset[1]
3154 pxor $inout0,$checksum # accumulate checksum
3155 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3156 pxor @offset[1],@offset[2]
3157 pxor $inout1,$checksum
3158 pxor @offset[1],$inout1
3159 pxor @offset[2],@offset[3]
3160 pxor $inout2,$checksum
3161 pxor @offset[2],$inout2
3162 pxor $inout3,$checksum
3163 pxor @offset[3],$inout3
3164 $movkey 32($key_),$rndkey0
3165
3166 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3167 pxor $rndkey0l,@offset[1]
3168 pxor $rndkey0l,@offset[2]
3169 pxor $rndkey0l,@offset[3]
3170
3171 aesenc $rndkey1,$inout0
3172 aesenc $rndkey1,$inout1
3173 aesenc $rndkey1,$inout2
3174 aesenc $rndkey1,$inout3
3175 $movkey 48($key_),$rndkey1
3176
3177 aesenc $rndkey0,$inout0
3178 aesenc $rndkey0,$inout1
3179 aesenc $rndkey0,$inout2
3180 aesenc $rndkey0,$inout3
3181 $movkey 64($key_),$rndkey0
3182 jmp .Locb_enc_loop4
3183
3184.align 32
3185.Locb_enc_loop4:
3186 aesenc $rndkey1,$inout0
3187 aesenc $rndkey1,$inout1
3188 aesenc $rndkey1,$inout2
3189 aesenc $rndkey1,$inout3
3190 $movkey ($key,%rax),$rndkey1
3191 add \$32,%rax
3192
3193 aesenc $rndkey0,$inout0
3194 aesenc $rndkey0,$inout1
3195 aesenc $rndkey0,$inout2
3196 aesenc $rndkey0,$inout3
3197 $movkey -16($key,%rax),$rndkey0
3198 jnz .Locb_enc_loop4
3199
3200 aesenc $rndkey1,$inout0
3201 aesenc $rndkey1,$inout1
3202 aesenc $rndkey1,$inout2
3203 aesenc $rndkey1,$inout3
3204 $movkey 16($key_),$rndkey1
3205 mov %r10,%rax # restore twisted rounds
3206
3207 aesenclast @offset[0],$inout0
3208 aesenclast @offset[1],$inout1
3209 aesenclast @offset[2],$inout2
3210 aesenclast @offset[3],$inout3
3211 ret
3212.size __ocb_encrypt4,.-__ocb_encrypt4
3213
3214.type __ocb_encrypt1,\@abi-omnipotent
3215.align 32
3216__ocb_encrypt1:
3217 pxor @offset[5],$inout5 # offset_i
3218 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3219 pxor $inout0,$checksum # accumulate checksum
3220 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3221 $movkey 32($key_),$rndkey0
3222
3223 aesenc $rndkey1,$inout0
3224 $movkey 48($key_),$rndkey1
3225 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3226
3227 aesenc $rndkey0,$inout0
3228 $movkey 64($key_),$rndkey0
3229 jmp .Locb_enc_loop1
3230
3231.align 32
3232.Locb_enc_loop1:
3233 aesenc $rndkey1,$inout0
3234 $movkey ($key,%rax),$rndkey1
3235 add \$32,%rax
3236
3237 aesenc $rndkey0,$inout0
3238 $movkey -16($key,%rax),$rndkey0
3239 jnz .Locb_enc_loop1
3240
3241 aesenc $rndkey1,$inout0
3242 $movkey 16($key_),$rndkey1 # redundant in tail
3243 mov %r10,%rax # restore twisted rounds
3244
3245 aesenclast $inout5,$inout0
3246 ret
3247.size __ocb_encrypt1,.-__ocb_encrypt1
3248
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01003249.globl ${PREFIX}_ocb_decrypt
3250.type ${PREFIX}_ocb_decrypt,\@function,6
Robert Sloana94fe052017-02-21 08:49:28 -08003251.align 32
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01003252${PREFIX}_ocb_decrypt:
Robert Sloanab8b8882018-03-26 11:39:51 -07003253.cfi_startproc
Robert Sloana94fe052017-02-21 08:49:28 -08003254 lea (%rsp),%rax
3255 push %rbx
Robert Sloanab8b8882018-03-26 11:39:51 -07003256.cfi_push %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08003257 push %rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07003258.cfi_push %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08003259 push %r12
Robert Sloanab8b8882018-03-26 11:39:51 -07003260.cfi_push %r12
Robert Sloana94fe052017-02-21 08:49:28 -08003261 push %r13
Robert Sloanab8b8882018-03-26 11:39:51 -07003262.cfi_push %r13
Robert Sloana94fe052017-02-21 08:49:28 -08003263 push %r14
Robert Sloanab8b8882018-03-26 11:39:51 -07003264.cfi_push %r14
Robert Sloana94fe052017-02-21 08:49:28 -08003265___
3266$code.=<<___ if ($win64);
3267 lea -0xa0(%rsp),%rsp
3268 movaps %xmm6,0x00(%rsp) # offload everything
3269 movaps %xmm7,0x10(%rsp)
3270 movaps %xmm8,0x20(%rsp)
3271 movaps %xmm9,0x30(%rsp)
3272 movaps %xmm10,0x40(%rsp)
3273 movaps %xmm11,0x50(%rsp)
3274 movaps %xmm12,0x60(%rsp)
3275 movaps %xmm13,0x70(%rsp)
3276 movaps %xmm14,0x80(%rsp)
3277 movaps %xmm15,0x90(%rsp)
3278.Locb_dec_body:
3279___
3280$code.=<<___;
3281 mov $seventh_arg(%rax),$L_p # 7th argument
3282 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
3283
3284 mov 240($key),$rnds_
3285 mov $key,$key_
3286 shl \$4,$rnds_
3287 $movkey ($key),$rndkey0l # round[0]
3288 $movkey 16($key,$rnds_),$rndkey1 # round[last]
3289
3290 movdqu ($offset_p),@offset[5] # load last offset_i
3291 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
3292 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
3293
3294 mov \$16+32,$rounds
3295 lea 32($key_,$rnds_),$key
3296 $movkey 16($key_),$rndkey1 # round[1]
3297 sub %r10,%rax # twisted $rounds
3298 mov %rax,%r10 # backup twisted $rounds
3299
3300 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3301 movdqu ($checksum_p),$checksum # load checksum
3302
3303 test \$1,$block_num # is first block number odd?
3304 jnz .Locb_dec_odd
3305
3306 bsf $block_num,$i1
3307 add \$1,$block_num
3308 shl \$4,$i1
3309 movdqu ($L_p,$i1),$inout5 # borrow
3310 movdqu ($inp),$inout0
3311 lea 16($inp),$inp
3312
3313 call __ocb_decrypt1
3314
3315 movdqa $inout5,@offset[5]
3316 movups $inout0,($out)
3317 xorps $inout0,$checksum # accumulate checksum
3318 lea 16($out),$out
3319 sub \$1,$blocks
3320 jz .Locb_dec_done
3321
3322.Locb_dec_odd:
3323 lea 1($block_num),$i1 # even-numbered blocks
3324 lea 3($block_num),$i3
3325 lea 5($block_num),$i5
3326 lea 6($block_num),$block_num
3327 bsf $i1,$i1 # ntz(block)
3328 bsf $i3,$i3
3329 bsf $i5,$i5
3330 shl \$4,$i1 # ntz(block) -> table offset
3331 shl \$4,$i3
3332 shl \$4,$i5
3333
3334 sub \$6,$blocks
3335 jc .Locb_dec_short
3336 jmp .Locb_dec_grandloop
3337
3338.align 32
3339.Locb_dec_grandloop:
3340 movdqu `16*0`($inp),$inout0 # load input
3341 movdqu `16*1`($inp),$inout1
3342 movdqu `16*2`($inp),$inout2
3343 movdqu `16*3`($inp),$inout3
3344 movdqu `16*4`($inp),$inout4
3345 movdqu `16*5`($inp),$inout5
3346 lea `16*6`($inp),$inp
3347
3348 call __ocb_decrypt6
3349
3350 movups $inout0,`16*0`($out) # store output
3351 pxor $inout0,$checksum # accumulate checksum
3352 movups $inout1,`16*1`($out)
3353 pxor $inout1,$checksum
3354 movups $inout2,`16*2`($out)
3355 pxor $inout2,$checksum
3356 movups $inout3,`16*3`($out)
3357 pxor $inout3,$checksum
3358 movups $inout4,`16*4`($out)
3359 pxor $inout4,$checksum
3360 movups $inout5,`16*5`($out)
3361 pxor $inout5,$checksum
3362 lea `16*6`($out),$out
3363 sub \$6,$blocks
3364 jnc .Locb_dec_grandloop
3365
3366.Locb_dec_short:
3367 add \$6,$blocks
3368 jz .Locb_dec_done
3369
3370 movdqu `16*0`($inp),$inout0
3371 cmp \$2,$blocks
3372 jb .Locb_dec_one
3373 movdqu `16*1`($inp),$inout1
3374 je .Locb_dec_two
3375
3376 movdqu `16*2`($inp),$inout2
3377 cmp \$4,$blocks
3378 jb .Locb_dec_three
3379 movdqu `16*3`($inp),$inout3
3380 je .Locb_dec_four
3381
3382 movdqu `16*4`($inp),$inout4
3383 pxor $inout5,$inout5
3384
3385 call __ocb_decrypt6
3386
3387 movdqa @offset[4],@offset[5]
3388 movups $inout0,`16*0`($out) # store output
3389 pxor $inout0,$checksum # accumulate checksum
3390 movups $inout1,`16*1`($out)
3391 pxor $inout1,$checksum
3392 movups $inout2,`16*2`($out)
3393 pxor $inout2,$checksum
3394 movups $inout3,`16*3`($out)
3395 pxor $inout3,$checksum
3396 movups $inout4,`16*4`($out)
3397 pxor $inout4,$checksum
3398
3399 jmp .Locb_dec_done
3400
3401.align 16
3402.Locb_dec_one:
3403 movdqa @offset[0],$inout5 # borrow
3404
3405 call __ocb_decrypt1
3406
3407 movdqa $inout5,@offset[5]
3408 movups $inout0,`16*0`($out) # store output
3409 xorps $inout0,$checksum # accumulate checksum
3410 jmp .Locb_dec_done
3411
3412.align 16
3413.Locb_dec_two:
3414 pxor $inout2,$inout2
3415 pxor $inout3,$inout3
3416
3417 call __ocb_decrypt4
3418
3419 movdqa @offset[1],@offset[5]
3420 movups $inout0,`16*0`($out) # store output
3421 xorps $inout0,$checksum # accumulate checksum
3422 movups $inout1,`16*1`($out)
3423 xorps $inout1,$checksum
3424
3425 jmp .Locb_dec_done
3426
3427.align 16
3428.Locb_dec_three:
3429 pxor $inout3,$inout3
3430
3431 call __ocb_decrypt4
3432
3433 movdqa @offset[2],@offset[5]
3434 movups $inout0,`16*0`($out) # store output
3435 xorps $inout0,$checksum # accumulate checksum
3436 movups $inout1,`16*1`($out)
3437 xorps $inout1,$checksum
3438 movups $inout2,`16*2`($out)
3439 xorps $inout2,$checksum
3440
3441 jmp .Locb_dec_done
3442
3443.align 16
3444.Locb_dec_four:
3445 call __ocb_decrypt4
3446
3447 movdqa @offset[3],@offset[5]
3448 movups $inout0,`16*0`($out) # store output
3449 pxor $inout0,$checksum # accumulate checksum
3450 movups $inout1,`16*1`($out)
3451 pxor $inout1,$checksum
3452 movups $inout2,`16*2`($out)
3453 pxor $inout2,$checksum
3454 movups $inout3,`16*3`($out)
3455 pxor $inout3,$checksum
3456
3457.Locb_dec_done:
3458 pxor $rndkey0,@offset[5] # "remove" round[last]
3459 movdqu $checksum,($checksum_p) # store checksum
3460 movdqu @offset[5],($offset_p) # store last offset_i
3461
3462 xorps %xmm0,%xmm0 # clear register bank
3463 pxor %xmm1,%xmm1
3464 pxor %xmm2,%xmm2
3465 pxor %xmm3,%xmm3
3466 pxor %xmm4,%xmm4
3467 pxor %xmm5,%xmm5
3468___
3469$code.=<<___ if (!$win64);
3470 pxor %xmm6,%xmm6
3471 pxor %xmm7,%xmm7
3472 pxor %xmm8,%xmm8
3473 pxor %xmm9,%xmm9
3474 pxor %xmm10,%xmm10
3475 pxor %xmm11,%xmm11
3476 pxor %xmm12,%xmm12
3477 pxor %xmm13,%xmm13
3478 pxor %xmm14,%xmm14
3479 pxor %xmm15,%xmm15
3480 lea 0x28(%rsp),%rax
Robert Sloanab8b8882018-03-26 11:39:51 -07003481.cfi_def_cfa %rax,8
Robert Sloana94fe052017-02-21 08:49:28 -08003482___
3483$code.=<<___ if ($win64);
3484 movaps 0x00(%rsp),%xmm6
3485 movaps %xmm0,0x00(%rsp) # clear stack
3486 movaps 0x10(%rsp),%xmm7
3487 movaps %xmm0,0x10(%rsp)
3488 movaps 0x20(%rsp),%xmm8
3489 movaps %xmm0,0x20(%rsp)
3490 movaps 0x30(%rsp),%xmm9
3491 movaps %xmm0,0x30(%rsp)
3492 movaps 0x40(%rsp),%xmm10
3493 movaps %xmm0,0x40(%rsp)
3494 movaps 0x50(%rsp),%xmm11
3495 movaps %xmm0,0x50(%rsp)
3496 movaps 0x60(%rsp),%xmm12
3497 movaps %xmm0,0x60(%rsp)
3498 movaps 0x70(%rsp),%xmm13
3499 movaps %xmm0,0x70(%rsp)
3500 movaps 0x80(%rsp),%xmm14
3501 movaps %xmm0,0x80(%rsp)
3502 movaps 0x90(%rsp),%xmm15
3503 movaps %xmm0,0x90(%rsp)
3504 lea 0xa0+0x28(%rsp),%rax
3505.Locb_dec_pop:
3506___
3507$code.=<<___;
3508 mov -40(%rax),%r14
Robert Sloanab8b8882018-03-26 11:39:51 -07003509.cfi_restore %r14
Robert Sloana94fe052017-02-21 08:49:28 -08003510 mov -32(%rax),%r13
Robert Sloanab8b8882018-03-26 11:39:51 -07003511.cfi_restore %r13
Robert Sloana94fe052017-02-21 08:49:28 -08003512 mov -24(%rax),%r12
Robert Sloanab8b8882018-03-26 11:39:51 -07003513.cfi_restore %r12
Robert Sloana94fe052017-02-21 08:49:28 -08003514 mov -16(%rax),%rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07003515.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08003516 mov -8(%rax),%rbx
Robert Sloanab8b8882018-03-26 11:39:51 -07003517.cfi_restore %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08003518 lea (%rax),%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07003519.cfi_def_cfa_register %rsp
Robert Sloana94fe052017-02-21 08:49:28 -08003520.Locb_dec_epilogue:
3521 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07003522.cfi_endproc
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01003523.size ${PREFIX}_ocb_decrypt,.-${PREFIX}_ocb_decrypt
Robert Sloana94fe052017-02-21 08:49:28 -08003524
3525.type __ocb_decrypt6,\@abi-omnipotent
3526.align 32
3527__ocb_decrypt6:
3528 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3529 movdqu ($L_p,$i1),@offset[1]
3530 movdqa @offset[0],@offset[2]
3531 movdqu ($L_p,$i3),@offset[3]
3532 movdqa @offset[0],@offset[4]
3533 pxor @offset[5],@offset[0]
3534 movdqu ($L_p,$i5),@offset[5]
3535 pxor @offset[0],@offset[1]
3536 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3537 pxor @offset[1],@offset[2]
3538 pxor @offset[1],$inout1
3539 pxor @offset[2],@offset[3]
3540 pxor @offset[2],$inout2
3541 pxor @offset[3],@offset[4]
3542 pxor @offset[3],$inout3
3543 pxor @offset[4],@offset[5]
3544 pxor @offset[4],$inout4
3545 pxor @offset[5],$inout5
3546 $movkey 32($key_),$rndkey0
3547
3548 lea 1($block_num),$i1 # even-numbered blocks
3549 lea 3($block_num),$i3
3550 lea 5($block_num),$i5
3551 add \$6,$block_num
3552 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3553 bsf $i1,$i1 # ntz(block)
3554 bsf $i3,$i3
3555 bsf $i5,$i5
3556
3557 aesdec $rndkey1,$inout0
3558 aesdec $rndkey1,$inout1
3559 aesdec $rndkey1,$inout2
3560 aesdec $rndkey1,$inout3
3561 pxor $rndkey0l,@offset[1]
3562 pxor $rndkey0l,@offset[2]
3563 aesdec $rndkey1,$inout4
3564 pxor $rndkey0l,@offset[3]
3565 pxor $rndkey0l,@offset[4]
3566 aesdec $rndkey1,$inout5
3567 $movkey 48($key_),$rndkey1
3568 pxor $rndkey0l,@offset[5]
3569
3570 aesdec $rndkey0,$inout0
3571 aesdec $rndkey0,$inout1
3572 aesdec $rndkey0,$inout2
3573 aesdec $rndkey0,$inout3
3574 aesdec $rndkey0,$inout4
3575 aesdec $rndkey0,$inout5
3576 $movkey 64($key_),$rndkey0
3577 shl \$4,$i1 # ntz(block) -> table offset
3578 shl \$4,$i3
3579 jmp .Locb_dec_loop6
3580
3581.align 32
3582.Locb_dec_loop6:
3583 aesdec $rndkey1,$inout0
3584 aesdec $rndkey1,$inout1
3585 aesdec $rndkey1,$inout2
3586 aesdec $rndkey1,$inout3
3587 aesdec $rndkey1,$inout4
3588 aesdec $rndkey1,$inout5
3589 $movkey ($key,%rax),$rndkey1
3590 add \$32,%rax
3591
3592 aesdec $rndkey0,$inout0
3593 aesdec $rndkey0,$inout1
3594 aesdec $rndkey0,$inout2
3595 aesdec $rndkey0,$inout3
3596 aesdec $rndkey0,$inout4
3597 aesdec $rndkey0,$inout5
3598 $movkey -16($key,%rax),$rndkey0
3599 jnz .Locb_dec_loop6
3600
3601 aesdec $rndkey1,$inout0
3602 aesdec $rndkey1,$inout1
3603 aesdec $rndkey1,$inout2
3604 aesdec $rndkey1,$inout3
3605 aesdec $rndkey1,$inout4
3606 aesdec $rndkey1,$inout5
3607 $movkey 16($key_),$rndkey1
3608 shl \$4,$i5
3609
3610 aesdeclast @offset[0],$inout0
3611 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3612 mov %r10,%rax # restore twisted rounds
3613 aesdeclast @offset[1],$inout1
3614 aesdeclast @offset[2],$inout2
3615 aesdeclast @offset[3],$inout3
3616 aesdeclast @offset[4],$inout4
3617 aesdeclast @offset[5],$inout5
3618 ret
3619.size __ocb_decrypt6,.-__ocb_decrypt6
3620
3621.type __ocb_decrypt4,\@abi-omnipotent
3622.align 32
3623__ocb_decrypt4:
3624 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3625 movdqu ($L_p,$i1),@offset[1]
3626 movdqa @offset[0],@offset[2]
3627 movdqu ($L_p,$i3),@offset[3]
3628 pxor @offset[5],@offset[0]
3629 pxor @offset[0],@offset[1]
3630 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3631 pxor @offset[1],@offset[2]
3632 pxor @offset[1],$inout1
3633 pxor @offset[2],@offset[3]
3634 pxor @offset[2],$inout2
3635 pxor @offset[3],$inout3
3636 $movkey 32($key_),$rndkey0
3637
3638 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3639 pxor $rndkey0l,@offset[1]
3640 pxor $rndkey0l,@offset[2]
3641 pxor $rndkey0l,@offset[3]
3642
3643 aesdec $rndkey1,$inout0
3644 aesdec $rndkey1,$inout1
3645 aesdec $rndkey1,$inout2
3646 aesdec $rndkey1,$inout3
3647 $movkey 48($key_),$rndkey1
3648
3649 aesdec $rndkey0,$inout0
3650 aesdec $rndkey0,$inout1
3651 aesdec $rndkey0,$inout2
3652 aesdec $rndkey0,$inout3
3653 $movkey 64($key_),$rndkey0
3654 jmp .Locb_dec_loop4
3655
3656.align 32
3657.Locb_dec_loop4:
3658 aesdec $rndkey1,$inout0
3659 aesdec $rndkey1,$inout1
3660 aesdec $rndkey1,$inout2
3661 aesdec $rndkey1,$inout3
3662 $movkey ($key,%rax),$rndkey1
3663 add \$32,%rax
3664
3665 aesdec $rndkey0,$inout0
3666 aesdec $rndkey0,$inout1
3667 aesdec $rndkey0,$inout2
3668 aesdec $rndkey0,$inout3
3669 $movkey -16($key,%rax),$rndkey0
3670 jnz .Locb_dec_loop4
3671
3672 aesdec $rndkey1,$inout0
3673 aesdec $rndkey1,$inout1
3674 aesdec $rndkey1,$inout2
3675 aesdec $rndkey1,$inout3
3676 $movkey 16($key_),$rndkey1
3677 mov %r10,%rax # restore twisted rounds
3678
3679 aesdeclast @offset[0],$inout0
3680 aesdeclast @offset[1],$inout1
3681 aesdeclast @offset[2],$inout2
3682 aesdeclast @offset[3],$inout3
3683 ret
3684.size __ocb_decrypt4,.-__ocb_decrypt4
3685
3686.type __ocb_decrypt1,\@abi-omnipotent
3687.align 32
3688__ocb_decrypt1:
3689 pxor @offset[5],$inout5 # offset_i
3690 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3691 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3692 $movkey 32($key_),$rndkey0
3693
3694 aesdec $rndkey1,$inout0
3695 $movkey 48($key_),$rndkey1
3696 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3697
3698 aesdec $rndkey0,$inout0
3699 $movkey 64($key_),$rndkey0
3700 jmp .Locb_dec_loop1
3701
3702.align 32
3703.Locb_dec_loop1:
3704 aesdec $rndkey1,$inout0
3705 $movkey ($key,%rax),$rndkey1
3706 add \$32,%rax
3707
3708 aesdec $rndkey0,$inout0
3709 $movkey -16($key,%rax),$rndkey0
3710 jnz .Locb_dec_loop1
3711
3712 aesdec $rndkey1,$inout0
3713 $movkey 16($key_),$rndkey1 # redundant in tail
3714 mov %r10,%rax # restore twisted rounds
3715
3716 aesdeclast $inout5,$inout0
3717 ret
3718.size __ocb_decrypt1,.-__ocb_decrypt1
3719___
Adam Langleyd9e397b2015-01-22 14:27:53 -08003720} }}
3721
3722########################################################################
3723# void $PREFIX_cbc_encrypt (const void *inp, void *out,
3724# size_t length, const AES_KEY *key,
3725# unsigned char *ivp,const int enc);
3726{
3727my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
3728my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
Adam Langleyd9e397b2015-01-22 14:27:53 -08003729
3730$code.=<<___;
3731.globl ${PREFIX}_cbc_encrypt
3732.type ${PREFIX}_cbc_encrypt,\@function,6
3733.align 16
3734${PREFIX}_cbc_encrypt:
Robert Sloanab8b8882018-03-26 11:39:51 -07003735.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08003736 test $len,$len # check length
3737 jz .Lcbc_ret
3738
3739 mov 240($key),$rnds_ # key->rounds
3740 mov $key,$key_ # backup $key
3741 test %r9d,%r9d # 6th argument
3742 jz .Lcbc_decrypt
3743#--------------------------- CBC ENCRYPT ------------------------------#
3744 movups ($ivp),$inout0 # load iv as initial state
3745 mov $rnds_,$rounds
3746 cmp \$16,$len
3747 jb .Lcbc_enc_tail
3748 sub \$16,$len
3749 jmp .Lcbc_enc_loop
3750.align 16
3751.Lcbc_enc_loop:
3752 movups ($inp),$inout1 # load input
3753 lea 16($inp),$inp
3754 #xorps $inout1,$inout0
3755___
3756 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3757$code.=<<___;
3758 mov $rnds_,$rounds # restore $rounds
3759 mov $key_,$key # restore $key
3760 movups $inout0,0($out) # store output
3761 lea 16($out),$out
3762 sub \$16,$len
3763 jnc .Lcbc_enc_loop
3764 add \$16,$len
3765 jnz .Lcbc_enc_tail
Adam Langleye9ada862015-05-11 17:20:37 -07003766 pxor $rndkey0,$rndkey0 # clear register bank
3767 pxor $rndkey1,$rndkey1
Adam Langleyd9e397b2015-01-22 14:27:53 -08003768 movups $inout0,($ivp)
Adam Langleye9ada862015-05-11 17:20:37 -07003769 pxor $inout0,$inout0
3770 pxor $inout1,$inout1
Adam Langleyd9e397b2015-01-22 14:27:53 -08003771 jmp .Lcbc_ret
3772
3773.Lcbc_enc_tail:
3774 mov $len,%rcx # zaps $key
3775 xchg $inp,$out # $inp is %rsi and $out is %rdi now
3776 .long 0x9066A4F3 # rep movsb
3777 mov \$16,%ecx # zero tail
3778 sub $len,%rcx
3779 xor %eax,%eax
3780 .long 0x9066AAF3 # rep stosb
3781 lea -16(%rdi),%rdi # rewind $out by 1 block
3782 mov $rnds_,$rounds # restore $rounds
3783 mov %rdi,%rsi # $inp and $out are the same
3784 mov $key_,$key # restore $key
3785 xor $len,$len # len=16
3786 jmp .Lcbc_enc_loop # one more spin
3787 #--------------------------- CBC DECRYPT ------------------------------#
3788.align 16
3789.Lcbc_decrypt:
Adam Langleye9ada862015-05-11 17:20:37 -07003790 cmp \$16,$len
3791 jne .Lcbc_decrypt_bulk
3792
3793 # handle single block without allocating stack frame,
3794 # useful in ciphertext stealing mode
3795 movdqu ($inp),$inout0 # load input
3796 movdqu ($ivp),$inout1 # load iv
3797 movdqa $inout0,$inout2 # future iv
3798___
3799 &aesni_generate1("dec",$key,$rnds_);
3800$code.=<<___;
3801 pxor $rndkey0,$rndkey0 # clear register bank
3802 pxor $rndkey1,$rndkey1
3803 movdqu $inout2,($ivp) # store iv
3804 xorps $inout1,$inout0 # ^=iv
3805 pxor $inout1,$inout1
3806 movups $inout0,($out) # store output
3807 pxor $inout0,$inout0
3808 jmp .Lcbc_ret
3809.align 16
3810.Lcbc_decrypt_bulk:
Robert Sloana94fe052017-02-21 08:49:28 -08003811 lea (%rsp),%r11 # frame pointer
Robert Sloanab8b8882018-03-26 11:39:51 -07003812.cfi_def_cfa_register %r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08003813 push %rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07003814.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08003815 sub \$$frame_size,%rsp
3816 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
3817___
3818$code.=<<___ if ($win64);
3819 movaps %xmm6,0x10(%rsp)
3820 movaps %xmm7,0x20(%rsp)
3821 movaps %xmm8,0x30(%rsp)
3822 movaps %xmm9,0x40(%rsp)
3823 movaps %xmm10,0x50(%rsp)
3824 movaps %xmm11,0x60(%rsp)
3825 movaps %xmm12,0x70(%rsp)
3826 movaps %xmm13,0x80(%rsp)
3827 movaps %xmm14,0x90(%rsp)
3828 movaps %xmm15,0xa0(%rsp)
3829.Lcbc_decrypt_body:
3830___
Robert Sloana94fe052017-02-21 08:49:28 -08003831
3832my $inp_=$key_="%rbp"; # reassign $key_
3833
Adam Langleyd9e397b2015-01-22 14:27:53 -08003834$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08003835 mov $key,$key_ # [re-]backup $key [after reassignment]
Adam Langleyd9e397b2015-01-22 14:27:53 -08003836 movups ($ivp),$iv
3837 mov $rnds_,$rounds
3838 cmp \$0x50,$len
3839 jbe .Lcbc_dec_tail
3840
3841 $movkey ($key),$rndkey0
3842 movdqu 0x00($inp),$inout0 # load input
3843 movdqu 0x10($inp),$inout1
3844 movdqa $inout0,$in0
3845 movdqu 0x20($inp),$inout2
3846 movdqa $inout1,$in1
3847 movdqu 0x30($inp),$inout3
3848 movdqa $inout2,$in2
3849 movdqu 0x40($inp),$inout4
3850 movdqa $inout3,$in3
3851 movdqu 0x50($inp),$inout5
3852 movdqa $inout4,$in4
Robert Sloan2424d842017-05-01 07:46:28 -07003853 leaq OPENSSL_ia32cap_P(%rip),%r9
Robert Sloan572a4e22017-04-17 10:52:19 -07003854 mov 4(%r9),%r9d
Adam Langleyd9e397b2015-01-22 14:27:53 -08003855 cmp \$0x70,$len
3856 jbe .Lcbc_dec_six_or_seven
3857
Adam Langleye9ada862015-05-11 17:20:37 -07003858 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
3859 sub \$0x50,$len # $len is biased by -5*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08003860 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
Adam Langleye9ada862015-05-11 17:20:37 -07003861 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
3862 sub \$0x20,$len # $len is biased by -7*16
Adam Langleyd9e397b2015-01-22 14:27:53 -08003863 lea 0x70($key),$key # size optimization
3864 jmp .Lcbc_dec_loop8_enter
3865.align 16
3866.Lcbc_dec_loop8:
3867 movups $inout7,($out)
3868 lea 0x10($out),$out
3869.Lcbc_dec_loop8_enter:
3870 movdqu 0x60($inp),$inout6
3871 pxor $rndkey0,$inout0
3872 movdqu 0x70($inp),$inout7
3873 pxor $rndkey0,$inout1
3874 $movkey 0x10-0x70($key),$rndkey1
3875 pxor $rndkey0,$inout2
Robert Sloana94fe052017-02-21 08:49:28 -08003876 mov \$-1,$inp_
Adam Langleyd9e397b2015-01-22 14:27:53 -08003877 cmp \$0x70,$len # is there at least 0x60 bytes ahead?
3878 pxor $rndkey0,$inout3
3879 pxor $rndkey0,$inout4
3880 pxor $rndkey0,$inout5
3881 pxor $rndkey0,$inout6
3882
3883 aesdec $rndkey1,$inout0
3884 pxor $rndkey0,$inout7
3885 $movkey 0x20-0x70($key),$rndkey0
3886 aesdec $rndkey1,$inout1
3887 aesdec $rndkey1,$inout2
3888 aesdec $rndkey1,$inout3
3889 aesdec $rndkey1,$inout4
3890 aesdec $rndkey1,$inout5
3891 aesdec $rndkey1,$inout6
Robert Sloana94fe052017-02-21 08:49:28 -08003892 adc \$0,$inp_
3893 and \$128,$inp_
Adam Langleyd9e397b2015-01-22 14:27:53 -08003894 aesdec $rndkey1,$inout7
3895 add $inp,$inp_
3896 $movkey 0x30-0x70($key),$rndkey1
3897___
3898for($i=1;$i<12;$i++) {
3899my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3900$code.=<<___ if ($i==7);
3901 cmp \$11,$rounds
3902___
3903$code.=<<___;
3904 aesdec $rndkeyx,$inout0
3905 aesdec $rndkeyx,$inout1
3906 aesdec $rndkeyx,$inout2
3907 aesdec $rndkeyx,$inout3
3908 aesdec $rndkeyx,$inout4
3909 aesdec $rndkeyx,$inout5
3910 aesdec $rndkeyx,$inout6
3911 aesdec $rndkeyx,$inout7
3912 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
3913___
3914$code.=<<___ if ($i<6 || (!($i&1) && $i>7));
3915 nop
3916___
3917$code.=<<___ if ($i==7);
3918 jb .Lcbc_dec_done
3919___
3920$code.=<<___ if ($i==9);
3921 je .Lcbc_dec_done
3922___
3923$code.=<<___ if ($i==11);
3924 jmp .Lcbc_dec_done
3925___
3926}
3927$code.=<<___;
3928.align 16
3929.Lcbc_dec_done:
3930 aesdec $rndkey1,$inout0
3931 aesdec $rndkey1,$inout1
3932 pxor $rndkey0,$iv
3933 pxor $rndkey0,$in0
3934 aesdec $rndkey1,$inout2
3935 aesdec $rndkey1,$inout3
3936 pxor $rndkey0,$in1
3937 pxor $rndkey0,$in2
3938 aesdec $rndkey1,$inout4
3939 aesdec $rndkey1,$inout5
3940 pxor $rndkey0,$in3
3941 pxor $rndkey0,$in4
3942 aesdec $rndkey1,$inout6
3943 aesdec $rndkey1,$inout7
3944 movdqu 0x50($inp),$rndkey1
3945
3946 aesdeclast $iv,$inout0
3947 movdqu 0x60($inp),$iv # borrow $iv
3948 pxor $rndkey0,$rndkey1
3949 aesdeclast $in0,$inout1
3950 pxor $rndkey0,$iv
3951 movdqu 0x70($inp),$rndkey0 # next IV
3952 aesdeclast $in1,$inout2
3953 lea 0x80($inp),$inp
3954 movdqu 0x00($inp_),$in0
3955 aesdeclast $in2,$inout3
3956 aesdeclast $in3,$inout4
3957 movdqu 0x10($inp_),$in1
3958 movdqu 0x20($inp_),$in2
3959 aesdeclast $in4,$inout5
3960 aesdeclast $rndkey1,$inout6
3961 movdqu 0x30($inp_),$in3
3962 movdqu 0x40($inp_),$in4
3963 aesdeclast $iv,$inout7
3964 movdqa $rndkey0,$iv # return $iv
3965 movdqu 0x50($inp_),$rndkey1
3966 $movkey -0x70($key),$rndkey0
3967
3968 movups $inout0,($out) # store output
3969 movdqa $in0,$inout0
3970 movups $inout1,0x10($out)
3971 movdqa $in1,$inout1
3972 movups $inout2,0x20($out)
3973 movdqa $in2,$inout2
3974 movups $inout3,0x30($out)
3975 movdqa $in3,$inout3
3976 movups $inout4,0x40($out)
3977 movdqa $in4,$inout4
3978 movups $inout5,0x50($out)
3979 movdqa $rndkey1,$inout5
3980 movups $inout6,0x60($out)
3981 lea 0x70($out),$out
3982
3983 sub \$0x80,$len
3984 ja .Lcbc_dec_loop8
3985
3986 movaps $inout7,$inout0
3987 lea -0x70($key),$key
3988 add \$0x70,$len
Adam Langleye9ada862015-05-11 17:20:37 -07003989 jle .Lcbc_dec_clear_tail_collected
Adam Langleyd9e397b2015-01-22 14:27:53 -08003990 movups $inout7,($out)
3991 lea 0x10($out),$out
3992 cmp \$0x50,$len
3993 jbe .Lcbc_dec_tail
3994
3995 movaps $in0,$inout0
3996.Lcbc_dec_six_or_seven:
3997 cmp \$0x60,$len
3998 ja .Lcbc_dec_seven
3999
4000 movaps $inout5,$inout6
4001 call _aesni_decrypt6
4002 pxor $iv,$inout0 # ^= IV
4003 movaps $inout6,$iv
4004 pxor $in0,$inout1
4005 movdqu $inout0,($out)
4006 pxor $in1,$inout2
4007 movdqu $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004008 pxor $inout1,$inout1 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -08004009 pxor $in2,$inout3
4010 movdqu $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004011 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -08004012 pxor $in3,$inout4
4013 movdqu $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004014 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -08004015 pxor $in4,$inout5
4016 movdqu $inout4,0x40($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004017 pxor $inout4,$inout4
Adam Langleyd9e397b2015-01-22 14:27:53 -08004018 lea 0x50($out),$out
4019 movdqa $inout5,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07004020 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -08004021 jmp .Lcbc_dec_tail_collected
4022
4023.align 16
4024.Lcbc_dec_seven:
4025 movups 0x60($inp),$inout6
4026 xorps $inout7,$inout7
4027 call _aesni_decrypt8
4028 movups 0x50($inp),$inout7
4029 pxor $iv,$inout0 # ^= IV
4030 movups 0x60($inp),$iv
4031 pxor $in0,$inout1
4032 movdqu $inout0,($out)
4033 pxor $in1,$inout2
4034 movdqu $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004035 pxor $inout1,$inout1 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -08004036 pxor $in2,$inout3
4037 movdqu $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004038 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -08004039 pxor $in3,$inout4
4040 movdqu $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004041 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -08004042 pxor $in4,$inout5
4043 movdqu $inout4,0x40($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004044 pxor $inout4,$inout4
Adam Langleyd9e397b2015-01-22 14:27:53 -08004045 pxor $inout7,$inout6
4046 movdqu $inout5,0x50($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004047 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -08004048 lea 0x60($out),$out
4049 movdqa $inout6,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07004050 pxor $inout6,$inout6
4051 pxor $inout7,$inout7
Adam Langleyd9e397b2015-01-22 14:27:53 -08004052 jmp .Lcbc_dec_tail_collected
4053
4054.align 16
4055.Lcbc_dec_loop6:
4056 movups $inout5,($out)
4057 lea 0x10($out),$out
4058 movdqu 0x00($inp),$inout0 # load input
4059 movdqu 0x10($inp),$inout1
4060 movdqa $inout0,$in0
4061 movdqu 0x20($inp),$inout2
4062 movdqa $inout1,$in1
4063 movdqu 0x30($inp),$inout3
4064 movdqa $inout2,$in2
4065 movdqu 0x40($inp),$inout4
4066 movdqa $inout3,$in3
4067 movdqu 0x50($inp),$inout5
4068 movdqa $inout4,$in4
4069.Lcbc_dec_loop6_enter:
4070 lea 0x60($inp),$inp
4071 movdqa $inout5,$inout6
4072
4073 call _aesni_decrypt6
4074
4075 pxor $iv,$inout0 # ^= IV
4076 movdqa $inout6,$iv
4077 pxor $in0,$inout1
4078 movdqu $inout0,($out)
4079 pxor $in1,$inout2
4080 movdqu $inout1,0x10($out)
4081 pxor $in2,$inout3
4082 movdqu $inout2,0x20($out)
4083 pxor $in3,$inout4
4084 mov $key_,$key
4085 movdqu $inout3,0x30($out)
4086 pxor $in4,$inout5
4087 mov $rnds_,$rounds
4088 movdqu $inout4,0x40($out)
4089 lea 0x50($out),$out
4090 sub \$0x60,$len
4091 ja .Lcbc_dec_loop6
4092
4093 movdqa $inout5,$inout0
4094 add \$0x50,$len
Adam Langleye9ada862015-05-11 17:20:37 -07004095 jle .Lcbc_dec_clear_tail_collected
Adam Langleyd9e397b2015-01-22 14:27:53 -08004096 movups $inout5,($out)
4097 lea 0x10($out),$out
4098
4099.Lcbc_dec_tail:
4100 movups ($inp),$inout0
4101 sub \$0x10,$len
Adam Langleye9ada862015-05-11 17:20:37 -07004102 jbe .Lcbc_dec_one # $len is 1*16 or less
Adam Langleyd9e397b2015-01-22 14:27:53 -08004103
4104 movups 0x10($inp),$inout1
4105 movaps $inout0,$in0
4106 sub \$0x10,$len
Adam Langleye9ada862015-05-11 17:20:37 -07004107 jbe .Lcbc_dec_two # $len is 2*16 or less
Adam Langleyd9e397b2015-01-22 14:27:53 -08004108
4109 movups 0x20($inp),$inout2
4110 movaps $inout1,$in1
4111 sub \$0x10,$len
Adam Langleye9ada862015-05-11 17:20:37 -07004112 jbe .Lcbc_dec_three # $len is 3*16 or less
Adam Langleyd9e397b2015-01-22 14:27:53 -08004113
4114 movups 0x30($inp),$inout3
4115 movaps $inout2,$in2
4116 sub \$0x10,$len
Adam Langleye9ada862015-05-11 17:20:37 -07004117 jbe .Lcbc_dec_four # $len is 4*16 or less
Adam Langleyd9e397b2015-01-22 14:27:53 -08004118
Adam Langleye9ada862015-05-11 17:20:37 -07004119 movups 0x40($inp),$inout4 # $len is 5*16 or less
Adam Langleyd9e397b2015-01-22 14:27:53 -08004120 movaps $inout3,$in3
4121 movaps $inout4,$in4
4122 xorps $inout5,$inout5
4123 call _aesni_decrypt6
4124 pxor $iv,$inout0
4125 movaps $in4,$iv
4126 pxor $in0,$inout1
4127 movdqu $inout0,($out)
4128 pxor $in1,$inout2
4129 movdqu $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004130 pxor $inout1,$inout1 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -08004131 pxor $in2,$inout3
4132 movdqu $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004133 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -08004134 pxor $in3,$inout4
4135 movdqu $inout3,0x30($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004136 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -08004137 lea 0x40($out),$out
4138 movdqa $inout4,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07004139 pxor $inout4,$inout4
4140 pxor $inout5,$inout5
Adam Langleyd9e397b2015-01-22 14:27:53 -08004141 sub \$0x10,$len
4142 jmp .Lcbc_dec_tail_collected
4143
4144.align 16
4145.Lcbc_dec_one:
4146 movaps $inout0,$in0
4147___
4148 &aesni_generate1("dec",$key,$rounds);
4149$code.=<<___;
4150 xorps $iv,$inout0
4151 movaps $in0,$iv
4152 jmp .Lcbc_dec_tail_collected
4153.align 16
4154.Lcbc_dec_two:
4155 movaps $inout1,$in1
4156 call _aesni_decrypt2
4157 pxor $iv,$inout0
4158 movaps $in1,$iv
4159 pxor $in0,$inout1
4160 movdqu $inout0,($out)
4161 movdqa $inout1,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07004162 pxor $inout1,$inout1 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -08004163 lea 0x10($out),$out
4164 jmp .Lcbc_dec_tail_collected
4165.align 16
4166.Lcbc_dec_three:
4167 movaps $inout2,$in2
4168 call _aesni_decrypt3
4169 pxor $iv,$inout0
4170 movaps $in2,$iv
4171 pxor $in0,$inout1
4172 movdqu $inout0,($out)
4173 pxor $in1,$inout2
4174 movdqu $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004175 pxor $inout1,$inout1 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -08004176 movdqa $inout2,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07004177 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -08004178 lea 0x20($out),$out
4179 jmp .Lcbc_dec_tail_collected
4180.align 16
4181.Lcbc_dec_four:
4182 movaps $inout3,$in3
4183 call _aesni_decrypt4
4184 pxor $iv,$inout0
4185 movaps $in3,$iv
4186 pxor $in0,$inout1
4187 movdqu $inout0,($out)
4188 pxor $in1,$inout2
4189 movdqu $inout1,0x10($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004190 pxor $inout1,$inout1 # clear register bank
Adam Langleyd9e397b2015-01-22 14:27:53 -08004191 pxor $in2,$inout3
4192 movdqu $inout2,0x20($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004193 pxor $inout2,$inout2
Adam Langleyd9e397b2015-01-22 14:27:53 -08004194 movdqa $inout3,$inout0
Adam Langleye9ada862015-05-11 17:20:37 -07004195 pxor $inout3,$inout3
Adam Langleyd9e397b2015-01-22 14:27:53 -08004196 lea 0x30($out),$out
4197 jmp .Lcbc_dec_tail_collected
4198
4199.align 16
Adam Langleye9ada862015-05-11 17:20:37 -07004200.Lcbc_dec_clear_tail_collected:
4201 pxor $inout1,$inout1 # clear register bank
4202 pxor $inout2,$inout2
4203 pxor $inout3,$inout3
4204___
4205$code.=<<___ if (!$win64);
4206 pxor $inout4,$inout4 # %xmm6..9
4207 pxor $inout5,$inout5
4208 pxor $inout6,$inout6
4209 pxor $inout7,$inout7
4210___
4211$code.=<<___;
Adam Langleyd9e397b2015-01-22 14:27:53 -08004212.Lcbc_dec_tail_collected:
4213 movups $iv,($ivp)
4214 and \$15,$len
4215 jnz .Lcbc_dec_tail_partial
4216 movups $inout0,($out)
Adam Langleye9ada862015-05-11 17:20:37 -07004217 pxor $inout0,$inout0
Adam Langleyd9e397b2015-01-22 14:27:53 -08004218 jmp .Lcbc_dec_ret
4219.align 16
4220.Lcbc_dec_tail_partial:
4221 movaps $inout0,(%rsp)
Adam Langleye9ada862015-05-11 17:20:37 -07004222 pxor $inout0,$inout0
Adam Langleyd9e397b2015-01-22 14:27:53 -08004223 mov \$16,%rcx
4224 mov $out,%rdi
4225 sub $len,%rcx
4226 lea (%rsp),%rsi
Adam Langleye9ada862015-05-11 17:20:37 -07004227 .long 0x9066A4F3 # rep movsb
4228 movdqa $inout0,(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004229
4230.Lcbc_dec_ret:
Adam Langleye9ada862015-05-11 17:20:37 -07004231 xorps $rndkey0,$rndkey0 # %xmm0
4232 pxor $rndkey1,$rndkey1
Adam Langleyd9e397b2015-01-22 14:27:53 -08004233___
4234$code.=<<___ if ($win64);
4235 movaps 0x10(%rsp),%xmm6
Adam Langleye9ada862015-05-11 17:20:37 -07004236 movaps %xmm0,0x10(%rsp) # clear stack
Adam Langleyd9e397b2015-01-22 14:27:53 -08004237 movaps 0x20(%rsp),%xmm7
Adam Langleye9ada862015-05-11 17:20:37 -07004238 movaps %xmm0,0x20(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004239 movaps 0x30(%rsp),%xmm8
Adam Langleye9ada862015-05-11 17:20:37 -07004240 movaps %xmm0,0x30(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004241 movaps 0x40(%rsp),%xmm9
Adam Langleye9ada862015-05-11 17:20:37 -07004242 movaps %xmm0,0x40(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004243 movaps 0x50(%rsp),%xmm10
Adam Langleye9ada862015-05-11 17:20:37 -07004244 movaps %xmm0,0x50(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004245 movaps 0x60(%rsp),%xmm11
Adam Langleye9ada862015-05-11 17:20:37 -07004246 movaps %xmm0,0x60(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004247 movaps 0x70(%rsp),%xmm12
Adam Langleye9ada862015-05-11 17:20:37 -07004248 movaps %xmm0,0x70(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004249 movaps 0x80(%rsp),%xmm13
Adam Langleye9ada862015-05-11 17:20:37 -07004250 movaps %xmm0,0x80(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004251 movaps 0x90(%rsp),%xmm14
Adam Langleye9ada862015-05-11 17:20:37 -07004252 movaps %xmm0,0x90(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004253 movaps 0xa0(%rsp),%xmm15
Adam Langleye9ada862015-05-11 17:20:37 -07004254 movaps %xmm0,0xa0(%rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08004255___
4256$code.=<<___;
Robert Sloana94fe052017-02-21 08:49:28 -08004257 mov -8(%r11),%rbp
Robert Sloanab8b8882018-03-26 11:39:51 -07004258.cfi_restore %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08004259 lea (%r11),%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07004260.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08004261.Lcbc_ret:
4262 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07004263.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08004264.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4265___
Robert Sloan4c22c5f2019-03-01 15:53:37 -08004266}
Adam Langleye9ada862015-05-11 17:20:37 -07004267# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
Adam Langleyd9e397b2015-01-22 14:27:53 -08004268# int bits, AES_KEY *key)
Adam Langleye9ada862015-05-11 17:20:37 -07004269#
4270# input: $inp user-supplied key
4271# $bits $inp length in bits
4272# $key pointer to key schedule
4273# output: %eax 0 denoting success, -1 or -2 - failure (see C)
4274# *$key key schedule
4275#
Adam Langleyd9e397b2015-01-22 14:27:53 -08004276{ my ($inp,$bits,$key) = @_4args;
4277 $bits =~ s/%r/%e/;
4278
4279$code.=<<___;
4280.globl ${PREFIX}_set_decrypt_key
4281.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
4282.align 16
4283${PREFIX}_set_decrypt_key:
Robert Sloanab8b8882018-03-26 11:39:51 -07004284.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08004285 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
Robert Sloanab8b8882018-03-26 11:39:51 -07004286.cfi_adjust_cfa_offset 8
Adam Langleyd9e397b2015-01-22 14:27:53 -08004287 call __aesni_set_encrypt_key
4288 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
4289 test %eax,%eax
4290 jnz .Ldec_key_ret
4291 lea 16($key,$bits),$inp # points at the end of key schedule
4292
4293 $movkey ($key),%xmm0 # just swap
4294 $movkey ($inp),%xmm1
4295 $movkey %xmm0,($inp)
4296 $movkey %xmm1,($key)
4297 lea 16($key),$key
4298 lea -16($inp),$inp
4299
4300.Ldec_key_inverse:
4301 $movkey ($key),%xmm0 # swap and inverse
4302 $movkey ($inp),%xmm1
4303 aesimc %xmm0,%xmm0
4304 aesimc %xmm1,%xmm1
4305 lea 16($key),$key
4306 lea -16($inp),$inp
4307 $movkey %xmm0,16($inp)
4308 $movkey %xmm1,-16($key)
4309 cmp $key,$inp
4310 ja .Ldec_key_inverse
4311
4312 $movkey ($key),%xmm0 # inverse middle
4313 aesimc %xmm0,%xmm0
Adam Langleye9ada862015-05-11 17:20:37 -07004314 pxor %xmm1,%xmm1
Adam Langleyd9e397b2015-01-22 14:27:53 -08004315 $movkey %xmm0,($inp)
Adam Langleye9ada862015-05-11 17:20:37 -07004316 pxor %xmm0,%xmm0
Adam Langleyd9e397b2015-01-22 14:27:53 -08004317.Ldec_key_ret:
4318 add \$8,%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07004319.cfi_adjust_cfa_offset -8
Adam Langleyd9e397b2015-01-22 14:27:53 -08004320 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07004321.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08004322.LSEH_end_set_decrypt_key:
4323.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4324___
4325
Robert Sloanab8b8882018-03-26 11:39:51 -07004326# This is based on submission from Intel by
4327# Huang Ying
4328# Vinodh Gopal
Adam Langleyd9e397b2015-01-22 14:27:53 -08004329# Kahraman Akdemir
4330#
Robert Sloana94fe052017-02-21 08:49:28 -08004331# Aggressively optimized in respect to aeskeygenassist's critical path
Adam Langleyd9e397b2015-01-22 14:27:53 -08004332# and is contained in %xmm0-5 to meet Win64 ABI requirement.
4333#
Adam Langleye9ada862015-05-11 17:20:37 -07004334# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4335# int bits, AES_KEY * const key);
4336#
4337# input: $inp user-supplied key
4338# $bits $inp length in bits
4339# $key pointer to key schedule
4340# output: %eax 0 denoting success, -1 or -2 - failure (see C)
4341# $bits rounds-1 (used in aesni_set_decrypt_key)
4342# *$key key schedule
4343# $key pointer to key schedule (used in
4344# aesni_set_decrypt_key)
4345#
4346# Subroutine is frame-less, which means that only volatile registers
4347# are used. Note that it's declared "abi-omnipotent", which means that
4348# amount of volatile registers is smaller on Windows.
4349#
Adam Langleyd9e397b2015-01-22 14:27:53 -08004350$code.=<<___;
4351.globl ${PREFIX}_set_encrypt_key
4352.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
4353.align 16
4354${PREFIX}_set_encrypt_key:
4355__aesni_set_encrypt_key:
Robert Sloanab8b8882018-03-26 11:39:51 -07004356.cfi_startproc
Robert Sloan4c22c5f2019-03-01 15:53:37 -08004357#ifndef NDEBUG
4358#ifndef BORINGSSL_FIPS
4359 movb \$1,BORINGSSL_function_hit+3(%rip)
4360#endif
4361#endif
Adam Langleyd9e397b2015-01-22 14:27:53 -08004362 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
Robert Sloanab8b8882018-03-26 11:39:51 -07004363.cfi_adjust_cfa_offset 8
Adam Langleyd9e397b2015-01-22 14:27:53 -08004364 mov \$-1,%rax
4365 test $inp,$inp
4366 jz .Lenc_key_ret
4367 test $key,$key
4368 jz .Lenc_key_ret
4369
4370 movups ($inp),%xmm0 # pull first 128 bits of *userKey
4371 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
Robert Sloan2424d842017-05-01 07:46:28 -07004372 leaq OPENSSL_ia32cap_P(%rip),%r10
Robert Sloan572a4e22017-04-17 10:52:19 -07004373 movl 4(%r10),%r10d
4374 and \$`1<<28|1<<11`,%r10d # AVX and XOP bits
Adam Langleye9ada862015-05-11 17:20:37 -07004375 lea 16($key),%rax # %rax is used as modifiable copy of $key
Adam Langleyd9e397b2015-01-22 14:27:53 -08004376 cmp \$256,$bits
4377 je .L14rounds
4378 cmp \$192,$bits
4379 je .L12rounds
4380 cmp \$128,$bits
4381 jne .Lbad_keybits
4382
4383.L10rounds:
4384 mov \$9,$bits # 10 rounds for 128-bit key
Adam Langleye9ada862015-05-11 17:20:37 -07004385 cmp \$`1<<28`,%r10d # AVX, bit no XOP
4386 je .L10rounds_alt
4387
Adam Langleyd9e397b2015-01-22 14:27:53 -08004388 $movkey %xmm0,($key) # round 0
4389 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
4390 call .Lkey_expansion_128_cold
4391 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
4392 call .Lkey_expansion_128
4393 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
4394 call .Lkey_expansion_128
4395 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
4396 call .Lkey_expansion_128
4397 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
4398 call .Lkey_expansion_128
4399 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
4400 call .Lkey_expansion_128
4401 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
4402 call .Lkey_expansion_128
4403 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
4404 call .Lkey_expansion_128
4405 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
4406 call .Lkey_expansion_128
4407 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
4408 call .Lkey_expansion_128
4409 $movkey %xmm0,(%rax)
4410 mov $bits,80(%rax) # 240(%rdx)
4411 xor %eax,%eax
4412 jmp .Lenc_key_ret
4413
4414.align 16
Adam Langleye9ada862015-05-11 17:20:37 -07004415.L10rounds_alt:
4416 movdqa .Lkey_rotate(%rip),%xmm5
4417 mov \$8,%r10d
4418 movdqa .Lkey_rcon1(%rip),%xmm4
4419 movdqa %xmm0,%xmm2
4420 movdqu %xmm0,($key)
4421 jmp .Loop_key128
4422
4423.align 16
4424.Loop_key128:
4425 pshufb %xmm5,%xmm0
4426 aesenclast %xmm4,%xmm0
4427 pslld \$1,%xmm4
4428 lea 16(%rax),%rax
4429
4430 movdqa %xmm2,%xmm3
4431 pslldq \$4,%xmm2
4432 pxor %xmm2,%xmm3
4433 pslldq \$4,%xmm2
4434 pxor %xmm2,%xmm3
4435 pslldq \$4,%xmm2
4436 pxor %xmm3,%xmm2
4437
4438 pxor %xmm2,%xmm0
4439 movdqu %xmm0,-16(%rax)
4440 movdqa %xmm0,%xmm2
4441
4442 dec %r10d
4443 jnz .Loop_key128
4444
4445 movdqa .Lkey_rcon1b(%rip),%xmm4
4446
4447 pshufb %xmm5,%xmm0
4448 aesenclast %xmm4,%xmm0
4449 pslld \$1,%xmm4
4450
4451 movdqa %xmm2,%xmm3
4452 pslldq \$4,%xmm2
4453 pxor %xmm2,%xmm3
4454 pslldq \$4,%xmm2
4455 pxor %xmm2,%xmm3
4456 pslldq \$4,%xmm2
4457 pxor %xmm3,%xmm2
4458
4459 pxor %xmm2,%xmm0
4460 movdqu %xmm0,(%rax)
4461
4462 movdqa %xmm0,%xmm2
4463 pshufb %xmm5,%xmm0
4464 aesenclast %xmm4,%xmm0
4465
4466 movdqa %xmm2,%xmm3
4467 pslldq \$4,%xmm2
4468 pxor %xmm2,%xmm3
4469 pslldq \$4,%xmm2
4470 pxor %xmm2,%xmm3
4471 pslldq \$4,%xmm2
4472 pxor %xmm3,%xmm2
4473
4474 pxor %xmm2,%xmm0
4475 movdqu %xmm0,16(%rax)
4476
4477 mov $bits,96(%rax) # 240($key)
4478 xor %eax,%eax
4479 jmp .Lenc_key_ret
4480
4481.align 16
Adam Langleyd9e397b2015-01-22 14:27:53 -08004482.L12rounds:
4483 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
4484 mov \$11,$bits # 12 rounds for 192
Adam Langleye9ada862015-05-11 17:20:37 -07004485 cmp \$`1<<28`,%r10d # AVX, but no XOP
4486 je .L12rounds_alt
4487
Adam Langleyd9e397b2015-01-22 14:27:53 -08004488 $movkey %xmm0,($key) # round 0
4489 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
4490 call .Lkey_expansion_192a_cold
4491 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
4492 call .Lkey_expansion_192b
4493 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
4494 call .Lkey_expansion_192a
4495 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
4496 call .Lkey_expansion_192b
4497 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
4498 call .Lkey_expansion_192a
4499 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
4500 call .Lkey_expansion_192b
4501 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
4502 call .Lkey_expansion_192a
4503 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
4504 call .Lkey_expansion_192b
4505 $movkey %xmm0,(%rax)
4506 mov $bits,48(%rax) # 240(%rdx)
4507 xor %rax, %rax
4508 jmp .Lenc_key_ret
4509
4510.align 16
Adam Langleye9ada862015-05-11 17:20:37 -07004511.L12rounds_alt:
4512 movdqa .Lkey_rotate192(%rip),%xmm5
4513 movdqa .Lkey_rcon1(%rip),%xmm4
4514 mov \$8,%r10d
4515 movdqu %xmm0,($key)
4516 jmp .Loop_key192
4517
4518.align 16
4519.Loop_key192:
4520 movq %xmm2,0(%rax)
4521 movdqa %xmm2,%xmm1
4522 pshufb %xmm5,%xmm2
4523 aesenclast %xmm4,%xmm2
4524 pslld \$1, %xmm4
4525 lea 24(%rax),%rax
4526
4527 movdqa %xmm0,%xmm3
4528 pslldq \$4,%xmm0
4529 pxor %xmm0,%xmm3
4530 pslldq \$4,%xmm0
4531 pxor %xmm0,%xmm3
4532 pslldq \$4,%xmm0
4533 pxor %xmm3,%xmm0
4534
4535 pshufd \$0xff,%xmm0,%xmm3
4536 pxor %xmm1,%xmm3
4537 pslldq \$4,%xmm1
4538 pxor %xmm1,%xmm3
4539
4540 pxor %xmm2,%xmm0
4541 pxor %xmm3,%xmm2
4542 movdqu %xmm0,-16(%rax)
4543
4544 dec %r10d
4545 jnz .Loop_key192
4546
4547 mov $bits,32(%rax) # 240($key)
4548 xor %eax,%eax
4549 jmp .Lenc_key_ret
4550
4551.align 16
Adam Langleyd9e397b2015-01-22 14:27:53 -08004552.L14rounds:
Robert Sloanab8b8882018-03-26 11:39:51 -07004553 movups 16($inp),%xmm2 # remaining half of *userKey
Adam Langleyd9e397b2015-01-22 14:27:53 -08004554 mov \$13,$bits # 14 rounds for 256
4555 lea 16(%rax),%rax
Adam Langleye9ada862015-05-11 17:20:37 -07004556 cmp \$`1<<28`,%r10d # AVX, but no XOP
4557 je .L14rounds_alt
4558
Adam Langleyd9e397b2015-01-22 14:27:53 -08004559 $movkey %xmm0,($key) # round 0
4560 $movkey %xmm2,16($key) # round 1
4561 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
4562 call .Lkey_expansion_256a_cold
4563 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
4564 call .Lkey_expansion_256b
4565 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
4566 call .Lkey_expansion_256a
4567 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
4568 call .Lkey_expansion_256b
4569 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
4570 call .Lkey_expansion_256a
4571 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
4572 call .Lkey_expansion_256b
4573 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
4574 call .Lkey_expansion_256a
4575 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
4576 call .Lkey_expansion_256b
4577 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
4578 call .Lkey_expansion_256a
4579 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
4580 call .Lkey_expansion_256b
4581 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
4582 call .Lkey_expansion_256a
4583 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
4584 call .Lkey_expansion_256b
4585 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
4586 call .Lkey_expansion_256a
4587 $movkey %xmm0,(%rax)
4588 mov $bits,16(%rax) # 240(%rdx)
4589 xor %rax,%rax
4590 jmp .Lenc_key_ret
4591
4592.align 16
Adam Langleye9ada862015-05-11 17:20:37 -07004593.L14rounds_alt:
4594 movdqa .Lkey_rotate(%rip),%xmm5
4595 movdqa .Lkey_rcon1(%rip),%xmm4
4596 mov \$7,%r10d
4597 movdqu %xmm0,0($key)
4598 movdqa %xmm2,%xmm1
4599 movdqu %xmm2,16($key)
4600 jmp .Loop_key256
4601
4602.align 16
4603.Loop_key256:
4604 pshufb %xmm5,%xmm2
4605 aesenclast %xmm4,%xmm2
4606
4607 movdqa %xmm0,%xmm3
4608 pslldq \$4,%xmm0
4609 pxor %xmm0,%xmm3
4610 pslldq \$4,%xmm0
4611 pxor %xmm0,%xmm3
4612 pslldq \$4,%xmm0
4613 pxor %xmm3,%xmm0
4614 pslld \$1,%xmm4
4615
4616 pxor %xmm2,%xmm0
4617 movdqu %xmm0,(%rax)
4618
4619 dec %r10d
4620 jz .Ldone_key256
4621
4622 pshufd \$0xff,%xmm0,%xmm2
4623 pxor %xmm3,%xmm3
4624 aesenclast %xmm3,%xmm2
4625
4626 movdqa %xmm1,%xmm3
4627 pslldq \$4,%xmm1
4628 pxor %xmm1,%xmm3
4629 pslldq \$4,%xmm1
4630 pxor %xmm1,%xmm3
4631 pslldq \$4,%xmm1
4632 pxor %xmm3,%xmm1
4633
4634 pxor %xmm1,%xmm2
4635 movdqu %xmm2,16(%rax)
4636 lea 32(%rax),%rax
4637 movdqa %xmm2,%xmm1
4638
4639 jmp .Loop_key256
4640
4641.Ldone_key256:
4642 mov $bits,16(%rax) # 240($key)
4643 xor %eax,%eax
4644 jmp .Lenc_key_ret
4645
4646.align 16
Adam Langleyd9e397b2015-01-22 14:27:53 -08004647.Lbad_keybits:
4648 mov \$-2,%rax
4649.Lenc_key_ret:
Adam Langleye9ada862015-05-11 17:20:37 -07004650 pxor %xmm0,%xmm0
4651 pxor %xmm1,%xmm1
4652 pxor %xmm2,%xmm2
4653 pxor %xmm3,%xmm3
4654 pxor %xmm4,%xmm4
4655 pxor %xmm5,%xmm5
Adam Langleyd9e397b2015-01-22 14:27:53 -08004656 add \$8,%rsp
Robert Sloanab8b8882018-03-26 11:39:51 -07004657.cfi_adjust_cfa_offset -8
Adam Langleyd9e397b2015-01-22 14:27:53 -08004658 ret
Robert Sloanab8b8882018-03-26 11:39:51 -07004659.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08004660.LSEH_end_set_encrypt_key:
4661
4662.align 16
4663.Lkey_expansion_128:
4664 $movkey %xmm0,(%rax)
4665 lea 16(%rax),%rax
4666.Lkey_expansion_128_cold:
4667 shufps \$0b00010000,%xmm0,%xmm4
4668 xorps %xmm4, %xmm0
4669 shufps \$0b10001100,%xmm0,%xmm4
4670 xorps %xmm4, %xmm0
4671 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4672 xorps %xmm1,%xmm0
4673 ret
4674
4675.align 16
4676.Lkey_expansion_192a:
4677 $movkey %xmm0,(%rax)
4678 lea 16(%rax),%rax
4679.Lkey_expansion_192a_cold:
4680 movaps %xmm2, %xmm5
4681.Lkey_expansion_192b_warm:
4682 shufps \$0b00010000,%xmm0,%xmm4
4683 movdqa %xmm2,%xmm3
4684 xorps %xmm4,%xmm0
4685 shufps \$0b10001100,%xmm0,%xmm4
4686 pslldq \$4,%xmm3
4687 xorps %xmm4,%xmm0
4688 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
4689 pxor %xmm3,%xmm2
4690 pxor %xmm1,%xmm0
4691 pshufd \$0b11111111,%xmm0,%xmm3
4692 pxor %xmm3,%xmm2
4693 ret
4694
4695.align 16
4696.Lkey_expansion_192b:
4697 movaps %xmm0,%xmm3
4698 shufps \$0b01000100,%xmm0,%xmm5
4699 $movkey %xmm5,(%rax)
4700 shufps \$0b01001110,%xmm2,%xmm3
4701 $movkey %xmm3,16(%rax)
4702 lea 32(%rax),%rax
4703 jmp .Lkey_expansion_192b_warm
4704
4705.align 16
4706.Lkey_expansion_256a:
4707 $movkey %xmm2,(%rax)
4708 lea 16(%rax),%rax
4709.Lkey_expansion_256a_cold:
4710 shufps \$0b00010000,%xmm0,%xmm4
4711 xorps %xmm4,%xmm0
4712 shufps \$0b10001100,%xmm0,%xmm4
4713 xorps %xmm4,%xmm0
4714 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4715 xorps %xmm1,%xmm0
4716 ret
4717
4718.align 16
4719.Lkey_expansion_256b:
4720 $movkey %xmm0,(%rax)
4721 lea 16(%rax),%rax
4722
4723 shufps \$0b00010000,%xmm2,%xmm4
4724 xorps %xmm4,%xmm2
4725 shufps \$0b10001100,%xmm2,%xmm4
4726 xorps %xmm4,%xmm2
4727 shufps \$0b10101010,%xmm1,%xmm1 # critical path
4728 xorps %xmm1,%xmm2
4729 ret
4730.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4731.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4732___
4733}
4734
4735$code.=<<___;
4736.align 64
4737.Lbswap_mask:
4738 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4739.Lincrement32:
4740 .long 6,6,6,0
4741.Lincrement64:
4742 .long 1,0,0,0
4743.Lxts_magic:
4744 .long 0x87,0,1,0
4745.Lincrement1:
4746 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
Adam Langleye9ada862015-05-11 17:20:37 -07004747.Lkey_rotate:
4748 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4749.Lkey_rotate192:
4750 .long 0x04070605,0x04070605,0x04070605,0x04070605
4751.Lkey_rcon1:
4752 .long 1,1,1,1
4753.Lkey_rcon1b:
4754 .long 0x1b,0x1b,0x1b,0x1b
Adam Langleyd9e397b2015-01-22 14:27:53 -08004755
4756.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4757.align 64
4758___
4759
4760# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4761# CONTEXT *context,DISPATCHER_CONTEXT *disp)
4762if ($win64) {
4763$rec="%rcx";
4764$frame="%rdx";
4765$context="%r8";
4766$disp="%r9";
4767
4768$code.=<<___;
4769.extern __imp_RtlVirtualUnwind
4770___
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01004771$code.=<<___ if ($PREFIX eq "aes_hw");
Adam Langleyd9e397b2015-01-22 14:27:53 -08004772.type ecb_ccm64_se_handler,\@abi-omnipotent
4773.align 16
4774ecb_ccm64_se_handler:
4775 push %rsi
4776 push %rdi
4777 push %rbx
4778 push %rbp
4779 push %r12
4780 push %r13
4781 push %r14
4782 push %r15
4783 pushfq
4784 sub \$64,%rsp
4785
4786 mov 120($context),%rax # pull context->Rax
4787 mov 248($context),%rbx # pull context->Rip
4788
4789 mov 8($disp),%rsi # disp->ImageBase
4790 mov 56($disp),%r11 # disp->HandlerData
4791
4792 mov 0(%r11),%r10d # HandlerData[0]
4793 lea (%rsi,%r10),%r10 # prologue label
4794 cmp %r10,%rbx # context->Rip<prologue label
4795 jb .Lcommon_seh_tail
4796
4797 mov 152($context),%rax # pull context->Rsp
4798
4799 mov 4(%r11),%r10d # HandlerData[1]
4800 lea (%rsi,%r10),%r10 # epilogue label
4801 cmp %r10,%rbx # context->Rip>=epilogue label
4802 jae .Lcommon_seh_tail
4803
4804 lea 0(%rax),%rsi # %xmm save area
4805 lea 512($context),%rdi # &context.Xmm6
4806 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
4807 .long 0xa548f3fc # cld; rep movsq
4808 lea 0x58(%rax),%rax # adjust stack pointer
4809
4810 jmp .Lcommon_seh_tail
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01004811.size ${PREFIX}_ccm64_se_handler,.-${PREFIX}_ccm64_se_handler
Adam Langleyd9e397b2015-01-22 14:27:53 -08004812
4813.type ctr_xts_se_handler,\@abi-omnipotent
4814.align 16
4815ctr_xts_se_handler:
4816 push %rsi
4817 push %rdi
4818 push %rbx
4819 push %rbp
4820 push %r12
4821 push %r13
4822 push %r14
4823 push %r15
4824 pushfq
4825 sub \$64,%rsp
4826
4827 mov 120($context),%rax # pull context->Rax
4828 mov 248($context),%rbx # pull context->Rip
4829
4830 mov 8($disp),%rsi # disp->ImageBase
4831 mov 56($disp),%r11 # disp->HandlerData
4832
4833 mov 0(%r11),%r10d # HandlerData[0]
4834 lea (%rsi,%r10),%r10 # prologue lable
4835 cmp %r10,%rbx # context->Rip<prologue label
4836 jb .Lcommon_seh_tail
4837
4838 mov 152($context),%rax # pull context->Rsp
4839
4840 mov 4(%r11),%r10d # HandlerData[1]
4841 lea (%rsi,%r10),%r10 # epilogue label
4842 cmp %r10,%rbx # context->Rip>=epilogue label
4843 jae .Lcommon_seh_tail
4844
Robert Sloana94fe052017-02-21 08:49:28 -08004845 mov 208($context),%rax # pull context->R11
4846
4847 lea -0xa8(%rax),%rsi # %xmm save area
Adam Langleyd9e397b2015-01-22 14:27:53 -08004848 lea 512($context),%rdi # & context.Xmm6
4849 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4850 .long 0xa548f3fc # cld; rep movsq
4851
Robert Sloana94fe052017-02-21 08:49:28 -08004852 mov -8(%rax),%rbp # restore saved %rbp
4853 mov %rbp,160($context) # restore context->Rbp
4854 jmp .Lcommon_seh_tail
Adam Langleyd9e397b2015-01-22 14:27:53 -08004855.size ctr_xts_se_handler,.-ctr_xts_se_handler
Robert Sloana94fe052017-02-21 08:49:28 -08004856
Robert Sloan4c22c5f2019-03-01 15:53:37 -08004857___
4858# BoringSSL omits the OCB functions.
4859$code.=<<___ if (0);
Robert Sloana94fe052017-02-21 08:49:28 -08004860.type ocb_se_handler,\@abi-omnipotent
4861.align 16
4862ocb_se_handler:
4863 push %rsi
4864 push %rdi
4865 push %rbx
4866 push %rbp
4867 push %r12
4868 push %r13
4869 push %r14
4870 push %r15
4871 pushfq
4872 sub \$64,%rsp
4873
4874 mov 120($context),%rax # pull context->Rax
4875 mov 248($context),%rbx # pull context->Rip
4876
4877 mov 8($disp),%rsi # disp->ImageBase
4878 mov 56($disp),%r11 # disp->HandlerData
4879
4880 mov 0(%r11),%r10d # HandlerData[0]
4881 lea (%rsi,%r10),%r10 # prologue lable
4882 cmp %r10,%rbx # context->Rip<prologue label
4883 jb .Lcommon_seh_tail
4884
4885 mov 4(%r11),%r10d # HandlerData[1]
4886 lea (%rsi,%r10),%r10 # epilogue label
4887 cmp %r10,%rbx # context->Rip>=epilogue label
4888 jae .Lcommon_seh_tail
4889
4890 mov 8(%r11),%r10d # HandlerData[2]
4891 lea (%rsi,%r10),%r10
4892 cmp %r10,%rbx # context->Rip>=pop label
4893 jae .Locb_no_xmm
4894
4895 mov 152($context),%rax # pull context->Rsp
4896
4897 lea (%rax),%rsi # %xmm save area
4898 lea 512($context),%rdi # & context.Xmm6
4899 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4900 .long 0xa548f3fc # cld; rep movsq
4901 lea 0xa0+0x28(%rax),%rax
4902
4903.Locb_no_xmm:
4904 mov -8(%rax),%rbx
4905 mov -16(%rax),%rbp
4906 mov -24(%rax),%r12
4907 mov -32(%rax),%r13
4908 mov -40(%rax),%r14
4909
4910 mov %rbx,144($context) # restore context->Rbx
4911 mov %rbp,160($context) # restore context->Rbp
4912 mov %r12,216($context) # restore context->R12
4913 mov %r13,224($context) # restore context->R13
4914 mov %r14,232($context) # restore context->R14
4915
4916 jmp .Lcommon_seh_tail
4917.size ocb_se_handler,.-ocb_se_handler
Adam Langleyd9e397b2015-01-22 14:27:53 -08004918___
4919$code.=<<___;
4920.type cbc_se_handler,\@abi-omnipotent
4921.align 16
4922cbc_se_handler:
4923 push %rsi
4924 push %rdi
4925 push %rbx
4926 push %rbp
4927 push %r12
4928 push %r13
4929 push %r14
4930 push %r15
4931 pushfq
4932 sub \$64,%rsp
4933
4934 mov 152($context),%rax # pull context->Rsp
4935 mov 248($context),%rbx # pull context->Rip
4936
Adam Langleye9ada862015-05-11 17:20:37 -07004937 lea .Lcbc_decrypt_bulk(%rip),%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08004938 cmp %r10,%rbx # context->Rip<"prologue" label
4939 jb .Lcommon_seh_tail
4940
Robert Sloana94fe052017-02-21 08:49:28 -08004941 mov 120($context),%rax # pull context->Rax
4942
Adam Langleyd9e397b2015-01-22 14:27:53 -08004943 lea .Lcbc_decrypt_body(%rip),%r10
4944 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
Robert Sloana94fe052017-02-21 08:49:28 -08004945 jb .Lcommon_seh_tail
4946
4947 mov 152($context),%rax # pull context->Rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08004948
4949 lea .Lcbc_ret(%rip),%r10
4950 cmp %r10,%rbx # context->Rip>="epilogue" label
4951 jae .Lcommon_seh_tail
4952
4953 lea 16(%rax),%rsi # %xmm save area
4954 lea 512($context),%rdi # &context.Xmm6
4955 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4956 .long 0xa548f3fc # cld; rep movsq
4957
Robert Sloana94fe052017-02-21 08:49:28 -08004958 mov 208($context),%rax # pull context->R11
Adam Langleyd9e397b2015-01-22 14:27:53 -08004959
Robert Sloana94fe052017-02-21 08:49:28 -08004960 mov -8(%rax),%rbp # restore saved %rbp
4961 mov %rbp,160($context) # restore context->Rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08004962
4963.Lcommon_seh_tail:
4964 mov 8(%rax),%rdi
4965 mov 16(%rax),%rsi
4966 mov %rax,152($context) # restore context->Rsp
4967 mov %rsi,168($context) # restore context->Rsi
4968 mov %rdi,176($context) # restore context->Rdi
4969
4970 mov 40($disp),%rdi # disp->ContextRecord
4971 mov $context,%rsi # context
4972 mov \$154,%ecx # sizeof(CONTEXT)
4973 .long 0xa548f3fc # cld; rep movsq
4974
4975 mov $disp,%rsi
4976 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4977 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4978 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4979 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4980 mov 40(%rsi),%r10 # disp->ContextRecord
4981 lea 56(%rsi),%r11 # &disp->HandlerData
4982 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4983 mov %r10,32(%rsp) # arg5
4984 mov %r11,40(%rsp) # arg6
4985 mov %r12,48(%rsp) # arg7
4986 mov %rcx,56(%rsp) # arg8, (NULL)
4987 call *__imp_RtlVirtualUnwind(%rip)
4988
4989 mov \$1,%eax # ExceptionContinueSearch
4990 add \$64,%rsp
4991 popfq
4992 pop %r15
4993 pop %r14
4994 pop %r13
4995 pop %r12
4996 pop %rbp
4997 pop %rbx
4998 pop %rdi
4999 pop %rsi
5000 ret
5001.size cbc_se_handler,.-cbc_se_handler
5002
5003.section .pdata
5004.align 4
5005___
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01005006$code.=<<___ if ($PREFIX eq "aes_hw");
5007 .rva .LSEH_begin_${PREFIX}_ecb_encrypt
5008 .rva .LSEH_end_${PREFIX}_ecb_encrypt
Adam Langleyd9e397b2015-01-22 14:27:53 -08005009 .rva .LSEH_info_ecb
5010
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01005011 .rva .LSEH_begin_${PREFIX}_ctr32_encrypt_blocks
5012 .rva .LSEH_end_${PREFIX}_ctr32_encrypt_blocks
Adam Langleyd9e397b2015-01-22 14:27:53 -08005013 .rva .LSEH_info_ctr32
Adam Langleyd9e397b2015-01-22 14:27:53 -08005014___
5015$code.=<<___;
5016 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
5017 .rva .LSEH_end_${PREFIX}_cbc_encrypt
5018 .rva .LSEH_info_cbc
5019
5020 .rva ${PREFIX}_set_decrypt_key
5021 .rva .LSEH_end_set_decrypt_key
5022 .rva .LSEH_info_key
5023
5024 .rva ${PREFIX}_set_encrypt_key
5025 .rva .LSEH_end_set_encrypt_key
5026 .rva .LSEH_info_key
5027.section .xdata
5028.align 8
5029___
Adam Vartanianbfcf3a72018-08-10 14:55:24 +01005030$code.=<<___ if ($PREFIX eq "aes_hw");
Adam Langleyd9e397b2015-01-22 14:27:53 -08005031.LSEH_info_ecb:
5032 .byte 9,0,0,0
5033 .rva ecb_ccm64_se_handler
5034 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
Adam Langleyd9e397b2015-01-22 14:27:53 -08005035.LSEH_info_ctr32:
5036 .byte 9,0,0,0
5037 .rva ctr_xts_se_handler
5038 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
Adam Langleyd9e397b2015-01-22 14:27:53 -08005039___
5040$code.=<<___;
5041.LSEH_info_cbc:
5042 .byte 9,0,0,0
5043 .rva cbc_se_handler
5044.LSEH_info_key:
5045 .byte 0x01,0x04,0x01,0x00
5046 .byte 0x04,0x02,0x00,0x00 # sub rsp,8
5047___
5048}
5049
5050sub rex {
5051 local *opcode=shift;
5052 my ($dst,$src)=@_;
5053 my $rex=0;
5054
5055 $rex|=0x04 if($dst>=8);
5056 $rex|=0x01 if($src>=8);
5057 push @opcode,$rex|0x40 if($rex);
5058}
5059
5060sub aesni {
5061 my $line=shift;
5062 my @opcode=(0x66);
5063
5064 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5065 rex(\@opcode,$4,$3);
5066 push @opcode,0x0f,0x3a,0xdf;
5067 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
5068 my $c=$2;
5069 push @opcode,$c=~/^0/?oct($c):$c;
5070 return ".byte\t".join(',',@opcode);
5071 }
5072 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5073 my %opcodelet = (
5074 "aesimc" => 0xdb,
5075 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5076 "aesdec" => 0xde, "aesdeclast" => 0xdf
5077 );
5078 return undef if (!defined($opcodelet{$1}));
5079 rex(\@opcode,$3,$2);
5080 push @opcode,0x0f,0x38,$opcodelet{$1};
5081 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
5082 return ".byte\t".join(',',@opcode);
5083 }
5084 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5085 my %opcodelet = (
5086 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5087 "aesdec" => 0xde, "aesdeclast" => 0xdf
5088 );
5089 return undef if (!defined($opcodelet{$1}));
5090 my $off = $2;
5091 push @opcode,0x44 if ($3>=8);
5092 push @opcode,0x0f,0x38,$opcodelet{$1};
5093 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
5094 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5095 return ".byte\t".join(',',@opcode);
5096 }
5097 return $line;
5098}
5099
5100sub movbe {
5101 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
5102}
5103
5104$code =~ s/\`([^\`]*)\`/eval($1)/gem;
5105$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5106#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
5107$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
5108
5109print $code;
5110
Pete Bentley0c61efe2019-08-13 09:32:23 +01005111close STDOUT or die "error closing STDOUT";