blob: 12e8484a8ee79b42d3604abc71a6e53a0f5b4cee [file] [log] [blame]
Huang Ying54b6a1b2009-01-18 16:28:34 +11001/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040012 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
Mathias Krause0d258ef2010-11-27 16:34:46 +080023 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
Huang Ying54b6a1b2009-01-18 16:28:34 +110026 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
Huang Yingb369e522009-11-23 19:54:06 +080033#include <asm/inst.h>
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -060034#include <asm/frame.h>
David Woodhouse9697fa32018-01-11 21:46:27 +000035#include <asm/nospec-branch.h>
Huang Ying54b6a1b2009-01-18 16:28:34 +110036
Timothy McCaffreye31ac322015-01-13 13:16:43 -050037/*
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
44 */
45#define MOVADQ movaps
46#define MOVUDQ movups
47
Mathias Krause559ad0f2010-11-29 08:35:39 +080048#ifdef __x86_64__
Timothy McCaffreye31ac322015-01-13 13:16:43 -050049
Denys Vlasenkoe1839142017-01-19 22:33:04 +010050# constants in mergeable sections, linker can reorder and merge
51.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +030052.align 16
53.Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
Denys Vlasenkoe1839142017-01-19 22:33:04 +010055.section .rodata.cst16.POLY, "aM", @progbits, 16
56.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040057POLY: .octa 0xC2000000000000000000000000000001
Denys Vlasenkoe1839142017-01-19 22:33:04 +010058.section .rodata.cst16.TWOONE, "aM", @progbits, 16
59.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040060TWOONE: .octa 0x00000001000000000000000000000001
61
Denys Vlasenkoe1839142017-01-19 22:33:04 +010062.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63.align 16
64SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
65.section .rodata.cst16.MASK1, "aM", @progbits, 16
66.align 16
67MASK1: .octa 0x0000000000000000ffffffffffffffff
68.section .rodata.cst16.MASK2, "aM", @progbits, 16
69.align 16
70MASK2: .octa 0xffffffffffffffff0000000000000000
71.section .rodata.cst16.ONE, "aM", @progbits, 16
72.align 16
73ONE: .octa 0x00000000000000000000000000000001
74.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75.align 16
76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77.section .rodata.cst16.dec, "aM", @progbits, 16
78.align 16
79dec: .octa 0x1
80.section .rodata.cst16.enc, "aM", @progbits, 16
81.align 16
82enc: .octa 0x2
83
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040084# order of these constants should not change.
85# more specifically, ALL_F should follow SHIFT_MASK,
Denys Vlasenkoe1839142017-01-19 22:33:04 +010086# and zero should follow ALL_F
87.section .rodata, "a", @progbits
88.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040089SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90ALL_F: .octa 0xffffffffffffffffffffffffffffffff
Denys Vlasenkoe1839142017-01-19 22:33:04 +010091 .octa 0x00000000000000000000000000000000
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040092
Huang Ying54b6a1b2009-01-18 16:28:34 +110093.text
94
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040095
96#define STACK_OFFSET 8*3
97#define HashKey 16*0 // store HashKey <<1 mod poly here
98#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
99#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
100#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
101#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
102 // bits of HashKey <<1 mod poly here
103 //(for Karatsuba purposes)
104#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
105 // bits of HashKey^2 <<1 mod poly here
106 // (for Karatsuba purposes)
107#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
108 // bits of HashKey^3 <<1 mod poly here
109 // (for Karatsuba purposes)
110#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
111 // bits of HashKey^4 <<1 mod poly here
112 // (for Karatsuba purposes)
113#define VARIABLE_OFFSET 16*8
114
115#define arg1 rdi
116#define arg2 rsi
117#define arg3 rdx
118#define arg4 rcx
119#define arg5 r8
120#define arg6 r9
121#define arg7 STACK_OFFSET+8(%r14)
122#define arg8 STACK_OFFSET+16(%r14)
123#define arg9 STACK_OFFSET+24(%r14)
124#define arg10 STACK_OFFSET+32(%r14)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500125#define keysize 2*15*16(%arg1)
Mathias Krause559ad0f2010-11-29 08:35:39 +0800126#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400127
128
Huang Ying54b6a1b2009-01-18 16:28:34 +1100129#define STATE1 %xmm0
130#define STATE2 %xmm4
131#define STATE3 %xmm5
132#define STATE4 %xmm6
133#define STATE STATE1
134#define IN1 %xmm1
135#define IN2 %xmm7
136#define IN3 %xmm8
137#define IN4 %xmm9
138#define IN IN1
139#define KEY %xmm2
140#define IV %xmm3
Mathias Krause0d258ef2010-11-27 16:34:46 +0800141
Huang Ying12387a42010-03-10 18:28:55 +0800142#define BSWAP_MASK %xmm10
143#define CTR %xmm11
144#define INC %xmm12
Huang Ying54b6a1b2009-01-18 16:28:34 +1100145
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +0300146#define GF128MUL_MASK %xmm10
147
Mathias Krause0d258ef2010-11-27 16:34:46 +0800148#ifdef __x86_64__
149#define AREG %rax
Huang Ying54b6a1b2009-01-18 16:28:34 +1100150#define KEYP %rdi
151#define OUTP %rsi
Mathias Krause0d258ef2010-11-27 16:34:46 +0800152#define UKEYP OUTP
Huang Ying54b6a1b2009-01-18 16:28:34 +1100153#define INP %rdx
154#define LEN %rcx
155#define IVP %r8
156#define KLEN %r9d
157#define T1 %r10
158#define TKEYP T1
159#define T2 %r11
Huang Ying12387a42010-03-10 18:28:55 +0800160#define TCTR_LOW T2
Mathias Krause0d258ef2010-11-27 16:34:46 +0800161#else
162#define AREG %eax
163#define KEYP %edi
164#define OUTP AREG
165#define UKEYP OUTP
166#define INP %edx
167#define LEN %esi
168#define IVP %ebp
169#define KLEN %ebx
170#define T1 %ecx
171#define TKEYP T1
172#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +1100173
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400174
Mathias Krause559ad0f2010-11-29 08:35:39 +0800175#ifdef __x86_64__
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400176/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
177*
178*
179* Input: A and B (128-bits each, bit-reflected)
180* Output: C = A*B*x mod poly, (i.e. >>1 )
181* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
182* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
183*
184*/
185.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
186 movdqa \GH, \TMP1
187 pshufd $78, \GH, \TMP2
188 pshufd $78, \HK, \TMP3
189 pxor \GH, \TMP2 # TMP2 = a1+a0
190 pxor \HK, \TMP3 # TMP3 = b1+b0
191 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
192 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
193 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
194 pxor \GH, \TMP2
195 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
196 movdqa \TMP2, \TMP3
197 pslldq $8, \TMP3 # left shift TMP3 2 DWs
198 psrldq $8, \TMP2 # right shift TMP2 2 DWs
199 pxor \TMP3, \GH
200 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
201
202 # first phase of the reduction
203
204 movdqa \GH, \TMP2
205 movdqa \GH, \TMP3
206 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
207 # in in order to perform
208 # independent shifts
209 pslld $31, \TMP2 # packed right shift <<31
210 pslld $30, \TMP3 # packed right shift <<30
211 pslld $25, \TMP4 # packed right shift <<25
212 pxor \TMP3, \TMP2 # xor the shifted versions
213 pxor \TMP4, \TMP2
214 movdqa \TMP2, \TMP5
215 psrldq $4, \TMP5 # right shift TMP5 1 DW
216 pslldq $12, \TMP2 # left shift TMP2 3 DWs
217 pxor \TMP2, \GH
218
219 # second phase of the reduction
220
221 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
222 # in in order to perform
223 # independent shifts
224 movdqa \GH,\TMP3
225 movdqa \GH,\TMP4
226 psrld $1,\TMP2 # packed left shift >>1
227 psrld $2,\TMP3 # packed left shift >>2
228 psrld $7,\TMP4 # packed left shift >>7
229 pxor \TMP3,\TMP2 # xor the shifted versions
230 pxor \TMP4,\TMP2
231 pxor \TMP5, \TMP2
232 pxor \TMP2, \GH
233 pxor \TMP1, \GH # result is in TMP1
234.endm
235
Junaid Shahidb20209c2017-12-20 17:08:37 -0800236# Reads DLEN bytes starting at DPTR and stores in XMMDst
237# where 0 < DLEN < 16
238# Clobbers %rax, DLEN and XMM1
239.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
240 cmp $8, \DLEN
241 jl _read_lt8_\@
242 mov (\DPTR), %rax
243 MOVQ_R64_XMM %rax, \XMMDst
244 sub $8, \DLEN
245 jz _done_read_partial_block_\@
246 xor %eax, %eax
247_read_next_byte_\@:
248 shl $8, %rax
249 mov 7(\DPTR, \DLEN, 1), %al
250 dec \DLEN
251 jnz _read_next_byte_\@
252 MOVQ_R64_XMM %rax, \XMM1
253 pslldq $8, \XMM1
254 por \XMM1, \XMMDst
255 jmp _done_read_partial_block_\@
256_read_lt8_\@:
257 xor %eax, %eax
258_read_next_byte_lt8_\@:
259 shl $8, %rax
260 mov -1(\DPTR, \DLEN, 1), %al
261 dec \DLEN
262 jnz _read_next_byte_lt8_\@
263 MOVQ_R64_XMM %rax, \XMMDst
264_done_read_partial_block_\@:
265.endm
266
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400267/*
268* if a = number of total plaintext bytes
269* b = floor(a/16)
270* num_initial_blocks = b mod 4
271* encrypt the initial num_initial_blocks blocks and apply ghash on
272* the ciphertext
273* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
274* are clobbered
275* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
276*/
277
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400278
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800279.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
280XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500281 MOVADQ SHUF_MASK(%rip), %xmm14
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400282 mov arg7, %r10 # %r10 = AAD
Junaid Shahid1ecdd372017-12-20 17:08:38 -0800283 mov arg8, %r11 # %r11 = aadLen
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400284 pxor %xmm\i, %xmm\i
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200285 pxor \XMM2, \XMM2
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500286
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200287 cmp $16, %r11
Junaid Shahid1ecdd372017-12-20 17:08:38 -0800288 jl _get_AAD_rest\num_initial_blocks\operation
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200289_get_AAD_blocks\num_initial_blocks\operation:
290 movdqu (%r10), %xmm\i
291 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
292 pxor %xmm\i, \XMM2
293 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
294 add $16, %r10
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200295 sub $16, %r11
296 cmp $16, %r11
297 jge _get_AAD_blocks\num_initial_blocks\operation
298
299 movdqu \XMM2, %xmm\i
Junaid Shahid1ecdd372017-12-20 17:08:38 -0800300
301 /* read the last <16B of AAD */
302_get_AAD_rest\num_initial_blocks\operation:
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200303 cmp $0, %r11
304 je _get_AAD_done\num_initial_blocks\operation
305
Junaid Shahid1ecdd372017-12-20 17:08:38 -0800306 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800307 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200308 pxor \XMM2, %xmm\i
309 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800310
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200311_get_AAD_done\num_initial_blocks\operation:
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400312 xor %r11, %r11 # initialise the data pointer offset as zero
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200313 # start AES for num_initial_blocks blocks
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400314
315 mov %arg5, %rax # %rax = *Y0
316 movdqu (%rax), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800317 PSHUFB_XMM %xmm14, \XMM0
318
319.if (\i == 5) || (\i == 6) || (\i == 7)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500320 MOVADQ ONE(%RIP),\TMP1
321 MOVADQ (%arg1),\TMP2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400322.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500323 paddd \TMP1, \XMM0 # INCR Y0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400324 movdqa \XMM0, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800325 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500326 pxor \TMP2, %xmm\index
327.endr
328 lea 0x10(%arg1),%r10
329 mov keysize,%eax
330 shr $2,%eax # 128->4, 192->6, 256->8
331 add $5,%eax # 128->9, 192->11, 256->13
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800332
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500333aes_loop_initial_dec\num_initial_blocks:
334 MOVADQ (%r10),\TMP1
335.irpc index, \i_seq
336 AESENC \TMP1, %xmm\index
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400337.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500338 add $16,%r10
339 sub $1,%eax
340 jnz aes_loop_initial_dec\num_initial_blocks
341
342 MOVADQ (%r10), \TMP1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400343.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500344 AESENCLAST \TMP1, %xmm\index # Last Round
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400345.endr
346.irpc index, \i_seq
347 movdqu (%arg3 , %r11, 1), \TMP1
348 pxor \TMP1, %xmm\index
349 movdqu %xmm\index, (%arg2 , %r11, 1)
350 # write back plaintext/ciphertext for num_initial_blocks
351 add $16, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800352
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400353 movdqa \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800354 PSHUFB_XMM %xmm14, %xmm\index
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500355 # prepare plaintext/ciphertext for GHASH computation
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400356.endr
357.endif
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200358
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400359 # apply GHASH on num_initial_blocks blocks
360
361.if \i == 5
362 pxor %xmm5, %xmm6
363 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
364 pxor %xmm6, %xmm7
365 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
366 pxor %xmm7, %xmm8
367 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
368.elseif \i == 6
369 pxor %xmm6, %xmm7
370 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
371 pxor %xmm7, %xmm8
372 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
373.elseif \i == 7
374 pxor %xmm7, %xmm8
375 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
376.endif
377 cmp $64, %r13
378 jl _initial_blocks_done\num_initial_blocks\operation
379 # no need for precomputed values
380/*
381*
382* Precomputations for HashKey parallel with encryption of first 4 blocks.
383* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
384*/
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500385 MOVADQ ONE(%rip), \TMP1
386 paddd \TMP1, \XMM0 # INCR Y0
387 MOVADQ \XMM0, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800388 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
389
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500390 paddd \TMP1, \XMM0 # INCR Y0
391 MOVADQ \XMM0, \XMM2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800392 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
393
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500394 paddd \TMP1, \XMM0 # INCR Y0
395 MOVADQ \XMM0, \XMM3
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800396 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
397
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500398 paddd \TMP1, \XMM0 # INCR Y0
399 MOVADQ \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800400 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
401
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500402 MOVADQ 0(%arg1),\TMP1
403 pxor \TMP1, \XMM1
404 pxor \TMP1, \XMM2
405 pxor \TMP1, \XMM3
406 pxor \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400407 movdqa \TMP3, \TMP5
408 pshufd $78, \TMP3, \TMP1
409 pxor \TMP3, \TMP1
410 movdqa \TMP1, HashKey_k(%rsp)
411 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
412# TMP5 = HashKey^2<<1 (mod poly)
413 movdqa \TMP5, HashKey_2(%rsp)
414# HashKey_2 = HashKey^2<<1 (mod poly)
415 pshufd $78, \TMP5, \TMP1
416 pxor \TMP5, \TMP1
417 movdqa \TMP1, HashKey_2_k(%rsp)
418.irpc index, 1234 # do 4 rounds
419 movaps 0x10*\index(%arg1), \TMP1
420 AESENC \TMP1, \XMM1
421 AESENC \TMP1, \XMM2
422 AESENC \TMP1, \XMM3
423 AESENC \TMP1, \XMM4
424.endr
425 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
426# TMP5 = HashKey^3<<1 (mod poly)
427 movdqa \TMP5, HashKey_3(%rsp)
428 pshufd $78, \TMP5, \TMP1
429 pxor \TMP5, \TMP1
430 movdqa \TMP1, HashKey_3_k(%rsp)
431.irpc index, 56789 # do next 5 rounds
432 movaps 0x10*\index(%arg1), \TMP1
433 AESENC \TMP1, \XMM1
434 AESENC \TMP1, \XMM2
435 AESENC \TMP1, \XMM3
436 AESENC \TMP1, \XMM4
437.endr
438 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
439# TMP5 = HashKey^3<<1 (mod poly)
440 movdqa \TMP5, HashKey_4(%rsp)
441 pshufd $78, \TMP5, \TMP1
442 pxor \TMP5, \TMP1
443 movdqa \TMP1, HashKey_4_k(%rsp)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500444 lea 0xa0(%arg1),%r10
445 mov keysize,%eax
446 shr $2,%eax # 128->4, 192->6, 256->8
447 sub $4,%eax # 128->0, 192->2, 256->4
448 jz aes_loop_pre_dec_done\num_initial_blocks
449
450aes_loop_pre_dec\num_initial_blocks:
451 MOVADQ (%r10),\TMP2
452.irpc index, 1234
453 AESENC \TMP2, %xmm\index
454.endr
455 add $16,%r10
456 sub $1,%eax
457 jnz aes_loop_pre_dec\num_initial_blocks
458
459aes_loop_pre_dec_done\num_initial_blocks:
460 MOVADQ (%r10), \TMP2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400461 AESENCLAST \TMP2, \XMM1
462 AESENCLAST \TMP2, \XMM2
463 AESENCLAST \TMP2, \XMM3
464 AESENCLAST \TMP2, \XMM4
465 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
466 pxor \TMP1, \XMM1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400467 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
468 movdqa \TMP1, \XMM1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400469 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
470 pxor \TMP1, \XMM2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400471 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
472 movdqa \TMP1, \XMM2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400473 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
474 pxor \TMP1, \XMM3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400475 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
476 movdqa \TMP1, \XMM3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400477 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
478 pxor \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400479 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
480 movdqa \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800481 add $64, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800482 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
483 pxor \XMMDst, \XMM1
484# combine GHASHed value with the corresponding ciphertext
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800485 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800486 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800487 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
488
489_initial_blocks_done\num_initial_blocks\operation:
490
491.endm
492
493
494/*
495* if a = number of total plaintext bytes
496* b = floor(a/16)
497* num_initial_blocks = b mod 4
498* encrypt the initial num_initial_blocks blocks and apply ghash on
499* the ciphertext
500* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
501* are clobbered
502* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
503*/
504
505
506.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
507XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500508 MOVADQ SHUF_MASK(%rip), %xmm14
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800509 mov arg7, %r10 # %r10 = AAD
Junaid Shahid1ecdd372017-12-20 17:08:38 -0800510 mov arg8, %r11 # %r11 = aadLen
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800511 pxor %xmm\i, %xmm\i
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200512 pxor \XMM2, \XMM2
513
514 cmp $16, %r11
Junaid Shahid1ecdd372017-12-20 17:08:38 -0800515 jl _get_AAD_rest\num_initial_blocks\operation
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200516_get_AAD_blocks\num_initial_blocks\operation:
517 movdqu (%r10), %xmm\i
518 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
519 pxor %xmm\i, \XMM2
520 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
521 add $16, %r10
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200522 sub $16, %r11
523 cmp $16, %r11
524 jge _get_AAD_blocks\num_initial_blocks\operation
525
526 movdqu \XMM2, %xmm\i
Junaid Shahid1ecdd372017-12-20 17:08:38 -0800527
528 /* read the last <16B of AAD */
529_get_AAD_rest\num_initial_blocks\operation:
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200530 cmp $0, %r11
531 je _get_AAD_done\num_initial_blocks\operation
532
Junaid Shahid1ecdd372017-12-20 17:08:38 -0800533 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800534 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200535 pxor \XMM2, %xmm\i
536 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800537
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200538_get_AAD_done\num_initial_blocks\operation:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800539 xor %r11, %r11 # initialise the data pointer offset as zero
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200540 # start AES for num_initial_blocks blocks
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800541
542 mov %arg5, %rax # %rax = *Y0
543 movdqu (%rax), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800544 PSHUFB_XMM %xmm14, \XMM0
545
546.if (\i == 5) || (\i == 6) || (\i == 7)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800547
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500548 MOVADQ ONE(%RIP),\TMP1
549 MOVADQ 0(%arg1),\TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800550.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500551 paddd \TMP1, \XMM0 # INCR Y0
552 MOVADQ \XMM0, %xmm\index
553 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
554 pxor \TMP2, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800555.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500556 lea 0x10(%arg1),%r10
557 mov keysize,%eax
558 shr $2,%eax # 128->4, 192->6, 256->8
559 add $5,%eax # 128->9, 192->11, 256->13
560
561aes_loop_initial_enc\num_initial_blocks:
562 MOVADQ (%r10),\TMP1
563.irpc index, \i_seq
564 AESENC \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800565.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500566 add $16,%r10
567 sub $1,%eax
568 jnz aes_loop_initial_enc\num_initial_blocks
569
570 MOVADQ (%r10), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800571.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500572 AESENCLAST \TMP1, %xmm\index # Last Round
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800573.endr
574.irpc index, \i_seq
575 movdqu (%arg3 , %r11, 1), \TMP1
576 pxor \TMP1, %xmm\index
577 movdqu %xmm\index, (%arg2 , %r11, 1)
578 # write back plaintext/ciphertext for num_initial_blocks
579 add $16, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800580 PSHUFB_XMM %xmm14, %xmm\index
581
582 # prepare plaintext/ciphertext for GHASH computation
583.endr
584.endif
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200585
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800586 # apply GHASH on num_initial_blocks blocks
587
588.if \i == 5
589 pxor %xmm5, %xmm6
590 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
591 pxor %xmm6, %xmm7
592 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
593 pxor %xmm7, %xmm8
594 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
595.elseif \i == 6
596 pxor %xmm6, %xmm7
597 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
598 pxor %xmm7, %xmm8
599 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
600.elseif \i == 7
601 pxor %xmm7, %xmm8
602 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
603.endif
604 cmp $64, %r13
605 jl _initial_blocks_done\num_initial_blocks\operation
606 # no need for precomputed values
607/*
608*
609* Precomputations for HashKey parallel with encryption of first 4 blocks.
610* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
611*/
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500612 MOVADQ ONE(%RIP),\TMP1
613 paddd \TMP1, \XMM0 # INCR Y0
614 MOVADQ \XMM0, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800615 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
616
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500617 paddd \TMP1, \XMM0 # INCR Y0
618 MOVADQ \XMM0, \XMM2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800619 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
620
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500621 paddd \TMP1, \XMM0 # INCR Y0
622 MOVADQ \XMM0, \XMM3
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800623 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
624
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500625 paddd \TMP1, \XMM0 # INCR Y0
626 MOVADQ \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800627 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
628
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500629 MOVADQ 0(%arg1),\TMP1
630 pxor \TMP1, \XMM1
631 pxor \TMP1, \XMM2
632 pxor \TMP1, \XMM3
633 pxor \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800634 movdqa \TMP3, \TMP5
635 pshufd $78, \TMP3, \TMP1
636 pxor \TMP3, \TMP1
637 movdqa \TMP1, HashKey_k(%rsp)
638 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
639# TMP5 = HashKey^2<<1 (mod poly)
640 movdqa \TMP5, HashKey_2(%rsp)
641# HashKey_2 = HashKey^2<<1 (mod poly)
642 pshufd $78, \TMP5, \TMP1
643 pxor \TMP5, \TMP1
644 movdqa \TMP1, HashKey_2_k(%rsp)
645.irpc index, 1234 # do 4 rounds
646 movaps 0x10*\index(%arg1), \TMP1
647 AESENC \TMP1, \XMM1
648 AESENC \TMP1, \XMM2
649 AESENC \TMP1, \XMM3
650 AESENC \TMP1, \XMM4
651.endr
652 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
653# TMP5 = HashKey^3<<1 (mod poly)
654 movdqa \TMP5, HashKey_3(%rsp)
655 pshufd $78, \TMP5, \TMP1
656 pxor \TMP5, \TMP1
657 movdqa \TMP1, HashKey_3_k(%rsp)
658.irpc index, 56789 # do next 5 rounds
659 movaps 0x10*\index(%arg1), \TMP1
660 AESENC \TMP1, \XMM1
661 AESENC \TMP1, \XMM2
662 AESENC \TMP1, \XMM3
663 AESENC \TMP1, \XMM4
664.endr
665 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
666# TMP5 = HashKey^3<<1 (mod poly)
667 movdqa \TMP5, HashKey_4(%rsp)
668 pshufd $78, \TMP5, \TMP1
669 pxor \TMP5, \TMP1
670 movdqa \TMP1, HashKey_4_k(%rsp)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500671 lea 0xa0(%arg1),%r10
672 mov keysize,%eax
673 shr $2,%eax # 128->4, 192->6, 256->8
674 sub $4,%eax # 128->0, 192->2, 256->4
675 jz aes_loop_pre_enc_done\num_initial_blocks
676
677aes_loop_pre_enc\num_initial_blocks:
678 MOVADQ (%r10),\TMP2
679.irpc index, 1234
680 AESENC \TMP2, %xmm\index
681.endr
682 add $16,%r10
683 sub $1,%eax
684 jnz aes_loop_pre_enc\num_initial_blocks
685
686aes_loop_pre_enc_done\num_initial_blocks:
687 MOVADQ (%r10), \TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800688 AESENCLAST \TMP2, \XMM1
689 AESENCLAST \TMP2, \XMM2
690 AESENCLAST \TMP2, \XMM3
691 AESENCLAST \TMP2, \XMM4
692 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
693 pxor \TMP1, \XMM1
694 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
695 pxor \TMP1, \XMM2
696 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
697 pxor \TMP1, \XMM3
698 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
699 pxor \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400700 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
701 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
702 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
703 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800704
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400705 add $64, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800706 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400707 pxor \XMMDst, \XMM1
708# combine GHASHed value with the corresponding ciphertext
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800709 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800710 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800711 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
712
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400713_initial_blocks_done\num_initial_blocks\operation:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800714
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400715.endm
716
717/*
718* encrypt 4 blocks at a time
719* ghash the 4 previously encrypted ciphertext blocks
720* arg1, %arg2, %arg3 are used as pointers only, not modified
721* %r11 is the data offset value
722*/
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800723.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400724TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
725
726 movdqa \XMM1, \XMM5
727 movdqa \XMM2, \XMM6
728 movdqa \XMM3, \XMM7
729 movdqa \XMM4, \XMM8
730
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800731 movdqa SHUF_MASK(%rip), %xmm15
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400732 # multiply TMP5 * HashKey using karatsuba
733
734 movdqa \XMM5, \TMP4
735 pshufd $78, \XMM5, \TMP6
736 pxor \XMM5, \TMP6
737 paddd ONE(%rip), \XMM0 # INCR CNT
738 movdqa HashKey_4(%rsp), \TMP5
739 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
740 movdqa \XMM0, \XMM1
741 paddd ONE(%rip), \XMM0 # INCR CNT
742 movdqa \XMM0, \XMM2
743 paddd ONE(%rip), \XMM0 # INCR CNT
744 movdqa \XMM0, \XMM3
745 paddd ONE(%rip), \XMM0 # INCR CNT
746 movdqa \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800747 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400748 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800749 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
750 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
751 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
752
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400753 pxor (%arg1), \XMM1
754 pxor (%arg1), \XMM2
755 pxor (%arg1), \XMM3
756 pxor (%arg1), \XMM4
757 movdqa HashKey_4_k(%rsp), \TMP5
758 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
759 movaps 0x10(%arg1), \TMP1
760 AESENC \TMP1, \XMM1 # Round 1
761 AESENC \TMP1, \XMM2
762 AESENC \TMP1, \XMM3
763 AESENC \TMP1, \XMM4
764 movaps 0x20(%arg1), \TMP1
765 AESENC \TMP1, \XMM1 # Round 2
766 AESENC \TMP1, \XMM2
767 AESENC \TMP1, \XMM3
768 AESENC \TMP1, \XMM4
769 movdqa \XMM6, \TMP1
770 pshufd $78, \XMM6, \TMP2
771 pxor \XMM6, \TMP2
772 movdqa HashKey_3(%rsp), \TMP5
773 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
774 movaps 0x30(%arg1), \TMP3
775 AESENC \TMP3, \XMM1 # Round 3
776 AESENC \TMP3, \XMM2
777 AESENC \TMP3, \XMM3
778 AESENC \TMP3, \XMM4
779 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
780 movaps 0x40(%arg1), \TMP3
781 AESENC \TMP3, \XMM1 # Round 4
782 AESENC \TMP3, \XMM2
783 AESENC \TMP3, \XMM3
784 AESENC \TMP3, \XMM4
785 movdqa HashKey_3_k(%rsp), \TMP5
786 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
787 movaps 0x50(%arg1), \TMP3
788 AESENC \TMP3, \XMM1 # Round 5
789 AESENC \TMP3, \XMM2
790 AESENC \TMP3, \XMM3
791 AESENC \TMP3, \XMM4
792 pxor \TMP1, \TMP4
793# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
794 pxor \XMM6, \XMM5
795 pxor \TMP2, \TMP6
796 movdqa \XMM7, \TMP1
797 pshufd $78, \XMM7, \TMP2
798 pxor \XMM7, \TMP2
799 movdqa HashKey_2(%rsp ), \TMP5
800
801 # Multiply TMP5 * HashKey using karatsuba
802
803 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
804 movaps 0x60(%arg1), \TMP3
805 AESENC \TMP3, \XMM1 # Round 6
806 AESENC \TMP3, \XMM2
807 AESENC \TMP3, \XMM3
808 AESENC \TMP3, \XMM4
809 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
810 movaps 0x70(%arg1), \TMP3
811 AESENC \TMP3, \XMM1 # Round 7
812 AESENC \TMP3, \XMM2
813 AESENC \TMP3, \XMM3
814 AESENC \TMP3, \XMM4
815 movdqa HashKey_2_k(%rsp), \TMP5
816 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
817 movaps 0x80(%arg1), \TMP3
818 AESENC \TMP3, \XMM1 # Round 8
819 AESENC \TMP3, \XMM2
820 AESENC \TMP3, \XMM3
821 AESENC \TMP3, \XMM4
822 pxor \TMP1, \TMP4
823# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
824 pxor \XMM7, \XMM5
825 pxor \TMP2, \TMP6
826
827 # Multiply XMM8 * HashKey
828 # XMM8 and TMP5 hold the values for the two operands
829
830 movdqa \XMM8, \TMP1
831 pshufd $78, \XMM8, \TMP2
832 pxor \XMM8, \TMP2
833 movdqa HashKey(%rsp), \TMP5
834 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
835 movaps 0x90(%arg1), \TMP3
836 AESENC \TMP3, \XMM1 # Round 9
837 AESENC \TMP3, \XMM2
838 AESENC \TMP3, \XMM3
839 AESENC \TMP3, \XMM4
840 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500841 lea 0xa0(%arg1),%r10
842 mov keysize,%eax
843 shr $2,%eax # 128->4, 192->6, 256->8
844 sub $4,%eax # 128->0, 192->2, 256->4
845 jz aes_loop_par_enc_done
846
847aes_loop_par_enc:
848 MOVADQ (%r10),\TMP3
849.irpc index, 1234
850 AESENC \TMP3, %xmm\index
851.endr
852 add $16,%r10
853 sub $1,%eax
854 jnz aes_loop_par_enc
855
856aes_loop_par_enc_done:
857 MOVADQ (%r10), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400858 AESENCLAST \TMP3, \XMM1 # Round 10
859 AESENCLAST \TMP3, \XMM2
860 AESENCLAST \TMP3, \XMM3
861 AESENCLAST \TMP3, \XMM4
862 movdqa HashKey_k(%rsp), \TMP5
863 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
864 movdqu (%arg3,%r11,1), \TMP3
865 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400866 movdqu 16(%arg3,%r11,1), \TMP3
867 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400868 movdqu 32(%arg3,%r11,1), \TMP3
869 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400870 movdqu 48(%arg3,%r11,1), \TMP3
871 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800872 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
873 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
874 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
875 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
876 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
877 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
878 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
879 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
880
881 pxor \TMP4, \TMP1
882 pxor \XMM8, \XMM5
883 pxor \TMP6, \TMP2
884 pxor \TMP1, \TMP2
885 pxor \XMM5, \TMP2
886 movdqa \TMP2, \TMP3
887 pslldq $8, \TMP3 # left shift TMP3 2 DWs
888 psrldq $8, \TMP2 # right shift TMP2 2 DWs
889 pxor \TMP3, \XMM5
890 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
891
892 # first phase of reduction
893
894 movdqa \XMM5, \TMP2
895 movdqa \XMM5, \TMP3
896 movdqa \XMM5, \TMP4
897# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
898 pslld $31, \TMP2 # packed right shift << 31
899 pslld $30, \TMP3 # packed right shift << 30
900 pslld $25, \TMP4 # packed right shift << 25
901 pxor \TMP3, \TMP2 # xor the shifted versions
902 pxor \TMP4, \TMP2
903 movdqa \TMP2, \TMP5
904 psrldq $4, \TMP5 # right shift T5 1 DW
905 pslldq $12, \TMP2 # left shift T2 3 DWs
906 pxor \TMP2, \XMM5
907
908 # second phase of reduction
909
910 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
911 movdqa \XMM5,\TMP3
912 movdqa \XMM5,\TMP4
913 psrld $1, \TMP2 # packed left shift >>1
914 psrld $2, \TMP3 # packed left shift >>2
915 psrld $7, \TMP4 # packed left shift >>7
916 pxor \TMP3,\TMP2 # xor the shifted versions
917 pxor \TMP4,\TMP2
918 pxor \TMP5, \TMP2
919 pxor \TMP2, \XMM5
920 pxor \TMP1, \XMM5 # result is in TMP1
921
922 pxor \XMM5, \XMM1
923.endm
924
925/*
926* decrypt 4 blocks at a time
927* ghash the 4 previously decrypted ciphertext blocks
928* arg1, %arg2, %arg3 are used as pointers only, not modified
929* %r11 is the data offset value
930*/
931.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
932TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
933
934 movdqa \XMM1, \XMM5
935 movdqa \XMM2, \XMM6
936 movdqa \XMM3, \XMM7
937 movdqa \XMM4, \XMM8
938
939 movdqa SHUF_MASK(%rip), %xmm15
940 # multiply TMP5 * HashKey using karatsuba
941
942 movdqa \XMM5, \TMP4
943 pshufd $78, \XMM5, \TMP6
944 pxor \XMM5, \TMP6
945 paddd ONE(%rip), \XMM0 # INCR CNT
946 movdqa HashKey_4(%rsp), \TMP5
947 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
948 movdqa \XMM0, \XMM1
949 paddd ONE(%rip), \XMM0 # INCR CNT
950 movdqa \XMM0, \XMM2
951 paddd ONE(%rip), \XMM0 # INCR CNT
952 movdqa \XMM0, \XMM3
953 paddd ONE(%rip), \XMM0 # INCR CNT
954 movdqa \XMM0, \XMM4
955 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
956 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
957 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
958 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
959 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
960
961 pxor (%arg1), \XMM1
962 pxor (%arg1), \XMM2
963 pxor (%arg1), \XMM3
964 pxor (%arg1), \XMM4
965 movdqa HashKey_4_k(%rsp), \TMP5
966 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
967 movaps 0x10(%arg1), \TMP1
968 AESENC \TMP1, \XMM1 # Round 1
969 AESENC \TMP1, \XMM2
970 AESENC \TMP1, \XMM3
971 AESENC \TMP1, \XMM4
972 movaps 0x20(%arg1), \TMP1
973 AESENC \TMP1, \XMM1 # Round 2
974 AESENC \TMP1, \XMM2
975 AESENC \TMP1, \XMM3
976 AESENC \TMP1, \XMM4
977 movdqa \XMM6, \TMP1
978 pshufd $78, \XMM6, \TMP2
979 pxor \XMM6, \TMP2
980 movdqa HashKey_3(%rsp), \TMP5
981 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
982 movaps 0x30(%arg1), \TMP3
983 AESENC \TMP3, \XMM1 # Round 3
984 AESENC \TMP3, \XMM2
985 AESENC \TMP3, \XMM3
986 AESENC \TMP3, \XMM4
987 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
988 movaps 0x40(%arg1), \TMP3
989 AESENC \TMP3, \XMM1 # Round 4
990 AESENC \TMP3, \XMM2
991 AESENC \TMP3, \XMM3
992 AESENC \TMP3, \XMM4
993 movdqa HashKey_3_k(%rsp), \TMP5
994 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
995 movaps 0x50(%arg1), \TMP3
996 AESENC \TMP3, \XMM1 # Round 5
997 AESENC \TMP3, \XMM2
998 AESENC \TMP3, \XMM3
999 AESENC \TMP3, \XMM4
1000 pxor \TMP1, \TMP4
1001# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1002 pxor \XMM6, \XMM5
1003 pxor \TMP2, \TMP6
1004 movdqa \XMM7, \TMP1
1005 pshufd $78, \XMM7, \TMP2
1006 pxor \XMM7, \TMP2
1007 movdqa HashKey_2(%rsp ), \TMP5
1008
1009 # Multiply TMP5 * HashKey using karatsuba
1010
1011 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1012 movaps 0x60(%arg1), \TMP3
1013 AESENC \TMP3, \XMM1 # Round 6
1014 AESENC \TMP3, \XMM2
1015 AESENC \TMP3, \XMM3
1016 AESENC \TMP3, \XMM4
1017 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1018 movaps 0x70(%arg1), \TMP3
1019 AESENC \TMP3, \XMM1 # Round 7
1020 AESENC \TMP3, \XMM2
1021 AESENC \TMP3, \XMM3
1022 AESENC \TMP3, \XMM4
1023 movdqa HashKey_2_k(%rsp), \TMP5
1024 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1025 movaps 0x80(%arg1), \TMP3
1026 AESENC \TMP3, \XMM1 # Round 8
1027 AESENC \TMP3, \XMM2
1028 AESENC \TMP3, \XMM3
1029 AESENC \TMP3, \XMM4
1030 pxor \TMP1, \TMP4
1031# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1032 pxor \XMM7, \XMM5
1033 pxor \TMP2, \TMP6
1034
1035 # Multiply XMM8 * HashKey
1036 # XMM8 and TMP5 hold the values for the two operands
1037
1038 movdqa \XMM8, \TMP1
1039 pshufd $78, \XMM8, \TMP2
1040 pxor \XMM8, \TMP2
1041 movdqa HashKey(%rsp), \TMP5
1042 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1043 movaps 0x90(%arg1), \TMP3
1044 AESENC \TMP3, \XMM1 # Round 9
1045 AESENC \TMP3, \XMM2
1046 AESENC \TMP3, \XMM3
1047 AESENC \TMP3, \XMM4
1048 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001049 lea 0xa0(%arg1),%r10
1050 mov keysize,%eax
1051 shr $2,%eax # 128->4, 192->6, 256->8
1052 sub $4,%eax # 128->0, 192->2, 256->4
1053 jz aes_loop_par_dec_done
1054
1055aes_loop_par_dec:
1056 MOVADQ (%r10),\TMP3
1057.irpc index, 1234
1058 AESENC \TMP3, %xmm\index
1059.endr
1060 add $16,%r10
1061 sub $1,%eax
1062 jnz aes_loop_par_dec
1063
1064aes_loop_par_dec_done:
1065 MOVADQ (%r10), \TMP3
1066 AESENCLAST \TMP3, \XMM1 # last round
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001067 AESENCLAST \TMP3, \XMM2
1068 AESENCLAST \TMP3, \XMM3
1069 AESENCLAST \TMP3, \XMM4
1070 movdqa HashKey_k(%rsp), \TMP5
1071 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1072 movdqu (%arg3,%r11,1), \TMP3
1073 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1074 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1075 movdqa \TMP3, \XMM1
1076 movdqu 16(%arg3,%r11,1), \TMP3
1077 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1078 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1079 movdqa \TMP3, \XMM2
1080 movdqu 32(%arg3,%r11,1), \TMP3
1081 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1082 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1083 movdqa \TMP3, \XMM3
1084 movdqu 48(%arg3,%r11,1), \TMP3
1085 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001086 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1087 movdqa \TMP3, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001088 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1089 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1090 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1091 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001092
1093 pxor \TMP4, \TMP1
1094 pxor \XMM8, \XMM5
1095 pxor \TMP6, \TMP2
1096 pxor \TMP1, \TMP2
1097 pxor \XMM5, \TMP2
1098 movdqa \TMP2, \TMP3
1099 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1100 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1101 pxor \TMP3, \XMM5
1102 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1103
1104 # first phase of reduction
1105
1106 movdqa \XMM5, \TMP2
1107 movdqa \XMM5, \TMP3
1108 movdqa \XMM5, \TMP4
1109# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1110 pslld $31, \TMP2 # packed right shift << 31
1111 pslld $30, \TMP3 # packed right shift << 30
1112 pslld $25, \TMP4 # packed right shift << 25
1113 pxor \TMP3, \TMP2 # xor the shifted versions
1114 pxor \TMP4, \TMP2
1115 movdqa \TMP2, \TMP5
1116 psrldq $4, \TMP5 # right shift T5 1 DW
1117 pslldq $12, \TMP2 # left shift T2 3 DWs
1118 pxor \TMP2, \XMM5
1119
1120 # second phase of reduction
1121
1122 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1123 movdqa \XMM5,\TMP3
1124 movdqa \XMM5,\TMP4
1125 psrld $1, \TMP2 # packed left shift >>1
1126 psrld $2, \TMP3 # packed left shift >>2
1127 psrld $7, \TMP4 # packed left shift >>7
1128 pxor \TMP3,\TMP2 # xor the shifted versions
1129 pxor \TMP4,\TMP2
1130 pxor \TMP5, \TMP2
1131 pxor \TMP2, \XMM5
1132 pxor \TMP1, \XMM5 # result is in TMP1
1133
1134 pxor \XMM5, \XMM1
1135.endm
1136
1137/* GHASH the last 4 ciphertext blocks. */
1138.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1139TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1140
1141 # Multiply TMP6 * HashKey (using Karatsuba)
1142
1143 movdqa \XMM1, \TMP6
1144 pshufd $78, \XMM1, \TMP2
1145 pxor \XMM1, \TMP2
1146 movdqa HashKey_4(%rsp), \TMP5
1147 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1148 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1149 movdqa HashKey_4_k(%rsp), \TMP4
1150 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1151 movdqa \XMM1, \XMMDst
1152 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1153
1154 # Multiply TMP1 * HashKey (using Karatsuba)
1155
1156 movdqa \XMM2, \TMP1
1157 pshufd $78, \XMM2, \TMP2
1158 pxor \XMM2, \TMP2
1159 movdqa HashKey_3(%rsp), \TMP5
1160 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1161 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1162 movdqa HashKey_3_k(%rsp), \TMP4
1163 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1164 pxor \TMP1, \TMP6
1165 pxor \XMM2, \XMMDst
1166 pxor \TMP2, \XMM1
1167# results accumulated in TMP6, XMMDst, XMM1
1168
1169 # Multiply TMP1 * HashKey (using Karatsuba)
1170
1171 movdqa \XMM3, \TMP1
1172 pshufd $78, \XMM3, \TMP2
1173 pxor \XMM3, \TMP2
1174 movdqa HashKey_2(%rsp), \TMP5
1175 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1176 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1177 movdqa HashKey_2_k(%rsp), \TMP4
1178 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1179 pxor \TMP1, \TMP6
1180 pxor \XMM3, \XMMDst
1181 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1182
1183 # Multiply TMP1 * HashKey (using Karatsuba)
1184 movdqa \XMM4, \TMP1
1185 pshufd $78, \XMM4, \TMP2
1186 pxor \XMM4, \TMP2
1187 movdqa HashKey(%rsp), \TMP5
1188 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1189 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1190 movdqa HashKey_k(%rsp), \TMP4
1191 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1192 pxor \TMP1, \TMP6
1193 pxor \XMM4, \XMMDst
1194 pxor \XMM1, \TMP2
1195 pxor \TMP6, \TMP2
1196 pxor \XMMDst, \TMP2
1197 # middle section of the temp results combined as in karatsuba algorithm
1198 movdqa \TMP2, \TMP4
1199 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1200 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1201 pxor \TMP4, \XMMDst
1202 pxor \TMP2, \TMP6
1203# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1204 # first phase of the reduction
1205 movdqa \XMMDst, \TMP2
1206 movdqa \XMMDst, \TMP3
1207 movdqa \XMMDst, \TMP4
1208# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1209 pslld $31, \TMP2 # packed right shifting << 31
1210 pslld $30, \TMP3 # packed right shifting << 30
1211 pslld $25, \TMP4 # packed right shifting << 25
1212 pxor \TMP3, \TMP2 # xor the shifted versions
1213 pxor \TMP4, \TMP2
1214 movdqa \TMP2, \TMP7
1215 psrldq $4, \TMP7 # right shift TMP7 1 DW
1216 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1217 pxor \TMP2, \XMMDst
1218
1219 # second phase of the reduction
1220 movdqa \XMMDst, \TMP2
1221 # make 3 copies of XMMDst for doing 3 shift operations
1222 movdqa \XMMDst, \TMP3
1223 movdqa \XMMDst, \TMP4
1224 psrld $1, \TMP2 # packed left shift >> 1
1225 psrld $2, \TMP3 # packed left shift >> 2
1226 psrld $7, \TMP4 # packed left shift >> 7
1227 pxor \TMP3, \TMP2 # xor the shifted versions
1228 pxor \TMP4, \TMP2
1229 pxor \TMP7, \TMP2
1230 pxor \TMP2, \XMMDst
1231 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1232.endm
1233
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001234
1235/* Encryption of a single block
1236* uses eax & r10
1237*/
1238
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001239.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1240
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001241 pxor (%arg1), \XMM0
1242 mov keysize,%eax
1243 shr $2,%eax # 128->4, 192->6, 256->8
1244 add $5,%eax # 128->9, 192->11, 256->13
1245 lea 16(%arg1), %r10 # get first expanded key address
1246
1247_esb_loop_\@:
1248 MOVADQ (%r10),\TMP1
1249 AESENC \TMP1,\XMM0
1250 add $16,%r10
1251 sub $1,%eax
1252 jnz _esb_loop_\@
1253
1254 MOVADQ (%r10),\TMP1
1255 AESENCLAST \TMP1,\XMM0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001256.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001257/*****************************************************************************
1258* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1259* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1260* const u8 *in, // Ciphertext input
1261* u64 plaintext_len, // Length of data in bytes for decryption.
1262* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1263* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1264* // concatenated with 0x00000001. 16-byte aligned pointer.
1265* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1266* const u8 *aad, // Additional Authentication Data (AAD)
1267* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1268* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1269* // given authentication tag and only return the plaintext if they match.
1270* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1271* // (most likely), 12 or 8.
1272*
1273* Assumptions:
1274*
1275* keys:
1276* keys are pre-expanded and aligned to 16 bytes. we are using the first
1277* set of 11 keys in the data structure void *aes_ctx
1278*
1279* iv:
1280* 0 1 2 3
1281* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1282* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1283* | Salt (From the SA) |
1284* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1285* | Initialization Vector |
1286* | (This is the sequence number from IPSec header) |
1287* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1288* | 0x1 |
1289* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1290*
1291*
1292*
1293* AAD:
1294* AAD padded to 128 bits with 0
1295* for example, assume AAD is a u32 vector
1296*
1297* if AAD is 8 bytes:
1298* AAD[3] = {A0, A1};
1299* padded AAD in xmm register = {A1 A0 0 0}
1300*
1301* 0 1 2 3
1302* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1303* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1304* | SPI (A1) |
1305* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1306* | 32-bit Sequence Number (A0) |
1307* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1308* | 0x0 |
1309* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1310*
1311* AAD Format with 32-bit Sequence Number
1312*
1313* if AAD is 12 bytes:
1314* AAD[3] = {A0, A1, A2};
1315* padded AAD in xmm register = {A2 A1 A0 0}
1316*
1317* 0 1 2 3
1318* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1319* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1320* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1321* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1322* | SPI (A2) |
1323* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1324* | 64-bit Extended Sequence Number {A1,A0} |
1325* | |
1326* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1327* | 0x0 |
1328* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1329*
1330* AAD Format with 64-bit Extended Sequence Number
1331*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001332* poly = x^128 + x^127 + x^126 + x^121 + 1
1333*
1334*****************************************************************************/
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001335ENTRY(aesni_gcm_dec)
1336 push %r12
1337 push %r13
1338 push %r14
1339 mov %rsp, %r14
1340/*
1341* states of %xmm registers %xmm6:%xmm15 not saved
1342* all %xmm registers are clobbered
1343*/
1344 sub $VARIABLE_OFFSET, %rsp
1345 and $~63, %rsp # align rsp to 64 bytes
1346 mov %arg6, %r12
1347 movdqu (%r12), %xmm13 # %xmm13 = HashKey
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001348 movdqa SHUF_MASK(%rip), %xmm2
1349 PSHUFB_XMM %xmm2, %xmm13
1350
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001351
1352# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1353
1354 movdqa %xmm13, %xmm2
1355 psllq $1, %xmm13
1356 psrlq $63, %xmm2
1357 movdqa %xmm2, %xmm1
1358 pslldq $8, %xmm2
1359 psrldq $8, %xmm1
1360 por %xmm2, %xmm13
1361
1362 # Reduction
1363
1364 pshufd $0x24, %xmm1, %xmm2
1365 pcmpeqd TWOONE(%rip), %xmm2
1366 pand POLY(%rip), %xmm2
1367 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1368
1369
1370 # Decrypt first few blocks
1371
1372 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1373 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1374 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1375 mov %r13, %r12
1376 and $(3<<4), %r12
1377 jz _initial_num_blocks_is_0_decrypt
1378 cmp $(2<<4), %r12
1379 jb _initial_num_blocks_is_1_decrypt
1380 je _initial_num_blocks_is_2_decrypt
1381_initial_num_blocks_is_3_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001382 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001383%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1384 sub $48, %r13
1385 jmp _initial_blocks_decrypted
1386_initial_num_blocks_is_2_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001387 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001388%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1389 sub $32, %r13
1390 jmp _initial_blocks_decrypted
1391_initial_num_blocks_is_1_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001392 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001393%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1394 sub $16, %r13
1395 jmp _initial_blocks_decrypted
1396_initial_num_blocks_is_0_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001397 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001398%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1399_initial_blocks_decrypted:
1400 cmp $0, %r13
1401 je _zero_cipher_left_decrypt
1402 sub $64, %r13
1403 je _four_cipher_left_decrypt
1404_decrypt_by_4:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001405 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001406%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1407 add $64, %r11
1408 sub $64, %r13
1409 jne _decrypt_by_4
1410_four_cipher_left_decrypt:
1411 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1412%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1413_zero_cipher_left_decrypt:
1414 mov %arg4, %r13
1415 and $15, %r13 # %r13 = arg4 (mod 16)
1416 je _multiple_of_16_bytes_decrypt
1417
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001418 # Handle the last <16 byte block separately
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001419
1420 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001421 movdqa SHUF_MASK(%rip), %xmm10
1422 PSHUFB_XMM %xmm10, %xmm0
1423
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001424 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001425
Junaid Shahidb20209c2017-12-20 17:08:37 -08001426 lea (%arg3,%r11,1), %r10
1427 mov %r13, %r12
1428 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
1429
1430 lea ALL_F+16(%rip), %r12
1431 sub %r13, %r12
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001432 movdqa %xmm1, %xmm2
1433 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
Junaid Shahidb20209c2017-12-20 17:08:37 -08001434 movdqu (%r12), %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001435 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1436 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1437 pand %xmm1, %xmm2
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001438 movdqa SHUF_MASK(%rip), %xmm10
1439 PSHUFB_XMM %xmm10 ,%xmm2
1440
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001441 pxor %xmm2, %xmm8
1442 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001443
1444 # output %r13 bytes
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001445 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001446 cmp $8, %r13
1447 jle _less_than_8_bytes_left_decrypt
1448 mov %rax, (%arg2 , %r11, 1)
1449 add $8, %r11
1450 psrldq $8, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001451 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001452 sub $8, %r13
1453_less_than_8_bytes_left_decrypt:
1454 mov %al, (%arg2, %r11, 1)
1455 add $1, %r11
1456 shr $8, %rax
1457 sub $1, %r13
1458 jne _less_than_8_bytes_left_decrypt
1459_multiple_of_16_bytes_decrypt:
1460 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1461 shl $3, %r12 # convert into number of bits
1462 movd %r12d, %xmm15 # len(A) in %xmm15
1463 shl $3, %arg4 # len(C) in bits (*128)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001464 MOVQ_R64_XMM %arg4, %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001465 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1466 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1467 pxor %xmm15, %xmm8
1468 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1469 # final GHASH computation
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001470 movdqa SHUF_MASK(%rip), %xmm10
1471 PSHUFB_XMM %xmm10, %xmm8
1472
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001473 mov %arg5, %rax # %rax = *Y0
1474 movdqu (%rax), %xmm0 # %xmm0 = Y0
1475 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1476 pxor %xmm8, %xmm0
1477_return_T_decrypt:
1478 mov arg9, %r10 # %r10 = authTag
1479 mov arg10, %r11 # %r11 = auth_tag_len
1480 cmp $16, %r11
1481 je _T_16_decrypt
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001482 cmp $8, %r11
1483 jl _T_4_decrypt
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001484_T_8_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001485 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001486 mov %rax, (%r10)
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001487 add $8, %r10
1488 sub $8, %r11
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001489 psrldq $8, %xmm0
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001490 cmp $0, %r11
1491 je _return_T_done_decrypt
1492_T_4_decrypt:
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001493 movd %xmm0, %eax
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001494 mov %eax, (%r10)
1495 add $4, %r10
1496 sub $4, %r11
1497 psrldq $4, %xmm0
1498 cmp $0, %r11
1499 je _return_T_done_decrypt
1500_T_123_decrypt:
1501 movd %xmm0, %eax
1502 cmp $2, %r11
1503 jl _T_1_decrypt
1504 mov %ax, (%r10)
1505 cmp $2, %r11
1506 je _return_T_done_decrypt
1507 add $2, %r10
1508 sar $16, %eax
1509_T_1_decrypt:
1510 mov %al, (%r10)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001511 jmp _return_T_done_decrypt
1512_T_16_decrypt:
1513 movdqu %xmm0, (%r10)
1514_return_T_done_decrypt:
1515 mov %r14, %rsp
1516 pop %r14
1517 pop %r13
1518 pop %r12
1519 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001520ENDPROC(aesni_gcm_dec)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001521
1522
1523/*****************************************************************************
1524* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1525* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1526* const u8 *in, // Plaintext input
1527* u64 plaintext_len, // Length of data in bytes for encryption.
1528* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1529* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1530* // concatenated with 0x00000001. 16-byte aligned pointer.
1531* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1532* const u8 *aad, // Additional Authentication Data (AAD)
1533* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1534* u8 *auth_tag, // Authenticated Tag output.
1535* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1536* // 12 or 8.
1537*
1538* Assumptions:
1539*
1540* keys:
1541* keys are pre-expanded and aligned to 16 bytes. we are using the
1542* first set of 11 keys in the data structure void *aes_ctx
1543*
1544*
1545* iv:
1546* 0 1 2 3
1547* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1548* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549* | Salt (From the SA) |
1550* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1551* | Initialization Vector |
1552* | (This is the sequence number from IPSec header) |
1553* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1554* | 0x1 |
1555* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1556*
1557*
1558*
1559* AAD:
1560* AAD padded to 128 bits with 0
1561* for example, assume AAD is a u32 vector
1562*
1563* if AAD is 8 bytes:
1564* AAD[3] = {A0, A1};
1565* padded AAD in xmm register = {A1 A0 0 0}
1566*
1567* 0 1 2 3
1568* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1569* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570* | SPI (A1) |
1571* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1572* | 32-bit Sequence Number (A0) |
1573* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1574* | 0x0 |
1575* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1576*
1577* AAD Format with 32-bit Sequence Number
1578*
1579* if AAD is 12 bytes:
1580* AAD[3] = {A0, A1, A2};
1581* padded AAD in xmm register = {A2 A1 A0 0}
1582*
1583* 0 1 2 3
1584* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1585* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1586* | SPI (A2) |
1587* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588* | 64-bit Extended Sequence Number {A1,A0} |
1589* | |
1590* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1591* | 0x0 |
1592* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1593*
1594* AAD Format with 64-bit Extended Sequence Number
1595*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001596* poly = x^128 + x^127 + x^126 + x^121 + 1
1597***************************************************************************/
1598ENTRY(aesni_gcm_enc)
1599 push %r12
1600 push %r13
1601 push %r14
1602 mov %rsp, %r14
1603#
1604# states of %xmm registers %xmm6:%xmm15 not saved
1605# all %xmm registers are clobbered
1606#
1607 sub $VARIABLE_OFFSET, %rsp
1608 and $~63, %rsp
1609 mov %arg6, %r12
1610 movdqu (%r12), %xmm13
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001611 movdqa SHUF_MASK(%rip), %xmm2
1612 PSHUFB_XMM %xmm2, %xmm13
1613
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001614
1615# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1616
1617 movdqa %xmm13, %xmm2
1618 psllq $1, %xmm13
1619 psrlq $63, %xmm2
1620 movdqa %xmm2, %xmm1
1621 pslldq $8, %xmm2
1622 psrldq $8, %xmm1
1623 por %xmm2, %xmm13
1624
1625 # reduce HashKey<<1
1626
1627 pshufd $0x24, %xmm1, %xmm2
1628 pcmpeqd TWOONE(%rip), %xmm2
1629 pand POLY(%rip), %xmm2
1630 pxor %xmm2, %xmm13
1631 movdqa %xmm13, HashKey(%rsp)
1632 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1633 and $-16, %r13
1634 mov %r13, %r12
1635
1636 # Encrypt first few blocks
1637
1638 and $(3<<4), %r12
1639 jz _initial_num_blocks_is_0_encrypt
1640 cmp $(2<<4), %r12
1641 jb _initial_num_blocks_is_1_encrypt
1642 je _initial_num_blocks_is_2_encrypt
1643_initial_num_blocks_is_3_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001644 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001645%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1646 sub $48, %r13
1647 jmp _initial_blocks_encrypted
1648_initial_num_blocks_is_2_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001649 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001650%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1651 sub $32, %r13
1652 jmp _initial_blocks_encrypted
1653_initial_num_blocks_is_1_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001654 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001655%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1656 sub $16, %r13
1657 jmp _initial_blocks_encrypted
1658_initial_num_blocks_is_0_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001659 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001660%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1661_initial_blocks_encrypted:
1662
1663 # Main loop - Encrypt remaining blocks
1664
1665 cmp $0, %r13
1666 je _zero_cipher_left_encrypt
1667 sub $64, %r13
1668 je _four_cipher_left_encrypt
1669_encrypt_by_4_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001670 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001671%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1672 add $64, %r11
1673 sub $64, %r13
1674 jne _encrypt_by_4_encrypt
1675_four_cipher_left_encrypt:
1676 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1677%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1678_zero_cipher_left_encrypt:
1679 mov %arg4, %r13
1680 and $15, %r13 # %r13 = arg4 (mod 16)
1681 je _multiple_of_16_bytes_encrypt
1682
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001683 # Handle the last <16 Byte block separately
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001684 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001685 movdqa SHUF_MASK(%rip), %xmm10
1686 PSHUFB_XMM %xmm10, %xmm0
1687
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001688 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
Junaid Shahidb20209c2017-12-20 17:08:37 -08001689
1690 lea (%arg3,%r11,1), %r10
1691 mov %r13, %r12
1692 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
1693
1694 lea ALL_F+16(%rip), %r12
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001695 sub %r13, %r12
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001696 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
Junaid Shahidb20209c2017-12-20 17:08:37 -08001697 movdqu (%r12), %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001698 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1699 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001700 movdqa SHUF_MASK(%rip), %xmm10
1701 PSHUFB_XMM %xmm10,%xmm0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001702
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001703 pxor %xmm0, %xmm8
1704 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1705 # GHASH computation for the last <16 byte block
Tadeusz Struk60af5202011-03-13 16:56:17 +08001706 movdqa SHUF_MASK(%rip), %xmm10
1707 PSHUFB_XMM %xmm10, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001708
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001709 # shuffle xmm0 back to output as ciphertext
1710
1711 # Output %r13 bytes
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001712 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001713 cmp $8, %r13
1714 jle _less_than_8_bytes_left_encrypt
1715 mov %rax, (%arg2 , %r11, 1)
1716 add $8, %r11
1717 psrldq $8, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001718 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001719 sub $8, %r13
1720_less_than_8_bytes_left_encrypt:
1721 mov %al, (%arg2, %r11, 1)
1722 add $1, %r11
1723 shr $8, %rax
1724 sub $1, %r13
1725 jne _less_than_8_bytes_left_encrypt
1726_multiple_of_16_bytes_encrypt:
1727 mov arg8, %r12 # %r12 = addLen (number of bytes)
1728 shl $3, %r12
1729 movd %r12d, %xmm15 # len(A) in %xmm15
1730 shl $3, %arg4 # len(C) in bits (*128)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001731 MOVQ_R64_XMM %arg4, %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001732 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1733 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1734 pxor %xmm15, %xmm8
1735 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1736 # final GHASH computation
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001737 movdqa SHUF_MASK(%rip), %xmm10
1738 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001739
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001740 mov %arg5, %rax # %rax = *Y0
1741 movdqu (%rax), %xmm0 # %xmm0 = Y0
1742 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1743 pxor %xmm8, %xmm0
1744_return_T_encrypt:
1745 mov arg9, %r10 # %r10 = authTag
1746 mov arg10, %r11 # %r11 = auth_tag_len
1747 cmp $16, %r11
1748 je _T_16_encrypt
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001749 cmp $8, %r11
1750 jl _T_4_encrypt
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001751_T_8_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001752 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001753 mov %rax, (%r10)
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001754 add $8, %r10
1755 sub $8, %r11
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001756 psrldq $8, %xmm0
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001757 cmp $0, %r11
1758 je _return_T_done_encrypt
1759_T_4_encrypt:
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001760 movd %xmm0, %eax
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001761 mov %eax, (%r10)
1762 add $4, %r10
1763 sub $4, %r11
1764 psrldq $4, %xmm0
1765 cmp $0, %r11
1766 je _return_T_done_encrypt
1767_T_123_encrypt:
1768 movd %xmm0, %eax
1769 cmp $2, %r11
1770 jl _T_1_encrypt
1771 mov %ax, (%r10)
1772 cmp $2, %r11
1773 je _return_T_done_encrypt
1774 add $2, %r10
1775 sar $16, %eax
1776_T_1_encrypt:
1777 mov %al, (%r10)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001778 jmp _return_T_done_encrypt
1779_T_16_encrypt:
1780 movdqu %xmm0, (%r10)
1781_return_T_done_encrypt:
1782 mov %r14, %rsp
1783 pop %r14
1784 pop %r13
1785 pop %r12
1786 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001787ENDPROC(aesni_gcm_enc)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001788
Mathias Krause559ad0f2010-11-29 08:35:39 +08001789#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001790
1791
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001792.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001793_key_expansion_128:
1794_key_expansion_256a:
1795 pshufd $0b11111111, %xmm1, %xmm1
1796 shufps $0b00010000, %xmm0, %xmm4
1797 pxor %xmm4, %xmm0
1798 shufps $0b10001100, %xmm0, %xmm4
1799 pxor %xmm4, %xmm0
1800 pxor %xmm1, %xmm0
Mathias Krause0d258ef2010-11-27 16:34:46 +08001801 movaps %xmm0, (TKEYP)
1802 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001803 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001804ENDPROC(_key_expansion_128)
1805ENDPROC(_key_expansion_256a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001806
Mathias Krause0d258ef2010-11-27 16:34:46 +08001807.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001808_key_expansion_192a:
1809 pshufd $0b01010101, %xmm1, %xmm1
1810 shufps $0b00010000, %xmm0, %xmm4
1811 pxor %xmm4, %xmm0
1812 shufps $0b10001100, %xmm0, %xmm4
1813 pxor %xmm4, %xmm0
1814 pxor %xmm1, %xmm0
1815
1816 movaps %xmm2, %xmm5
1817 movaps %xmm2, %xmm6
1818 pslldq $4, %xmm5
1819 pshufd $0b11111111, %xmm0, %xmm3
1820 pxor %xmm3, %xmm2
1821 pxor %xmm5, %xmm2
1822
1823 movaps %xmm0, %xmm1
1824 shufps $0b01000100, %xmm0, %xmm6
Mathias Krause0d258ef2010-11-27 16:34:46 +08001825 movaps %xmm6, (TKEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001826 shufps $0b01001110, %xmm2, %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001827 movaps %xmm1, 0x10(TKEYP)
1828 add $0x20, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001829 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001830ENDPROC(_key_expansion_192a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001831
Mathias Krause0d258ef2010-11-27 16:34:46 +08001832.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001833_key_expansion_192b:
1834 pshufd $0b01010101, %xmm1, %xmm1
1835 shufps $0b00010000, %xmm0, %xmm4
1836 pxor %xmm4, %xmm0
1837 shufps $0b10001100, %xmm0, %xmm4
1838 pxor %xmm4, %xmm0
1839 pxor %xmm1, %xmm0
1840
1841 movaps %xmm2, %xmm5
1842 pslldq $4, %xmm5
1843 pshufd $0b11111111, %xmm0, %xmm3
1844 pxor %xmm3, %xmm2
1845 pxor %xmm5, %xmm2
1846
Mathias Krause0d258ef2010-11-27 16:34:46 +08001847 movaps %xmm0, (TKEYP)
1848 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001849 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001850ENDPROC(_key_expansion_192b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001851
Mathias Krause0d258ef2010-11-27 16:34:46 +08001852.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001853_key_expansion_256b:
1854 pshufd $0b10101010, %xmm1, %xmm1
1855 shufps $0b00010000, %xmm2, %xmm4
1856 pxor %xmm4, %xmm2
1857 shufps $0b10001100, %xmm2, %xmm4
1858 pxor %xmm4, %xmm2
1859 pxor %xmm1, %xmm2
Mathias Krause0d258ef2010-11-27 16:34:46 +08001860 movaps %xmm2, (TKEYP)
1861 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001862 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001863ENDPROC(_key_expansion_256b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001864
1865/*
1866 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1867 * unsigned int key_len)
1868 */
1869ENTRY(aesni_set_key)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001870 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001871#ifndef __x86_64__
1872 pushl KEYP
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001873 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1874 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1875 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
Mathias Krause0d258ef2010-11-27 16:34:46 +08001876#endif
1877 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1878 movaps %xmm0, (KEYP)
1879 lea 0x10(KEYP), TKEYP # key addr
1880 movl %edx, 480(KEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001881 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1882 cmp $24, %dl
1883 jb .Lenc_key128
1884 je .Lenc_key192
Mathias Krause0d258ef2010-11-27 16:34:46 +08001885 movups 0x10(UKEYP), %xmm2 # other user key
1886 movaps %xmm2, (TKEYP)
1887 add $0x10, TKEYP
Huang Yingb369e522009-11-23 19:54:06 +08001888 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001889 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001890 AESKEYGENASSIST 0x1 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001891 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001892 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001893 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001894 AESKEYGENASSIST 0x2 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001895 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001896 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001897 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001898 AESKEYGENASSIST 0x4 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001899 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001900 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001901 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001902 AESKEYGENASSIST 0x8 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001903 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001904 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001905 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001906 AESKEYGENASSIST 0x10 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001907 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001908 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001909 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001910 AESKEYGENASSIST 0x20 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001911 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001912 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001913 call _key_expansion_256a
1914 jmp .Ldec_key
1915.Lenc_key192:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001916 movq 0x10(UKEYP), %xmm2 # other user key
Huang Yingb369e522009-11-23 19:54:06 +08001917 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001918 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001919 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001920 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001921 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001922 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001923 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001924 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001925 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001926 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001927 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001928 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001929 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001930 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001931 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001932 call _key_expansion_192b
1933 jmp .Ldec_key
1934.Lenc_key128:
Huang Yingb369e522009-11-23 19:54:06 +08001935 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001936 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001937 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001938 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001939 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001940 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001941 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001942 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001943 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001944 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001945 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001946 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001947 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001948 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001949 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001950 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001951 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
Huang Ying54b6a1b2009-01-18 16:28:34 +11001952 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001953 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
Huang Ying54b6a1b2009-01-18 16:28:34 +11001954 call _key_expansion_128
1955.Ldec_key:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001956 sub $0x10, TKEYP
1957 movaps (KEYP), %xmm0
1958 movaps (TKEYP), %xmm1
1959 movaps %xmm0, 240(TKEYP)
1960 movaps %xmm1, 240(KEYP)
1961 add $0x10, KEYP
1962 lea 240-16(TKEYP), UKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001963.align 4
1964.Ldec_key_loop:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001965 movaps (KEYP), %xmm0
Huang Yingb369e522009-11-23 19:54:06 +08001966 AESIMC %xmm0 %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001967 movaps %xmm1, (UKEYP)
1968 add $0x10, KEYP
1969 sub $0x10, UKEYP
1970 cmp TKEYP, KEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001971 jb .Ldec_key_loop
Mathias Krause0d258ef2010-11-27 16:34:46 +08001972 xor AREG, AREG
1973#ifndef __x86_64__
1974 popl KEYP
1975#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001976 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001977 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001978ENDPROC(aesni_set_key)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001979
1980/*
1981 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1982 */
1983ENTRY(aesni_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001984 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001985#ifndef __x86_64__
1986 pushl KEYP
1987 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001988 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1989 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1990 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08001991#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001992 movl 480(KEYP), KLEN # key length
1993 movups (INP), STATE # input
1994 call _aesni_enc1
1995 movups STATE, (OUTP) # output
Mathias Krause0d258ef2010-11-27 16:34:46 +08001996#ifndef __x86_64__
1997 popl KLEN
1998 popl KEYP
1999#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002000 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002001 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002002ENDPROC(aesni_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002003
2004/*
2005 * _aesni_enc1: internal ABI
2006 * input:
2007 * KEYP: key struct pointer
2008 * KLEN: round count
2009 * STATE: initial state (input)
2010 * output:
2011 * STATE: finial state (output)
2012 * changed:
2013 * KEY
2014 * TKEYP (T1)
2015 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002016.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002017_aesni_enc1:
2018 movaps (KEYP), KEY # key
2019 mov KEYP, TKEYP
2020 pxor KEY, STATE # round 0
2021 add $0x30, TKEYP
2022 cmp $24, KLEN
2023 jb .Lenc128
2024 lea 0x20(TKEYP), TKEYP
2025 je .Lenc192
2026 add $0x20, TKEYP
2027 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002028 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002029 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002030 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002031.align 4
2032.Lenc192:
2033 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002034 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002035 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002036 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002037.align 4
2038.Lenc128:
2039 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002040 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002041 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002042 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002043 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002044 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002045 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002046 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002047 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002048 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002049 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002050 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002051 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002052 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002053 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002054 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002055 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002056 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002057 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002058 AESENCLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002059 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002060ENDPROC(_aesni_enc1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002061
2062/*
2063 * _aesni_enc4: internal ABI
2064 * input:
2065 * KEYP: key struct pointer
2066 * KLEN: round count
2067 * STATE1: initial state (input)
2068 * STATE2
2069 * STATE3
2070 * STATE4
2071 * output:
2072 * STATE1: finial state (output)
2073 * STATE2
2074 * STATE3
2075 * STATE4
2076 * changed:
2077 * KEY
2078 * TKEYP (T1)
2079 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002080.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002081_aesni_enc4:
2082 movaps (KEYP), KEY # key
2083 mov KEYP, TKEYP
2084 pxor KEY, STATE1 # round 0
2085 pxor KEY, STATE2
2086 pxor KEY, STATE3
2087 pxor KEY, STATE4
2088 add $0x30, TKEYP
2089 cmp $24, KLEN
2090 jb .L4enc128
2091 lea 0x20(TKEYP), TKEYP
2092 je .L4enc192
2093 add $0x20, TKEYP
2094 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002095 AESENC KEY STATE1
2096 AESENC KEY STATE2
2097 AESENC KEY STATE3
2098 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002099 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002100 AESENC KEY STATE1
2101 AESENC KEY STATE2
2102 AESENC KEY STATE3
2103 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002104#.align 4
2105.L4enc192:
2106 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002107 AESENC KEY STATE1
2108 AESENC KEY STATE2
2109 AESENC KEY STATE3
2110 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002111 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002112 AESENC KEY STATE1
2113 AESENC KEY STATE2
2114 AESENC KEY STATE3
2115 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002116#.align 4
2117.L4enc128:
2118 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002119 AESENC KEY STATE1
2120 AESENC KEY STATE2
2121 AESENC KEY STATE3
2122 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002123 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002124 AESENC KEY STATE1
2125 AESENC KEY STATE2
2126 AESENC KEY STATE3
2127 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002128 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002129 AESENC KEY STATE1
2130 AESENC KEY STATE2
2131 AESENC KEY STATE3
2132 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002133 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002134 AESENC KEY STATE1
2135 AESENC KEY STATE2
2136 AESENC KEY STATE3
2137 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002138 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002139 AESENC KEY STATE1
2140 AESENC KEY STATE2
2141 AESENC KEY STATE3
2142 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002143 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002144 AESENC KEY STATE1
2145 AESENC KEY STATE2
2146 AESENC KEY STATE3
2147 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002148 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002149 AESENC KEY STATE1
2150 AESENC KEY STATE2
2151 AESENC KEY STATE3
2152 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002153 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002154 AESENC KEY STATE1
2155 AESENC KEY STATE2
2156 AESENC KEY STATE3
2157 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002158 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002159 AESENC KEY STATE1
2160 AESENC KEY STATE2
2161 AESENC KEY STATE3
2162 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002163 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002164 AESENCLAST KEY STATE1 # last round
2165 AESENCLAST KEY STATE2
2166 AESENCLAST KEY STATE3
2167 AESENCLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002168 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002169ENDPROC(_aesni_enc4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002170
2171/*
2172 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2173 */
2174ENTRY(aesni_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002175 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002176#ifndef __x86_64__
2177 pushl KEYP
2178 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002179 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2180 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2181 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08002182#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002183 mov 480(KEYP), KLEN # key length
2184 add $240, KEYP
2185 movups (INP), STATE # input
2186 call _aesni_dec1
2187 movups STATE, (OUTP) #output
Mathias Krause0d258ef2010-11-27 16:34:46 +08002188#ifndef __x86_64__
2189 popl KLEN
2190 popl KEYP
2191#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002192 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002193 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002194ENDPROC(aesni_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002195
2196/*
2197 * _aesni_dec1: internal ABI
2198 * input:
2199 * KEYP: key struct pointer
2200 * KLEN: key length
2201 * STATE: initial state (input)
2202 * output:
2203 * STATE: finial state (output)
2204 * changed:
2205 * KEY
2206 * TKEYP (T1)
2207 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002208.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002209_aesni_dec1:
2210 movaps (KEYP), KEY # key
2211 mov KEYP, TKEYP
2212 pxor KEY, STATE # round 0
2213 add $0x30, TKEYP
2214 cmp $24, KLEN
2215 jb .Ldec128
2216 lea 0x20(TKEYP), TKEYP
2217 je .Ldec192
2218 add $0x20, TKEYP
2219 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002220 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002221 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002222 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002223.align 4
2224.Ldec192:
2225 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002226 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002227 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002228 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002229.align 4
2230.Ldec128:
2231 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002232 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002233 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002234 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002235 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002236 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002237 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002238 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002239 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002240 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002241 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002242 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002243 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002244 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002245 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002246 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002247 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002248 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002249 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002250 AESDECLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002251 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002252ENDPROC(_aesni_dec1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002253
2254/*
2255 * _aesni_dec4: internal ABI
2256 * input:
2257 * KEYP: key struct pointer
2258 * KLEN: key length
2259 * STATE1: initial state (input)
2260 * STATE2
2261 * STATE3
2262 * STATE4
2263 * output:
2264 * STATE1: finial state (output)
2265 * STATE2
2266 * STATE3
2267 * STATE4
2268 * changed:
2269 * KEY
2270 * TKEYP (T1)
2271 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002272.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002273_aesni_dec4:
2274 movaps (KEYP), KEY # key
2275 mov KEYP, TKEYP
2276 pxor KEY, STATE1 # round 0
2277 pxor KEY, STATE2
2278 pxor KEY, STATE3
2279 pxor KEY, STATE4
2280 add $0x30, TKEYP
2281 cmp $24, KLEN
2282 jb .L4dec128
2283 lea 0x20(TKEYP), TKEYP
2284 je .L4dec192
2285 add $0x20, TKEYP
2286 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002287 AESDEC KEY STATE1
2288 AESDEC KEY STATE2
2289 AESDEC KEY STATE3
2290 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002291 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002292 AESDEC KEY STATE1
2293 AESDEC KEY STATE2
2294 AESDEC KEY STATE3
2295 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002296.align 4
2297.L4dec192:
2298 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002299 AESDEC KEY STATE1
2300 AESDEC KEY STATE2
2301 AESDEC KEY STATE3
2302 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002303 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002304 AESDEC KEY STATE1
2305 AESDEC KEY STATE2
2306 AESDEC KEY STATE3
2307 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002308.align 4
2309.L4dec128:
2310 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002311 AESDEC KEY STATE1
2312 AESDEC KEY STATE2
2313 AESDEC KEY STATE3
2314 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002315 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002316 AESDEC KEY STATE1
2317 AESDEC KEY STATE2
2318 AESDEC KEY STATE3
2319 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002320 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002321 AESDEC KEY STATE1
2322 AESDEC KEY STATE2
2323 AESDEC KEY STATE3
2324 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002325 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002326 AESDEC KEY STATE1
2327 AESDEC KEY STATE2
2328 AESDEC KEY STATE3
2329 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002330 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002331 AESDEC KEY STATE1
2332 AESDEC KEY STATE2
2333 AESDEC KEY STATE3
2334 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002335 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002336 AESDEC KEY STATE1
2337 AESDEC KEY STATE2
2338 AESDEC KEY STATE3
2339 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002340 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002341 AESDEC KEY STATE1
2342 AESDEC KEY STATE2
2343 AESDEC KEY STATE3
2344 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002345 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002346 AESDEC KEY STATE1
2347 AESDEC KEY STATE2
2348 AESDEC KEY STATE3
2349 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002350 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002351 AESDEC KEY STATE1
2352 AESDEC KEY STATE2
2353 AESDEC KEY STATE3
2354 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002355 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002356 AESDECLAST KEY STATE1 # last round
2357 AESDECLAST KEY STATE2
2358 AESDECLAST KEY STATE3
2359 AESDECLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002360 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002361ENDPROC(_aesni_dec4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002362
2363/*
2364 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2365 * size_t len)
2366 */
2367ENTRY(aesni_ecb_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002368 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002369#ifndef __x86_64__
2370 pushl LEN
2371 pushl KEYP
2372 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002373 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2374 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2375 movl (FRAME_OFFSET+24)(%esp), INP # src
2376 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002377#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002378 test LEN, LEN # check length
2379 jz .Lecb_enc_ret
2380 mov 480(KEYP), KLEN
2381 cmp $16, LEN
2382 jb .Lecb_enc_ret
2383 cmp $64, LEN
2384 jb .Lecb_enc_loop1
2385.align 4
2386.Lecb_enc_loop4:
2387 movups (INP), STATE1
2388 movups 0x10(INP), STATE2
2389 movups 0x20(INP), STATE3
2390 movups 0x30(INP), STATE4
2391 call _aesni_enc4
2392 movups STATE1, (OUTP)
2393 movups STATE2, 0x10(OUTP)
2394 movups STATE3, 0x20(OUTP)
2395 movups STATE4, 0x30(OUTP)
2396 sub $64, LEN
2397 add $64, INP
2398 add $64, OUTP
2399 cmp $64, LEN
2400 jge .Lecb_enc_loop4
2401 cmp $16, LEN
2402 jb .Lecb_enc_ret
2403.align 4
2404.Lecb_enc_loop1:
2405 movups (INP), STATE1
2406 call _aesni_enc1
2407 movups STATE1, (OUTP)
2408 sub $16, LEN
2409 add $16, INP
2410 add $16, OUTP
2411 cmp $16, LEN
2412 jge .Lecb_enc_loop1
2413.Lecb_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002414#ifndef __x86_64__
2415 popl KLEN
2416 popl KEYP
2417 popl LEN
2418#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002419 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002420 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002421ENDPROC(aesni_ecb_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002422
2423/*
2424 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2425 * size_t len);
2426 */
2427ENTRY(aesni_ecb_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002428 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002429#ifndef __x86_64__
2430 pushl LEN
2431 pushl KEYP
2432 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002433 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2434 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2435 movl (FRAME_OFFSET+24)(%esp), INP # src
2436 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002437#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002438 test LEN, LEN
2439 jz .Lecb_dec_ret
2440 mov 480(KEYP), KLEN
2441 add $240, KEYP
2442 cmp $16, LEN
2443 jb .Lecb_dec_ret
2444 cmp $64, LEN
2445 jb .Lecb_dec_loop1
2446.align 4
2447.Lecb_dec_loop4:
2448 movups (INP), STATE1
2449 movups 0x10(INP), STATE2
2450 movups 0x20(INP), STATE3
2451 movups 0x30(INP), STATE4
2452 call _aesni_dec4
2453 movups STATE1, (OUTP)
2454 movups STATE2, 0x10(OUTP)
2455 movups STATE3, 0x20(OUTP)
2456 movups STATE4, 0x30(OUTP)
2457 sub $64, LEN
2458 add $64, INP
2459 add $64, OUTP
2460 cmp $64, LEN
2461 jge .Lecb_dec_loop4
2462 cmp $16, LEN
2463 jb .Lecb_dec_ret
2464.align 4
2465.Lecb_dec_loop1:
2466 movups (INP), STATE1
2467 call _aesni_dec1
2468 movups STATE1, (OUTP)
2469 sub $16, LEN
2470 add $16, INP
2471 add $16, OUTP
2472 cmp $16, LEN
2473 jge .Lecb_dec_loop1
2474.Lecb_dec_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002475#ifndef __x86_64__
2476 popl KLEN
2477 popl KEYP
2478 popl LEN
2479#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002480 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002481 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002482ENDPROC(aesni_ecb_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002483
2484/*
2485 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2486 * size_t len, u8 *iv)
2487 */
2488ENTRY(aesni_cbc_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002489 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002490#ifndef __x86_64__
2491 pushl IVP
2492 pushl LEN
2493 pushl KEYP
2494 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002495 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2496 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2497 movl (FRAME_OFFSET+28)(%esp), INP # src
2498 movl (FRAME_OFFSET+32)(%esp), LEN # len
2499 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002500#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002501 cmp $16, LEN
2502 jb .Lcbc_enc_ret
2503 mov 480(KEYP), KLEN
2504 movups (IVP), STATE # load iv as initial state
2505.align 4
2506.Lcbc_enc_loop:
2507 movups (INP), IN # load input
2508 pxor IN, STATE
2509 call _aesni_enc1
2510 movups STATE, (OUTP) # store output
2511 sub $16, LEN
2512 add $16, INP
2513 add $16, OUTP
2514 cmp $16, LEN
2515 jge .Lcbc_enc_loop
2516 movups STATE, (IVP)
2517.Lcbc_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002518#ifndef __x86_64__
2519 popl KLEN
2520 popl KEYP
2521 popl LEN
2522 popl IVP
2523#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002524 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002525 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002526ENDPROC(aesni_cbc_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002527
2528/*
2529 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2530 * size_t len, u8 *iv)
2531 */
2532ENTRY(aesni_cbc_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002533 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002534#ifndef __x86_64__
2535 pushl IVP
2536 pushl LEN
2537 pushl KEYP
2538 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002539 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2540 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2541 movl (FRAME_OFFSET+28)(%esp), INP # src
2542 movl (FRAME_OFFSET+32)(%esp), LEN # len
2543 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002544#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002545 cmp $16, LEN
Huang Yinge6efaa02009-06-18 19:33:57 +08002546 jb .Lcbc_dec_just_ret
Huang Ying54b6a1b2009-01-18 16:28:34 +11002547 mov 480(KEYP), KLEN
2548 add $240, KEYP
2549 movups (IVP), IV
2550 cmp $64, LEN
2551 jb .Lcbc_dec_loop1
2552.align 4
2553.Lcbc_dec_loop4:
2554 movups (INP), IN1
2555 movaps IN1, STATE1
2556 movups 0x10(INP), IN2
2557 movaps IN2, STATE2
Mathias Krause0d258ef2010-11-27 16:34:46 +08002558#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002559 movups 0x20(INP), IN3
2560 movaps IN3, STATE3
2561 movups 0x30(INP), IN4
2562 movaps IN4, STATE4
Mathias Krause0d258ef2010-11-27 16:34:46 +08002563#else
2564 movups 0x20(INP), IN1
2565 movaps IN1, STATE3
2566 movups 0x30(INP), IN2
2567 movaps IN2, STATE4
2568#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002569 call _aesni_dec4
2570 pxor IV, STATE1
Mathias Krause0d258ef2010-11-27 16:34:46 +08002571#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002572 pxor IN1, STATE2
2573 pxor IN2, STATE3
2574 pxor IN3, STATE4
2575 movaps IN4, IV
Mathias Krause0d258ef2010-11-27 16:34:46 +08002576#else
Mathias Krause0d258ef2010-11-27 16:34:46 +08002577 pxor IN1, STATE4
2578 movaps IN2, IV
Mathias Krause7c8d5182012-05-30 01:43:08 +02002579 movups (INP), IN1
2580 pxor IN1, STATE2
2581 movups 0x10(INP), IN2
2582 pxor IN2, STATE3
Mathias Krause0d258ef2010-11-27 16:34:46 +08002583#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002584 movups STATE1, (OUTP)
2585 movups STATE2, 0x10(OUTP)
2586 movups STATE3, 0x20(OUTP)
2587 movups STATE4, 0x30(OUTP)
2588 sub $64, LEN
2589 add $64, INP
2590 add $64, OUTP
2591 cmp $64, LEN
2592 jge .Lcbc_dec_loop4
2593 cmp $16, LEN
2594 jb .Lcbc_dec_ret
2595.align 4
2596.Lcbc_dec_loop1:
2597 movups (INP), IN
2598 movaps IN, STATE
2599 call _aesni_dec1
2600 pxor IV, STATE
2601 movups STATE, (OUTP)
2602 movaps IN, IV
2603 sub $16, LEN
2604 add $16, INP
2605 add $16, OUTP
2606 cmp $16, LEN
2607 jge .Lcbc_dec_loop1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002608.Lcbc_dec_ret:
Huang Yinge6efaa02009-06-18 19:33:57 +08002609 movups IV, (IVP)
2610.Lcbc_dec_just_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002611#ifndef __x86_64__
2612 popl KLEN
2613 popl KEYP
2614 popl LEN
2615 popl IVP
2616#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002617 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002618 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002619ENDPROC(aesni_cbc_dec)
Huang Ying12387a42010-03-10 18:28:55 +08002620
Mathias Krause0d258ef2010-11-27 16:34:46 +08002621#ifdef __x86_64__
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002622.pushsection .rodata
Huang Ying12387a42010-03-10 18:28:55 +08002623.align 16
2624.Lbswap_mask:
2625 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002626.popsection
Huang Ying12387a42010-03-10 18:28:55 +08002627
2628/*
2629 * _aesni_inc_init: internal ABI
2630 * setup registers used by _aesni_inc
2631 * input:
2632 * IV
2633 * output:
2634 * CTR: == IV, in little endian
2635 * TCTR_LOW: == lower qword of CTR
2636 * INC: == 1, in little endian
2637 * BSWAP_MASK == endian swapping mask
2638 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002639.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002640_aesni_inc_init:
2641 movaps .Lbswap_mask, BSWAP_MASK
2642 movaps IV, CTR
2643 PSHUFB_XMM BSWAP_MASK CTR
2644 mov $1, TCTR_LOW
Huang Ying32cbd7d2010-03-13 16:28:42 +08002645 MOVQ_R64_XMM TCTR_LOW INC
2646 MOVQ_R64_XMM CTR TCTR_LOW
Huang Ying12387a42010-03-10 18:28:55 +08002647 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002648ENDPROC(_aesni_inc_init)
Huang Ying12387a42010-03-10 18:28:55 +08002649
2650/*
2651 * _aesni_inc: internal ABI
2652 * Increase IV by 1, IV is in big endian
2653 * input:
2654 * IV
2655 * CTR: == IV, in little endian
2656 * TCTR_LOW: == lower qword of CTR
2657 * INC: == 1, in little endian
2658 * BSWAP_MASK == endian swapping mask
2659 * output:
2660 * IV: Increase by 1
2661 * changed:
2662 * CTR: == output IV, in little endian
2663 * TCTR_LOW: == lower qword of CTR
2664 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002665.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002666_aesni_inc:
2667 paddq INC, CTR
2668 add $1, TCTR_LOW
2669 jnc .Linc_low
2670 pslldq $8, INC
2671 paddq INC, CTR
2672 psrldq $8, INC
2673.Linc_low:
2674 movaps CTR, IV
2675 PSHUFB_XMM BSWAP_MASK IV
2676 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002677ENDPROC(_aesni_inc)
Huang Ying12387a42010-03-10 18:28:55 +08002678
2679/*
2680 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2681 * size_t len, u8 *iv)
2682 */
2683ENTRY(aesni_ctr_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002684 FRAME_BEGIN
Huang Ying12387a42010-03-10 18:28:55 +08002685 cmp $16, LEN
2686 jb .Lctr_enc_just_ret
2687 mov 480(KEYP), KLEN
2688 movups (IVP), IV
2689 call _aesni_inc_init
2690 cmp $64, LEN
2691 jb .Lctr_enc_loop1
2692.align 4
2693.Lctr_enc_loop4:
2694 movaps IV, STATE1
2695 call _aesni_inc
2696 movups (INP), IN1
2697 movaps IV, STATE2
2698 call _aesni_inc
2699 movups 0x10(INP), IN2
2700 movaps IV, STATE3
2701 call _aesni_inc
2702 movups 0x20(INP), IN3
2703 movaps IV, STATE4
2704 call _aesni_inc
2705 movups 0x30(INP), IN4
2706 call _aesni_enc4
2707 pxor IN1, STATE1
2708 movups STATE1, (OUTP)
2709 pxor IN2, STATE2
2710 movups STATE2, 0x10(OUTP)
2711 pxor IN3, STATE3
2712 movups STATE3, 0x20(OUTP)
2713 pxor IN4, STATE4
2714 movups STATE4, 0x30(OUTP)
2715 sub $64, LEN
2716 add $64, INP
2717 add $64, OUTP
2718 cmp $64, LEN
2719 jge .Lctr_enc_loop4
2720 cmp $16, LEN
2721 jb .Lctr_enc_ret
2722.align 4
2723.Lctr_enc_loop1:
2724 movaps IV, STATE
2725 call _aesni_inc
2726 movups (INP), IN
2727 call _aesni_enc1
2728 pxor IN, STATE
2729 movups STATE, (OUTP)
2730 sub $16, LEN
2731 add $16, INP
2732 add $16, OUTP
2733 cmp $16, LEN
2734 jge .Lctr_enc_loop1
2735.Lctr_enc_ret:
2736 movups IV, (IVP)
2737.Lctr_enc_just_ret:
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002738 FRAME_END
Huang Ying12387a42010-03-10 18:28:55 +08002739 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002740ENDPROC(aesni_ctr_enc)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002741
2742/*
2743 * _aesni_gf128mul_x_ble: internal ABI
2744 * Multiply in GF(2^128) for XTS IVs
2745 * input:
2746 * IV: current IV
2747 * GF128MUL_MASK == mask with 0x87 and 0x01
2748 * output:
2749 * IV: next IV
2750 * changed:
2751 * CTR: == temporary value
2752 */
2753#define _aesni_gf128mul_x_ble() \
2754 pshufd $0x13, IV, CTR; \
2755 paddq IV, IV; \
2756 psrad $31, CTR; \
2757 pand GF128MUL_MASK, CTR; \
2758 pxor CTR, IV;
2759
2760/*
2761 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2762 * bool enc, u8 *iv)
2763 */
2764ENTRY(aesni_xts_crypt8)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002765 FRAME_BEGIN
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002766 cmpb $0, %cl
2767 movl $0, %ecx
2768 movl $240, %r10d
2769 leaq _aesni_enc4, %r11
2770 leaq _aesni_dec4, %rax
2771 cmovel %r10d, %ecx
2772 cmoveq %rax, %r11
2773
2774 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2775 movups (IVP), IV
2776
2777 mov 480(KEYP), KLEN
2778 addq %rcx, KEYP
2779
2780 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002781 movdqu 0x00(INP), INC
2782 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002783 movdqu IV, 0x00(OUTP)
2784
2785 _aesni_gf128mul_x_ble()
2786 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002787 movdqu 0x10(INP), INC
2788 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002789 movdqu IV, 0x10(OUTP)
2790
2791 _aesni_gf128mul_x_ble()
2792 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002793 movdqu 0x20(INP), INC
2794 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002795 movdqu IV, 0x20(OUTP)
2796
2797 _aesni_gf128mul_x_ble()
2798 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002799 movdqu 0x30(INP), INC
2800 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002801 movdqu IV, 0x30(OUTP)
2802
David Woodhouse9697fa32018-01-11 21:46:27 +00002803 CALL_NOSPEC %r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002804
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002805 movdqu 0x00(OUTP), INC
2806 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002807 movdqu STATE1, 0x00(OUTP)
2808
2809 _aesni_gf128mul_x_ble()
2810 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002811 movdqu 0x40(INP), INC
2812 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002813 movdqu IV, 0x40(OUTP)
2814
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002815 movdqu 0x10(OUTP), INC
2816 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002817 movdqu STATE2, 0x10(OUTP)
2818
2819 _aesni_gf128mul_x_ble()
2820 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002821 movdqu 0x50(INP), INC
2822 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002823 movdqu IV, 0x50(OUTP)
2824
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002825 movdqu 0x20(OUTP), INC
2826 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002827 movdqu STATE3, 0x20(OUTP)
2828
2829 _aesni_gf128mul_x_ble()
2830 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002831 movdqu 0x60(INP), INC
2832 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002833 movdqu IV, 0x60(OUTP)
2834
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002835 movdqu 0x30(OUTP), INC
2836 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002837 movdqu STATE4, 0x30(OUTP)
2838
2839 _aesni_gf128mul_x_ble()
2840 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002841 movdqu 0x70(INP), INC
2842 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002843 movdqu IV, 0x70(OUTP)
2844
2845 _aesni_gf128mul_x_ble()
2846 movups IV, (IVP)
2847
David Woodhouse9697fa32018-01-11 21:46:27 +00002848 CALL_NOSPEC %r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002849
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002850 movdqu 0x40(OUTP), INC
2851 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002852 movdqu STATE1, 0x40(OUTP)
2853
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002854 movdqu 0x50(OUTP), INC
2855 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002856 movdqu STATE2, 0x50(OUTP)
2857
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002858 movdqu 0x60(OUTP), INC
2859 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002860 movdqu STATE3, 0x60(OUTP)
2861
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002862 movdqu 0x70(OUTP), INC
2863 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002864 movdqu STATE4, 0x70(OUTP)
2865
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002866 FRAME_END
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002867 ret
2868ENDPROC(aesni_xts_crypt8)
2869
Mathias Krause0d258ef2010-11-27 16:34:46 +08002870#endif