blob: 6bd2c6c95373f58dc8f915f5e50967105bf404d0 [file] [log] [blame]
Huang Ying54b6a1b2009-01-18 16:28:34 +11001/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040012 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
Mathias Krause0d258ef2010-11-27 16:34:46 +080023 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
Huang Ying54b6a1b2009-01-18 16:28:34 +110026 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
Huang Yingb369e522009-11-23 19:54:06 +080033#include <asm/inst.h>
Huang Ying54b6a1b2009-01-18 16:28:34 +110034
Timothy McCaffreye31ac322015-01-13 13:16:43 -050035/*
36 * The following macros are used to move an (un)aligned 16 byte value to/from
37 * an XMM register. This can done for either FP or integer values, for FP use
38 * movaps (move aligned packed single) or integer use movdqa (move double quad
39 * aligned). It doesn't make a performance difference which instruction is used
40 * since Nehalem (original Core i7) was released. However, the movaps is a byte
41 * shorter, so that is the one we'll use for now. (same for unaligned).
42 */
43#define MOVADQ movaps
44#define MOVUDQ movups
45
Mathias Krause559ad0f2010-11-29 08:35:39 +080046#ifdef __x86_64__
Timothy McCaffreye31ac322015-01-13 13:16:43 -050047
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040048.data
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +030049.align 16
50.Lgf128mul_x_ble_mask:
51 .octa 0x00000000000000010000000000000087
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040052POLY: .octa 0xC2000000000000000000000000000001
53TWOONE: .octa 0x00000001000000000000000000000001
54
55# order of these constants should not change.
56# more specifically, ALL_F should follow SHIFT_MASK,
57# and ZERO should follow ALL_F
58
59SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
60MASK1: .octa 0x0000000000000000ffffffffffffffff
61MASK2: .octa 0xffffffffffffffff0000000000000000
62SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
63ALL_F: .octa 0xffffffffffffffffffffffffffffffff
64ZERO: .octa 0x00000000000000000000000000000000
65ONE: .octa 0x00000000000000000000000000000001
66F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
67dec: .octa 0x1
68enc: .octa 0x2
69
70
Huang Ying54b6a1b2009-01-18 16:28:34 +110071.text
72
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040073
74#define STACK_OFFSET 8*3
75#define HashKey 16*0 // store HashKey <<1 mod poly here
76#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
77#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
78#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
79#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
80 // bits of HashKey <<1 mod poly here
81 //(for Karatsuba purposes)
82#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
83 // bits of HashKey^2 <<1 mod poly here
84 // (for Karatsuba purposes)
85#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
86 // bits of HashKey^3 <<1 mod poly here
87 // (for Karatsuba purposes)
88#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
89 // bits of HashKey^4 <<1 mod poly here
90 // (for Karatsuba purposes)
91#define VARIABLE_OFFSET 16*8
92
93#define arg1 rdi
94#define arg2 rsi
95#define arg3 rdx
96#define arg4 rcx
97#define arg5 r8
98#define arg6 r9
99#define arg7 STACK_OFFSET+8(%r14)
100#define arg8 STACK_OFFSET+16(%r14)
101#define arg9 STACK_OFFSET+24(%r14)
102#define arg10 STACK_OFFSET+32(%r14)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500103#define keysize 2*15*16(%arg1)
Mathias Krause559ad0f2010-11-29 08:35:39 +0800104#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400105
106
Huang Ying54b6a1b2009-01-18 16:28:34 +1100107#define STATE1 %xmm0
108#define STATE2 %xmm4
109#define STATE3 %xmm5
110#define STATE4 %xmm6
111#define STATE STATE1
112#define IN1 %xmm1
113#define IN2 %xmm7
114#define IN3 %xmm8
115#define IN4 %xmm9
116#define IN IN1
117#define KEY %xmm2
118#define IV %xmm3
Mathias Krause0d258ef2010-11-27 16:34:46 +0800119
Huang Ying12387a42010-03-10 18:28:55 +0800120#define BSWAP_MASK %xmm10
121#define CTR %xmm11
122#define INC %xmm12
Huang Ying54b6a1b2009-01-18 16:28:34 +1100123
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +0300124#define GF128MUL_MASK %xmm10
125
Mathias Krause0d258ef2010-11-27 16:34:46 +0800126#ifdef __x86_64__
127#define AREG %rax
Huang Ying54b6a1b2009-01-18 16:28:34 +1100128#define KEYP %rdi
129#define OUTP %rsi
Mathias Krause0d258ef2010-11-27 16:34:46 +0800130#define UKEYP OUTP
Huang Ying54b6a1b2009-01-18 16:28:34 +1100131#define INP %rdx
132#define LEN %rcx
133#define IVP %r8
134#define KLEN %r9d
135#define T1 %r10
136#define TKEYP T1
137#define T2 %r11
Huang Ying12387a42010-03-10 18:28:55 +0800138#define TCTR_LOW T2
Mathias Krause0d258ef2010-11-27 16:34:46 +0800139#else
140#define AREG %eax
141#define KEYP %edi
142#define OUTP AREG
143#define UKEYP OUTP
144#define INP %edx
145#define LEN %esi
146#define IVP %ebp
147#define KLEN %ebx
148#define T1 %ecx
149#define TKEYP T1
150#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +1100151
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400152
Mathias Krause559ad0f2010-11-29 08:35:39 +0800153#ifdef __x86_64__
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400154/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
155*
156*
157* Input: A and B (128-bits each, bit-reflected)
158* Output: C = A*B*x mod poly, (i.e. >>1 )
159* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
160* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
161*
162*/
163.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
164 movdqa \GH, \TMP1
165 pshufd $78, \GH, \TMP2
166 pshufd $78, \HK, \TMP3
167 pxor \GH, \TMP2 # TMP2 = a1+a0
168 pxor \HK, \TMP3 # TMP3 = b1+b0
169 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
170 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
171 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
172 pxor \GH, \TMP2
173 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
174 movdqa \TMP2, \TMP3
175 pslldq $8, \TMP3 # left shift TMP3 2 DWs
176 psrldq $8, \TMP2 # right shift TMP2 2 DWs
177 pxor \TMP3, \GH
178 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
179
180 # first phase of the reduction
181
182 movdqa \GH, \TMP2
183 movdqa \GH, \TMP3
184 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
185 # in in order to perform
186 # independent shifts
187 pslld $31, \TMP2 # packed right shift <<31
188 pslld $30, \TMP3 # packed right shift <<30
189 pslld $25, \TMP4 # packed right shift <<25
190 pxor \TMP3, \TMP2 # xor the shifted versions
191 pxor \TMP4, \TMP2
192 movdqa \TMP2, \TMP5
193 psrldq $4, \TMP5 # right shift TMP5 1 DW
194 pslldq $12, \TMP2 # left shift TMP2 3 DWs
195 pxor \TMP2, \GH
196
197 # second phase of the reduction
198
199 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
200 # in in order to perform
201 # independent shifts
202 movdqa \GH,\TMP3
203 movdqa \GH,\TMP4
204 psrld $1,\TMP2 # packed left shift >>1
205 psrld $2,\TMP3 # packed left shift >>2
206 psrld $7,\TMP4 # packed left shift >>7
207 pxor \TMP3,\TMP2 # xor the shifted versions
208 pxor \TMP4,\TMP2
209 pxor \TMP5, \TMP2
210 pxor \TMP2, \GH
211 pxor \TMP1, \GH # result is in TMP1
212.endm
213
214/*
215* if a = number of total plaintext bytes
216* b = floor(a/16)
217* num_initial_blocks = b mod 4
218* encrypt the initial num_initial_blocks blocks and apply ghash on
219* the ciphertext
220* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
221* are clobbered
222* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
223*/
224
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400225
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800226.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
227XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500228 MOVADQ SHUF_MASK(%rip), %xmm14
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400229 mov arg7, %r10 # %r10 = AAD
230 mov arg8, %r12 # %r12 = aadLen
231 mov %r12, %r11
232 pxor %xmm\i, %xmm\i
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500233
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400234_get_AAD_loop\num_initial_blocks\operation:
235 movd (%r10), \TMP1
236 pslldq $12, \TMP1
237 psrldq $4, %xmm\i
238 pxor \TMP1, %xmm\i
239 add $4, %r10
240 sub $4, %r12
241 jne _get_AAD_loop\num_initial_blocks\operation
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500242
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400243 cmp $16, %r11
244 je _get_AAD_loop2_done\num_initial_blocks\operation
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500245
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400246 mov $16, %r12
247_get_AAD_loop2\num_initial_blocks\operation:
248 psrldq $4, %xmm\i
249 sub $4, %r12
250 cmp %r11, %r12
251 jne _get_AAD_loop2\num_initial_blocks\operation
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500252
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400253_get_AAD_loop2_done\num_initial_blocks\operation:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800254 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
255
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400256 xor %r11, %r11 # initialise the data pointer offset as zero
257
258 # start AES for num_initial_blocks blocks
259
260 mov %arg5, %rax # %rax = *Y0
261 movdqu (%rax), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800262 PSHUFB_XMM %xmm14, \XMM0
263
264.if (\i == 5) || (\i == 6) || (\i == 7)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500265 MOVADQ ONE(%RIP),\TMP1
266 MOVADQ (%arg1),\TMP2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400267.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500268 paddd \TMP1, \XMM0 # INCR Y0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400269 movdqa \XMM0, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800270 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500271 pxor \TMP2, %xmm\index
272.endr
273 lea 0x10(%arg1),%r10
274 mov keysize,%eax
275 shr $2,%eax # 128->4, 192->6, 256->8
276 add $5,%eax # 128->9, 192->11, 256->13
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800277
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500278aes_loop_initial_dec\num_initial_blocks:
279 MOVADQ (%r10),\TMP1
280.irpc index, \i_seq
281 AESENC \TMP1, %xmm\index
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400282.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500283 add $16,%r10
284 sub $1,%eax
285 jnz aes_loop_initial_dec\num_initial_blocks
286
287 MOVADQ (%r10), \TMP1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400288.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500289 AESENCLAST \TMP1, %xmm\index # Last Round
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400290.endr
291.irpc index, \i_seq
292 movdqu (%arg3 , %r11, 1), \TMP1
293 pxor \TMP1, %xmm\index
294 movdqu %xmm\index, (%arg2 , %r11, 1)
295 # write back plaintext/ciphertext for num_initial_blocks
296 add $16, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800297
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400298 movdqa \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800299 PSHUFB_XMM %xmm14, %xmm\index
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500300 # prepare plaintext/ciphertext for GHASH computation
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400301.endr
302.endif
303 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
304 # apply GHASH on num_initial_blocks blocks
305
306.if \i == 5
307 pxor %xmm5, %xmm6
308 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
309 pxor %xmm6, %xmm7
310 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
311 pxor %xmm7, %xmm8
312 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
313.elseif \i == 6
314 pxor %xmm6, %xmm7
315 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316 pxor %xmm7, %xmm8
317 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318.elseif \i == 7
319 pxor %xmm7, %xmm8
320 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321.endif
322 cmp $64, %r13
323 jl _initial_blocks_done\num_initial_blocks\operation
324 # no need for precomputed values
325/*
326*
327* Precomputations for HashKey parallel with encryption of first 4 blocks.
328* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
329*/
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500330 MOVADQ ONE(%rip), \TMP1
331 paddd \TMP1, \XMM0 # INCR Y0
332 MOVADQ \XMM0, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800333 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
334
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500335 paddd \TMP1, \XMM0 # INCR Y0
336 MOVADQ \XMM0, \XMM2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800337 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
338
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500339 paddd \TMP1, \XMM0 # INCR Y0
340 MOVADQ \XMM0, \XMM3
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800341 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
342
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500343 paddd \TMP1, \XMM0 # INCR Y0
344 MOVADQ \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800345 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
346
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500347 MOVADQ 0(%arg1),\TMP1
348 pxor \TMP1, \XMM1
349 pxor \TMP1, \XMM2
350 pxor \TMP1, \XMM3
351 pxor \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400352 movdqa \TMP3, \TMP5
353 pshufd $78, \TMP3, \TMP1
354 pxor \TMP3, \TMP1
355 movdqa \TMP1, HashKey_k(%rsp)
356 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
357# TMP5 = HashKey^2<<1 (mod poly)
358 movdqa \TMP5, HashKey_2(%rsp)
359# HashKey_2 = HashKey^2<<1 (mod poly)
360 pshufd $78, \TMP5, \TMP1
361 pxor \TMP5, \TMP1
362 movdqa \TMP1, HashKey_2_k(%rsp)
363.irpc index, 1234 # do 4 rounds
364 movaps 0x10*\index(%arg1), \TMP1
365 AESENC \TMP1, \XMM1
366 AESENC \TMP1, \XMM2
367 AESENC \TMP1, \XMM3
368 AESENC \TMP1, \XMM4
369.endr
370 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
371# TMP5 = HashKey^3<<1 (mod poly)
372 movdqa \TMP5, HashKey_3(%rsp)
373 pshufd $78, \TMP5, \TMP1
374 pxor \TMP5, \TMP1
375 movdqa \TMP1, HashKey_3_k(%rsp)
376.irpc index, 56789 # do next 5 rounds
377 movaps 0x10*\index(%arg1), \TMP1
378 AESENC \TMP1, \XMM1
379 AESENC \TMP1, \XMM2
380 AESENC \TMP1, \XMM3
381 AESENC \TMP1, \XMM4
382.endr
383 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
384# TMP5 = HashKey^3<<1 (mod poly)
385 movdqa \TMP5, HashKey_4(%rsp)
386 pshufd $78, \TMP5, \TMP1
387 pxor \TMP5, \TMP1
388 movdqa \TMP1, HashKey_4_k(%rsp)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500389 lea 0xa0(%arg1),%r10
390 mov keysize,%eax
391 shr $2,%eax # 128->4, 192->6, 256->8
392 sub $4,%eax # 128->0, 192->2, 256->4
393 jz aes_loop_pre_dec_done\num_initial_blocks
394
395aes_loop_pre_dec\num_initial_blocks:
396 MOVADQ (%r10),\TMP2
397.irpc index, 1234
398 AESENC \TMP2, %xmm\index
399.endr
400 add $16,%r10
401 sub $1,%eax
402 jnz aes_loop_pre_dec\num_initial_blocks
403
404aes_loop_pre_dec_done\num_initial_blocks:
405 MOVADQ (%r10), \TMP2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400406 AESENCLAST \TMP2, \XMM1
407 AESENCLAST \TMP2, \XMM2
408 AESENCLAST \TMP2, \XMM3
409 AESENCLAST \TMP2, \XMM4
410 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
411 pxor \TMP1, \XMM1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400412 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
413 movdqa \TMP1, \XMM1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400414 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
415 pxor \TMP1, \XMM2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400416 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
417 movdqa \TMP1, \XMM2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400418 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
419 pxor \TMP1, \XMM3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400420 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
421 movdqa \TMP1, \XMM3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400422 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
423 pxor \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400424 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
425 movdqa \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800426 add $64, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800427 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
428 pxor \XMMDst, \XMM1
429# combine GHASHed value with the corresponding ciphertext
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800430 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800431 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800432 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
433
434_initial_blocks_done\num_initial_blocks\operation:
435
436.endm
437
438
439/*
440* if a = number of total plaintext bytes
441* b = floor(a/16)
442* num_initial_blocks = b mod 4
443* encrypt the initial num_initial_blocks blocks and apply ghash on
444* the ciphertext
445* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
446* are clobbered
447* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
448*/
449
450
451.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
452XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500453 MOVADQ SHUF_MASK(%rip), %xmm14
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800454 mov arg7, %r10 # %r10 = AAD
455 mov arg8, %r12 # %r12 = aadLen
456 mov %r12, %r11
457 pxor %xmm\i, %xmm\i
458_get_AAD_loop\num_initial_blocks\operation:
459 movd (%r10), \TMP1
460 pslldq $12, \TMP1
461 psrldq $4, %xmm\i
462 pxor \TMP1, %xmm\i
463 add $4, %r10
464 sub $4, %r12
465 jne _get_AAD_loop\num_initial_blocks\operation
466 cmp $16, %r11
467 je _get_AAD_loop2_done\num_initial_blocks\operation
468 mov $16, %r12
469_get_AAD_loop2\num_initial_blocks\operation:
470 psrldq $4, %xmm\i
471 sub $4, %r12
472 cmp %r11, %r12
473 jne _get_AAD_loop2\num_initial_blocks\operation
474_get_AAD_loop2_done\num_initial_blocks\operation:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800475 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
476
477 xor %r11, %r11 # initialise the data pointer offset as zero
478
479 # start AES for num_initial_blocks blocks
480
481 mov %arg5, %rax # %rax = *Y0
482 movdqu (%rax), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800483 PSHUFB_XMM %xmm14, \XMM0
484
485.if (\i == 5) || (\i == 6) || (\i == 7)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800486
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500487 MOVADQ ONE(%RIP),\TMP1
488 MOVADQ 0(%arg1),\TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800489.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500490 paddd \TMP1, \XMM0 # INCR Y0
491 MOVADQ \XMM0, %xmm\index
492 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
493 pxor \TMP2, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800494.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500495 lea 0x10(%arg1),%r10
496 mov keysize,%eax
497 shr $2,%eax # 128->4, 192->6, 256->8
498 add $5,%eax # 128->9, 192->11, 256->13
499
500aes_loop_initial_enc\num_initial_blocks:
501 MOVADQ (%r10),\TMP1
502.irpc index, \i_seq
503 AESENC \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800504.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500505 add $16,%r10
506 sub $1,%eax
507 jnz aes_loop_initial_enc\num_initial_blocks
508
509 MOVADQ (%r10), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800510.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500511 AESENCLAST \TMP1, %xmm\index # Last Round
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800512.endr
513.irpc index, \i_seq
514 movdqu (%arg3 , %r11, 1), \TMP1
515 pxor \TMP1, %xmm\index
516 movdqu %xmm\index, (%arg2 , %r11, 1)
517 # write back plaintext/ciphertext for num_initial_blocks
518 add $16, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800519 PSHUFB_XMM %xmm14, %xmm\index
520
521 # prepare plaintext/ciphertext for GHASH computation
522.endr
523.endif
524 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
525 # apply GHASH on num_initial_blocks blocks
526
527.if \i == 5
528 pxor %xmm5, %xmm6
529 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
530 pxor %xmm6, %xmm7
531 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
532 pxor %xmm7, %xmm8
533 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
534.elseif \i == 6
535 pxor %xmm6, %xmm7
536 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
537 pxor %xmm7, %xmm8
538 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
539.elseif \i == 7
540 pxor %xmm7, %xmm8
541 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
542.endif
543 cmp $64, %r13
544 jl _initial_blocks_done\num_initial_blocks\operation
545 # no need for precomputed values
546/*
547*
548* Precomputations for HashKey parallel with encryption of first 4 blocks.
549* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
550*/
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500551 MOVADQ ONE(%RIP),\TMP1
552 paddd \TMP1, \XMM0 # INCR Y0
553 MOVADQ \XMM0, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800554 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
555
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500556 paddd \TMP1, \XMM0 # INCR Y0
557 MOVADQ \XMM0, \XMM2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800558 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
559
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500560 paddd \TMP1, \XMM0 # INCR Y0
561 MOVADQ \XMM0, \XMM3
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800562 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
563
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500564 paddd \TMP1, \XMM0 # INCR Y0
565 MOVADQ \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800566 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
567
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500568 MOVADQ 0(%arg1),\TMP1
569 pxor \TMP1, \XMM1
570 pxor \TMP1, \XMM2
571 pxor \TMP1, \XMM3
572 pxor \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800573 movdqa \TMP3, \TMP5
574 pshufd $78, \TMP3, \TMP1
575 pxor \TMP3, \TMP1
576 movdqa \TMP1, HashKey_k(%rsp)
577 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
578# TMP5 = HashKey^2<<1 (mod poly)
579 movdqa \TMP5, HashKey_2(%rsp)
580# HashKey_2 = HashKey^2<<1 (mod poly)
581 pshufd $78, \TMP5, \TMP1
582 pxor \TMP5, \TMP1
583 movdqa \TMP1, HashKey_2_k(%rsp)
584.irpc index, 1234 # do 4 rounds
585 movaps 0x10*\index(%arg1), \TMP1
586 AESENC \TMP1, \XMM1
587 AESENC \TMP1, \XMM2
588 AESENC \TMP1, \XMM3
589 AESENC \TMP1, \XMM4
590.endr
591 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
592# TMP5 = HashKey^3<<1 (mod poly)
593 movdqa \TMP5, HashKey_3(%rsp)
594 pshufd $78, \TMP5, \TMP1
595 pxor \TMP5, \TMP1
596 movdqa \TMP1, HashKey_3_k(%rsp)
597.irpc index, 56789 # do next 5 rounds
598 movaps 0x10*\index(%arg1), \TMP1
599 AESENC \TMP1, \XMM1
600 AESENC \TMP1, \XMM2
601 AESENC \TMP1, \XMM3
602 AESENC \TMP1, \XMM4
603.endr
604 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
605# TMP5 = HashKey^3<<1 (mod poly)
606 movdqa \TMP5, HashKey_4(%rsp)
607 pshufd $78, \TMP5, \TMP1
608 pxor \TMP5, \TMP1
609 movdqa \TMP1, HashKey_4_k(%rsp)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500610 lea 0xa0(%arg1),%r10
611 mov keysize,%eax
612 shr $2,%eax # 128->4, 192->6, 256->8
613 sub $4,%eax # 128->0, 192->2, 256->4
614 jz aes_loop_pre_enc_done\num_initial_blocks
615
616aes_loop_pre_enc\num_initial_blocks:
617 MOVADQ (%r10),\TMP2
618.irpc index, 1234
619 AESENC \TMP2, %xmm\index
620.endr
621 add $16,%r10
622 sub $1,%eax
623 jnz aes_loop_pre_enc\num_initial_blocks
624
625aes_loop_pre_enc_done\num_initial_blocks:
626 MOVADQ (%r10), \TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800627 AESENCLAST \TMP2, \XMM1
628 AESENCLAST \TMP2, \XMM2
629 AESENCLAST \TMP2, \XMM3
630 AESENCLAST \TMP2, \XMM4
631 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
632 pxor \TMP1, \XMM1
633 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
634 pxor \TMP1, \XMM2
635 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
636 pxor \TMP1, \XMM3
637 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
638 pxor \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400639 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
640 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
641 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
642 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800643
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400644 add $64, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800645 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400646 pxor \XMMDst, \XMM1
647# combine GHASHed value with the corresponding ciphertext
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800648 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800649 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800650 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
651
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400652_initial_blocks_done\num_initial_blocks\operation:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800653
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400654.endm
655
656/*
657* encrypt 4 blocks at a time
658* ghash the 4 previously encrypted ciphertext blocks
659* arg1, %arg2, %arg3 are used as pointers only, not modified
660* %r11 is the data offset value
661*/
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800662.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400663TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
664
665 movdqa \XMM1, \XMM5
666 movdqa \XMM2, \XMM6
667 movdqa \XMM3, \XMM7
668 movdqa \XMM4, \XMM8
669
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800670 movdqa SHUF_MASK(%rip), %xmm15
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400671 # multiply TMP5 * HashKey using karatsuba
672
673 movdqa \XMM5, \TMP4
674 pshufd $78, \XMM5, \TMP6
675 pxor \XMM5, \TMP6
676 paddd ONE(%rip), \XMM0 # INCR CNT
677 movdqa HashKey_4(%rsp), \TMP5
678 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
679 movdqa \XMM0, \XMM1
680 paddd ONE(%rip), \XMM0 # INCR CNT
681 movdqa \XMM0, \XMM2
682 paddd ONE(%rip), \XMM0 # INCR CNT
683 movdqa \XMM0, \XMM3
684 paddd ONE(%rip), \XMM0 # INCR CNT
685 movdqa \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800686 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400687 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800688 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
689 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
690 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
691
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400692 pxor (%arg1), \XMM1
693 pxor (%arg1), \XMM2
694 pxor (%arg1), \XMM3
695 pxor (%arg1), \XMM4
696 movdqa HashKey_4_k(%rsp), \TMP5
697 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
698 movaps 0x10(%arg1), \TMP1
699 AESENC \TMP1, \XMM1 # Round 1
700 AESENC \TMP1, \XMM2
701 AESENC \TMP1, \XMM3
702 AESENC \TMP1, \XMM4
703 movaps 0x20(%arg1), \TMP1
704 AESENC \TMP1, \XMM1 # Round 2
705 AESENC \TMP1, \XMM2
706 AESENC \TMP1, \XMM3
707 AESENC \TMP1, \XMM4
708 movdqa \XMM6, \TMP1
709 pshufd $78, \XMM6, \TMP2
710 pxor \XMM6, \TMP2
711 movdqa HashKey_3(%rsp), \TMP5
712 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
713 movaps 0x30(%arg1), \TMP3
714 AESENC \TMP3, \XMM1 # Round 3
715 AESENC \TMP3, \XMM2
716 AESENC \TMP3, \XMM3
717 AESENC \TMP3, \XMM4
718 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
719 movaps 0x40(%arg1), \TMP3
720 AESENC \TMP3, \XMM1 # Round 4
721 AESENC \TMP3, \XMM2
722 AESENC \TMP3, \XMM3
723 AESENC \TMP3, \XMM4
724 movdqa HashKey_3_k(%rsp), \TMP5
725 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
726 movaps 0x50(%arg1), \TMP3
727 AESENC \TMP3, \XMM1 # Round 5
728 AESENC \TMP3, \XMM2
729 AESENC \TMP3, \XMM3
730 AESENC \TMP3, \XMM4
731 pxor \TMP1, \TMP4
732# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
733 pxor \XMM6, \XMM5
734 pxor \TMP2, \TMP6
735 movdqa \XMM7, \TMP1
736 pshufd $78, \XMM7, \TMP2
737 pxor \XMM7, \TMP2
738 movdqa HashKey_2(%rsp ), \TMP5
739
740 # Multiply TMP5 * HashKey using karatsuba
741
742 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
743 movaps 0x60(%arg1), \TMP3
744 AESENC \TMP3, \XMM1 # Round 6
745 AESENC \TMP3, \XMM2
746 AESENC \TMP3, \XMM3
747 AESENC \TMP3, \XMM4
748 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
749 movaps 0x70(%arg1), \TMP3
750 AESENC \TMP3, \XMM1 # Round 7
751 AESENC \TMP3, \XMM2
752 AESENC \TMP3, \XMM3
753 AESENC \TMP3, \XMM4
754 movdqa HashKey_2_k(%rsp), \TMP5
755 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
756 movaps 0x80(%arg1), \TMP3
757 AESENC \TMP3, \XMM1 # Round 8
758 AESENC \TMP3, \XMM2
759 AESENC \TMP3, \XMM3
760 AESENC \TMP3, \XMM4
761 pxor \TMP1, \TMP4
762# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
763 pxor \XMM7, \XMM5
764 pxor \TMP2, \TMP6
765
766 # Multiply XMM8 * HashKey
767 # XMM8 and TMP5 hold the values for the two operands
768
769 movdqa \XMM8, \TMP1
770 pshufd $78, \XMM8, \TMP2
771 pxor \XMM8, \TMP2
772 movdqa HashKey(%rsp), \TMP5
773 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
774 movaps 0x90(%arg1), \TMP3
775 AESENC \TMP3, \XMM1 # Round 9
776 AESENC \TMP3, \XMM2
777 AESENC \TMP3, \XMM3
778 AESENC \TMP3, \XMM4
779 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500780 lea 0xa0(%arg1),%r10
781 mov keysize,%eax
782 shr $2,%eax # 128->4, 192->6, 256->8
783 sub $4,%eax # 128->0, 192->2, 256->4
784 jz aes_loop_par_enc_done
785
786aes_loop_par_enc:
787 MOVADQ (%r10),\TMP3
788.irpc index, 1234
789 AESENC \TMP3, %xmm\index
790.endr
791 add $16,%r10
792 sub $1,%eax
793 jnz aes_loop_par_enc
794
795aes_loop_par_enc_done:
796 MOVADQ (%r10), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400797 AESENCLAST \TMP3, \XMM1 # Round 10
798 AESENCLAST \TMP3, \XMM2
799 AESENCLAST \TMP3, \XMM3
800 AESENCLAST \TMP3, \XMM4
801 movdqa HashKey_k(%rsp), \TMP5
802 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
803 movdqu (%arg3,%r11,1), \TMP3
804 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400805 movdqu 16(%arg3,%r11,1), \TMP3
806 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400807 movdqu 32(%arg3,%r11,1), \TMP3
808 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400809 movdqu 48(%arg3,%r11,1), \TMP3
810 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800811 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
812 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
813 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
814 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
815 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
816 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
817 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
818 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
819
820 pxor \TMP4, \TMP1
821 pxor \XMM8, \XMM5
822 pxor \TMP6, \TMP2
823 pxor \TMP1, \TMP2
824 pxor \XMM5, \TMP2
825 movdqa \TMP2, \TMP3
826 pslldq $8, \TMP3 # left shift TMP3 2 DWs
827 psrldq $8, \TMP2 # right shift TMP2 2 DWs
828 pxor \TMP3, \XMM5
829 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
830
831 # first phase of reduction
832
833 movdqa \XMM5, \TMP2
834 movdqa \XMM5, \TMP3
835 movdqa \XMM5, \TMP4
836# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
837 pslld $31, \TMP2 # packed right shift << 31
838 pslld $30, \TMP3 # packed right shift << 30
839 pslld $25, \TMP4 # packed right shift << 25
840 pxor \TMP3, \TMP2 # xor the shifted versions
841 pxor \TMP4, \TMP2
842 movdqa \TMP2, \TMP5
843 psrldq $4, \TMP5 # right shift T5 1 DW
844 pslldq $12, \TMP2 # left shift T2 3 DWs
845 pxor \TMP2, \XMM5
846
847 # second phase of reduction
848
849 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
850 movdqa \XMM5,\TMP3
851 movdqa \XMM5,\TMP4
852 psrld $1, \TMP2 # packed left shift >>1
853 psrld $2, \TMP3 # packed left shift >>2
854 psrld $7, \TMP4 # packed left shift >>7
855 pxor \TMP3,\TMP2 # xor the shifted versions
856 pxor \TMP4,\TMP2
857 pxor \TMP5, \TMP2
858 pxor \TMP2, \XMM5
859 pxor \TMP1, \XMM5 # result is in TMP1
860
861 pxor \XMM5, \XMM1
862.endm
863
864/*
865* decrypt 4 blocks at a time
866* ghash the 4 previously decrypted ciphertext blocks
867* arg1, %arg2, %arg3 are used as pointers only, not modified
868* %r11 is the data offset value
869*/
870.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
871TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
872
873 movdqa \XMM1, \XMM5
874 movdqa \XMM2, \XMM6
875 movdqa \XMM3, \XMM7
876 movdqa \XMM4, \XMM8
877
878 movdqa SHUF_MASK(%rip), %xmm15
879 # multiply TMP5 * HashKey using karatsuba
880
881 movdqa \XMM5, \TMP4
882 pshufd $78, \XMM5, \TMP6
883 pxor \XMM5, \TMP6
884 paddd ONE(%rip), \XMM0 # INCR CNT
885 movdqa HashKey_4(%rsp), \TMP5
886 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
887 movdqa \XMM0, \XMM1
888 paddd ONE(%rip), \XMM0 # INCR CNT
889 movdqa \XMM0, \XMM2
890 paddd ONE(%rip), \XMM0 # INCR CNT
891 movdqa \XMM0, \XMM3
892 paddd ONE(%rip), \XMM0 # INCR CNT
893 movdqa \XMM0, \XMM4
894 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
895 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
896 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
897 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
898 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
899
900 pxor (%arg1), \XMM1
901 pxor (%arg1), \XMM2
902 pxor (%arg1), \XMM3
903 pxor (%arg1), \XMM4
904 movdqa HashKey_4_k(%rsp), \TMP5
905 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
906 movaps 0x10(%arg1), \TMP1
907 AESENC \TMP1, \XMM1 # Round 1
908 AESENC \TMP1, \XMM2
909 AESENC \TMP1, \XMM3
910 AESENC \TMP1, \XMM4
911 movaps 0x20(%arg1), \TMP1
912 AESENC \TMP1, \XMM1 # Round 2
913 AESENC \TMP1, \XMM2
914 AESENC \TMP1, \XMM3
915 AESENC \TMP1, \XMM4
916 movdqa \XMM6, \TMP1
917 pshufd $78, \XMM6, \TMP2
918 pxor \XMM6, \TMP2
919 movdqa HashKey_3(%rsp), \TMP5
920 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
921 movaps 0x30(%arg1), \TMP3
922 AESENC \TMP3, \XMM1 # Round 3
923 AESENC \TMP3, \XMM2
924 AESENC \TMP3, \XMM3
925 AESENC \TMP3, \XMM4
926 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
927 movaps 0x40(%arg1), \TMP3
928 AESENC \TMP3, \XMM1 # Round 4
929 AESENC \TMP3, \XMM2
930 AESENC \TMP3, \XMM3
931 AESENC \TMP3, \XMM4
932 movdqa HashKey_3_k(%rsp), \TMP5
933 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
934 movaps 0x50(%arg1), \TMP3
935 AESENC \TMP3, \XMM1 # Round 5
936 AESENC \TMP3, \XMM2
937 AESENC \TMP3, \XMM3
938 AESENC \TMP3, \XMM4
939 pxor \TMP1, \TMP4
940# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
941 pxor \XMM6, \XMM5
942 pxor \TMP2, \TMP6
943 movdqa \XMM7, \TMP1
944 pshufd $78, \XMM7, \TMP2
945 pxor \XMM7, \TMP2
946 movdqa HashKey_2(%rsp ), \TMP5
947
948 # Multiply TMP5 * HashKey using karatsuba
949
950 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
951 movaps 0x60(%arg1), \TMP3
952 AESENC \TMP3, \XMM1 # Round 6
953 AESENC \TMP3, \XMM2
954 AESENC \TMP3, \XMM3
955 AESENC \TMP3, \XMM4
956 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
957 movaps 0x70(%arg1), \TMP3
958 AESENC \TMP3, \XMM1 # Round 7
959 AESENC \TMP3, \XMM2
960 AESENC \TMP3, \XMM3
961 AESENC \TMP3, \XMM4
962 movdqa HashKey_2_k(%rsp), \TMP5
963 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
964 movaps 0x80(%arg1), \TMP3
965 AESENC \TMP3, \XMM1 # Round 8
966 AESENC \TMP3, \XMM2
967 AESENC \TMP3, \XMM3
968 AESENC \TMP3, \XMM4
969 pxor \TMP1, \TMP4
970# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
971 pxor \XMM7, \XMM5
972 pxor \TMP2, \TMP6
973
974 # Multiply XMM8 * HashKey
975 # XMM8 and TMP5 hold the values for the two operands
976
977 movdqa \XMM8, \TMP1
978 pshufd $78, \XMM8, \TMP2
979 pxor \XMM8, \TMP2
980 movdqa HashKey(%rsp), \TMP5
981 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
982 movaps 0x90(%arg1), \TMP3
983 AESENC \TMP3, \XMM1 # Round 9
984 AESENC \TMP3, \XMM2
985 AESENC \TMP3, \XMM3
986 AESENC \TMP3, \XMM4
987 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500988 lea 0xa0(%arg1),%r10
989 mov keysize,%eax
990 shr $2,%eax # 128->4, 192->6, 256->8
991 sub $4,%eax # 128->0, 192->2, 256->4
992 jz aes_loop_par_dec_done
993
994aes_loop_par_dec:
995 MOVADQ (%r10),\TMP3
996.irpc index, 1234
997 AESENC \TMP3, %xmm\index
998.endr
999 add $16,%r10
1000 sub $1,%eax
1001 jnz aes_loop_par_dec
1002
1003aes_loop_par_dec_done:
1004 MOVADQ (%r10), \TMP3
1005 AESENCLAST \TMP3, \XMM1 # last round
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001006 AESENCLAST \TMP3, \XMM2
1007 AESENCLAST \TMP3, \XMM3
1008 AESENCLAST \TMP3, \XMM4
1009 movdqa HashKey_k(%rsp), \TMP5
1010 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1011 movdqu (%arg3,%r11,1), \TMP3
1012 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1013 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1014 movdqa \TMP3, \XMM1
1015 movdqu 16(%arg3,%r11,1), \TMP3
1016 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1017 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1018 movdqa \TMP3, \XMM2
1019 movdqu 32(%arg3,%r11,1), \TMP3
1020 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1021 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1022 movdqa \TMP3, \XMM3
1023 movdqu 48(%arg3,%r11,1), \TMP3
1024 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001025 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1026 movdqa \TMP3, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001027 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1028 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1029 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1030 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001031
1032 pxor \TMP4, \TMP1
1033 pxor \XMM8, \XMM5
1034 pxor \TMP6, \TMP2
1035 pxor \TMP1, \TMP2
1036 pxor \XMM5, \TMP2
1037 movdqa \TMP2, \TMP3
1038 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1039 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1040 pxor \TMP3, \XMM5
1041 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1042
1043 # first phase of reduction
1044
1045 movdqa \XMM5, \TMP2
1046 movdqa \XMM5, \TMP3
1047 movdqa \XMM5, \TMP4
1048# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1049 pslld $31, \TMP2 # packed right shift << 31
1050 pslld $30, \TMP3 # packed right shift << 30
1051 pslld $25, \TMP4 # packed right shift << 25
1052 pxor \TMP3, \TMP2 # xor the shifted versions
1053 pxor \TMP4, \TMP2
1054 movdqa \TMP2, \TMP5
1055 psrldq $4, \TMP5 # right shift T5 1 DW
1056 pslldq $12, \TMP2 # left shift T2 3 DWs
1057 pxor \TMP2, \XMM5
1058
1059 # second phase of reduction
1060
1061 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1062 movdqa \XMM5,\TMP3
1063 movdqa \XMM5,\TMP4
1064 psrld $1, \TMP2 # packed left shift >>1
1065 psrld $2, \TMP3 # packed left shift >>2
1066 psrld $7, \TMP4 # packed left shift >>7
1067 pxor \TMP3,\TMP2 # xor the shifted versions
1068 pxor \TMP4,\TMP2
1069 pxor \TMP5, \TMP2
1070 pxor \TMP2, \XMM5
1071 pxor \TMP1, \XMM5 # result is in TMP1
1072
1073 pxor \XMM5, \XMM1
1074.endm
1075
1076/* GHASH the last 4 ciphertext blocks. */
1077.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1078TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1079
1080 # Multiply TMP6 * HashKey (using Karatsuba)
1081
1082 movdqa \XMM1, \TMP6
1083 pshufd $78, \XMM1, \TMP2
1084 pxor \XMM1, \TMP2
1085 movdqa HashKey_4(%rsp), \TMP5
1086 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1087 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1088 movdqa HashKey_4_k(%rsp), \TMP4
1089 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1090 movdqa \XMM1, \XMMDst
1091 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1092
1093 # Multiply TMP1 * HashKey (using Karatsuba)
1094
1095 movdqa \XMM2, \TMP1
1096 pshufd $78, \XMM2, \TMP2
1097 pxor \XMM2, \TMP2
1098 movdqa HashKey_3(%rsp), \TMP5
1099 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1100 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1101 movdqa HashKey_3_k(%rsp), \TMP4
1102 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1103 pxor \TMP1, \TMP6
1104 pxor \XMM2, \XMMDst
1105 pxor \TMP2, \XMM1
1106# results accumulated in TMP6, XMMDst, XMM1
1107
1108 # Multiply TMP1 * HashKey (using Karatsuba)
1109
1110 movdqa \XMM3, \TMP1
1111 pshufd $78, \XMM3, \TMP2
1112 pxor \XMM3, \TMP2
1113 movdqa HashKey_2(%rsp), \TMP5
1114 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1115 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1116 movdqa HashKey_2_k(%rsp), \TMP4
1117 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1118 pxor \TMP1, \TMP6
1119 pxor \XMM3, \XMMDst
1120 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1121
1122 # Multiply TMP1 * HashKey (using Karatsuba)
1123 movdqa \XMM4, \TMP1
1124 pshufd $78, \XMM4, \TMP2
1125 pxor \XMM4, \TMP2
1126 movdqa HashKey(%rsp), \TMP5
1127 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1128 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1129 movdqa HashKey_k(%rsp), \TMP4
1130 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1131 pxor \TMP1, \TMP6
1132 pxor \XMM4, \XMMDst
1133 pxor \XMM1, \TMP2
1134 pxor \TMP6, \TMP2
1135 pxor \XMMDst, \TMP2
1136 # middle section of the temp results combined as in karatsuba algorithm
1137 movdqa \TMP2, \TMP4
1138 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1139 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1140 pxor \TMP4, \XMMDst
1141 pxor \TMP2, \TMP6
1142# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1143 # first phase of the reduction
1144 movdqa \XMMDst, \TMP2
1145 movdqa \XMMDst, \TMP3
1146 movdqa \XMMDst, \TMP4
1147# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1148 pslld $31, \TMP2 # packed right shifting << 31
1149 pslld $30, \TMP3 # packed right shifting << 30
1150 pslld $25, \TMP4 # packed right shifting << 25
1151 pxor \TMP3, \TMP2 # xor the shifted versions
1152 pxor \TMP4, \TMP2
1153 movdqa \TMP2, \TMP7
1154 psrldq $4, \TMP7 # right shift TMP7 1 DW
1155 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1156 pxor \TMP2, \XMMDst
1157
1158 # second phase of the reduction
1159 movdqa \XMMDst, \TMP2
1160 # make 3 copies of XMMDst for doing 3 shift operations
1161 movdqa \XMMDst, \TMP3
1162 movdqa \XMMDst, \TMP4
1163 psrld $1, \TMP2 # packed left shift >> 1
1164 psrld $2, \TMP3 # packed left shift >> 2
1165 psrld $7, \TMP4 # packed left shift >> 7
1166 pxor \TMP3, \TMP2 # xor the shifted versions
1167 pxor \TMP4, \TMP2
1168 pxor \TMP7, \TMP2
1169 pxor \TMP2, \XMMDst
1170 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1171.endm
1172
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001173
1174/* Encryption of a single block
1175* uses eax & r10
1176*/
1177
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001178.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1179
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001180 pxor (%arg1), \XMM0
1181 mov keysize,%eax
1182 shr $2,%eax # 128->4, 192->6, 256->8
1183 add $5,%eax # 128->9, 192->11, 256->13
1184 lea 16(%arg1), %r10 # get first expanded key address
1185
1186_esb_loop_\@:
1187 MOVADQ (%r10),\TMP1
1188 AESENC \TMP1,\XMM0
1189 add $16,%r10
1190 sub $1,%eax
1191 jnz _esb_loop_\@
1192
1193 MOVADQ (%r10),\TMP1
1194 AESENCLAST \TMP1,\XMM0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001195.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001196/*****************************************************************************
1197* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1198* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1199* const u8 *in, // Ciphertext input
1200* u64 plaintext_len, // Length of data in bytes for decryption.
1201* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1202* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1203* // concatenated with 0x00000001. 16-byte aligned pointer.
1204* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1205* const u8 *aad, // Additional Authentication Data (AAD)
1206* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1207* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1208* // given authentication tag and only return the plaintext if they match.
1209* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1210* // (most likely), 12 or 8.
1211*
1212* Assumptions:
1213*
1214* keys:
1215* keys are pre-expanded and aligned to 16 bytes. we are using the first
1216* set of 11 keys in the data structure void *aes_ctx
1217*
1218* iv:
1219* 0 1 2 3
1220* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1221* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1222* | Salt (From the SA) |
1223* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1224* | Initialization Vector |
1225* | (This is the sequence number from IPSec header) |
1226* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1227* | 0x1 |
1228* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1229*
1230*
1231*
1232* AAD:
1233* AAD padded to 128 bits with 0
1234* for example, assume AAD is a u32 vector
1235*
1236* if AAD is 8 bytes:
1237* AAD[3] = {A0, A1};
1238* padded AAD in xmm register = {A1 A0 0 0}
1239*
1240* 0 1 2 3
1241* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1242* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1243* | SPI (A1) |
1244* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1245* | 32-bit Sequence Number (A0) |
1246* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1247* | 0x0 |
1248* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249*
1250* AAD Format with 32-bit Sequence Number
1251*
1252* if AAD is 12 bytes:
1253* AAD[3] = {A0, A1, A2};
1254* padded AAD in xmm register = {A2 A1 A0 0}
1255*
1256* 0 1 2 3
1257* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1258* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1259* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1260* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1261* | SPI (A2) |
1262* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1263* | 64-bit Extended Sequence Number {A1,A0} |
1264* | |
1265* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1266* | 0x0 |
1267* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1268*
1269* AAD Format with 64-bit Extended Sequence Number
1270*
1271* aadLen:
1272* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1273* The code supports 16 too but for other sizes, the code will fail.
1274*
1275* TLen:
1276* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1277* For other sizes, the code will fail.
1278*
1279* poly = x^128 + x^127 + x^126 + x^121 + 1
1280*
1281*****************************************************************************/
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001282ENTRY(aesni_gcm_dec)
1283 push %r12
1284 push %r13
1285 push %r14
1286 mov %rsp, %r14
1287/*
1288* states of %xmm registers %xmm6:%xmm15 not saved
1289* all %xmm registers are clobbered
1290*/
1291 sub $VARIABLE_OFFSET, %rsp
1292 and $~63, %rsp # align rsp to 64 bytes
1293 mov %arg6, %r12
1294 movdqu (%r12), %xmm13 # %xmm13 = HashKey
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001295 movdqa SHUF_MASK(%rip), %xmm2
1296 PSHUFB_XMM %xmm2, %xmm13
1297
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001298
1299# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1300
1301 movdqa %xmm13, %xmm2
1302 psllq $1, %xmm13
1303 psrlq $63, %xmm2
1304 movdqa %xmm2, %xmm1
1305 pslldq $8, %xmm2
1306 psrldq $8, %xmm1
1307 por %xmm2, %xmm13
1308
1309 # Reduction
1310
1311 pshufd $0x24, %xmm1, %xmm2
1312 pcmpeqd TWOONE(%rip), %xmm2
1313 pand POLY(%rip), %xmm2
1314 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1315
1316
1317 # Decrypt first few blocks
1318
1319 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1320 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1321 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1322 mov %r13, %r12
1323 and $(3<<4), %r12
1324 jz _initial_num_blocks_is_0_decrypt
1325 cmp $(2<<4), %r12
1326 jb _initial_num_blocks_is_1_decrypt
1327 je _initial_num_blocks_is_2_decrypt
1328_initial_num_blocks_is_3_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001329 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001330%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1331 sub $48, %r13
1332 jmp _initial_blocks_decrypted
1333_initial_num_blocks_is_2_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001334 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001335%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1336 sub $32, %r13
1337 jmp _initial_blocks_decrypted
1338_initial_num_blocks_is_1_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001339 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001340%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1341 sub $16, %r13
1342 jmp _initial_blocks_decrypted
1343_initial_num_blocks_is_0_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001344 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001345%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1346_initial_blocks_decrypted:
1347 cmp $0, %r13
1348 je _zero_cipher_left_decrypt
1349 sub $64, %r13
1350 je _four_cipher_left_decrypt
1351_decrypt_by_4:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001352 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001353%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1354 add $64, %r11
1355 sub $64, %r13
1356 jne _decrypt_by_4
1357_four_cipher_left_decrypt:
1358 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1359%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1360_zero_cipher_left_decrypt:
1361 mov %arg4, %r13
1362 and $15, %r13 # %r13 = arg4 (mod 16)
1363 je _multiple_of_16_bytes_decrypt
1364
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001365 # Handle the last <16 byte block separately
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001366
1367 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001368 movdqa SHUF_MASK(%rip), %xmm10
1369 PSHUFB_XMM %xmm10, %xmm0
1370
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001371 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1372 sub $16, %r11
1373 add %r13, %r11
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001374 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001375 lea SHIFT_MASK+16(%rip), %r12
1376 sub %r13, %r12
1377# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1378# (%r13 is the number of bytes in plaintext mod 16)
1379 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001380 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1381
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001382 movdqa %xmm1, %xmm2
1383 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1384 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1385 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1386 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1387 pand %xmm1, %xmm2
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001388 movdqa SHUF_MASK(%rip), %xmm10
1389 PSHUFB_XMM %xmm10 ,%xmm2
1390
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001391 pxor %xmm2, %xmm8
1392 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1393 # GHASH computation for the last <16 byte block
1394 sub %r13, %r11
1395 add $16, %r11
1396
1397 # output %r13 bytes
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001398 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001399 cmp $8, %r13
1400 jle _less_than_8_bytes_left_decrypt
1401 mov %rax, (%arg2 , %r11, 1)
1402 add $8, %r11
1403 psrldq $8, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001404 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001405 sub $8, %r13
1406_less_than_8_bytes_left_decrypt:
1407 mov %al, (%arg2, %r11, 1)
1408 add $1, %r11
1409 shr $8, %rax
1410 sub $1, %r13
1411 jne _less_than_8_bytes_left_decrypt
1412_multiple_of_16_bytes_decrypt:
1413 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1414 shl $3, %r12 # convert into number of bits
1415 movd %r12d, %xmm15 # len(A) in %xmm15
1416 shl $3, %arg4 # len(C) in bits (*128)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001417 MOVQ_R64_XMM %arg4, %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001418 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1419 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1420 pxor %xmm15, %xmm8
1421 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1422 # final GHASH computation
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001423 movdqa SHUF_MASK(%rip), %xmm10
1424 PSHUFB_XMM %xmm10, %xmm8
1425
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001426 mov %arg5, %rax # %rax = *Y0
1427 movdqu (%rax), %xmm0 # %xmm0 = Y0
1428 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1429 pxor %xmm8, %xmm0
1430_return_T_decrypt:
1431 mov arg9, %r10 # %r10 = authTag
1432 mov arg10, %r11 # %r11 = auth_tag_len
1433 cmp $16, %r11
1434 je _T_16_decrypt
1435 cmp $12, %r11
1436 je _T_12_decrypt
1437_T_8_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001438 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001439 mov %rax, (%r10)
1440 jmp _return_T_done_decrypt
1441_T_12_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001442 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001443 mov %rax, (%r10)
1444 psrldq $8, %xmm0
1445 movd %xmm0, %eax
1446 mov %eax, 8(%r10)
1447 jmp _return_T_done_decrypt
1448_T_16_decrypt:
1449 movdqu %xmm0, (%r10)
1450_return_T_done_decrypt:
1451 mov %r14, %rsp
1452 pop %r14
1453 pop %r13
1454 pop %r12
1455 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001456ENDPROC(aesni_gcm_dec)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001457
1458
1459/*****************************************************************************
1460* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1461* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1462* const u8 *in, // Plaintext input
1463* u64 plaintext_len, // Length of data in bytes for encryption.
1464* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1465* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1466* // concatenated with 0x00000001. 16-byte aligned pointer.
1467* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1468* const u8 *aad, // Additional Authentication Data (AAD)
1469* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1470* u8 *auth_tag, // Authenticated Tag output.
1471* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1472* // 12 or 8.
1473*
1474* Assumptions:
1475*
1476* keys:
1477* keys are pre-expanded and aligned to 16 bytes. we are using the
1478* first set of 11 keys in the data structure void *aes_ctx
1479*
1480*
1481* iv:
1482* 0 1 2 3
1483* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1484* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1485* | Salt (From the SA) |
1486* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1487* | Initialization Vector |
1488* | (This is the sequence number from IPSec header) |
1489* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1490* | 0x1 |
1491* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1492*
1493*
1494*
1495* AAD:
1496* AAD padded to 128 bits with 0
1497* for example, assume AAD is a u32 vector
1498*
1499* if AAD is 8 bytes:
1500* AAD[3] = {A0, A1};
1501* padded AAD in xmm register = {A1 A0 0 0}
1502*
1503* 0 1 2 3
1504* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1505* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1506* | SPI (A1) |
1507* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1508* | 32-bit Sequence Number (A0) |
1509* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510* | 0x0 |
1511* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512*
1513* AAD Format with 32-bit Sequence Number
1514*
1515* if AAD is 12 bytes:
1516* AAD[3] = {A0, A1, A2};
1517* padded AAD in xmm register = {A2 A1 A0 0}
1518*
1519* 0 1 2 3
1520* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1521* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1522* | SPI (A2) |
1523* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1524* | 64-bit Extended Sequence Number {A1,A0} |
1525* | |
1526* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1527* | 0x0 |
1528* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1529*
1530* AAD Format with 64-bit Extended Sequence Number
1531*
1532* aadLen:
1533* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1534* The code supports 16 too but for other sizes, the code will fail.
1535*
1536* TLen:
1537* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1538* For other sizes, the code will fail.
1539*
1540* poly = x^128 + x^127 + x^126 + x^121 + 1
1541***************************************************************************/
1542ENTRY(aesni_gcm_enc)
1543 push %r12
1544 push %r13
1545 push %r14
1546 mov %rsp, %r14
1547#
1548# states of %xmm registers %xmm6:%xmm15 not saved
1549# all %xmm registers are clobbered
1550#
1551 sub $VARIABLE_OFFSET, %rsp
1552 and $~63, %rsp
1553 mov %arg6, %r12
1554 movdqu (%r12), %xmm13
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001555 movdqa SHUF_MASK(%rip), %xmm2
1556 PSHUFB_XMM %xmm2, %xmm13
1557
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001558
1559# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1560
1561 movdqa %xmm13, %xmm2
1562 psllq $1, %xmm13
1563 psrlq $63, %xmm2
1564 movdqa %xmm2, %xmm1
1565 pslldq $8, %xmm2
1566 psrldq $8, %xmm1
1567 por %xmm2, %xmm13
1568
1569 # reduce HashKey<<1
1570
1571 pshufd $0x24, %xmm1, %xmm2
1572 pcmpeqd TWOONE(%rip), %xmm2
1573 pand POLY(%rip), %xmm2
1574 pxor %xmm2, %xmm13
1575 movdqa %xmm13, HashKey(%rsp)
1576 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1577 and $-16, %r13
1578 mov %r13, %r12
1579
1580 # Encrypt first few blocks
1581
1582 and $(3<<4), %r12
1583 jz _initial_num_blocks_is_0_encrypt
1584 cmp $(2<<4), %r12
1585 jb _initial_num_blocks_is_1_encrypt
1586 je _initial_num_blocks_is_2_encrypt
1587_initial_num_blocks_is_3_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001588 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001589%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1590 sub $48, %r13
1591 jmp _initial_blocks_encrypted
1592_initial_num_blocks_is_2_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001593 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001594%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1595 sub $32, %r13
1596 jmp _initial_blocks_encrypted
1597_initial_num_blocks_is_1_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001598 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001599%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1600 sub $16, %r13
1601 jmp _initial_blocks_encrypted
1602_initial_num_blocks_is_0_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001603 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001604%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1605_initial_blocks_encrypted:
1606
1607 # Main loop - Encrypt remaining blocks
1608
1609 cmp $0, %r13
1610 je _zero_cipher_left_encrypt
1611 sub $64, %r13
1612 je _four_cipher_left_encrypt
1613_encrypt_by_4_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001614 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001615%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1616 add $64, %r11
1617 sub $64, %r13
1618 jne _encrypt_by_4_encrypt
1619_four_cipher_left_encrypt:
1620 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1621%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1622_zero_cipher_left_encrypt:
1623 mov %arg4, %r13
1624 and $15, %r13 # %r13 = arg4 (mod 16)
1625 je _multiple_of_16_bytes_encrypt
1626
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001627 # Handle the last <16 Byte block separately
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001628 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001629 movdqa SHUF_MASK(%rip), %xmm10
1630 PSHUFB_XMM %xmm10, %xmm0
1631
Tadeusz Struk60af5202011-03-13 16:56:17 +08001632
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001633 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1634 sub $16, %r11
1635 add %r13, %r11
1636 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1637 lea SHIFT_MASK+16(%rip), %r12
1638 sub %r13, %r12
1639 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1640 # (%r13 is the number of bytes in plaintext mod 16)
1641 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001642 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001643 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1644 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1645 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1646 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001647 movdqa SHUF_MASK(%rip), %xmm10
1648 PSHUFB_XMM %xmm10,%xmm0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001649
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001650 pxor %xmm0, %xmm8
1651 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1652 # GHASH computation for the last <16 byte block
1653 sub %r13, %r11
1654 add $16, %r11
Tadeusz Struk60af5202011-03-13 16:56:17 +08001655
1656 movdqa SHUF_MASK(%rip), %xmm10
1657 PSHUFB_XMM %xmm10, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001658
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001659 # shuffle xmm0 back to output as ciphertext
1660
1661 # Output %r13 bytes
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001662 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001663 cmp $8, %r13
1664 jle _less_than_8_bytes_left_encrypt
1665 mov %rax, (%arg2 , %r11, 1)
1666 add $8, %r11
1667 psrldq $8, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001668 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001669 sub $8, %r13
1670_less_than_8_bytes_left_encrypt:
1671 mov %al, (%arg2, %r11, 1)
1672 add $1, %r11
1673 shr $8, %rax
1674 sub $1, %r13
1675 jne _less_than_8_bytes_left_encrypt
1676_multiple_of_16_bytes_encrypt:
1677 mov arg8, %r12 # %r12 = addLen (number of bytes)
1678 shl $3, %r12
1679 movd %r12d, %xmm15 # len(A) in %xmm15
1680 shl $3, %arg4 # len(C) in bits (*128)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001681 MOVQ_R64_XMM %arg4, %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001682 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1683 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1684 pxor %xmm15, %xmm8
1685 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1686 # final GHASH computation
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001687 movdqa SHUF_MASK(%rip), %xmm10
1688 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001689
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001690 mov %arg5, %rax # %rax = *Y0
1691 movdqu (%rax), %xmm0 # %xmm0 = Y0
1692 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1693 pxor %xmm8, %xmm0
1694_return_T_encrypt:
1695 mov arg9, %r10 # %r10 = authTag
1696 mov arg10, %r11 # %r11 = auth_tag_len
1697 cmp $16, %r11
1698 je _T_16_encrypt
1699 cmp $12, %r11
1700 je _T_12_encrypt
1701_T_8_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001702 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001703 mov %rax, (%r10)
1704 jmp _return_T_done_encrypt
1705_T_12_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001706 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001707 mov %rax, (%r10)
1708 psrldq $8, %xmm0
1709 movd %xmm0, %eax
1710 mov %eax, 8(%r10)
1711 jmp _return_T_done_encrypt
1712_T_16_encrypt:
1713 movdqu %xmm0, (%r10)
1714_return_T_done_encrypt:
1715 mov %r14, %rsp
1716 pop %r14
1717 pop %r13
1718 pop %r12
1719 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001720ENDPROC(aesni_gcm_enc)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001721
Mathias Krause559ad0f2010-11-29 08:35:39 +08001722#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001723
1724
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001725.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001726_key_expansion_128:
1727_key_expansion_256a:
1728 pshufd $0b11111111, %xmm1, %xmm1
1729 shufps $0b00010000, %xmm0, %xmm4
1730 pxor %xmm4, %xmm0
1731 shufps $0b10001100, %xmm0, %xmm4
1732 pxor %xmm4, %xmm0
1733 pxor %xmm1, %xmm0
Mathias Krause0d258ef2010-11-27 16:34:46 +08001734 movaps %xmm0, (TKEYP)
1735 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001736 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001737ENDPROC(_key_expansion_128)
1738ENDPROC(_key_expansion_256a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001739
Mathias Krause0d258ef2010-11-27 16:34:46 +08001740.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001741_key_expansion_192a:
1742 pshufd $0b01010101, %xmm1, %xmm1
1743 shufps $0b00010000, %xmm0, %xmm4
1744 pxor %xmm4, %xmm0
1745 shufps $0b10001100, %xmm0, %xmm4
1746 pxor %xmm4, %xmm0
1747 pxor %xmm1, %xmm0
1748
1749 movaps %xmm2, %xmm5
1750 movaps %xmm2, %xmm6
1751 pslldq $4, %xmm5
1752 pshufd $0b11111111, %xmm0, %xmm3
1753 pxor %xmm3, %xmm2
1754 pxor %xmm5, %xmm2
1755
1756 movaps %xmm0, %xmm1
1757 shufps $0b01000100, %xmm0, %xmm6
Mathias Krause0d258ef2010-11-27 16:34:46 +08001758 movaps %xmm6, (TKEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001759 shufps $0b01001110, %xmm2, %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001760 movaps %xmm1, 0x10(TKEYP)
1761 add $0x20, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001762 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001763ENDPROC(_key_expansion_192a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001764
Mathias Krause0d258ef2010-11-27 16:34:46 +08001765.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001766_key_expansion_192b:
1767 pshufd $0b01010101, %xmm1, %xmm1
1768 shufps $0b00010000, %xmm0, %xmm4
1769 pxor %xmm4, %xmm0
1770 shufps $0b10001100, %xmm0, %xmm4
1771 pxor %xmm4, %xmm0
1772 pxor %xmm1, %xmm0
1773
1774 movaps %xmm2, %xmm5
1775 pslldq $4, %xmm5
1776 pshufd $0b11111111, %xmm0, %xmm3
1777 pxor %xmm3, %xmm2
1778 pxor %xmm5, %xmm2
1779
Mathias Krause0d258ef2010-11-27 16:34:46 +08001780 movaps %xmm0, (TKEYP)
1781 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001782 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001783ENDPROC(_key_expansion_192b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001784
Mathias Krause0d258ef2010-11-27 16:34:46 +08001785.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001786_key_expansion_256b:
1787 pshufd $0b10101010, %xmm1, %xmm1
1788 shufps $0b00010000, %xmm2, %xmm4
1789 pxor %xmm4, %xmm2
1790 shufps $0b10001100, %xmm2, %xmm4
1791 pxor %xmm4, %xmm2
1792 pxor %xmm1, %xmm2
Mathias Krause0d258ef2010-11-27 16:34:46 +08001793 movaps %xmm2, (TKEYP)
1794 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001795 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001796ENDPROC(_key_expansion_256b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001797
1798/*
1799 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1800 * unsigned int key_len)
1801 */
1802ENTRY(aesni_set_key)
Mathias Krause0d258ef2010-11-27 16:34:46 +08001803#ifndef __x86_64__
1804 pushl KEYP
1805 movl 8(%esp), KEYP # ctx
1806 movl 12(%esp), UKEYP # in_key
1807 movl 16(%esp), %edx # key_len
1808#endif
1809 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1810 movaps %xmm0, (KEYP)
1811 lea 0x10(KEYP), TKEYP # key addr
1812 movl %edx, 480(KEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001813 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1814 cmp $24, %dl
1815 jb .Lenc_key128
1816 je .Lenc_key192
Mathias Krause0d258ef2010-11-27 16:34:46 +08001817 movups 0x10(UKEYP), %xmm2 # other user key
1818 movaps %xmm2, (TKEYP)
1819 add $0x10, TKEYP
Huang Yingb369e522009-11-23 19:54:06 +08001820 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001821 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001822 AESKEYGENASSIST 0x1 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001823 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001824 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001825 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001826 AESKEYGENASSIST 0x2 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001827 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001828 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001829 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001830 AESKEYGENASSIST 0x4 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001831 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001832 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001833 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001834 AESKEYGENASSIST 0x8 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001835 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001836 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001837 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001838 AESKEYGENASSIST 0x10 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001839 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001840 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001841 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001842 AESKEYGENASSIST 0x20 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001843 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001844 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001845 call _key_expansion_256a
1846 jmp .Ldec_key
1847.Lenc_key192:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001848 movq 0x10(UKEYP), %xmm2 # other user key
Huang Yingb369e522009-11-23 19:54:06 +08001849 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001850 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001851 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001852 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001853 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001854 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001855 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001856 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001857 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001858 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001859 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001860 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001861 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001862 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001863 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001864 call _key_expansion_192b
1865 jmp .Ldec_key
1866.Lenc_key128:
Huang Yingb369e522009-11-23 19:54:06 +08001867 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001868 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001869 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001870 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001871 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001872 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001873 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001874 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001875 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001876 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001877 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001878 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001879 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001880 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001881 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001882 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001883 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
Huang Ying54b6a1b2009-01-18 16:28:34 +11001884 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001885 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
Huang Ying54b6a1b2009-01-18 16:28:34 +11001886 call _key_expansion_128
1887.Ldec_key:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001888 sub $0x10, TKEYP
1889 movaps (KEYP), %xmm0
1890 movaps (TKEYP), %xmm1
1891 movaps %xmm0, 240(TKEYP)
1892 movaps %xmm1, 240(KEYP)
1893 add $0x10, KEYP
1894 lea 240-16(TKEYP), UKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001895.align 4
1896.Ldec_key_loop:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001897 movaps (KEYP), %xmm0
Huang Yingb369e522009-11-23 19:54:06 +08001898 AESIMC %xmm0 %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001899 movaps %xmm1, (UKEYP)
1900 add $0x10, KEYP
1901 sub $0x10, UKEYP
1902 cmp TKEYP, KEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001903 jb .Ldec_key_loop
Mathias Krause0d258ef2010-11-27 16:34:46 +08001904 xor AREG, AREG
1905#ifndef __x86_64__
1906 popl KEYP
1907#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001908 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001909ENDPROC(aesni_set_key)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001910
1911/*
1912 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1913 */
1914ENTRY(aesni_enc)
Mathias Krause0d258ef2010-11-27 16:34:46 +08001915#ifndef __x86_64__
1916 pushl KEYP
1917 pushl KLEN
1918 movl 12(%esp), KEYP
1919 movl 16(%esp), OUTP
1920 movl 20(%esp), INP
1921#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001922 movl 480(KEYP), KLEN # key length
1923 movups (INP), STATE # input
1924 call _aesni_enc1
1925 movups STATE, (OUTP) # output
Mathias Krause0d258ef2010-11-27 16:34:46 +08001926#ifndef __x86_64__
1927 popl KLEN
1928 popl KEYP
1929#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001930 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001931ENDPROC(aesni_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001932
1933/*
1934 * _aesni_enc1: internal ABI
1935 * input:
1936 * KEYP: key struct pointer
1937 * KLEN: round count
1938 * STATE: initial state (input)
1939 * output:
1940 * STATE: finial state (output)
1941 * changed:
1942 * KEY
1943 * TKEYP (T1)
1944 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08001945.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001946_aesni_enc1:
1947 movaps (KEYP), KEY # key
1948 mov KEYP, TKEYP
1949 pxor KEY, STATE # round 0
1950 add $0x30, TKEYP
1951 cmp $24, KLEN
1952 jb .Lenc128
1953 lea 0x20(TKEYP), TKEYP
1954 je .Lenc192
1955 add $0x20, TKEYP
1956 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001957 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001958 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001959 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001960.align 4
1961.Lenc192:
1962 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001963 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001964 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001965 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001966.align 4
1967.Lenc128:
1968 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001969 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001970 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001971 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001972 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001973 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001974 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001975 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001976 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001977 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001978 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001979 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001980 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001981 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001982 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001983 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001984 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001985 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001986 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001987 AESENCLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001988 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001989ENDPROC(_aesni_enc1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001990
1991/*
1992 * _aesni_enc4: internal ABI
1993 * input:
1994 * KEYP: key struct pointer
1995 * KLEN: round count
1996 * STATE1: initial state (input)
1997 * STATE2
1998 * STATE3
1999 * STATE4
2000 * output:
2001 * STATE1: finial state (output)
2002 * STATE2
2003 * STATE3
2004 * STATE4
2005 * changed:
2006 * KEY
2007 * TKEYP (T1)
2008 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002009.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002010_aesni_enc4:
2011 movaps (KEYP), KEY # key
2012 mov KEYP, TKEYP
2013 pxor KEY, STATE1 # round 0
2014 pxor KEY, STATE2
2015 pxor KEY, STATE3
2016 pxor KEY, STATE4
2017 add $0x30, TKEYP
2018 cmp $24, KLEN
2019 jb .L4enc128
2020 lea 0x20(TKEYP), TKEYP
2021 je .L4enc192
2022 add $0x20, TKEYP
2023 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002024 AESENC KEY STATE1
2025 AESENC KEY STATE2
2026 AESENC KEY STATE3
2027 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002028 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002029 AESENC KEY STATE1
2030 AESENC KEY STATE2
2031 AESENC KEY STATE3
2032 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002033#.align 4
2034.L4enc192:
2035 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002036 AESENC KEY STATE1
2037 AESENC KEY STATE2
2038 AESENC KEY STATE3
2039 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002040 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002041 AESENC KEY STATE1
2042 AESENC KEY STATE2
2043 AESENC KEY STATE3
2044 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002045#.align 4
2046.L4enc128:
2047 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002048 AESENC KEY STATE1
2049 AESENC KEY STATE2
2050 AESENC KEY STATE3
2051 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002052 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002053 AESENC KEY STATE1
2054 AESENC KEY STATE2
2055 AESENC KEY STATE3
2056 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002057 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002058 AESENC KEY STATE1
2059 AESENC KEY STATE2
2060 AESENC KEY STATE3
2061 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002062 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002063 AESENC KEY STATE1
2064 AESENC KEY STATE2
2065 AESENC KEY STATE3
2066 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002067 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002068 AESENC KEY STATE1
2069 AESENC KEY STATE2
2070 AESENC KEY STATE3
2071 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002072 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002073 AESENC KEY STATE1
2074 AESENC KEY STATE2
2075 AESENC KEY STATE3
2076 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002077 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002078 AESENC KEY STATE1
2079 AESENC KEY STATE2
2080 AESENC KEY STATE3
2081 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002082 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002083 AESENC KEY STATE1
2084 AESENC KEY STATE2
2085 AESENC KEY STATE3
2086 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002087 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002088 AESENC KEY STATE1
2089 AESENC KEY STATE2
2090 AESENC KEY STATE3
2091 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002092 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002093 AESENCLAST KEY STATE1 # last round
2094 AESENCLAST KEY STATE2
2095 AESENCLAST KEY STATE3
2096 AESENCLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002097 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002098ENDPROC(_aesni_enc4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002099
2100/*
2101 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2102 */
2103ENTRY(aesni_dec)
Mathias Krause0d258ef2010-11-27 16:34:46 +08002104#ifndef __x86_64__
2105 pushl KEYP
2106 pushl KLEN
2107 movl 12(%esp), KEYP
2108 movl 16(%esp), OUTP
2109 movl 20(%esp), INP
2110#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002111 mov 480(KEYP), KLEN # key length
2112 add $240, KEYP
2113 movups (INP), STATE # input
2114 call _aesni_dec1
2115 movups STATE, (OUTP) #output
Mathias Krause0d258ef2010-11-27 16:34:46 +08002116#ifndef __x86_64__
2117 popl KLEN
2118 popl KEYP
2119#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002120 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002121ENDPROC(aesni_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002122
2123/*
2124 * _aesni_dec1: internal ABI
2125 * input:
2126 * KEYP: key struct pointer
2127 * KLEN: key length
2128 * STATE: initial state (input)
2129 * output:
2130 * STATE: finial state (output)
2131 * changed:
2132 * KEY
2133 * TKEYP (T1)
2134 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002135.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002136_aesni_dec1:
2137 movaps (KEYP), KEY # key
2138 mov KEYP, TKEYP
2139 pxor KEY, STATE # round 0
2140 add $0x30, TKEYP
2141 cmp $24, KLEN
2142 jb .Ldec128
2143 lea 0x20(TKEYP), TKEYP
2144 je .Ldec192
2145 add $0x20, TKEYP
2146 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002147 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002148 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002149 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002150.align 4
2151.Ldec192:
2152 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002153 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002154 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002155 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002156.align 4
2157.Ldec128:
2158 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002159 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002160 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002161 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002162 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002163 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002164 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002165 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002166 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002167 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002168 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002169 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002170 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002171 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002172 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002173 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002174 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002175 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002176 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002177 AESDECLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002178 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002179ENDPROC(_aesni_dec1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002180
2181/*
2182 * _aesni_dec4: internal ABI
2183 * input:
2184 * KEYP: key struct pointer
2185 * KLEN: key length
2186 * STATE1: initial state (input)
2187 * STATE2
2188 * STATE3
2189 * STATE4
2190 * output:
2191 * STATE1: finial state (output)
2192 * STATE2
2193 * STATE3
2194 * STATE4
2195 * changed:
2196 * KEY
2197 * TKEYP (T1)
2198 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002199.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002200_aesni_dec4:
2201 movaps (KEYP), KEY # key
2202 mov KEYP, TKEYP
2203 pxor KEY, STATE1 # round 0
2204 pxor KEY, STATE2
2205 pxor KEY, STATE3
2206 pxor KEY, STATE4
2207 add $0x30, TKEYP
2208 cmp $24, KLEN
2209 jb .L4dec128
2210 lea 0x20(TKEYP), TKEYP
2211 je .L4dec192
2212 add $0x20, TKEYP
2213 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002214 AESDEC KEY STATE1
2215 AESDEC KEY STATE2
2216 AESDEC KEY STATE3
2217 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002218 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002219 AESDEC KEY STATE1
2220 AESDEC KEY STATE2
2221 AESDEC KEY STATE3
2222 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002223.align 4
2224.L4dec192:
2225 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002226 AESDEC KEY STATE1
2227 AESDEC KEY STATE2
2228 AESDEC KEY STATE3
2229 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002230 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002231 AESDEC KEY STATE1
2232 AESDEC KEY STATE2
2233 AESDEC KEY STATE3
2234 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002235.align 4
2236.L4dec128:
2237 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002238 AESDEC KEY STATE1
2239 AESDEC KEY STATE2
2240 AESDEC KEY STATE3
2241 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002242 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002243 AESDEC KEY STATE1
2244 AESDEC KEY STATE2
2245 AESDEC KEY STATE3
2246 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002247 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002248 AESDEC KEY STATE1
2249 AESDEC KEY STATE2
2250 AESDEC KEY STATE3
2251 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002252 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002253 AESDEC KEY STATE1
2254 AESDEC KEY STATE2
2255 AESDEC KEY STATE3
2256 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002257 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002258 AESDEC KEY STATE1
2259 AESDEC KEY STATE2
2260 AESDEC KEY STATE3
2261 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002262 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002263 AESDEC KEY STATE1
2264 AESDEC KEY STATE2
2265 AESDEC KEY STATE3
2266 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002267 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002268 AESDEC KEY STATE1
2269 AESDEC KEY STATE2
2270 AESDEC KEY STATE3
2271 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002272 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002273 AESDEC KEY STATE1
2274 AESDEC KEY STATE2
2275 AESDEC KEY STATE3
2276 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002277 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002278 AESDEC KEY STATE1
2279 AESDEC KEY STATE2
2280 AESDEC KEY STATE3
2281 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002282 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002283 AESDECLAST KEY STATE1 # last round
2284 AESDECLAST KEY STATE2
2285 AESDECLAST KEY STATE3
2286 AESDECLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002287 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002288ENDPROC(_aesni_dec4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002289
2290/*
2291 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2292 * size_t len)
2293 */
2294ENTRY(aesni_ecb_enc)
Mathias Krause0d258ef2010-11-27 16:34:46 +08002295#ifndef __x86_64__
2296 pushl LEN
2297 pushl KEYP
2298 pushl KLEN
2299 movl 16(%esp), KEYP
2300 movl 20(%esp), OUTP
2301 movl 24(%esp), INP
2302 movl 28(%esp), LEN
2303#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002304 test LEN, LEN # check length
2305 jz .Lecb_enc_ret
2306 mov 480(KEYP), KLEN
2307 cmp $16, LEN
2308 jb .Lecb_enc_ret
2309 cmp $64, LEN
2310 jb .Lecb_enc_loop1
2311.align 4
2312.Lecb_enc_loop4:
2313 movups (INP), STATE1
2314 movups 0x10(INP), STATE2
2315 movups 0x20(INP), STATE3
2316 movups 0x30(INP), STATE4
2317 call _aesni_enc4
2318 movups STATE1, (OUTP)
2319 movups STATE2, 0x10(OUTP)
2320 movups STATE3, 0x20(OUTP)
2321 movups STATE4, 0x30(OUTP)
2322 sub $64, LEN
2323 add $64, INP
2324 add $64, OUTP
2325 cmp $64, LEN
2326 jge .Lecb_enc_loop4
2327 cmp $16, LEN
2328 jb .Lecb_enc_ret
2329.align 4
2330.Lecb_enc_loop1:
2331 movups (INP), STATE1
2332 call _aesni_enc1
2333 movups STATE1, (OUTP)
2334 sub $16, LEN
2335 add $16, INP
2336 add $16, OUTP
2337 cmp $16, LEN
2338 jge .Lecb_enc_loop1
2339.Lecb_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002340#ifndef __x86_64__
2341 popl KLEN
2342 popl KEYP
2343 popl LEN
2344#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002345 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002346ENDPROC(aesni_ecb_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002347
2348/*
2349 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2350 * size_t len);
2351 */
2352ENTRY(aesni_ecb_dec)
Mathias Krause0d258ef2010-11-27 16:34:46 +08002353#ifndef __x86_64__
2354 pushl LEN
2355 pushl KEYP
2356 pushl KLEN
2357 movl 16(%esp), KEYP
2358 movl 20(%esp), OUTP
2359 movl 24(%esp), INP
2360 movl 28(%esp), LEN
2361#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002362 test LEN, LEN
2363 jz .Lecb_dec_ret
2364 mov 480(KEYP), KLEN
2365 add $240, KEYP
2366 cmp $16, LEN
2367 jb .Lecb_dec_ret
2368 cmp $64, LEN
2369 jb .Lecb_dec_loop1
2370.align 4
2371.Lecb_dec_loop4:
2372 movups (INP), STATE1
2373 movups 0x10(INP), STATE2
2374 movups 0x20(INP), STATE3
2375 movups 0x30(INP), STATE4
2376 call _aesni_dec4
2377 movups STATE1, (OUTP)
2378 movups STATE2, 0x10(OUTP)
2379 movups STATE3, 0x20(OUTP)
2380 movups STATE4, 0x30(OUTP)
2381 sub $64, LEN
2382 add $64, INP
2383 add $64, OUTP
2384 cmp $64, LEN
2385 jge .Lecb_dec_loop4
2386 cmp $16, LEN
2387 jb .Lecb_dec_ret
2388.align 4
2389.Lecb_dec_loop1:
2390 movups (INP), STATE1
2391 call _aesni_dec1
2392 movups STATE1, (OUTP)
2393 sub $16, LEN
2394 add $16, INP
2395 add $16, OUTP
2396 cmp $16, LEN
2397 jge .Lecb_dec_loop1
2398.Lecb_dec_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002399#ifndef __x86_64__
2400 popl KLEN
2401 popl KEYP
2402 popl LEN
2403#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002404 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002405ENDPROC(aesni_ecb_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002406
2407/*
2408 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2409 * size_t len, u8 *iv)
2410 */
2411ENTRY(aesni_cbc_enc)
Mathias Krause0d258ef2010-11-27 16:34:46 +08002412#ifndef __x86_64__
2413 pushl IVP
2414 pushl LEN
2415 pushl KEYP
2416 pushl KLEN
2417 movl 20(%esp), KEYP
2418 movl 24(%esp), OUTP
2419 movl 28(%esp), INP
2420 movl 32(%esp), LEN
2421 movl 36(%esp), IVP
2422#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002423 cmp $16, LEN
2424 jb .Lcbc_enc_ret
2425 mov 480(KEYP), KLEN
2426 movups (IVP), STATE # load iv as initial state
2427.align 4
2428.Lcbc_enc_loop:
2429 movups (INP), IN # load input
2430 pxor IN, STATE
2431 call _aesni_enc1
2432 movups STATE, (OUTP) # store output
2433 sub $16, LEN
2434 add $16, INP
2435 add $16, OUTP
2436 cmp $16, LEN
2437 jge .Lcbc_enc_loop
2438 movups STATE, (IVP)
2439.Lcbc_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002440#ifndef __x86_64__
2441 popl KLEN
2442 popl KEYP
2443 popl LEN
2444 popl IVP
2445#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002446 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002447ENDPROC(aesni_cbc_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002448
2449/*
2450 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2451 * size_t len, u8 *iv)
2452 */
2453ENTRY(aesni_cbc_dec)
Mathias Krause0d258ef2010-11-27 16:34:46 +08002454#ifndef __x86_64__
2455 pushl IVP
2456 pushl LEN
2457 pushl KEYP
2458 pushl KLEN
2459 movl 20(%esp), KEYP
2460 movl 24(%esp), OUTP
2461 movl 28(%esp), INP
2462 movl 32(%esp), LEN
2463 movl 36(%esp), IVP
2464#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002465 cmp $16, LEN
Huang Yinge6efaa02009-06-18 19:33:57 +08002466 jb .Lcbc_dec_just_ret
Huang Ying54b6a1b2009-01-18 16:28:34 +11002467 mov 480(KEYP), KLEN
2468 add $240, KEYP
2469 movups (IVP), IV
2470 cmp $64, LEN
2471 jb .Lcbc_dec_loop1
2472.align 4
2473.Lcbc_dec_loop4:
2474 movups (INP), IN1
2475 movaps IN1, STATE1
2476 movups 0x10(INP), IN2
2477 movaps IN2, STATE2
Mathias Krause0d258ef2010-11-27 16:34:46 +08002478#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002479 movups 0x20(INP), IN3
2480 movaps IN3, STATE3
2481 movups 0x30(INP), IN4
2482 movaps IN4, STATE4
Mathias Krause0d258ef2010-11-27 16:34:46 +08002483#else
2484 movups 0x20(INP), IN1
2485 movaps IN1, STATE3
2486 movups 0x30(INP), IN2
2487 movaps IN2, STATE4
2488#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002489 call _aesni_dec4
2490 pxor IV, STATE1
Mathias Krause0d258ef2010-11-27 16:34:46 +08002491#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002492 pxor IN1, STATE2
2493 pxor IN2, STATE3
2494 pxor IN3, STATE4
2495 movaps IN4, IV
Mathias Krause0d258ef2010-11-27 16:34:46 +08002496#else
Mathias Krause0d258ef2010-11-27 16:34:46 +08002497 pxor IN1, STATE4
2498 movaps IN2, IV
Mathias Krause7c8d5182012-05-30 01:43:08 +02002499 movups (INP), IN1
2500 pxor IN1, STATE2
2501 movups 0x10(INP), IN2
2502 pxor IN2, STATE3
Mathias Krause0d258ef2010-11-27 16:34:46 +08002503#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002504 movups STATE1, (OUTP)
2505 movups STATE2, 0x10(OUTP)
2506 movups STATE3, 0x20(OUTP)
2507 movups STATE4, 0x30(OUTP)
2508 sub $64, LEN
2509 add $64, INP
2510 add $64, OUTP
2511 cmp $64, LEN
2512 jge .Lcbc_dec_loop4
2513 cmp $16, LEN
2514 jb .Lcbc_dec_ret
2515.align 4
2516.Lcbc_dec_loop1:
2517 movups (INP), IN
2518 movaps IN, STATE
2519 call _aesni_dec1
2520 pxor IV, STATE
2521 movups STATE, (OUTP)
2522 movaps IN, IV
2523 sub $16, LEN
2524 add $16, INP
2525 add $16, OUTP
2526 cmp $16, LEN
2527 jge .Lcbc_dec_loop1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002528.Lcbc_dec_ret:
Huang Yinge6efaa02009-06-18 19:33:57 +08002529 movups IV, (IVP)
2530.Lcbc_dec_just_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002531#ifndef __x86_64__
2532 popl KLEN
2533 popl KEYP
2534 popl LEN
2535 popl IVP
2536#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002537 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002538ENDPROC(aesni_cbc_dec)
Huang Ying12387a42010-03-10 18:28:55 +08002539
Mathias Krause0d258ef2010-11-27 16:34:46 +08002540#ifdef __x86_64__
Huang Ying12387a42010-03-10 18:28:55 +08002541.align 16
2542.Lbswap_mask:
2543 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2544
2545/*
2546 * _aesni_inc_init: internal ABI
2547 * setup registers used by _aesni_inc
2548 * input:
2549 * IV
2550 * output:
2551 * CTR: == IV, in little endian
2552 * TCTR_LOW: == lower qword of CTR
2553 * INC: == 1, in little endian
2554 * BSWAP_MASK == endian swapping mask
2555 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002556.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002557_aesni_inc_init:
2558 movaps .Lbswap_mask, BSWAP_MASK
2559 movaps IV, CTR
2560 PSHUFB_XMM BSWAP_MASK CTR
2561 mov $1, TCTR_LOW
Huang Ying32cbd7d2010-03-13 16:28:42 +08002562 MOVQ_R64_XMM TCTR_LOW INC
2563 MOVQ_R64_XMM CTR TCTR_LOW
Huang Ying12387a42010-03-10 18:28:55 +08002564 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002565ENDPROC(_aesni_inc_init)
Huang Ying12387a42010-03-10 18:28:55 +08002566
2567/*
2568 * _aesni_inc: internal ABI
2569 * Increase IV by 1, IV is in big endian
2570 * input:
2571 * IV
2572 * CTR: == IV, in little endian
2573 * TCTR_LOW: == lower qword of CTR
2574 * INC: == 1, in little endian
2575 * BSWAP_MASK == endian swapping mask
2576 * output:
2577 * IV: Increase by 1
2578 * changed:
2579 * CTR: == output IV, in little endian
2580 * TCTR_LOW: == lower qword of CTR
2581 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002582.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002583_aesni_inc:
2584 paddq INC, CTR
2585 add $1, TCTR_LOW
2586 jnc .Linc_low
2587 pslldq $8, INC
2588 paddq INC, CTR
2589 psrldq $8, INC
2590.Linc_low:
2591 movaps CTR, IV
2592 PSHUFB_XMM BSWAP_MASK IV
2593 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002594ENDPROC(_aesni_inc)
Huang Ying12387a42010-03-10 18:28:55 +08002595
2596/*
2597 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2598 * size_t len, u8 *iv)
2599 */
2600ENTRY(aesni_ctr_enc)
2601 cmp $16, LEN
2602 jb .Lctr_enc_just_ret
2603 mov 480(KEYP), KLEN
2604 movups (IVP), IV
2605 call _aesni_inc_init
2606 cmp $64, LEN
2607 jb .Lctr_enc_loop1
2608.align 4
2609.Lctr_enc_loop4:
2610 movaps IV, STATE1
2611 call _aesni_inc
2612 movups (INP), IN1
2613 movaps IV, STATE2
2614 call _aesni_inc
2615 movups 0x10(INP), IN2
2616 movaps IV, STATE3
2617 call _aesni_inc
2618 movups 0x20(INP), IN3
2619 movaps IV, STATE4
2620 call _aesni_inc
2621 movups 0x30(INP), IN4
2622 call _aesni_enc4
2623 pxor IN1, STATE1
2624 movups STATE1, (OUTP)
2625 pxor IN2, STATE2
2626 movups STATE2, 0x10(OUTP)
2627 pxor IN3, STATE3
2628 movups STATE3, 0x20(OUTP)
2629 pxor IN4, STATE4
2630 movups STATE4, 0x30(OUTP)
2631 sub $64, LEN
2632 add $64, INP
2633 add $64, OUTP
2634 cmp $64, LEN
2635 jge .Lctr_enc_loop4
2636 cmp $16, LEN
2637 jb .Lctr_enc_ret
2638.align 4
2639.Lctr_enc_loop1:
2640 movaps IV, STATE
2641 call _aesni_inc
2642 movups (INP), IN
2643 call _aesni_enc1
2644 pxor IN, STATE
2645 movups STATE, (OUTP)
2646 sub $16, LEN
2647 add $16, INP
2648 add $16, OUTP
2649 cmp $16, LEN
2650 jge .Lctr_enc_loop1
2651.Lctr_enc_ret:
2652 movups IV, (IVP)
2653.Lctr_enc_just_ret:
2654 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002655ENDPROC(aesni_ctr_enc)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002656
2657/*
2658 * _aesni_gf128mul_x_ble: internal ABI
2659 * Multiply in GF(2^128) for XTS IVs
2660 * input:
2661 * IV: current IV
2662 * GF128MUL_MASK == mask with 0x87 and 0x01
2663 * output:
2664 * IV: next IV
2665 * changed:
2666 * CTR: == temporary value
2667 */
2668#define _aesni_gf128mul_x_ble() \
2669 pshufd $0x13, IV, CTR; \
2670 paddq IV, IV; \
2671 psrad $31, CTR; \
2672 pand GF128MUL_MASK, CTR; \
2673 pxor CTR, IV;
2674
2675/*
2676 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2677 * bool enc, u8 *iv)
2678 */
2679ENTRY(aesni_xts_crypt8)
2680 cmpb $0, %cl
2681 movl $0, %ecx
2682 movl $240, %r10d
2683 leaq _aesni_enc4, %r11
2684 leaq _aesni_dec4, %rax
2685 cmovel %r10d, %ecx
2686 cmoveq %rax, %r11
2687
2688 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2689 movups (IVP), IV
2690
2691 mov 480(KEYP), KLEN
2692 addq %rcx, KEYP
2693
2694 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002695 movdqu 0x00(INP), INC
2696 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002697 movdqu IV, 0x00(OUTP)
2698
2699 _aesni_gf128mul_x_ble()
2700 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002701 movdqu 0x10(INP), INC
2702 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002703 movdqu IV, 0x10(OUTP)
2704
2705 _aesni_gf128mul_x_ble()
2706 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002707 movdqu 0x20(INP), INC
2708 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002709 movdqu IV, 0x20(OUTP)
2710
2711 _aesni_gf128mul_x_ble()
2712 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002713 movdqu 0x30(INP), INC
2714 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002715 movdqu IV, 0x30(OUTP)
2716
2717 call *%r11
2718
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002719 movdqu 0x00(OUTP), INC
2720 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002721 movdqu STATE1, 0x00(OUTP)
2722
2723 _aesni_gf128mul_x_ble()
2724 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002725 movdqu 0x40(INP), INC
2726 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002727 movdqu IV, 0x40(OUTP)
2728
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002729 movdqu 0x10(OUTP), INC
2730 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002731 movdqu STATE2, 0x10(OUTP)
2732
2733 _aesni_gf128mul_x_ble()
2734 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002735 movdqu 0x50(INP), INC
2736 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002737 movdqu IV, 0x50(OUTP)
2738
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002739 movdqu 0x20(OUTP), INC
2740 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002741 movdqu STATE3, 0x20(OUTP)
2742
2743 _aesni_gf128mul_x_ble()
2744 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002745 movdqu 0x60(INP), INC
2746 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002747 movdqu IV, 0x60(OUTP)
2748
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002749 movdqu 0x30(OUTP), INC
2750 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002751 movdqu STATE4, 0x30(OUTP)
2752
2753 _aesni_gf128mul_x_ble()
2754 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002755 movdqu 0x70(INP), INC
2756 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002757 movdqu IV, 0x70(OUTP)
2758
2759 _aesni_gf128mul_x_ble()
2760 movups IV, (IVP)
2761
2762 call *%r11
2763
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002764 movdqu 0x40(OUTP), INC
2765 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002766 movdqu STATE1, 0x40(OUTP)
2767
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002768 movdqu 0x50(OUTP), INC
2769 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002770 movdqu STATE2, 0x50(OUTP)
2771
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002772 movdqu 0x60(OUTP), INC
2773 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002774 movdqu STATE3, 0x60(OUTP)
2775
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002776 movdqu 0x70(OUTP), INC
2777 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002778 movdqu STATE4, 0x70(OUTP)
2779
2780 ret
2781ENDPROC(aesni_xts_crypt8)
2782
Mathias Krause0d258ef2010-11-27 16:34:46 +08002783#endif