blob: d528fde219d2d2bc189eacdd8ff2a38beb4438d4 [file] [log] [blame]
Huang Ying54b6a1b2009-01-18 16:28:34 +11001/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040012 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
Mathias Krause0d258ef2010-11-27 16:34:46 +080023 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
Huang Ying54b6a1b2009-01-18 16:28:34 +110026 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
Huang Yingb369e522009-11-23 19:54:06 +080033#include <asm/inst.h>
Huang Ying54b6a1b2009-01-18 16:28:34 +110034
Mathias Krause559ad0f2010-11-29 08:35:39 +080035#ifdef __x86_64__
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040036.data
37POLY: .octa 0xC2000000000000000000000000000001
38TWOONE: .octa 0x00000001000000000000000000000001
39
40# order of these constants should not change.
41# more specifically, ALL_F should follow SHIFT_MASK,
42# and ZERO should follow ALL_F
43
44SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
45MASK1: .octa 0x0000000000000000ffffffffffffffff
46MASK2: .octa 0xffffffffffffffff0000000000000000
47SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
48ALL_F: .octa 0xffffffffffffffffffffffffffffffff
49ZERO: .octa 0x00000000000000000000000000000000
50ONE: .octa 0x00000000000000000000000000000001
51F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
52dec: .octa 0x1
53enc: .octa 0x2
54
55
Huang Ying54b6a1b2009-01-18 16:28:34 +110056.text
57
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040058
59#define STACK_OFFSET 8*3
60#define HashKey 16*0 // store HashKey <<1 mod poly here
61#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
62#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
63#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
64#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
65 // bits of HashKey <<1 mod poly here
66 //(for Karatsuba purposes)
67#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
68 // bits of HashKey^2 <<1 mod poly here
69 // (for Karatsuba purposes)
70#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
71 // bits of HashKey^3 <<1 mod poly here
72 // (for Karatsuba purposes)
73#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
74 // bits of HashKey^4 <<1 mod poly here
75 // (for Karatsuba purposes)
76#define VARIABLE_OFFSET 16*8
77
78#define arg1 rdi
79#define arg2 rsi
80#define arg3 rdx
81#define arg4 rcx
82#define arg5 r8
83#define arg6 r9
84#define arg7 STACK_OFFSET+8(%r14)
85#define arg8 STACK_OFFSET+16(%r14)
86#define arg9 STACK_OFFSET+24(%r14)
87#define arg10 STACK_OFFSET+32(%r14)
Mathias Krause559ad0f2010-11-29 08:35:39 +080088#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040089
90
Huang Ying54b6a1b2009-01-18 16:28:34 +110091#define STATE1 %xmm0
92#define STATE2 %xmm4
93#define STATE3 %xmm5
94#define STATE4 %xmm6
95#define STATE STATE1
96#define IN1 %xmm1
97#define IN2 %xmm7
98#define IN3 %xmm8
99#define IN4 %xmm9
100#define IN IN1
101#define KEY %xmm2
102#define IV %xmm3
Mathias Krause0d258ef2010-11-27 16:34:46 +0800103
Huang Ying12387a42010-03-10 18:28:55 +0800104#define BSWAP_MASK %xmm10
105#define CTR %xmm11
106#define INC %xmm12
Huang Ying54b6a1b2009-01-18 16:28:34 +1100107
Mathias Krause0d258ef2010-11-27 16:34:46 +0800108#ifdef __x86_64__
109#define AREG %rax
Huang Ying54b6a1b2009-01-18 16:28:34 +1100110#define KEYP %rdi
111#define OUTP %rsi
Mathias Krause0d258ef2010-11-27 16:34:46 +0800112#define UKEYP OUTP
Huang Ying54b6a1b2009-01-18 16:28:34 +1100113#define INP %rdx
114#define LEN %rcx
115#define IVP %r8
116#define KLEN %r9d
117#define T1 %r10
118#define TKEYP T1
119#define T2 %r11
Huang Ying12387a42010-03-10 18:28:55 +0800120#define TCTR_LOW T2
Mathias Krause0d258ef2010-11-27 16:34:46 +0800121#else
122#define AREG %eax
123#define KEYP %edi
124#define OUTP AREG
125#define UKEYP OUTP
126#define INP %edx
127#define LEN %esi
128#define IVP %ebp
129#define KLEN %ebx
130#define T1 %ecx
131#define TKEYP T1
132#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +1100133
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400134
Mathias Krause559ad0f2010-11-29 08:35:39 +0800135#ifdef __x86_64__
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400136/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
137*
138*
139* Input: A and B (128-bits each, bit-reflected)
140* Output: C = A*B*x mod poly, (i.e. >>1 )
141* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
142* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
143*
144*/
145.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
146 movdqa \GH, \TMP1
147 pshufd $78, \GH, \TMP2
148 pshufd $78, \HK, \TMP3
149 pxor \GH, \TMP2 # TMP2 = a1+a0
150 pxor \HK, \TMP3 # TMP3 = b1+b0
151 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
152 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
153 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
154 pxor \GH, \TMP2
155 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
156 movdqa \TMP2, \TMP3
157 pslldq $8, \TMP3 # left shift TMP3 2 DWs
158 psrldq $8, \TMP2 # right shift TMP2 2 DWs
159 pxor \TMP3, \GH
160 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
161
162 # first phase of the reduction
163
164 movdqa \GH, \TMP2
165 movdqa \GH, \TMP3
166 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
167 # in in order to perform
168 # independent shifts
169 pslld $31, \TMP2 # packed right shift <<31
170 pslld $30, \TMP3 # packed right shift <<30
171 pslld $25, \TMP4 # packed right shift <<25
172 pxor \TMP3, \TMP2 # xor the shifted versions
173 pxor \TMP4, \TMP2
174 movdqa \TMP2, \TMP5
175 psrldq $4, \TMP5 # right shift TMP5 1 DW
176 pslldq $12, \TMP2 # left shift TMP2 3 DWs
177 pxor \TMP2, \GH
178
179 # second phase of the reduction
180
181 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
182 # in in order to perform
183 # independent shifts
184 movdqa \GH,\TMP3
185 movdqa \GH,\TMP4
186 psrld $1,\TMP2 # packed left shift >>1
187 psrld $2,\TMP3 # packed left shift >>2
188 psrld $7,\TMP4 # packed left shift >>7
189 pxor \TMP3,\TMP2 # xor the shifted versions
190 pxor \TMP4,\TMP2
191 pxor \TMP5, \TMP2
192 pxor \TMP2, \GH
193 pxor \TMP1, \GH # result is in TMP1
194.endm
195
196/*
197* if a = number of total plaintext bytes
198* b = floor(a/16)
199* num_initial_blocks = b mod 4
200* encrypt the initial num_initial_blocks blocks and apply ghash on
201* the ciphertext
202* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
203* are clobbered
204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
205*/
206
207.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
208XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
209
210 mov arg7, %r10 # %r10 = AAD
211 mov arg8, %r12 # %r12 = aadLen
212 mov %r12, %r11
213 pxor %xmm\i, %xmm\i
214_get_AAD_loop\num_initial_blocks\operation:
215 movd (%r10), \TMP1
216 pslldq $12, \TMP1
217 psrldq $4, %xmm\i
218 pxor \TMP1, %xmm\i
219 add $4, %r10
220 sub $4, %r12
221 jne _get_AAD_loop\num_initial_blocks\operation
222 cmp $16, %r11
223 je _get_AAD_loop2_done\num_initial_blocks\operation
224 mov $16, %r12
225_get_AAD_loop2\num_initial_blocks\operation:
226 psrldq $4, %xmm\i
227 sub $4, %r12
228 cmp %r11, %r12
229 jne _get_AAD_loop2\num_initial_blocks\operation
230_get_AAD_loop2_done\num_initial_blocks\operation:
231 pshufb SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data
232 xor %r11, %r11 # initialise the data pointer offset as zero
233
234 # start AES for num_initial_blocks blocks
235
236 mov %arg5, %rax # %rax = *Y0
237 movdqu (%rax), \XMM0 # XMM0 = Y0
238 pshufb SHUF_MASK(%rip), \XMM0
239.if \i_seq != 0
240.irpc index, \i_seq
241 paddd ONE(%rip), \XMM0 # INCR Y0
242 movdqa \XMM0, %xmm\index
243 pshufb SHUF_MASK(%rip), %xmm\index # perform a 16 byte swap
244.endr
245.irpc index, \i_seq
246 pxor 16*0(%arg1), %xmm\index
247.endr
248.irpc index, \i_seq
249 movaps 0x10(%rdi), \TMP1
250 AESENC \TMP1, %xmm\index # Round 1
251.endr
252.irpc index, \i_seq
253 movaps 0x20(%arg1), \TMP1
254 AESENC \TMP1, %xmm\index # Round 2
255.endr
256.irpc index, \i_seq
257 movaps 0x30(%arg1), \TMP1
258 AESENC \TMP1, %xmm\index # Round 2
259.endr
260.irpc index, \i_seq
261 movaps 0x40(%arg1), \TMP1
262 AESENC \TMP1, %xmm\index # Round 2
263.endr
264.irpc index, \i_seq
265 movaps 0x50(%arg1), \TMP1
266 AESENC \TMP1, %xmm\index # Round 2
267.endr
268.irpc index, \i_seq
269 movaps 0x60(%arg1), \TMP1
270 AESENC \TMP1, %xmm\index # Round 2
271.endr
272.irpc index, \i_seq
273 movaps 0x70(%arg1), \TMP1
274 AESENC \TMP1, %xmm\index # Round 2
275.endr
276.irpc index, \i_seq
277 movaps 0x80(%arg1), \TMP1
278 AESENC \TMP1, %xmm\index # Round 2
279.endr
280.irpc index, \i_seq
281 movaps 0x90(%arg1), \TMP1
282 AESENC \TMP1, %xmm\index # Round 2
283.endr
284.irpc index, \i_seq
285 movaps 0xa0(%arg1), \TMP1
286 AESENCLAST \TMP1, %xmm\index # Round 10
287.endr
288.irpc index, \i_seq
289 movdqu (%arg3 , %r11, 1), \TMP1
290 pxor \TMP1, %xmm\index
291 movdqu %xmm\index, (%arg2 , %r11, 1)
292 # write back plaintext/ciphertext for num_initial_blocks
293 add $16, %r11
294.if \operation == dec
295 movdqa \TMP1, %xmm\index
296.endif
297 pshufb SHUF_MASK(%rip), %xmm\index
298 # prepare plaintext/ciphertext for GHASH computation
299.endr
300.endif
301 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
302 # apply GHASH on num_initial_blocks blocks
303
304.if \i == 5
305 pxor %xmm5, %xmm6
306 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
307 pxor %xmm6, %xmm7
308 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
309 pxor %xmm7, %xmm8
310 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
311.elseif \i == 6
312 pxor %xmm6, %xmm7
313 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314 pxor %xmm7, %xmm8
315 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316.elseif \i == 7
317 pxor %xmm7, %xmm8
318 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
319.endif
320 cmp $64, %r13
321 jl _initial_blocks_done\num_initial_blocks\operation
322 # no need for precomputed values
323/*
324*
325* Precomputations for HashKey parallel with encryption of first 4 blocks.
326* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
327*/
328 paddd ONE(%rip), \XMM0 # INCR Y0
329 movdqa \XMM0, \XMM1
330 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
331 paddd ONE(%rip), \XMM0 # INCR Y0
332 movdqa \XMM0, \XMM2
333 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
334 paddd ONE(%rip), \XMM0 # INCR Y0
335 movdqa \XMM0, \XMM3
336 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
337 paddd ONE(%rip), \XMM0 # INCR Y0
338 movdqa \XMM0, \XMM4
339 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
340 pxor 16*0(%arg1), \XMM1
341 pxor 16*0(%arg1), \XMM2
342 pxor 16*0(%arg1), \XMM3
343 pxor 16*0(%arg1), \XMM4
344 movdqa \TMP3, \TMP5
345 pshufd $78, \TMP3, \TMP1
346 pxor \TMP3, \TMP1
347 movdqa \TMP1, HashKey_k(%rsp)
348 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
349# TMP5 = HashKey^2<<1 (mod poly)
350 movdqa \TMP5, HashKey_2(%rsp)
351# HashKey_2 = HashKey^2<<1 (mod poly)
352 pshufd $78, \TMP5, \TMP1
353 pxor \TMP5, \TMP1
354 movdqa \TMP1, HashKey_2_k(%rsp)
355.irpc index, 1234 # do 4 rounds
356 movaps 0x10*\index(%arg1), \TMP1
357 AESENC \TMP1, \XMM1
358 AESENC \TMP1, \XMM2
359 AESENC \TMP1, \XMM3
360 AESENC \TMP1, \XMM4
361.endr
362 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
363# TMP5 = HashKey^3<<1 (mod poly)
364 movdqa \TMP5, HashKey_3(%rsp)
365 pshufd $78, \TMP5, \TMP1
366 pxor \TMP5, \TMP1
367 movdqa \TMP1, HashKey_3_k(%rsp)
368.irpc index, 56789 # do next 5 rounds
369 movaps 0x10*\index(%arg1), \TMP1
370 AESENC \TMP1, \XMM1
371 AESENC \TMP1, \XMM2
372 AESENC \TMP1, \XMM3
373 AESENC \TMP1, \XMM4
374.endr
375 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
376# TMP5 = HashKey^3<<1 (mod poly)
377 movdqa \TMP5, HashKey_4(%rsp)
378 pshufd $78, \TMP5, \TMP1
379 pxor \TMP5, \TMP1
380 movdqa \TMP1, HashKey_4_k(%rsp)
381 movaps 0xa0(%arg1), \TMP2
382 AESENCLAST \TMP2, \XMM1
383 AESENCLAST \TMP2, \XMM2
384 AESENCLAST \TMP2, \XMM3
385 AESENCLAST \TMP2, \XMM4
386 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
387 pxor \TMP1, \XMM1
388.if \operation == dec
389 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
390 movdqa \TMP1, \XMM1
391.endif
392 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
393 pxor \TMP1, \XMM2
394.if \operation == dec
395 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
396 movdqa \TMP1, \XMM2
397.endif
398 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
399 pxor \TMP1, \XMM3
400.if \operation == dec
401 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
402 movdqa \TMP1, \XMM3
403.endif
404 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
405 pxor \TMP1, \XMM4
406.if \operation == dec
407 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
408 movdqa \TMP1, \XMM4
409.else
410 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
411 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
412 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
413 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
414.endif
415 add $64, %r11
416 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
417 pxor \XMMDst, \XMM1
418# combine GHASHed value with the corresponding ciphertext
419 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
420 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
421 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
422_initial_blocks_done\num_initial_blocks\operation:
423.endm
424
425/*
426* encrypt 4 blocks at a time
427* ghash the 4 previously encrypted ciphertext blocks
428* arg1, %arg2, %arg3 are used as pointers only, not modified
429* %r11 is the data offset value
430*/
431.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \
432TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
433
434 movdqa \XMM1, \XMM5
435 movdqa \XMM2, \XMM6
436 movdqa \XMM3, \XMM7
437 movdqa \XMM4, \XMM8
438
439 # multiply TMP5 * HashKey using karatsuba
440
441 movdqa \XMM5, \TMP4
442 pshufd $78, \XMM5, \TMP6
443 pxor \XMM5, \TMP6
444 paddd ONE(%rip), \XMM0 # INCR CNT
445 movdqa HashKey_4(%rsp), \TMP5
446 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
447 movdqa \XMM0, \XMM1
448 paddd ONE(%rip), \XMM0 # INCR CNT
449 movdqa \XMM0, \XMM2
450 paddd ONE(%rip), \XMM0 # INCR CNT
451 movdqa \XMM0, \XMM3
452 paddd ONE(%rip), \XMM0 # INCR CNT
453 movdqa \XMM0, \XMM4
454 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
455 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
456 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
457 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
458 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
459 pxor (%arg1), \XMM1
460 pxor (%arg1), \XMM2
461 pxor (%arg1), \XMM3
462 pxor (%arg1), \XMM4
463 movdqa HashKey_4_k(%rsp), \TMP5
464 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
465 movaps 0x10(%arg1), \TMP1
466 AESENC \TMP1, \XMM1 # Round 1
467 AESENC \TMP1, \XMM2
468 AESENC \TMP1, \XMM3
469 AESENC \TMP1, \XMM4
470 movaps 0x20(%arg1), \TMP1
471 AESENC \TMP1, \XMM1 # Round 2
472 AESENC \TMP1, \XMM2
473 AESENC \TMP1, \XMM3
474 AESENC \TMP1, \XMM4
475 movdqa \XMM6, \TMP1
476 pshufd $78, \XMM6, \TMP2
477 pxor \XMM6, \TMP2
478 movdqa HashKey_3(%rsp), \TMP5
479 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
480 movaps 0x30(%arg1), \TMP3
481 AESENC \TMP3, \XMM1 # Round 3
482 AESENC \TMP3, \XMM2
483 AESENC \TMP3, \XMM3
484 AESENC \TMP3, \XMM4
485 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
486 movaps 0x40(%arg1), \TMP3
487 AESENC \TMP3, \XMM1 # Round 4
488 AESENC \TMP3, \XMM2
489 AESENC \TMP3, \XMM3
490 AESENC \TMP3, \XMM4
491 movdqa HashKey_3_k(%rsp), \TMP5
492 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
493 movaps 0x50(%arg1), \TMP3
494 AESENC \TMP3, \XMM1 # Round 5
495 AESENC \TMP3, \XMM2
496 AESENC \TMP3, \XMM3
497 AESENC \TMP3, \XMM4
498 pxor \TMP1, \TMP4
499# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
500 pxor \XMM6, \XMM5
501 pxor \TMP2, \TMP6
502 movdqa \XMM7, \TMP1
503 pshufd $78, \XMM7, \TMP2
504 pxor \XMM7, \TMP2
505 movdqa HashKey_2(%rsp ), \TMP5
506
507 # Multiply TMP5 * HashKey using karatsuba
508
509 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
510 movaps 0x60(%arg1), \TMP3
511 AESENC \TMP3, \XMM1 # Round 6
512 AESENC \TMP3, \XMM2
513 AESENC \TMP3, \XMM3
514 AESENC \TMP3, \XMM4
515 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
516 movaps 0x70(%arg1), \TMP3
517 AESENC \TMP3, \XMM1 # Round 7
518 AESENC \TMP3, \XMM2
519 AESENC \TMP3, \XMM3
520 AESENC \TMP3, \XMM4
521 movdqa HashKey_2_k(%rsp), \TMP5
522 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
523 movaps 0x80(%arg1), \TMP3
524 AESENC \TMP3, \XMM1 # Round 8
525 AESENC \TMP3, \XMM2
526 AESENC \TMP3, \XMM3
527 AESENC \TMP3, \XMM4
528 pxor \TMP1, \TMP4
529# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
530 pxor \XMM7, \XMM5
531 pxor \TMP2, \TMP6
532
533 # Multiply XMM8 * HashKey
534 # XMM8 and TMP5 hold the values for the two operands
535
536 movdqa \XMM8, \TMP1
537 pshufd $78, \XMM8, \TMP2
538 pxor \XMM8, \TMP2
539 movdqa HashKey(%rsp), \TMP5
540 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
541 movaps 0x90(%arg1), \TMP3
542 AESENC \TMP3, \XMM1 # Round 9
543 AESENC \TMP3, \XMM2
544 AESENC \TMP3, \XMM3
545 AESENC \TMP3, \XMM4
546 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
547 movaps 0xa0(%arg1), \TMP3
548 AESENCLAST \TMP3, \XMM1 # Round 10
549 AESENCLAST \TMP3, \XMM2
550 AESENCLAST \TMP3, \XMM3
551 AESENCLAST \TMP3, \XMM4
552 movdqa HashKey_k(%rsp), \TMP5
553 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
554 movdqu (%arg3,%r11,1), \TMP3
555 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
556.if \operation == dec
557 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
558 movdqa \TMP3, \XMM1
559.endif
560 movdqu 16(%arg3,%r11,1), \TMP3
561 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
562.if \operation == dec
563 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
564 movdqa \TMP3, \XMM2
565.endif
566 movdqu 32(%arg3,%r11,1), \TMP3
567 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
568.if \operation == dec
569 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
570 movdqa \TMP3, \XMM3
571.endif
572 movdqu 48(%arg3,%r11,1), \TMP3
573 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
574.if \operation == dec
575 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
576 movdqa \TMP3, \XMM4
577.else
578 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
579 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
580 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
581 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
582.endif
583 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
584 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
585 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
586 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte sway
587
588 pxor \TMP4, \TMP1
589 pxor \XMM8, \XMM5
590 pxor \TMP6, \TMP2
591 pxor \TMP1, \TMP2
592 pxor \XMM5, \TMP2
593 movdqa \TMP2, \TMP3
594 pslldq $8, \TMP3 # left shift TMP3 2 DWs
595 psrldq $8, \TMP2 # right shift TMP2 2 DWs
596 pxor \TMP3, \XMM5
597 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
598
599 # first phase of reduction
600
601 movdqa \XMM5, \TMP2
602 movdqa \XMM5, \TMP3
603 movdqa \XMM5, \TMP4
604# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
605 pslld $31, \TMP2 # packed right shift << 31
606 pslld $30, \TMP3 # packed right shift << 30
607 pslld $25, \TMP4 # packed right shift << 25
608 pxor \TMP3, \TMP2 # xor the shifted versions
609 pxor \TMP4, \TMP2
610 movdqa \TMP2, \TMP5
611 psrldq $4, \TMP5 # right shift T5 1 DW
612 pslldq $12, \TMP2 # left shift T2 3 DWs
613 pxor \TMP2, \XMM5
614
615 # second phase of reduction
616
617 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
618 movdqa \XMM5,\TMP3
619 movdqa \XMM5,\TMP4
620 psrld $1, \TMP2 # packed left shift >>1
621 psrld $2, \TMP3 # packed left shift >>2
622 psrld $7, \TMP4 # packed left shift >>7
623 pxor \TMP3,\TMP2 # xor the shifted versions
624 pxor \TMP4,\TMP2
625 pxor \TMP5, \TMP2
626 pxor \TMP2, \XMM5
627 pxor \TMP1, \XMM5 # result is in TMP1
628
629 pxor \XMM5, \XMM1
630.endm
631
632/* GHASH the last 4 ciphertext blocks. */
633.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
634TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
635
636 # Multiply TMP6 * HashKey (using Karatsuba)
637
638 movdqa \XMM1, \TMP6
639 pshufd $78, \XMM1, \TMP2
640 pxor \XMM1, \TMP2
641 movdqa HashKey_4(%rsp), \TMP5
642 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
643 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
644 movdqa HashKey_4_k(%rsp), \TMP4
645 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
646 movdqa \XMM1, \XMMDst
647 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
648
649 # Multiply TMP1 * HashKey (using Karatsuba)
650
651 movdqa \XMM2, \TMP1
652 pshufd $78, \XMM2, \TMP2
653 pxor \XMM2, \TMP2
654 movdqa HashKey_3(%rsp), \TMP5
655 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
656 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
657 movdqa HashKey_3_k(%rsp), \TMP4
658 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
659 pxor \TMP1, \TMP6
660 pxor \XMM2, \XMMDst
661 pxor \TMP2, \XMM1
662# results accumulated in TMP6, XMMDst, XMM1
663
664 # Multiply TMP1 * HashKey (using Karatsuba)
665
666 movdqa \XMM3, \TMP1
667 pshufd $78, \XMM3, \TMP2
668 pxor \XMM3, \TMP2
669 movdqa HashKey_2(%rsp), \TMP5
670 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
671 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
672 movdqa HashKey_2_k(%rsp), \TMP4
673 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
674 pxor \TMP1, \TMP6
675 pxor \XMM3, \XMMDst
676 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
677
678 # Multiply TMP1 * HashKey (using Karatsuba)
679 movdqa \XMM4, \TMP1
680 pshufd $78, \XMM4, \TMP2
681 pxor \XMM4, \TMP2
682 movdqa HashKey(%rsp), \TMP5
683 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
684 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
685 movdqa HashKey_k(%rsp), \TMP4
686 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
687 pxor \TMP1, \TMP6
688 pxor \XMM4, \XMMDst
689 pxor \XMM1, \TMP2
690 pxor \TMP6, \TMP2
691 pxor \XMMDst, \TMP2
692 # middle section of the temp results combined as in karatsuba algorithm
693 movdqa \TMP2, \TMP4
694 pslldq $8, \TMP4 # left shift TMP4 2 DWs
695 psrldq $8, \TMP2 # right shift TMP2 2 DWs
696 pxor \TMP4, \XMMDst
697 pxor \TMP2, \TMP6
698# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
699 # first phase of the reduction
700 movdqa \XMMDst, \TMP2
701 movdqa \XMMDst, \TMP3
702 movdqa \XMMDst, \TMP4
703# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
704 pslld $31, \TMP2 # packed right shifting << 31
705 pslld $30, \TMP3 # packed right shifting << 30
706 pslld $25, \TMP4 # packed right shifting << 25
707 pxor \TMP3, \TMP2 # xor the shifted versions
708 pxor \TMP4, \TMP2
709 movdqa \TMP2, \TMP7
710 psrldq $4, \TMP7 # right shift TMP7 1 DW
711 pslldq $12, \TMP2 # left shift TMP2 3 DWs
712 pxor \TMP2, \XMMDst
713
714 # second phase of the reduction
715 movdqa \XMMDst, \TMP2
716 # make 3 copies of XMMDst for doing 3 shift operations
717 movdqa \XMMDst, \TMP3
718 movdqa \XMMDst, \TMP4
719 psrld $1, \TMP2 # packed left shift >> 1
720 psrld $2, \TMP3 # packed left shift >> 2
721 psrld $7, \TMP4 # packed left shift >> 7
722 pxor \TMP3, \TMP2 # xor the shifted versions
723 pxor \TMP4, \TMP2
724 pxor \TMP7, \TMP2
725 pxor \TMP2, \XMMDst
726 pxor \TMP6, \XMMDst # reduced result is in XMMDst
727.endm
728
729/* Encryption of a single block done*/
730.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
731
732 pxor (%arg1), \XMM0
733 movaps 16(%arg1), \TMP1
734 AESENC \TMP1, \XMM0
735 movaps 32(%arg1), \TMP1
736 AESENC \TMP1, \XMM0
737 movaps 48(%arg1), \TMP1
738 AESENC \TMP1, \XMM0
739 movaps 64(%arg1), \TMP1
740 AESENC \TMP1, \XMM0
741 movaps 80(%arg1), \TMP1
742 AESENC \TMP1, \XMM0
743 movaps 96(%arg1), \TMP1
744 AESENC \TMP1, \XMM0
745 movaps 112(%arg1), \TMP1
746 AESENC \TMP1, \XMM0
747 movaps 128(%arg1), \TMP1
748 AESENC \TMP1, \XMM0
749 movaps 144(%arg1), \TMP1
750 AESENC \TMP1, \XMM0
751 movaps 160(%arg1), \TMP1
752 AESENCLAST \TMP1, \XMM0
753.endm
754
755
756/*****************************************************************************
757* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
758* u8 *out, // Plaintext output. Encrypt in-place is allowed.
759* const u8 *in, // Ciphertext input
760* u64 plaintext_len, // Length of data in bytes for decryption.
761* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
762* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
763* // concatenated with 0x00000001. 16-byte aligned pointer.
764* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
765* const u8 *aad, // Additional Authentication Data (AAD)
766* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
767* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
768* // given authentication tag and only return the plaintext if they match.
769* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
770* // (most likely), 12 or 8.
771*
772* Assumptions:
773*
774* keys:
775* keys are pre-expanded and aligned to 16 bytes. we are using the first
776* set of 11 keys in the data structure void *aes_ctx
777*
778* iv:
779* 0 1 2 3
780* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
781* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
782* | Salt (From the SA) |
783* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
784* | Initialization Vector |
785* | (This is the sequence number from IPSec header) |
786* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
787* | 0x1 |
788* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
789*
790*
791*
792* AAD:
793* AAD padded to 128 bits with 0
794* for example, assume AAD is a u32 vector
795*
796* if AAD is 8 bytes:
797* AAD[3] = {A0, A1};
798* padded AAD in xmm register = {A1 A0 0 0}
799*
800* 0 1 2 3
801* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
802* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
803* | SPI (A1) |
804* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
805* | 32-bit Sequence Number (A0) |
806* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
807* | 0x0 |
808* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
809*
810* AAD Format with 32-bit Sequence Number
811*
812* if AAD is 12 bytes:
813* AAD[3] = {A0, A1, A2};
814* padded AAD in xmm register = {A2 A1 A0 0}
815*
816* 0 1 2 3
817* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
818* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
819* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
820* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
821* | SPI (A2) |
822* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
823* | 64-bit Extended Sequence Number {A1,A0} |
824* | |
825* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
826* | 0x0 |
827* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
828*
829* AAD Format with 64-bit Extended Sequence Number
830*
831* aadLen:
832* from the definition of the spec, aadLen can only be 8 or 12 bytes.
833* The code supports 16 too but for other sizes, the code will fail.
834*
835* TLen:
836* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
837* For other sizes, the code will fail.
838*
839* poly = x^128 + x^127 + x^126 + x^121 + 1
840*
841*****************************************************************************/
842
843ENTRY(aesni_gcm_dec)
844 push %r12
845 push %r13
846 push %r14
847 mov %rsp, %r14
848/*
849* states of %xmm registers %xmm6:%xmm15 not saved
850* all %xmm registers are clobbered
851*/
852 sub $VARIABLE_OFFSET, %rsp
853 and $~63, %rsp # align rsp to 64 bytes
854 mov %arg6, %r12
855 movdqu (%r12), %xmm13 # %xmm13 = HashKey
856 pshufb SHUF_MASK(%rip), %xmm13
857
858# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
859
860 movdqa %xmm13, %xmm2
861 psllq $1, %xmm13
862 psrlq $63, %xmm2
863 movdqa %xmm2, %xmm1
864 pslldq $8, %xmm2
865 psrldq $8, %xmm1
866 por %xmm2, %xmm13
867
868 # Reduction
869
870 pshufd $0x24, %xmm1, %xmm2
871 pcmpeqd TWOONE(%rip), %xmm2
872 pand POLY(%rip), %xmm2
873 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
874
875
876 # Decrypt first few blocks
877
878 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
879 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
880 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
881 mov %r13, %r12
882 and $(3<<4), %r12
883 jz _initial_num_blocks_is_0_decrypt
884 cmp $(2<<4), %r12
885 jb _initial_num_blocks_is_1_decrypt
886 je _initial_num_blocks_is_2_decrypt
887_initial_num_blocks_is_3_decrypt:
888 INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
889%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
890 sub $48, %r13
891 jmp _initial_blocks_decrypted
892_initial_num_blocks_is_2_decrypt:
893 INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
894%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
895 sub $32, %r13
896 jmp _initial_blocks_decrypted
897_initial_num_blocks_is_1_decrypt:
898 INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
899%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
900 sub $16, %r13
901 jmp _initial_blocks_decrypted
902_initial_num_blocks_is_0_decrypt:
903 INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
904%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
905_initial_blocks_decrypted:
906 cmp $0, %r13
907 je _zero_cipher_left_decrypt
908 sub $64, %r13
909 je _four_cipher_left_decrypt
910_decrypt_by_4:
911 GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
912%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
913 add $64, %r11
914 sub $64, %r13
915 jne _decrypt_by_4
916_four_cipher_left_decrypt:
917 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
918%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
919_zero_cipher_left_decrypt:
920 mov %arg4, %r13
921 and $15, %r13 # %r13 = arg4 (mod 16)
922 je _multiple_of_16_bytes_decrypt
923
924 # Handle the last <16 byte block seperately
925
926 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
927 pshufb SHUF_MASK(%rip), %xmm0
928 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
929 sub $16, %r11
930 add %r13, %r11
931 movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block
932 lea SHIFT_MASK+16(%rip), %r12
933 sub %r13, %r12
934# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
935# (%r13 is the number of bytes in plaintext mod 16)
936 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
937 pshufb %xmm2, %xmm1 # right shift 16-%r13 butes
938 movdqa %xmm1, %xmm2
939 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
940 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
941 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
942 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
943 pand %xmm1, %xmm2
944 pshufb SHUF_MASK(%rip),%xmm2
945 pxor %xmm2, %xmm8
946 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
947 # GHASH computation for the last <16 byte block
948 sub %r13, %r11
949 add $16, %r11
950
951 # output %r13 bytes
952 movq %xmm0, %rax
953 cmp $8, %r13
954 jle _less_than_8_bytes_left_decrypt
955 mov %rax, (%arg2 , %r11, 1)
956 add $8, %r11
957 psrldq $8, %xmm0
958 movq %xmm0, %rax
959 sub $8, %r13
960_less_than_8_bytes_left_decrypt:
961 mov %al, (%arg2, %r11, 1)
962 add $1, %r11
963 shr $8, %rax
964 sub $1, %r13
965 jne _less_than_8_bytes_left_decrypt
966_multiple_of_16_bytes_decrypt:
967 mov arg8, %r12 # %r13 = aadLen (number of bytes)
968 shl $3, %r12 # convert into number of bits
969 movd %r12d, %xmm15 # len(A) in %xmm15
970 shl $3, %arg4 # len(C) in bits (*128)
971 movq %arg4, %xmm1
972 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
973 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
974 pxor %xmm15, %xmm8
975 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
976 # final GHASH computation
977 pshufb SHUF_MASK(%rip), %xmm8
978 mov %arg5, %rax # %rax = *Y0
979 movdqu (%rax), %xmm0 # %xmm0 = Y0
980 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
981 pxor %xmm8, %xmm0
982_return_T_decrypt:
983 mov arg9, %r10 # %r10 = authTag
984 mov arg10, %r11 # %r11 = auth_tag_len
985 cmp $16, %r11
986 je _T_16_decrypt
987 cmp $12, %r11
988 je _T_12_decrypt
989_T_8_decrypt:
990 movq %xmm0, %rax
991 mov %rax, (%r10)
992 jmp _return_T_done_decrypt
993_T_12_decrypt:
994 movq %xmm0, %rax
995 mov %rax, (%r10)
996 psrldq $8, %xmm0
997 movd %xmm0, %eax
998 mov %eax, 8(%r10)
999 jmp _return_T_done_decrypt
1000_T_16_decrypt:
1001 movdqu %xmm0, (%r10)
1002_return_T_done_decrypt:
1003 mov %r14, %rsp
1004 pop %r14
1005 pop %r13
1006 pop %r12
1007 ret
1008
1009
1010/*****************************************************************************
1011* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1012* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1013* const u8 *in, // Plaintext input
1014* u64 plaintext_len, // Length of data in bytes for encryption.
1015* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1016* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1017* // concatenated with 0x00000001. 16-byte aligned pointer.
1018* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1019* const u8 *aad, // Additional Authentication Data (AAD)
1020* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1021* u8 *auth_tag, // Authenticated Tag output.
1022* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1023* // 12 or 8.
1024*
1025* Assumptions:
1026*
1027* keys:
1028* keys are pre-expanded and aligned to 16 bytes. we are using the
1029* first set of 11 keys in the data structure void *aes_ctx
1030*
1031*
1032* iv:
1033* 0 1 2 3
1034* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1035* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1036* | Salt (From the SA) |
1037* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1038* | Initialization Vector |
1039* | (This is the sequence number from IPSec header) |
1040* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1041* | 0x1 |
1042* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1043*
1044*
1045*
1046* AAD:
1047* AAD padded to 128 bits with 0
1048* for example, assume AAD is a u32 vector
1049*
1050* if AAD is 8 bytes:
1051* AAD[3] = {A0, A1};
1052* padded AAD in xmm register = {A1 A0 0 0}
1053*
1054* 0 1 2 3
1055* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1056* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1057* | SPI (A1) |
1058* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1059* | 32-bit Sequence Number (A0) |
1060* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1061* | 0x0 |
1062* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1063*
1064* AAD Format with 32-bit Sequence Number
1065*
1066* if AAD is 12 bytes:
1067* AAD[3] = {A0, A1, A2};
1068* padded AAD in xmm register = {A2 A1 A0 0}
1069*
1070* 0 1 2 3
1071* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1072* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1073* | SPI (A2) |
1074* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1075* | 64-bit Extended Sequence Number {A1,A0} |
1076* | |
1077* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1078* | 0x0 |
1079* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1080*
1081* AAD Format with 64-bit Extended Sequence Number
1082*
1083* aadLen:
1084* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1085* The code supports 16 too but for other sizes, the code will fail.
1086*
1087* TLen:
1088* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1089* For other sizes, the code will fail.
1090*
1091* poly = x^128 + x^127 + x^126 + x^121 + 1
1092***************************************************************************/
1093ENTRY(aesni_gcm_enc)
1094 push %r12
1095 push %r13
1096 push %r14
1097 mov %rsp, %r14
1098#
1099# states of %xmm registers %xmm6:%xmm15 not saved
1100# all %xmm registers are clobbered
1101#
1102 sub $VARIABLE_OFFSET, %rsp
1103 and $~63, %rsp
1104 mov %arg6, %r12
1105 movdqu (%r12), %xmm13
1106 pshufb SHUF_MASK(%rip), %xmm13
1107
1108# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1109
1110 movdqa %xmm13, %xmm2
1111 psllq $1, %xmm13
1112 psrlq $63, %xmm2
1113 movdqa %xmm2, %xmm1
1114 pslldq $8, %xmm2
1115 psrldq $8, %xmm1
1116 por %xmm2, %xmm13
1117
1118 # reduce HashKey<<1
1119
1120 pshufd $0x24, %xmm1, %xmm2
1121 pcmpeqd TWOONE(%rip), %xmm2
1122 pand POLY(%rip), %xmm2
1123 pxor %xmm2, %xmm13
1124 movdqa %xmm13, HashKey(%rsp)
1125 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1126 and $-16, %r13
1127 mov %r13, %r12
1128
1129 # Encrypt first few blocks
1130
1131 and $(3<<4), %r12
1132 jz _initial_num_blocks_is_0_encrypt
1133 cmp $(2<<4), %r12
1134 jb _initial_num_blocks_is_1_encrypt
1135 je _initial_num_blocks_is_2_encrypt
1136_initial_num_blocks_is_3_encrypt:
1137 INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1138%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1139 sub $48, %r13
1140 jmp _initial_blocks_encrypted
1141_initial_num_blocks_is_2_encrypt:
1142 INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1143%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1144 sub $32, %r13
1145 jmp _initial_blocks_encrypted
1146_initial_num_blocks_is_1_encrypt:
1147 INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1148%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1149 sub $16, %r13
1150 jmp _initial_blocks_encrypted
1151_initial_num_blocks_is_0_encrypt:
1152 INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1153%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1154_initial_blocks_encrypted:
1155
1156 # Main loop - Encrypt remaining blocks
1157
1158 cmp $0, %r13
1159 je _zero_cipher_left_encrypt
1160 sub $64, %r13
1161 je _four_cipher_left_encrypt
1162_encrypt_by_4_encrypt:
1163 GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1164%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1165 add $64, %r11
1166 sub $64, %r13
1167 jne _encrypt_by_4_encrypt
1168_four_cipher_left_encrypt:
1169 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1170%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1171_zero_cipher_left_encrypt:
1172 mov %arg4, %r13
1173 and $15, %r13 # %r13 = arg4 (mod 16)
1174 je _multiple_of_16_bytes_encrypt
1175
1176 # Handle the last <16 Byte block seperately
1177 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1178 pshufb SHUF_MASK(%rip), %xmm0
1179 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1180 sub $16, %r11
1181 add %r13, %r11
1182 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1183 lea SHIFT_MASK+16(%rip), %r12
1184 sub %r13, %r12
1185 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1186 # (%r13 is the number of bytes in plaintext mod 16)
1187 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1188 pshufb %xmm2, %xmm1 # shift right 16-r13 byte
1189 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1190 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1191 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1192 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1193
1194 pshufb SHUF_MASK(%rip),%xmm0
1195 pxor %xmm0, %xmm8
1196 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1197 # GHASH computation for the last <16 byte block
1198 sub %r13, %r11
1199 add $16, %r11
1200 pshufb SHUF_MASK(%rip), %xmm0
1201 # shuffle xmm0 back to output as ciphertext
1202
1203 # Output %r13 bytes
1204 movq %xmm0, %rax
1205 cmp $8, %r13
1206 jle _less_than_8_bytes_left_encrypt
1207 mov %rax, (%arg2 , %r11, 1)
1208 add $8, %r11
1209 psrldq $8, %xmm0
1210 movq %xmm0, %rax
1211 sub $8, %r13
1212_less_than_8_bytes_left_encrypt:
1213 mov %al, (%arg2, %r11, 1)
1214 add $1, %r11
1215 shr $8, %rax
1216 sub $1, %r13
1217 jne _less_than_8_bytes_left_encrypt
1218_multiple_of_16_bytes_encrypt:
1219 mov arg8, %r12 # %r12 = addLen (number of bytes)
1220 shl $3, %r12
1221 movd %r12d, %xmm15 # len(A) in %xmm15
1222 shl $3, %arg4 # len(C) in bits (*128)
1223 movq %arg4, %xmm1
1224 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1225 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1226 pxor %xmm15, %xmm8
1227 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1228 # final GHASH computation
1229
1230 pshufb SHUF_MASK(%rip), %xmm8 # perform a 16 byte swap
1231 mov %arg5, %rax # %rax = *Y0
1232 movdqu (%rax), %xmm0 # %xmm0 = Y0
1233 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1234 pxor %xmm8, %xmm0
1235_return_T_encrypt:
1236 mov arg9, %r10 # %r10 = authTag
1237 mov arg10, %r11 # %r11 = auth_tag_len
1238 cmp $16, %r11
1239 je _T_16_encrypt
1240 cmp $12, %r11
1241 je _T_12_encrypt
1242_T_8_encrypt:
1243 movq %xmm0, %rax
1244 mov %rax, (%r10)
1245 jmp _return_T_done_encrypt
1246_T_12_encrypt:
1247 movq %xmm0, %rax
1248 mov %rax, (%r10)
1249 psrldq $8, %xmm0
1250 movd %xmm0, %eax
1251 mov %eax, 8(%r10)
1252 jmp _return_T_done_encrypt
1253_T_16_encrypt:
1254 movdqu %xmm0, (%r10)
1255_return_T_done_encrypt:
1256 mov %r14, %rsp
1257 pop %r14
1258 pop %r13
1259 pop %r12
1260 ret
Mathias Krause559ad0f2010-11-29 08:35:39 +08001261#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001262
1263
Huang Ying54b6a1b2009-01-18 16:28:34 +11001264_key_expansion_128:
1265_key_expansion_256a:
1266 pshufd $0b11111111, %xmm1, %xmm1
1267 shufps $0b00010000, %xmm0, %xmm4
1268 pxor %xmm4, %xmm0
1269 shufps $0b10001100, %xmm0, %xmm4
1270 pxor %xmm4, %xmm0
1271 pxor %xmm1, %xmm0
Mathias Krause0d258ef2010-11-27 16:34:46 +08001272 movaps %xmm0, (TKEYP)
1273 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001274 ret
1275
Mathias Krause0d258ef2010-11-27 16:34:46 +08001276.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001277_key_expansion_192a:
1278 pshufd $0b01010101, %xmm1, %xmm1
1279 shufps $0b00010000, %xmm0, %xmm4
1280 pxor %xmm4, %xmm0
1281 shufps $0b10001100, %xmm0, %xmm4
1282 pxor %xmm4, %xmm0
1283 pxor %xmm1, %xmm0
1284
1285 movaps %xmm2, %xmm5
1286 movaps %xmm2, %xmm6
1287 pslldq $4, %xmm5
1288 pshufd $0b11111111, %xmm0, %xmm3
1289 pxor %xmm3, %xmm2
1290 pxor %xmm5, %xmm2
1291
1292 movaps %xmm0, %xmm1
1293 shufps $0b01000100, %xmm0, %xmm6
Mathias Krause0d258ef2010-11-27 16:34:46 +08001294 movaps %xmm6, (TKEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001295 shufps $0b01001110, %xmm2, %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001296 movaps %xmm1, 0x10(TKEYP)
1297 add $0x20, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001298 ret
1299
Mathias Krause0d258ef2010-11-27 16:34:46 +08001300.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001301_key_expansion_192b:
1302 pshufd $0b01010101, %xmm1, %xmm1
1303 shufps $0b00010000, %xmm0, %xmm4
1304 pxor %xmm4, %xmm0
1305 shufps $0b10001100, %xmm0, %xmm4
1306 pxor %xmm4, %xmm0
1307 pxor %xmm1, %xmm0
1308
1309 movaps %xmm2, %xmm5
1310 pslldq $4, %xmm5
1311 pshufd $0b11111111, %xmm0, %xmm3
1312 pxor %xmm3, %xmm2
1313 pxor %xmm5, %xmm2
1314
Mathias Krause0d258ef2010-11-27 16:34:46 +08001315 movaps %xmm0, (TKEYP)
1316 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001317 ret
1318
Mathias Krause0d258ef2010-11-27 16:34:46 +08001319.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001320_key_expansion_256b:
1321 pshufd $0b10101010, %xmm1, %xmm1
1322 shufps $0b00010000, %xmm2, %xmm4
1323 pxor %xmm4, %xmm2
1324 shufps $0b10001100, %xmm2, %xmm4
1325 pxor %xmm4, %xmm2
1326 pxor %xmm1, %xmm2
Mathias Krause0d258ef2010-11-27 16:34:46 +08001327 movaps %xmm2, (TKEYP)
1328 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001329 ret
1330
1331/*
1332 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1333 * unsigned int key_len)
1334 */
1335ENTRY(aesni_set_key)
Mathias Krause0d258ef2010-11-27 16:34:46 +08001336#ifndef __x86_64__
1337 pushl KEYP
1338 movl 8(%esp), KEYP # ctx
1339 movl 12(%esp), UKEYP # in_key
1340 movl 16(%esp), %edx # key_len
1341#endif
1342 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1343 movaps %xmm0, (KEYP)
1344 lea 0x10(KEYP), TKEYP # key addr
1345 movl %edx, 480(KEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001346 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1347 cmp $24, %dl
1348 jb .Lenc_key128
1349 je .Lenc_key192
Mathias Krause0d258ef2010-11-27 16:34:46 +08001350 movups 0x10(UKEYP), %xmm2 # other user key
1351 movaps %xmm2, (TKEYP)
1352 add $0x10, TKEYP
Huang Yingb369e522009-11-23 19:54:06 +08001353 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001354 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001355 AESKEYGENASSIST 0x1 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001356 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001357 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001358 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001359 AESKEYGENASSIST 0x2 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001360 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001361 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001362 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001363 AESKEYGENASSIST 0x4 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001364 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001365 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001366 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001367 AESKEYGENASSIST 0x8 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001368 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001369 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001370 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001371 AESKEYGENASSIST 0x10 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001372 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001373 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001374 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001375 AESKEYGENASSIST 0x20 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001376 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001377 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001378 call _key_expansion_256a
1379 jmp .Ldec_key
1380.Lenc_key192:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001381 movq 0x10(UKEYP), %xmm2 # other user key
Huang Yingb369e522009-11-23 19:54:06 +08001382 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001383 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001384 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001385 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001386 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001387 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001388 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001389 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001390 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001391 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001392 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001393 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001394 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001395 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001396 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001397 call _key_expansion_192b
1398 jmp .Ldec_key
1399.Lenc_key128:
Huang Yingb369e522009-11-23 19:54:06 +08001400 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001401 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001402 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001403 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001404 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001405 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001406 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001407 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001408 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001409 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001410 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001411 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001412 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001413 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001414 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001415 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001416 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
Huang Ying54b6a1b2009-01-18 16:28:34 +11001417 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001418 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
Huang Ying54b6a1b2009-01-18 16:28:34 +11001419 call _key_expansion_128
1420.Ldec_key:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001421 sub $0x10, TKEYP
1422 movaps (KEYP), %xmm0
1423 movaps (TKEYP), %xmm1
1424 movaps %xmm0, 240(TKEYP)
1425 movaps %xmm1, 240(KEYP)
1426 add $0x10, KEYP
1427 lea 240-16(TKEYP), UKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001428.align 4
1429.Ldec_key_loop:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001430 movaps (KEYP), %xmm0
Huang Yingb369e522009-11-23 19:54:06 +08001431 AESIMC %xmm0 %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001432 movaps %xmm1, (UKEYP)
1433 add $0x10, KEYP
1434 sub $0x10, UKEYP
1435 cmp TKEYP, KEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001436 jb .Ldec_key_loop
Mathias Krause0d258ef2010-11-27 16:34:46 +08001437 xor AREG, AREG
1438#ifndef __x86_64__
1439 popl KEYP
1440#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001441 ret
1442
1443/*
1444 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1445 */
1446ENTRY(aesni_enc)
Mathias Krause0d258ef2010-11-27 16:34:46 +08001447#ifndef __x86_64__
1448 pushl KEYP
1449 pushl KLEN
1450 movl 12(%esp), KEYP
1451 movl 16(%esp), OUTP
1452 movl 20(%esp), INP
1453#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001454 movl 480(KEYP), KLEN # key length
1455 movups (INP), STATE # input
1456 call _aesni_enc1
1457 movups STATE, (OUTP) # output
Mathias Krause0d258ef2010-11-27 16:34:46 +08001458#ifndef __x86_64__
1459 popl KLEN
1460 popl KEYP
1461#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001462 ret
1463
1464/*
1465 * _aesni_enc1: internal ABI
1466 * input:
1467 * KEYP: key struct pointer
1468 * KLEN: round count
1469 * STATE: initial state (input)
1470 * output:
1471 * STATE: finial state (output)
1472 * changed:
1473 * KEY
1474 * TKEYP (T1)
1475 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08001476.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001477_aesni_enc1:
1478 movaps (KEYP), KEY # key
1479 mov KEYP, TKEYP
1480 pxor KEY, STATE # round 0
1481 add $0x30, TKEYP
1482 cmp $24, KLEN
1483 jb .Lenc128
1484 lea 0x20(TKEYP), TKEYP
1485 je .Lenc192
1486 add $0x20, TKEYP
1487 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001488 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001489 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001490 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001491.align 4
1492.Lenc192:
1493 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001494 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001495 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001496 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001497.align 4
1498.Lenc128:
1499 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001500 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001501 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001502 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001503 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001504 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001505 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001506 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001507 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001508 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001509 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001510 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001511 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001512 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001513 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001514 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001515 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001516 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001517 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001518 AESENCLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001519 ret
1520
1521/*
1522 * _aesni_enc4: internal ABI
1523 * input:
1524 * KEYP: key struct pointer
1525 * KLEN: round count
1526 * STATE1: initial state (input)
1527 * STATE2
1528 * STATE3
1529 * STATE4
1530 * output:
1531 * STATE1: finial state (output)
1532 * STATE2
1533 * STATE3
1534 * STATE4
1535 * changed:
1536 * KEY
1537 * TKEYP (T1)
1538 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08001539.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001540_aesni_enc4:
1541 movaps (KEYP), KEY # key
1542 mov KEYP, TKEYP
1543 pxor KEY, STATE1 # round 0
1544 pxor KEY, STATE2
1545 pxor KEY, STATE3
1546 pxor KEY, STATE4
1547 add $0x30, TKEYP
1548 cmp $24, KLEN
1549 jb .L4enc128
1550 lea 0x20(TKEYP), TKEYP
1551 je .L4enc192
1552 add $0x20, TKEYP
1553 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001554 AESENC KEY STATE1
1555 AESENC KEY STATE2
1556 AESENC KEY STATE3
1557 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001558 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001559 AESENC KEY STATE1
1560 AESENC KEY STATE2
1561 AESENC KEY STATE3
1562 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001563#.align 4
1564.L4enc192:
1565 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001566 AESENC KEY STATE1
1567 AESENC KEY STATE2
1568 AESENC KEY STATE3
1569 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001570 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001571 AESENC KEY STATE1
1572 AESENC KEY STATE2
1573 AESENC KEY STATE3
1574 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001575#.align 4
1576.L4enc128:
1577 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001578 AESENC KEY STATE1
1579 AESENC KEY STATE2
1580 AESENC KEY STATE3
1581 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001582 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001583 AESENC KEY STATE1
1584 AESENC KEY STATE2
1585 AESENC KEY STATE3
1586 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001587 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001588 AESENC KEY STATE1
1589 AESENC KEY STATE2
1590 AESENC KEY STATE3
1591 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001592 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001593 AESENC KEY STATE1
1594 AESENC KEY STATE2
1595 AESENC KEY STATE3
1596 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001597 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001598 AESENC KEY STATE1
1599 AESENC KEY STATE2
1600 AESENC KEY STATE3
1601 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001602 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001603 AESENC KEY STATE1
1604 AESENC KEY STATE2
1605 AESENC KEY STATE3
1606 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001607 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001608 AESENC KEY STATE1
1609 AESENC KEY STATE2
1610 AESENC KEY STATE3
1611 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001612 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001613 AESENC KEY STATE1
1614 AESENC KEY STATE2
1615 AESENC KEY STATE3
1616 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001617 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001618 AESENC KEY STATE1
1619 AESENC KEY STATE2
1620 AESENC KEY STATE3
1621 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001622 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001623 AESENCLAST KEY STATE1 # last round
1624 AESENCLAST KEY STATE2
1625 AESENCLAST KEY STATE3
1626 AESENCLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001627 ret
1628
1629/*
1630 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1631 */
1632ENTRY(aesni_dec)
Mathias Krause0d258ef2010-11-27 16:34:46 +08001633#ifndef __x86_64__
1634 pushl KEYP
1635 pushl KLEN
1636 movl 12(%esp), KEYP
1637 movl 16(%esp), OUTP
1638 movl 20(%esp), INP
1639#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001640 mov 480(KEYP), KLEN # key length
1641 add $240, KEYP
1642 movups (INP), STATE # input
1643 call _aesni_dec1
1644 movups STATE, (OUTP) #output
Mathias Krause0d258ef2010-11-27 16:34:46 +08001645#ifndef __x86_64__
1646 popl KLEN
1647 popl KEYP
1648#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001649 ret
1650
1651/*
1652 * _aesni_dec1: internal ABI
1653 * input:
1654 * KEYP: key struct pointer
1655 * KLEN: key length
1656 * STATE: initial state (input)
1657 * output:
1658 * STATE: finial state (output)
1659 * changed:
1660 * KEY
1661 * TKEYP (T1)
1662 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08001663.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001664_aesni_dec1:
1665 movaps (KEYP), KEY # key
1666 mov KEYP, TKEYP
1667 pxor KEY, STATE # round 0
1668 add $0x30, TKEYP
1669 cmp $24, KLEN
1670 jb .Ldec128
1671 lea 0x20(TKEYP), TKEYP
1672 je .Ldec192
1673 add $0x20, TKEYP
1674 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001675 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001676 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001677 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001678.align 4
1679.Ldec192:
1680 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001681 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001682 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001683 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001684.align 4
1685.Ldec128:
1686 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001687 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001688 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001689 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001690 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001691 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001692 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001693 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001694 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001695 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001696 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001697 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001698 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001699 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001700 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001701 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001702 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001703 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001704 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001705 AESDECLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001706 ret
1707
1708/*
1709 * _aesni_dec4: internal ABI
1710 * input:
1711 * KEYP: key struct pointer
1712 * KLEN: key length
1713 * STATE1: initial state (input)
1714 * STATE2
1715 * STATE3
1716 * STATE4
1717 * output:
1718 * STATE1: finial state (output)
1719 * STATE2
1720 * STATE3
1721 * STATE4
1722 * changed:
1723 * KEY
1724 * TKEYP (T1)
1725 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08001726.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001727_aesni_dec4:
1728 movaps (KEYP), KEY # key
1729 mov KEYP, TKEYP
1730 pxor KEY, STATE1 # round 0
1731 pxor KEY, STATE2
1732 pxor KEY, STATE3
1733 pxor KEY, STATE4
1734 add $0x30, TKEYP
1735 cmp $24, KLEN
1736 jb .L4dec128
1737 lea 0x20(TKEYP), TKEYP
1738 je .L4dec192
1739 add $0x20, TKEYP
1740 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001741 AESDEC KEY STATE1
1742 AESDEC KEY STATE2
1743 AESDEC KEY STATE3
1744 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001745 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001746 AESDEC KEY STATE1
1747 AESDEC KEY STATE2
1748 AESDEC KEY STATE3
1749 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001750.align 4
1751.L4dec192:
1752 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001753 AESDEC KEY STATE1
1754 AESDEC KEY STATE2
1755 AESDEC KEY STATE3
1756 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001757 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001758 AESDEC KEY STATE1
1759 AESDEC KEY STATE2
1760 AESDEC KEY STATE3
1761 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001762.align 4
1763.L4dec128:
1764 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001765 AESDEC KEY STATE1
1766 AESDEC KEY STATE2
1767 AESDEC KEY STATE3
1768 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001769 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001770 AESDEC KEY STATE1
1771 AESDEC KEY STATE2
1772 AESDEC KEY STATE3
1773 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001774 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001775 AESDEC KEY STATE1
1776 AESDEC KEY STATE2
1777 AESDEC KEY STATE3
1778 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001779 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001780 AESDEC KEY STATE1
1781 AESDEC KEY STATE2
1782 AESDEC KEY STATE3
1783 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001784 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001785 AESDEC KEY STATE1
1786 AESDEC KEY STATE2
1787 AESDEC KEY STATE3
1788 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001789 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001790 AESDEC KEY STATE1
1791 AESDEC KEY STATE2
1792 AESDEC KEY STATE3
1793 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001794 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001795 AESDEC KEY STATE1
1796 AESDEC KEY STATE2
1797 AESDEC KEY STATE3
1798 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001799 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001800 AESDEC KEY STATE1
1801 AESDEC KEY STATE2
1802 AESDEC KEY STATE3
1803 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001804 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001805 AESDEC KEY STATE1
1806 AESDEC KEY STATE2
1807 AESDEC KEY STATE3
1808 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001809 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001810 AESDECLAST KEY STATE1 # last round
1811 AESDECLAST KEY STATE2
1812 AESDECLAST KEY STATE3
1813 AESDECLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001814 ret
1815
1816/*
1817 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1818 * size_t len)
1819 */
1820ENTRY(aesni_ecb_enc)
Mathias Krause0d258ef2010-11-27 16:34:46 +08001821#ifndef __x86_64__
1822 pushl LEN
1823 pushl KEYP
1824 pushl KLEN
1825 movl 16(%esp), KEYP
1826 movl 20(%esp), OUTP
1827 movl 24(%esp), INP
1828 movl 28(%esp), LEN
1829#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001830 test LEN, LEN # check length
1831 jz .Lecb_enc_ret
1832 mov 480(KEYP), KLEN
1833 cmp $16, LEN
1834 jb .Lecb_enc_ret
1835 cmp $64, LEN
1836 jb .Lecb_enc_loop1
1837.align 4
1838.Lecb_enc_loop4:
1839 movups (INP), STATE1
1840 movups 0x10(INP), STATE2
1841 movups 0x20(INP), STATE3
1842 movups 0x30(INP), STATE4
1843 call _aesni_enc4
1844 movups STATE1, (OUTP)
1845 movups STATE2, 0x10(OUTP)
1846 movups STATE3, 0x20(OUTP)
1847 movups STATE4, 0x30(OUTP)
1848 sub $64, LEN
1849 add $64, INP
1850 add $64, OUTP
1851 cmp $64, LEN
1852 jge .Lecb_enc_loop4
1853 cmp $16, LEN
1854 jb .Lecb_enc_ret
1855.align 4
1856.Lecb_enc_loop1:
1857 movups (INP), STATE1
1858 call _aesni_enc1
1859 movups STATE1, (OUTP)
1860 sub $16, LEN
1861 add $16, INP
1862 add $16, OUTP
1863 cmp $16, LEN
1864 jge .Lecb_enc_loop1
1865.Lecb_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001866#ifndef __x86_64__
1867 popl KLEN
1868 popl KEYP
1869 popl LEN
1870#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001871 ret
1872
1873/*
1874 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1875 * size_t len);
1876 */
1877ENTRY(aesni_ecb_dec)
Mathias Krause0d258ef2010-11-27 16:34:46 +08001878#ifndef __x86_64__
1879 pushl LEN
1880 pushl KEYP
1881 pushl KLEN
1882 movl 16(%esp), KEYP
1883 movl 20(%esp), OUTP
1884 movl 24(%esp), INP
1885 movl 28(%esp), LEN
1886#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001887 test LEN, LEN
1888 jz .Lecb_dec_ret
1889 mov 480(KEYP), KLEN
1890 add $240, KEYP
1891 cmp $16, LEN
1892 jb .Lecb_dec_ret
1893 cmp $64, LEN
1894 jb .Lecb_dec_loop1
1895.align 4
1896.Lecb_dec_loop4:
1897 movups (INP), STATE1
1898 movups 0x10(INP), STATE2
1899 movups 0x20(INP), STATE3
1900 movups 0x30(INP), STATE4
1901 call _aesni_dec4
1902 movups STATE1, (OUTP)
1903 movups STATE2, 0x10(OUTP)
1904 movups STATE3, 0x20(OUTP)
1905 movups STATE4, 0x30(OUTP)
1906 sub $64, LEN
1907 add $64, INP
1908 add $64, OUTP
1909 cmp $64, LEN
1910 jge .Lecb_dec_loop4
1911 cmp $16, LEN
1912 jb .Lecb_dec_ret
1913.align 4
1914.Lecb_dec_loop1:
1915 movups (INP), STATE1
1916 call _aesni_dec1
1917 movups STATE1, (OUTP)
1918 sub $16, LEN
1919 add $16, INP
1920 add $16, OUTP
1921 cmp $16, LEN
1922 jge .Lecb_dec_loop1
1923.Lecb_dec_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001924#ifndef __x86_64__
1925 popl KLEN
1926 popl KEYP
1927 popl LEN
1928#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001929 ret
1930
1931/*
1932 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1933 * size_t len, u8 *iv)
1934 */
1935ENTRY(aesni_cbc_enc)
Mathias Krause0d258ef2010-11-27 16:34:46 +08001936#ifndef __x86_64__
1937 pushl IVP
1938 pushl LEN
1939 pushl KEYP
1940 pushl KLEN
1941 movl 20(%esp), KEYP
1942 movl 24(%esp), OUTP
1943 movl 28(%esp), INP
1944 movl 32(%esp), LEN
1945 movl 36(%esp), IVP
1946#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001947 cmp $16, LEN
1948 jb .Lcbc_enc_ret
1949 mov 480(KEYP), KLEN
1950 movups (IVP), STATE # load iv as initial state
1951.align 4
1952.Lcbc_enc_loop:
1953 movups (INP), IN # load input
1954 pxor IN, STATE
1955 call _aesni_enc1
1956 movups STATE, (OUTP) # store output
1957 sub $16, LEN
1958 add $16, INP
1959 add $16, OUTP
1960 cmp $16, LEN
1961 jge .Lcbc_enc_loop
1962 movups STATE, (IVP)
1963.Lcbc_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001964#ifndef __x86_64__
1965 popl KLEN
1966 popl KEYP
1967 popl LEN
1968 popl IVP
1969#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001970 ret
1971
1972/*
1973 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1974 * size_t len, u8 *iv)
1975 */
1976ENTRY(aesni_cbc_dec)
Mathias Krause0d258ef2010-11-27 16:34:46 +08001977#ifndef __x86_64__
1978 pushl IVP
1979 pushl LEN
1980 pushl KEYP
1981 pushl KLEN
1982 movl 20(%esp), KEYP
1983 movl 24(%esp), OUTP
1984 movl 28(%esp), INP
1985 movl 32(%esp), LEN
1986 movl 36(%esp), IVP
1987#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001988 cmp $16, LEN
Huang Yinge6efaa02009-06-18 19:33:57 +08001989 jb .Lcbc_dec_just_ret
Huang Ying54b6a1b2009-01-18 16:28:34 +11001990 mov 480(KEYP), KLEN
1991 add $240, KEYP
1992 movups (IVP), IV
1993 cmp $64, LEN
1994 jb .Lcbc_dec_loop1
1995.align 4
1996.Lcbc_dec_loop4:
1997 movups (INP), IN1
1998 movaps IN1, STATE1
1999 movups 0x10(INP), IN2
2000 movaps IN2, STATE2
Mathias Krause0d258ef2010-11-27 16:34:46 +08002001#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002002 movups 0x20(INP), IN3
2003 movaps IN3, STATE3
2004 movups 0x30(INP), IN4
2005 movaps IN4, STATE4
Mathias Krause0d258ef2010-11-27 16:34:46 +08002006#else
2007 movups 0x20(INP), IN1
2008 movaps IN1, STATE3
2009 movups 0x30(INP), IN2
2010 movaps IN2, STATE4
2011#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002012 call _aesni_dec4
2013 pxor IV, STATE1
Mathias Krause0d258ef2010-11-27 16:34:46 +08002014#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002015 pxor IN1, STATE2
2016 pxor IN2, STATE3
2017 pxor IN3, STATE4
2018 movaps IN4, IV
Mathias Krause0d258ef2010-11-27 16:34:46 +08002019#else
2020 pxor (INP), STATE2
2021 pxor 0x10(INP), STATE3
2022 pxor IN1, STATE4
2023 movaps IN2, IV
2024#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002025 movups STATE1, (OUTP)
2026 movups STATE2, 0x10(OUTP)
2027 movups STATE3, 0x20(OUTP)
2028 movups STATE4, 0x30(OUTP)
2029 sub $64, LEN
2030 add $64, INP
2031 add $64, OUTP
2032 cmp $64, LEN
2033 jge .Lcbc_dec_loop4
2034 cmp $16, LEN
2035 jb .Lcbc_dec_ret
2036.align 4
2037.Lcbc_dec_loop1:
2038 movups (INP), IN
2039 movaps IN, STATE
2040 call _aesni_dec1
2041 pxor IV, STATE
2042 movups STATE, (OUTP)
2043 movaps IN, IV
2044 sub $16, LEN
2045 add $16, INP
2046 add $16, OUTP
2047 cmp $16, LEN
2048 jge .Lcbc_dec_loop1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002049.Lcbc_dec_ret:
Huang Yinge6efaa02009-06-18 19:33:57 +08002050 movups IV, (IVP)
2051.Lcbc_dec_just_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002052#ifndef __x86_64__
2053 popl KLEN
2054 popl KEYP
2055 popl LEN
2056 popl IVP
2057#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002058 ret
Huang Ying12387a42010-03-10 18:28:55 +08002059
Mathias Krause0d258ef2010-11-27 16:34:46 +08002060#ifdef __x86_64__
Huang Ying12387a42010-03-10 18:28:55 +08002061.align 16
2062.Lbswap_mask:
2063 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2064
2065/*
2066 * _aesni_inc_init: internal ABI
2067 * setup registers used by _aesni_inc
2068 * input:
2069 * IV
2070 * output:
2071 * CTR: == IV, in little endian
2072 * TCTR_LOW: == lower qword of CTR
2073 * INC: == 1, in little endian
2074 * BSWAP_MASK == endian swapping mask
2075 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002076.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002077_aesni_inc_init:
2078 movaps .Lbswap_mask, BSWAP_MASK
2079 movaps IV, CTR
2080 PSHUFB_XMM BSWAP_MASK CTR
2081 mov $1, TCTR_LOW
Huang Ying32cbd7d2010-03-13 16:28:42 +08002082 MOVQ_R64_XMM TCTR_LOW INC
2083 MOVQ_R64_XMM CTR TCTR_LOW
Huang Ying12387a42010-03-10 18:28:55 +08002084 ret
2085
2086/*
2087 * _aesni_inc: internal ABI
2088 * Increase IV by 1, IV is in big endian
2089 * input:
2090 * IV
2091 * CTR: == IV, in little endian
2092 * TCTR_LOW: == lower qword of CTR
2093 * INC: == 1, in little endian
2094 * BSWAP_MASK == endian swapping mask
2095 * output:
2096 * IV: Increase by 1
2097 * changed:
2098 * CTR: == output IV, in little endian
2099 * TCTR_LOW: == lower qword of CTR
2100 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002101.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002102_aesni_inc:
2103 paddq INC, CTR
2104 add $1, TCTR_LOW
2105 jnc .Linc_low
2106 pslldq $8, INC
2107 paddq INC, CTR
2108 psrldq $8, INC
2109.Linc_low:
2110 movaps CTR, IV
2111 PSHUFB_XMM BSWAP_MASK IV
2112 ret
2113
2114/*
2115 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2116 * size_t len, u8 *iv)
2117 */
2118ENTRY(aesni_ctr_enc)
2119 cmp $16, LEN
2120 jb .Lctr_enc_just_ret
2121 mov 480(KEYP), KLEN
2122 movups (IVP), IV
2123 call _aesni_inc_init
2124 cmp $64, LEN
2125 jb .Lctr_enc_loop1
2126.align 4
2127.Lctr_enc_loop4:
2128 movaps IV, STATE1
2129 call _aesni_inc
2130 movups (INP), IN1
2131 movaps IV, STATE2
2132 call _aesni_inc
2133 movups 0x10(INP), IN2
2134 movaps IV, STATE3
2135 call _aesni_inc
2136 movups 0x20(INP), IN3
2137 movaps IV, STATE4
2138 call _aesni_inc
2139 movups 0x30(INP), IN4
2140 call _aesni_enc4
2141 pxor IN1, STATE1
2142 movups STATE1, (OUTP)
2143 pxor IN2, STATE2
2144 movups STATE2, 0x10(OUTP)
2145 pxor IN3, STATE3
2146 movups STATE3, 0x20(OUTP)
2147 pxor IN4, STATE4
2148 movups STATE4, 0x30(OUTP)
2149 sub $64, LEN
2150 add $64, INP
2151 add $64, OUTP
2152 cmp $64, LEN
2153 jge .Lctr_enc_loop4
2154 cmp $16, LEN
2155 jb .Lctr_enc_ret
2156.align 4
2157.Lctr_enc_loop1:
2158 movaps IV, STATE
2159 call _aesni_inc
2160 movups (INP), IN
2161 call _aesni_enc1
2162 pxor IN, STATE
2163 movups STATE, (OUTP)
2164 sub $16, LEN
2165 add $16, INP
2166 add $16, OUTP
2167 cmp $16, LEN
2168 jge .Lctr_enc_loop1
2169.Lctr_enc_ret:
2170 movups IV, (IVP)
2171.Lctr_enc_just_ret:
2172 ret
Mathias Krause0d258ef2010-11-27 16:34:46 +08002173#endif