blob: aafced54df645daf332938bbcdcc070307e4d89c [file] [log] [blame]
Huang Ying54b6a1b2009-01-18 16:28:34 +11001/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040012 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
Huang Ying54b6a1b2009-01-18 16:28:34 +110023 * This program is free software; you can redistribute it and/or modify
24 * it under the terms of the GNU General Public License as published by
25 * the Free Software Foundation; either version 2 of the License, or
26 * (at your option) any later version.
27 */
28
29#include <linux/linkage.h>
Huang Yingb369e522009-11-23 19:54:06 +080030#include <asm/inst.h>
Huang Ying54b6a1b2009-01-18 16:28:34 +110031
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040032.data
33POLY: .octa 0xC2000000000000000000000000000001
34TWOONE: .octa 0x00000001000000000000000000000001
35
36# order of these constants should not change.
37# more specifically, ALL_F should follow SHIFT_MASK,
38# and ZERO should follow ALL_F
39
40SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
41MASK1: .octa 0x0000000000000000ffffffffffffffff
42MASK2: .octa 0xffffffffffffffff0000000000000000
43SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
44ALL_F: .octa 0xffffffffffffffffffffffffffffffff
45ZERO: .octa 0x00000000000000000000000000000000
46ONE: .octa 0x00000000000000000000000000000001
47F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
48dec: .octa 0x1
49enc: .octa 0x2
50
51
Huang Ying54b6a1b2009-01-18 16:28:34 +110052.text
53
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040054
55#define STACK_OFFSET 8*3
56#define HashKey 16*0 // store HashKey <<1 mod poly here
57#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
58#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
59#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
60#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
61 // bits of HashKey <<1 mod poly here
62 //(for Karatsuba purposes)
63#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
64 // bits of HashKey^2 <<1 mod poly here
65 // (for Karatsuba purposes)
66#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
67 // bits of HashKey^3 <<1 mod poly here
68 // (for Karatsuba purposes)
69#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
70 // bits of HashKey^4 <<1 mod poly here
71 // (for Karatsuba purposes)
72#define VARIABLE_OFFSET 16*8
73
74#define arg1 rdi
75#define arg2 rsi
76#define arg3 rdx
77#define arg4 rcx
78#define arg5 r8
79#define arg6 r9
80#define arg7 STACK_OFFSET+8(%r14)
81#define arg8 STACK_OFFSET+16(%r14)
82#define arg9 STACK_OFFSET+24(%r14)
83#define arg10 STACK_OFFSET+32(%r14)
84
85
Huang Ying54b6a1b2009-01-18 16:28:34 +110086#define STATE1 %xmm0
87#define STATE2 %xmm4
88#define STATE3 %xmm5
89#define STATE4 %xmm6
90#define STATE STATE1
91#define IN1 %xmm1
92#define IN2 %xmm7
93#define IN3 %xmm8
94#define IN4 %xmm9
95#define IN IN1
96#define KEY %xmm2
97#define IV %xmm3
Huang Ying12387a42010-03-10 18:28:55 +080098#define BSWAP_MASK %xmm10
99#define CTR %xmm11
100#define INC %xmm12
Huang Ying54b6a1b2009-01-18 16:28:34 +1100101
102#define KEYP %rdi
103#define OUTP %rsi
104#define INP %rdx
105#define LEN %rcx
106#define IVP %r8
107#define KLEN %r9d
108#define T1 %r10
109#define TKEYP T1
110#define T2 %r11
Huang Ying12387a42010-03-10 18:28:55 +0800111#define TCTR_LOW T2
Huang Ying54b6a1b2009-01-18 16:28:34 +1100112
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400113
114/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
115*
116*
117* Input: A and B (128-bits each, bit-reflected)
118* Output: C = A*B*x mod poly, (i.e. >>1 )
119* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
120* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
121*
122*/
123.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
124 movdqa \GH, \TMP1
125 pshufd $78, \GH, \TMP2
126 pshufd $78, \HK, \TMP3
127 pxor \GH, \TMP2 # TMP2 = a1+a0
128 pxor \HK, \TMP3 # TMP3 = b1+b0
129 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
130 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
131 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
132 pxor \GH, \TMP2
133 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
134 movdqa \TMP2, \TMP3
135 pslldq $8, \TMP3 # left shift TMP3 2 DWs
136 psrldq $8, \TMP2 # right shift TMP2 2 DWs
137 pxor \TMP3, \GH
138 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
139
140 # first phase of the reduction
141
142 movdqa \GH, \TMP2
143 movdqa \GH, \TMP3
144 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
145 # in in order to perform
146 # independent shifts
147 pslld $31, \TMP2 # packed right shift <<31
148 pslld $30, \TMP3 # packed right shift <<30
149 pslld $25, \TMP4 # packed right shift <<25
150 pxor \TMP3, \TMP2 # xor the shifted versions
151 pxor \TMP4, \TMP2
152 movdqa \TMP2, \TMP5
153 psrldq $4, \TMP5 # right shift TMP5 1 DW
154 pslldq $12, \TMP2 # left shift TMP2 3 DWs
155 pxor \TMP2, \GH
156
157 # second phase of the reduction
158
159 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
160 # in in order to perform
161 # independent shifts
162 movdqa \GH,\TMP3
163 movdqa \GH,\TMP4
164 psrld $1,\TMP2 # packed left shift >>1
165 psrld $2,\TMP3 # packed left shift >>2
166 psrld $7,\TMP4 # packed left shift >>7
167 pxor \TMP3,\TMP2 # xor the shifted versions
168 pxor \TMP4,\TMP2
169 pxor \TMP5, \TMP2
170 pxor \TMP2, \GH
171 pxor \TMP1, \GH # result is in TMP1
172.endm
173
174/*
175* if a = number of total plaintext bytes
176* b = floor(a/16)
177* num_initial_blocks = b mod 4
178* encrypt the initial num_initial_blocks blocks and apply ghash on
179* the ciphertext
180* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
181* are clobbered
182* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
183*/
184
185.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
186XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
187
188 mov arg7, %r10 # %r10 = AAD
189 mov arg8, %r12 # %r12 = aadLen
190 mov %r12, %r11
191 pxor %xmm\i, %xmm\i
192_get_AAD_loop\num_initial_blocks\operation:
193 movd (%r10), \TMP1
194 pslldq $12, \TMP1
195 psrldq $4, %xmm\i
196 pxor \TMP1, %xmm\i
197 add $4, %r10
198 sub $4, %r12
199 jne _get_AAD_loop\num_initial_blocks\operation
200 cmp $16, %r11
201 je _get_AAD_loop2_done\num_initial_blocks\operation
202 mov $16, %r12
203_get_AAD_loop2\num_initial_blocks\operation:
204 psrldq $4, %xmm\i
205 sub $4, %r12
206 cmp %r11, %r12
207 jne _get_AAD_loop2\num_initial_blocks\operation
208_get_AAD_loop2_done\num_initial_blocks\operation:
209 pshufb SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data
210 xor %r11, %r11 # initialise the data pointer offset as zero
211
212 # start AES for num_initial_blocks blocks
213
214 mov %arg5, %rax # %rax = *Y0
215 movdqu (%rax), \XMM0 # XMM0 = Y0
216 pshufb SHUF_MASK(%rip), \XMM0
217.if \i_seq != 0
218.irpc index, \i_seq
219 paddd ONE(%rip), \XMM0 # INCR Y0
220 movdqa \XMM0, %xmm\index
221 pshufb SHUF_MASK(%rip), %xmm\index # perform a 16 byte swap
222.endr
223.irpc index, \i_seq
224 pxor 16*0(%arg1), %xmm\index
225.endr
226.irpc index, \i_seq
227 movaps 0x10(%rdi), \TMP1
228 AESENC \TMP1, %xmm\index # Round 1
229.endr
230.irpc index, \i_seq
231 movaps 0x20(%arg1), \TMP1
232 AESENC \TMP1, %xmm\index # Round 2
233.endr
234.irpc index, \i_seq
235 movaps 0x30(%arg1), \TMP1
236 AESENC \TMP1, %xmm\index # Round 2
237.endr
238.irpc index, \i_seq
239 movaps 0x40(%arg1), \TMP1
240 AESENC \TMP1, %xmm\index # Round 2
241.endr
242.irpc index, \i_seq
243 movaps 0x50(%arg1), \TMP1
244 AESENC \TMP1, %xmm\index # Round 2
245.endr
246.irpc index, \i_seq
247 movaps 0x60(%arg1), \TMP1
248 AESENC \TMP1, %xmm\index # Round 2
249.endr
250.irpc index, \i_seq
251 movaps 0x70(%arg1), \TMP1
252 AESENC \TMP1, %xmm\index # Round 2
253.endr
254.irpc index, \i_seq
255 movaps 0x80(%arg1), \TMP1
256 AESENC \TMP1, %xmm\index # Round 2
257.endr
258.irpc index, \i_seq
259 movaps 0x90(%arg1), \TMP1
260 AESENC \TMP1, %xmm\index # Round 2
261.endr
262.irpc index, \i_seq
263 movaps 0xa0(%arg1), \TMP1
264 AESENCLAST \TMP1, %xmm\index # Round 10
265.endr
266.irpc index, \i_seq
267 movdqu (%arg3 , %r11, 1), \TMP1
268 pxor \TMP1, %xmm\index
269 movdqu %xmm\index, (%arg2 , %r11, 1)
270 # write back plaintext/ciphertext for num_initial_blocks
271 add $16, %r11
272.if \operation == dec
273 movdqa \TMP1, %xmm\index
274.endif
275 pshufb SHUF_MASK(%rip), %xmm\index
276 # prepare plaintext/ciphertext for GHASH computation
277.endr
278.endif
279 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
280 # apply GHASH on num_initial_blocks blocks
281
282.if \i == 5
283 pxor %xmm5, %xmm6
284 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
285 pxor %xmm6, %xmm7
286 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
287 pxor %xmm7, %xmm8
288 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
289.elseif \i == 6
290 pxor %xmm6, %xmm7
291 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
292 pxor %xmm7, %xmm8
293 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
294.elseif \i == 7
295 pxor %xmm7, %xmm8
296 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
297.endif
298 cmp $64, %r13
299 jl _initial_blocks_done\num_initial_blocks\operation
300 # no need for precomputed values
301/*
302*
303* Precomputations for HashKey parallel with encryption of first 4 blocks.
304* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
305*/
306 paddd ONE(%rip), \XMM0 # INCR Y0
307 movdqa \XMM0, \XMM1
308 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
309 paddd ONE(%rip), \XMM0 # INCR Y0
310 movdqa \XMM0, \XMM2
311 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
312 paddd ONE(%rip), \XMM0 # INCR Y0
313 movdqa \XMM0, \XMM3
314 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
315 paddd ONE(%rip), \XMM0 # INCR Y0
316 movdqa \XMM0, \XMM4
317 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
318 pxor 16*0(%arg1), \XMM1
319 pxor 16*0(%arg1), \XMM2
320 pxor 16*0(%arg1), \XMM3
321 pxor 16*0(%arg1), \XMM4
322 movdqa \TMP3, \TMP5
323 pshufd $78, \TMP3, \TMP1
324 pxor \TMP3, \TMP1
325 movdqa \TMP1, HashKey_k(%rsp)
326 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
327# TMP5 = HashKey^2<<1 (mod poly)
328 movdqa \TMP5, HashKey_2(%rsp)
329# HashKey_2 = HashKey^2<<1 (mod poly)
330 pshufd $78, \TMP5, \TMP1
331 pxor \TMP5, \TMP1
332 movdqa \TMP1, HashKey_2_k(%rsp)
333.irpc index, 1234 # do 4 rounds
334 movaps 0x10*\index(%arg1), \TMP1
335 AESENC \TMP1, \XMM1
336 AESENC \TMP1, \XMM2
337 AESENC \TMP1, \XMM3
338 AESENC \TMP1, \XMM4
339.endr
340 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
341# TMP5 = HashKey^3<<1 (mod poly)
342 movdqa \TMP5, HashKey_3(%rsp)
343 pshufd $78, \TMP5, \TMP1
344 pxor \TMP5, \TMP1
345 movdqa \TMP1, HashKey_3_k(%rsp)
346.irpc index, 56789 # do next 5 rounds
347 movaps 0x10*\index(%arg1), \TMP1
348 AESENC \TMP1, \XMM1
349 AESENC \TMP1, \XMM2
350 AESENC \TMP1, \XMM3
351 AESENC \TMP1, \XMM4
352.endr
353 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
354# TMP5 = HashKey^3<<1 (mod poly)
355 movdqa \TMP5, HashKey_4(%rsp)
356 pshufd $78, \TMP5, \TMP1
357 pxor \TMP5, \TMP1
358 movdqa \TMP1, HashKey_4_k(%rsp)
359 movaps 0xa0(%arg1), \TMP2
360 AESENCLAST \TMP2, \XMM1
361 AESENCLAST \TMP2, \XMM2
362 AESENCLAST \TMP2, \XMM3
363 AESENCLAST \TMP2, \XMM4
364 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
365 pxor \TMP1, \XMM1
366.if \operation == dec
367 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
368 movdqa \TMP1, \XMM1
369.endif
370 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
371 pxor \TMP1, \XMM2
372.if \operation == dec
373 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
374 movdqa \TMP1, \XMM2
375.endif
376 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
377 pxor \TMP1, \XMM3
378.if \operation == dec
379 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
380 movdqa \TMP1, \XMM3
381.endif
382 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
383 pxor \TMP1, \XMM4
384.if \operation == dec
385 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
386 movdqa \TMP1, \XMM4
387.else
388 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
389 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
390 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
391 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
392.endif
393 add $64, %r11
394 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
395 pxor \XMMDst, \XMM1
396# combine GHASHed value with the corresponding ciphertext
397 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
398 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
399 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
400_initial_blocks_done\num_initial_blocks\operation:
401.endm
402
403/*
404* encrypt 4 blocks at a time
405* ghash the 4 previously encrypted ciphertext blocks
406* arg1, %arg2, %arg3 are used as pointers only, not modified
407* %r11 is the data offset value
408*/
409.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \
410TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
411
412 movdqa \XMM1, \XMM5
413 movdqa \XMM2, \XMM6
414 movdqa \XMM3, \XMM7
415 movdqa \XMM4, \XMM8
416
417 # multiply TMP5 * HashKey using karatsuba
418
419 movdqa \XMM5, \TMP4
420 pshufd $78, \XMM5, \TMP6
421 pxor \XMM5, \TMP6
422 paddd ONE(%rip), \XMM0 # INCR CNT
423 movdqa HashKey_4(%rsp), \TMP5
424 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
425 movdqa \XMM0, \XMM1
426 paddd ONE(%rip), \XMM0 # INCR CNT
427 movdqa \XMM0, \XMM2
428 paddd ONE(%rip), \XMM0 # INCR CNT
429 movdqa \XMM0, \XMM3
430 paddd ONE(%rip), \XMM0 # INCR CNT
431 movdqa \XMM0, \XMM4
432 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
433 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
434 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
435 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
436 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
437 pxor (%arg1), \XMM1
438 pxor (%arg1), \XMM2
439 pxor (%arg1), \XMM3
440 pxor (%arg1), \XMM4
441 movdqa HashKey_4_k(%rsp), \TMP5
442 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
443 movaps 0x10(%arg1), \TMP1
444 AESENC \TMP1, \XMM1 # Round 1
445 AESENC \TMP1, \XMM2
446 AESENC \TMP1, \XMM3
447 AESENC \TMP1, \XMM4
448 movaps 0x20(%arg1), \TMP1
449 AESENC \TMP1, \XMM1 # Round 2
450 AESENC \TMP1, \XMM2
451 AESENC \TMP1, \XMM3
452 AESENC \TMP1, \XMM4
453 movdqa \XMM6, \TMP1
454 pshufd $78, \XMM6, \TMP2
455 pxor \XMM6, \TMP2
456 movdqa HashKey_3(%rsp), \TMP5
457 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
458 movaps 0x30(%arg1), \TMP3
459 AESENC \TMP3, \XMM1 # Round 3
460 AESENC \TMP3, \XMM2
461 AESENC \TMP3, \XMM3
462 AESENC \TMP3, \XMM4
463 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
464 movaps 0x40(%arg1), \TMP3
465 AESENC \TMP3, \XMM1 # Round 4
466 AESENC \TMP3, \XMM2
467 AESENC \TMP3, \XMM3
468 AESENC \TMP3, \XMM4
469 movdqa HashKey_3_k(%rsp), \TMP5
470 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
471 movaps 0x50(%arg1), \TMP3
472 AESENC \TMP3, \XMM1 # Round 5
473 AESENC \TMP3, \XMM2
474 AESENC \TMP3, \XMM3
475 AESENC \TMP3, \XMM4
476 pxor \TMP1, \TMP4
477# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
478 pxor \XMM6, \XMM5
479 pxor \TMP2, \TMP6
480 movdqa \XMM7, \TMP1
481 pshufd $78, \XMM7, \TMP2
482 pxor \XMM7, \TMP2
483 movdqa HashKey_2(%rsp ), \TMP5
484
485 # Multiply TMP5 * HashKey using karatsuba
486
487 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
488 movaps 0x60(%arg1), \TMP3
489 AESENC \TMP3, \XMM1 # Round 6
490 AESENC \TMP3, \XMM2
491 AESENC \TMP3, \XMM3
492 AESENC \TMP3, \XMM4
493 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
494 movaps 0x70(%arg1), \TMP3
495 AESENC \TMP3, \XMM1 # Round 7
496 AESENC \TMP3, \XMM2
497 AESENC \TMP3, \XMM3
498 AESENC \TMP3, \XMM4
499 movdqa HashKey_2_k(%rsp), \TMP5
500 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
501 movaps 0x80(%arg1), \TMP3
502 AESENC \TMP3, \XMM1 # Round 8
503 AESENC \TMP3, \XMM2
504 AESENC \TMP3, \XMM3
505 AESENC \TMP3, \XMM4
506 pxor \TMP1, \TMP4
507# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
508 pxor \XMM7, \XMM5
509 pxor \TMP2, \TMP6
510
511 # Multiply XMM8 * HashKey
512 # XMM8 and TMP5 hold the values for the two operands
513
514 movdqa \XMM8, \TMP1
515 pshufd $78, \XMM8, \TMP2
516 pxor \XMM8, \TMP2
517 movdqa HashKey(%rsp), \TMP5
518 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
519 movaps 0x90(%arg1), \TMP3
520 AESENC \TMP3, \XMM1 # Round 9
521 AESENC \TMP3, \XMM2
522 AESENC \TMP3, \XMM3
523 AESENC \TMP3, \XMM4
524 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
525 movaps 0xa0(%arg1), \TMP3
526 AESENCLAST \TMP3, \XMM1 # Round 10
527 AESENCLAST \TMP3, \XMM2
528 AESENCLAST \TMP3, \XMM3
529 AESENCLAST \TMP3, \XMM4
530 movdqa HashKey_k(%rsp), \TMP5
531 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
532 movdqu (%arg3,%r11,1), \TMP3
533 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
534.if \operation == dec
535 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
536 movdqa \TMP3, \XMM1
537.endif
538 movdqu 16(%arg3,%r11,1), \TMP3
539 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
540.if \operation == dec
541 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
542 movdqa \TMP3, \XMM2
543.endif
544 movdqu 32(%arg3,%r11,1), \TMP3
545 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
546.if \operation == dec
547 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
548 movdqa \TMP3, \XMM3
549.endif
550 movdqu 48(%arg3,%r11,1), \TMP3
551 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
552.if \operation == dec
553 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
554 movdqa \TMP3, \XMM4
555.else
556 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
557 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
558 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
559 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
560.endif
561 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
562 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
563 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
564 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte sway
565
566 pxor \TMP4, \TMP1
567 pxor \XMM8, \XMM5
568 pxor \TMP6, \TMP2
569 pxor \TMP1, \TMP2
570 pxor \XMM5, \TMP2
571 movdqa \TMP2, \TMP3
572 pslldq $8, \TMP3 # left shift TMP3 2 DWs
573 psrldq $8, \TMP2 # right shift TMP2 2 DWs
574 pxor \TMP3, \XMM5
575 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
576
577 # first phase of reduction
578
579 movdqa \XMM5, \TMP2
580 movdqa \XMM5, \TMP3
581 movdqa \XMM5, \TMP4
582# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
583 pslld $31, \TMP2 # packed right shift << 31
584 pslld $30, \TMP3 # packed right shift << 30
585 pslld $25, \TMP4 # packed right shift << 25
586 pxor \TMP3, \TMP2 # xor the shifted versions
587 pxor \TMP4, \TMP2
588 movdqa \TMP2, \TMP5
589 psrldq $4, \TMP5 # right shift T5 1 DW
590 pslldq $12, \TMP2 # left shift T2 3 DWs
591 pxor \TMP2, \XMM5
592
593 # second phase of reduction
594
595 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
596 movdqa \XMM5,\TMP3
597 movdqa \XMM5,\TMP4
598 psrld $1, \TMP2 # packed left shift >>1
599 psrld $2, \TMP3 # packed left shift >>2
600 psrld $7, \TMP4 # packed left shift >>7
601 pxor \TMP3,\TMP2 # xor the shifted versions
602 pxor \TMP4,\TMP2
603 pxor \TMP5, \TMP2
604 pxor \TMP2, \XMM5
605 pxor \TMP1, \XMM5 # result is in TMP1
606
607 pxor \XMM5, \XMM1
608.endm
609
610/* GHASH the last 4 ciphertext blocks. */
611.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
612TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
613
614 # Multiply TMP6 * HashKey (using Karatsuba)
615
616 movdqa \XMM1, \TMP6
617 pshufd $78, \XMM1, \TMP2
618 pxor \XMM1, \TMP2
619 movdqa HashKey_4(%rsp), \TMP5
620 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
621 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
622 movdqa HashKey_4_k(%rsp), \TMP4
623 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
624 movdqa \XMM1, \XMMDst
625 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
626
627 # Multiply TMP1 * HashKey (using Karatsuba)
628
629 movdqa \XMM2, \TMP1
630 pshufd $78, \XMM2, \TMP2
631 pxor \XMM2, \TMP2
632 movdqa HashKey_3(%rsp), \TMP5
633 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
634 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
635 movdqa HashKey_3_k(%rsp), \TMP4
636 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
637 pxor \TMP1, \TMP6
638 pxor \XMM2, \XMMDst
639 pxor \TMP2, \XMM1
640# results accumulated in TMP6, XMMDst, XMM1
641
642 # Multiply TMP1 * HashKey (using Karatsuba)
643
644 movdqa \XMM3, \TMP1
645 pshufd $78, \XMM3, \TMP2
646 pxor \XMM3, \TMP2
647 movdqa HashKey_2(%rsp), \TMP5
648 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
649 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
650 movdqa HashKey_2_k(%rsp), \TMP4
651 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
652 pxor \TMP1, \TMP6
653 pxor \XMM3, \XMMDst
654 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
655
656 # Multiply TMP1 * HashKey (using Karatsuba)
657 movdqa \XMM4, \TMP1
658 pshufd $78, \XMM4, \TMP2
659 pxor \XMM4, \TMP2
660 movdqa HashKey(%rsp), \TMP5
661 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
662 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
663 movdqa HashKey_k(%rsp), \TMP4
664 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
665 pxor \TMP1, \TMP6
666 pxor \XMM4, \XMMDst
667 pxor \XMM1, \TMP2
668 pxor \TMP6, \TMP2
669 pxor \XMMDst, \TMP2
670 # middle section of the temp results combined as in karatsuba algorithm
671 movdqa \TMP2, \TMP4
672 pslldq $8, \TMP4 # left shift TMP4 2 DWs
673 psrldq $8, \TMP2 # right shift TMP2 2 DWs
674 pxor \TMP4, \XMMDst
675 pxor \TMP2, \TMP6
676# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
677 # first phase of the reduction
678 movdqa \XMMDst, \TMP2
679 movdqa \XMMDst, \TMP3
680 movdqa \XMMDst, \TMP4
681# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
682 pslld $31, \TMP2 # packed right shifting << 31
683 pslld $30, \TMP3 # packed right shifting << 30
684 pslld $25, \TMP4 # packed right shifting << 25
685 pxor \TMP3, \TMP2 # xor the shifted versions
686 pxor \TMP4, \TMP2
687 movdqa \TMP2, \TMP7
688 psrldq $4, \TMP7 # right shift TMP7 1 DW
689 pslldq $12, \TMP2 # left shift TMP2 3 DWs
690 pxor \TMP2, \XMMDst
691
692 # second phase of the reduction
693 movdqa \XMMDst, \TMP2
694 # make 3 copies of XMMDst for doing 3 shift operations
695 movdqa \XMMDst, \TMP3
696 movdqa \XMMDst, \TMP4
697 psrld $1, \TMP2 # packed left shift >> 1
698 psrld $2, \TMP3 # packed left shift >> 2
699 psrld $7, \TMP4 # packed left shift >> 7
700 pxor \TMP3, \TMP2 # xor the shifted versions
701 pxor \TMP4, \TMP2
702 pxor \TMP7, \TMP2
703 pxor \TMP2, \XMMDst
704 pxor \TMP6, \XMMDst # reduced result is in XMMDst
705.endm
706
707/* Encryption of a single block done*/
708.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
709
710 pxor (%arg1), \XMM0
711 movaps 16(%arg1), \TMP1
712 AESENC \TMP1, \XMM0
713 movaps 32(%arg1), \TMP1
714 AESENC \TMP1, \XMM0
715 movaps 48(%arg1), \TMP1
716 AESENC \TMP1, \XMM0
717 movaps 64(%arg1), \TMP1
718 AESENC \TMP1, \XMM0
719 movaps 80(%arg1), \TMP1
720 AESENC \TMP1, \XMM0
721 movaps 96(%arg1), \TMP1
722 AESENC \TMP1, \XMM0
723 movaps 112(%arg1), \TMP1
724 AESENC \TMP1, \XMM0
725 movaps 128(%arg1), \TMP1
726 AESENC \TMP1, \XMM0
727 movaps 144(%arg1), \TMP1
728 AESENC \TMP1, \XMM0
729 movaps 160(%arg1), \TMP1
730 AESENCLAST \TMP1, \XMM0
731.endm
732
733
734/*****************************************************************************
735* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
736* u8 *out, // Plaintext output. Encrypt in-place is allowed.
737* const u8 *in, // Ciphertext input
738* u64 plaintext_len, // Length of data in bytes for decryption.
739* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
740* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
741* // concatenated with 0x00000001. 16-byte aligned pointer.
742* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
743* const u8 *aad, // Additional Authentication Data (AAD)
744* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
745* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
746* // given authentication tag and only return the plaintext if they match.
747* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
748* // (most likely), 12 or 8.
749*
750* Assumptions:
751*
752* keys:
753* keys are pre-expanded and aligned to 16 bytes. we are using the first
754* set of 11 keys in the data structure void *aes_ctx
755*
756* iv:
757* 0 1 2 3
758* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
759* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
760* | Salt (From the SA) |
761* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
762* | Initialization Vector |
763* | (This is the sequence number from IPSec header) |
764* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
765* | 0x1 |
766* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
767*
768*
769*
770* AAD:
771* AAD padded to 128 bits with 0
772* for example, assume AAD is a u32 vector
773*
774* if AAD is 8 bytes:
775* AAD[3] = {A0, A1};
776* padded AAD in xmm register = {A1 A0 0 0}
777*
778* 0 1 2 3
779* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
780* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
781* | SPI (A1) |
782* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
783* | 32-bit Sequence Number (A0) |
784* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
785* | 0x0 |
786* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
787*
788* AAD Format with 32-bit Sequence Number
789*
790* if AAD is 12 bytes:
791* AAD[3] = {A0, A1, A2};
792* padded AAD in xmm register = {A2 A1 A0 0}
793*
794* 0 1 2 3
795* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
796* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
797* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
798* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
799* | SPI (A2) |
800* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
801* | 64-bit Extended Sequence Number {A1,A0} |
802* | |
803* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
804* | 0x0 |
805* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
806*
807* AAD Format with 64-bit Extended Sequence Number
808*
809* aadLen:
810* from the definition of the spec, aadLen can only be 8 or 12 bytes.
811* The code supports 16 too but for other sizes, the code will fail.
812*
813* TLen:
814* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
815* For other sizes, the code will fail.
816*
817* poly = x^128 + x^127 + x^126 + x^121 + 1
818*
819*****************************************************************************/
820
821ENTRY(aesni_gcm_dec)
822 push %r12
823 push %r13
824 push %r14
825 mov %rsp, %r14
826/*
827* states of %xmm registers %xmm6:%xmm15 not saved
828* all %xmm registers are clobbered
829*/
830 sub $VARIABLE_OFFSET, %rsp
831 and $~63, %rsp # align rsp to 64 bytes
832 mov %arg6, %r12
833 movdqu (%r12), %xmm13 # %xmm13 = HashKey
834 pshufb SHUF_MASK(%rip), %xmm13
835
836# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
837
838 movdqa %xmm13, %xmm2
839 psllq $1, %xmm13
840 psrlq $63, %xmm2
841 movdqa %xmm2, %xmm1
842 pslldq $8, %xmm2
843 psrldq $8, %xmm1
844 por %xmm2, %xmm13
845
846 # Reduction
847
848 pshufd $0x24, %xmm1, %xmm2
849 pcmpeqd TWOONE(%rip), %xmm2
850 pand POLY(%rip), %xmm2
851 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
852
853
854 # Decrypt first few blocks
855
856 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
857 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
858 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
859 mov %r13, %r12
860 and $(3<<4), %r12
861 jz _initial_num_blocks_is_0_decrypt
862 cmp $(2<<4), %r12
863 jb _initial_num_blocks_is_1_decrypt
864 je _initial_num_blocks_is_2_decrypt
865_initial_num_blocks_is_3_decrypt:
866 INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
867%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
868 sub $48, %r13
869 jmp _initial_blocks_decrypted
870_initial_num_blocks_is_2_decrypt:
871 INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
872%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
873 sub $32, %r13
874 jmp _initial_blocks_decrypted
875_initial_num_blocks_is_1_decrypt:
876 INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
877%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
878 sub $16, %r13
879 jmp _initial_blocks_decrypted
880_initial_num_blocks_is_0_decrypt:
881 INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
882%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
883_initial_blocks_decrypted:
884 cmp $0, %r13
885 je _zero_cipher_left_decrypt
886 sub $64, %r13
887 je _four_cipher_left_decrypt
888_decrypt_by_4:
889 GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
890%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
891 add $64, %r11
892 sub $64, %r13
893 jne _decrypt_by_4
894_four_cipher_left_decrypt:
895 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
896%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
897_zero_cipher_left_decrypt:
898 mov %arg4, %r13
899 and $15, %r13 # %r13 = arg4 (mod 16)
900 je _multiple_of_16_bytes_decrypt
901
902 # Handle the last <16 byte block seperately
903
904 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
905 pshufb SHUF_MASK(%rip), %xmm0
906 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
907 sub $16, %r11
908 add %r13, %r11
909 movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block
910 lea SHIFT_MASK+16(%rip), %r12
911 sub %r13, %r12
912# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
913# (%r13 is the number of bytes in plaintext mod 16)
914 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
915 pshufb %xmm2, %xmm1 # right shift 16-%r13 butes
916 movdqa %xmm1, %xmm2
917 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
918 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
919 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
920 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
921 pand %xmm1, %xmm2
922 pshufb SHUF_MASK(%rip),%xmm2
923 pxor %xmm2, %xmm8
924 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
925 # GHASH computation for the last <16 byte block
926 sub %r13, %r11
927 add $16, %r11
928
929 # output %r13 bytes
930 movq %xmm0, %rax
931 cmp $8, %r13
932 jle _less_than_8_bytes_left_decrypt
933 mov %rax, (%arg2 , %r11, 1)
934 add $8, %r11
935 psrldq $8, %xmm0
936 movq %xmm0, %rax
937 sub $8, %r13
938_less_than_8_bytes_left_decrypt:
939 mov %al, (%arg2, %r11, 1)
940 add $1, %r11
941 shr $8, %rax
942 sub $1, %r13
943 jne _less_than_8_bytes_left_decrypt
944_multiple_of_16_bytes_decrypt:
945 mov arg8, %r12 # %r13 = aadLen (number of bytes)
946 shl $3, %r12 # convert into number of bits
947 movd %r12d, %xmm15 # len(A) in %xmm15
948 shl $3, %arg4 # len(C) in bits (*128)
949 movq %arg4, %xmm1
950 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
951 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
952 pxor %xmm15, %xmm8
953 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
954 # final GHASH computation
955 pshufb SHUF_MASK(%rip), %xmm8
956 mov %arg5, %rax # %rax = *Y0
957 movdqu (%rax), %xmm0 # %xmm0 = Y0
958 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
959 pxor %xmm8, %xmm0
960_return_T_decrypt:
961 mov arg9, %r10 # %r10 = authTag
962 mov arg10, %r11 # %r11 = auth_tag_len
963 cmp $16, %r11
964 je _T_16_decrypt
965 cmp $12, %r11
966 je _T_12_decrypt
967_T_8_decrypt:
968 movq %xmm0, %rax
969 mov %rax, (%r10)
970 jmp _return_T_done_decrypt
971_T_12_decrypt:
972 movq %xmm0, %rax
973 mov %rax, (%r10)
974 psrldq $8, %xmm0
975 movd %xmm0, %eax
976 mov %eax, 8(%r10)
977 jmp _return_T_done_decrypt
978_T_16_decrypt:
979 movdqu %xmm0, (%r10)
980_return_T_done_decrypt:
981 mov %r14, %rsp
982 pop %r14
983 pop %r13
984 pop %r12
985 ret
986
987
988/*****************************************************************************
989* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
990* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
991* const u8 *in, // Plaintext input
992* u64 plaintext_len, // Length of data in bytes for encryption.
993* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
994* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
995* // concatenated with 0x00000001. 16-byte aligned pointer.
996* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
997* const u8 *aad, // Additional Authentication Data (AAD)
998* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
999* u8 *auth_tag, // Authenticated Tag output.
1000* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1001* // 12 or 8.
1002*
1003* Assumptions:
1004*
1005* keys:
1006* keys are pre-expanded and aligned to 16 bytes. we are using the
1007* first set of 11 keys in the data structure void *aes_ctx
1008*
1009*
1010* iv:
1011* 0 1 2 3
1012* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1013* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1014* | Salt (From the SA) |
1015* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1016* | Initialization Vector |
1017* | (This is the sequence number from IPSec header) |
1018* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1019* | 0x1 |
1020* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1021*
1022*
1023*
1024* AAD:
1025* AAD padded to 128 bits with 0
1026* for example, assume AAD is a u32 vector
1027*
1028* if AAD is 8 bytes:
1029* AAD[3] = {A0, A1};
1030* padded AAD in xmm register = {A1 A0 0 0}
1031*
1032* 0 1 2 3
1033* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1034* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1035* | SPI (A1) |
1036* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1037* | 32-bit Sequence Number (A0) |
1038* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1039* | 0x0 |
1040* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1041*
1042* AAD Format with 32-bit Sequence Number
1043*
1044* if AAD is 12 bytes:
1045* AAD[3] = {A0, A1, A2};
1046* padded AAD in xmm register = {A2 A1 A0 0}
1047*
1048* 0 1 2 3
1049* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1050* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1051* | SPI (A2) |
1052* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1053* | 64-bit Extended Sequence Number {A1,A0} |
1054* | |
1055* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1056* | 0x0 |
1057* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1058*
1059* AAD Format with 64-bit Extended Sequence Number
1060*
1061* aadLen:
1062* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1063* The code supports 16 too but for other sizes, the code will fail.
1064*
1065* TLen:
1066* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1067* For other sizes, the code will fail.
1068*
1069* poly = x^128 + x^127 + x^126 + x^121 + 1
1070***************************************************************************/
1071ENTRY(aesni_gcm_enc)
1072 push %r12
1073 push %r13
1074 push %r14
1075 mov %rsp, %r14
1076#
1077# states of %xmm registers %xmm6:%xmm15 not saved
1078# all %xmm registers are clobbered
1079#
1080 sub $VARIABLE_OFFSET, %rsp
1081 and $~63, %rsp
1082 mov %arg6, %r12
1083 movdqu (%r12), %xmm13
1084 pshufb SHUF_MASK(%rip), %xmm13
1085
1086# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1087
1088 movdqa %xmm13, %xmm2
1089 psllq $1, %xmm13
1090 psrlq $63, %xmm2
1091 movdqa %xmm2, %xmm1
1092 pslldq $8, %xmm2
1093 psrldq $8, %xmm1
1094 por %xmm2, %xmm13
1095
1096 # reduce HashKey<<1
1097
1098 pshufd $0x24, %xmm1, %xmm2
1099 pcmpeqd TWOONE(%rip), %xmm2
1100 pand POLY(%rip), %xmm2
1101 pxor %xmm2, %xmm13
1102 movdqa %xmm13, HashKey(%rsp)
1103 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1104 and $-16, %r13
1105 mov %r13, %r12
1106
1107 # Encrypt first few blocks
1108
1109 and $(3<<4), %r12
1110 jz _initial_num_blocks_is_0_encrypt
1111 cmp $(2<<4), %r12
1112 jb _initial_num_blocks_is_1_encrypt
1113 je _initial_num_blocks_is_2_encrypt
1114_initial_num_blocks_is_3_encrypt:
1115 INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1116%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1117 sub $48, %r13
1118 jmp _initial_blocks_encrypted
1119_initial_num_blocks_is_2_encrypt:
1120 INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1121%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1122 sub $32, %r13
1123 jmp _initial_blocks_encrypted
1124_initial_num_blocks_is_1_encrypt:
1125 INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1126%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1127 sub $16, %r13
1128 jmp _initial_blocks_encrypted
1129_initial_num_blocks_is_0_encrypt:
1130 INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1131%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1132_initial_blocks_encrypted:
1133
1134 # Main loop - Encrypt remaining blocks
1135
1136 cmp $0, %r13
1137 je _zero_cipher_left_encrypt
1138 sub $64, %r13
1139 je _four_cipher_left_encrypt
1140_encrypt_by_4_encrypt:
1141 GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1142%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1143 add $64, %r11
1144 sub $64, %r13
1145 jne _encrypt_by_4_encrypt
1146_four_cipher_left_encrypt:
1147 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1148%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1149_zero_cipher_left_encrypt:
1150 mov %arg4, %r13
1151 and $15, %r13 # %r13 = arg4 (mod 16)
1152 je _multiple_of_16_bytes_encrypt
1153
1154 # Handle the last <16 Byte block seperately
1155 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1156 pshufb SHUF_MASK(%rip), %xmm0
1157 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1158 sub $16, %r11
1159 add %r13, %r11
1160 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1161 lea SHIFT_MASK+16(%rip), %r12
1162 sub %r13, %r12
1163 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1164 # (%r13 is the number of bytes in plaintext mod 16)
1165 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1166 pshufb %xmm2, %xmm1 # shift right 16-r13 byte
1167 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1168 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1169 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1170 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1171
1172 pshufb SHUF_MASK(%rip),%xmm0
1173 pxor %xmm0, %xmm8
1174 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1175 # GHASH computation for the last <16 byte block
1176 sub %r13, %r11
1177 add $16, %r11
1178 pshufb SHUF_MASK(%rip), %xmm0
1179 # shuffle xmm0 back to output as ciphertext
1180
1181 # Output %r13 bytes
1182 movq %xmm0, %rax
1183 cmp $8, %r13
1184 jle _less_than_8_bytes_left_encrypt
1185 mov %rax, (%arg2 , %r11, 1)
1186 add $8, %r11
1187 psrldq $8, %xmm0
1188 movq %xmm0, %rax
1189 sub $8, %r13
1190_less_than_8_bytes_left_encrypt:
1191 mov %al, (%arg2, %r11, 1)
1192 add $1, %r11
1193 shr $8, %rax
1194 sub $1, %r13
1195 jne _less_than_8_bytes_left_encrypt
1196_multiple_of_16_bytes_encrypt:
1197 mov arg8, %r12 # %r12 = addLen (number of bytes)
1198 shl $3, %r12
1199 movd %r12d, %xmm15 # len(A) in %xmm15
1200 shl $3, %arg4 # len(C) in bits (*128)
1201 movq %arg4, %xmm1
1202 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1203 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1204 pxor %xmm15, %xmm8
1205 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1206 # final GHASH computation
1207
1208 pshufb SHUF_MASK(%rip), %xmm8 # perform a 16 byte swap
1209 mov %arg5, %rax # %rax = *Y0
1210 movdqu (%rax), %xmm0 # %xmm0 = Y0
1211 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1212 pxor %xmm8, %xmm0
1213_return_T_encrypt:
1214 mov arg9, %r10 # %r10 = authTag
1215 mov arg10, %r11 # %r11 = auth_tag_len
1216 cmp $16, %r11
1217 je _T_16_encrypt
1218 cmp $12, %r11
1219 je _T_12_encrypt
1220_T_8_encrypt:
1221 movq %xmm0, %rax
1222 mov %rax, (%r10)
1223 jmp _return_T_done_encrypt
1224_T_12_encrypt:
1225 movq %xmm0, %rax
1226 mov %rax, (%r10)
1227 psrldq $8, %xmm0
1228 movd %xmm0, %eax
1229 mov %eax, 8(%r10)
1230 jmp _return_T_done_encrypt
1231_T_16_encrypt:
1232 movdqu %xmm0, (%r10)
1233_return_T_done_encrypt:
1234 mov %r14, %rsp
1235 pop %r14
1236 pop %r13
1237 pop %r12
1238 ret
1239
1240
1241
Huang Ying54b6a1b2009-01-18 16:28:34 +11001242_key_expansion_128:
1243_key_expansion_256a:
1244 pshufd $0b11111111, %xmm1, %xmm1
1245 shufps $0b00010000, %xmm0, %xmm4
1246 pxor %xmm4, %xmm0
1247 shufps $0b10001100, %xmm0, %xmm4
1248 pxor %xmm4, %xmm0
1249 pxor %xmm1, %xmm0
1250 movaps %xmm0, (%rcx)
1251 add $0x10, %rcx
1252 ret
1253
1254_key_expansion_192a:
1255 pshufd $0b01010101, %xmm1, %xmm1
1256 shufps $0b00010000, %xmm0, %xmm4
1257 pxor %xmm4, %xmm0
1258 shufps $0b10001100, %xmm0, %xmm4
1259 pxor %xmm4, %xmm0
1260 pxor %xmm1, %xmm0
1261
1262 movaps %xmm2, %xmm5
1263 movaps %xmm2, %xmm6
1264 pslldq $4, %xmm5
1265 pshufd $0b11111111, %xmm0, %xmm3
1266 pxor %xmm3, %xmm2
1267 pxor %xmm5, %xmm2
1268
1269 movaps %xmm0, %xmm1
1270 shufps $0b01000100, %xmm0, %xmm6
1271 movaps %xmm6, (%rcx)
1272 shufps $0b01001110, %xmm2, %xmm1
1273 movaps %xmm1, 16(%rcx)
1274 add $0x20, %rcx
1275 ret
1276
1277_key_expansion_192b:
1278 pshufd $0b01010101, %xmm1, %xmm1
1279 shufps $0b00010000, %xmm0, %xmm4
1280 pxor %xmm4, %xmm0
1281 shufps $0b10001100, %xmm0, %xmm4
1282 pxor %xmm4, %xmm0
1283 pxor %xmm1, %xmm0
1284
1285 movaps %xmm2, %xmm5
1286 pslldq $4, %xmm5
1287 pshufd $0b11111111, %xmm0, %xmm3
1288 pxor %xmm3, %xmm2
1289 pxor %xmm5, %xmm2
1290
1291 movaps %xmm0, (%rcx)
1292 add $0x10, %rcx
1293 ret
1294
1295_key_expansion_256b:
1296 pshufd $0b10101010, %xmm1, %xmm1
1297 shufps $0b00010000, %xmm2, %xmm4
1298 pxor %xmm4, %xmm2
1299 shufps $0b10001100, %xmm2, %xmm4
1300 pxor %xmm4, %xmm2
1301 pxor %xmm1, %xmm2
1302 movaps %xmm2, (%rcx)
1303 add $0x10, %rcx
1304 ret
1305
1306/*
1307 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1308 * unsigned int key_len)
1309 */
1310ENTRY(aesni_set_key)
1311 movups (%rsi), %xmm0 # user key (first 16 bytes)
1312 movaps %xmm0, (%rdi)
1313 lea 0x10(%rdi), %rcx # key addr
1314 movl %edx, 480(%rdi)
1315 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1316 cmp $24, %dl
1317 jb .Lenc_key128
1318 je .Lenc_key192
1319 movups 0x10(%rsi), %xmm2 # other user key
1320 movaps %xmm2, (%rcx)
1321 add $0x10, %rcx
Huang Yingb369e522009-11-23 19:54:06 +08001322 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001323 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001324 AESKEYGENASSIST 0x1 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001325 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001326 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001327 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001328 AESKEYGENASSIST 0x2 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001329 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001330 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001331 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001332 AESKEYGENASSIST 0x4 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001333 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001334 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001335 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001336 AESKEYGENASSIST 0x8 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001337 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001338 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001339 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001340 AESKEYGENASSIST 0x10 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001341 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001342 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001343 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001344 AESKEYGENASSIST 0x20 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001345 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001346 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001347 call _key_expansion_256a
1348 jmp .Ldec_key
1349.Lenc_key192:
1350 movq 0x10(%rsi), %xmm2 # other user key
Huang Yingb369e522009-11-23 19:54:06 +08001351 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001352 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001353 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001354 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001355 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001356 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001357 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001358 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001359 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001360 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001361 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001362 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001363 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001364 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001365 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001366 call _key_expansion_192b
1367 jmp .Ldec_key
1368.Lenc_key128:
Huang Yingb369e522009-11-23 19:54:06 +08001369 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001370 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001371 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001372 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001373 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001374 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001375 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001376 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001377 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001378 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001379 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001380 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001381 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001382 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001383 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001384 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001385 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
Huang Ying54b6a1b2009-01-18 16:28:34 +11001386 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001387 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
Huang Ying54b6a1b2009-01-18 16:28:34 +11001388 call _key_expansion_128
1389.Ldec_key:
1390 sub $0x10, %rcx
1391 movaps (%rdi), %xmm0
1392 movaps (%rcx), %xmm1
1393 movaps %xmm0, 240(%rcx)
1394 movaps %xmm1, 240(%rdi)
1395 add $0x10, %rdi
1396 lea 240-16(%rcx), %rsi
1397.align 4
1398.Ldec_key_loop:
1399 movaps (%rdi), %xmm0
Huang Yingb369e522009-11-23 19:54:06 +08001400 AESIMC %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001401 movaps %xmm1, (%rsi)
1402 add $0x10, %rdi
1403 sub $0x10, %rsi
1404 cmp %rcx, %rdi
1405 jb .Ldec_key_loop
1406 xor %rax, %rax
1407 ret
1408
1409/*
1410 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1411 */
1412ENTRY(aesni_enc)
1413 movl 480(KEYP), KLEN # key length
1414 movups (INP), STATE # input
1415 call _aesni_enc1
1416 movups STATE, (OUTP) # output
1417 ret
1418
1419/*
1420 * _aesni_enc1: internal ABI
1421 * input:
1422 * KEYP: key struct pointer
1423 * KLEN: round count
1424 * STATE: initial state (input)
1425 * output:
1426 * STATE: finial state (output)
1427 * changed:
1428 * KEY
1429 * TKEYP (T1)
1430 */
1431_aesni_enc1:
1432 movaps (KEYP), KEY # key
1433 mov KEYP, TKEYP
1434 pxor KEY, STATE # round 0
1435 add $0x30, TKEYP
1436 cmp $24, KLEN
1437 jb .Lenc128
1438 lea 0x20(TKEYP), TKEYP
1439 je .Lenc192
1440 add $0x20, TKEYP
1441 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001442 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001443 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001444 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001445.align 4
1446.Lenc192:
1447 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001448 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001449 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001450 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001451.align 4
1452.Lenc128:
1453 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001454 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001455 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001456 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001457 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001458 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001459 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001460 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001461 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001462 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001463 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001464 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001465 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001466 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001467 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001468 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001469 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001470 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001471 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001472 AESENCLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001473 ret
1474
1475/*
1476 * _aesni_enc4: internal ABI
1477 * input:
1478 * KEYP: key struct pointer
1479 * KLEN: round count
1480 * STATE1: initial state (input)
1481 * STATE2
1482 * STATE3
1483 * STATE4
1484 * output:
1485 * STATE1: finial state (output)
1486 * STATE2
1487 * STATE3
1488 * STATE4
1489 * changed:
1490 * KEY
1491 * TKEYP (T1)
1492 */
1493_aesni_enc4:
1494 movaps (KEYP), KEY # key
1495 mov KEYP, TKEYP
1496 pxor KEY, STATE1 # round 0
1497 pxor KEY, STATE2
1498 pxor KEY, STATE3
1499 pxor KEY, STATE4
1500 add $0x30, TKEYP
1501 cmp $24, KLEN
1502 jb .L4enc128
1503 lea 0x20(TKEYP), TKEYP
1504 je .L4enc192
1505 add $0x20, TKEYP
1506 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001507 AESENC KEY STATE1
1508 AESENC KEY STATE2
1509 AESENC KEY STATE3
1510 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001511 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001512 AESENC KEY STATE1
1513 AESENC KEY STATE2
1514 AESENC KEY STATE3
1515 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001516#.align 4
1517.L4enc192:
1518 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001519 AESENC KEY STATE1
1520 AESENC KEY STATE2
1521 AESENC KEY STATE3
1522 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001523 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001524 AESENC KEY STATE1
1525 AESENC KEY STATE2
1526 AESENC KEY STATE3
1527 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001528#.align 4
1529.L4enc128:
1530 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001531 AESENC KEY STATE1
1532 AESENC KEY STATE2
1533 AESENC KEY STATE3
1534 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001535 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001536 AESENC KEY STATE1
1537 AESENC KEY STATE2
1538 AESENC KEY STATE3
1539 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001540 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001541 AESENC KEY STATE1
1542 AESENC KEY STATE2
1543 AESENC KEY STATE3
1544 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001545 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001546 AESENC KEY STATE1
1547 AESENC KEY STATE2
1548 AESENC KEY STATE3
1549 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001550 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001551 AESENC KEY STATE1
1552 AESENC KEY STATE2
1553 AESENC KEY STATE3
1554 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001555 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001556 AESENC KEY STATE1
1557 AESENC KEY STATE2
1558 AESENC KEY STATE3
1559 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001560 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001561 AESENC KEY STATE1
1562 AESENC KEY STATE2
1563 AESENC KEY STATE3
1564 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001565 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001566 AESENC KEY STATE1
1567 AESENC KEY STATE2
1568 AESENC KEY STATE3
1569 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001570 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001571 AESENC KEY STATE1
1572 AESENC KEY STATE2
1573 AESENC KEY STATE3
1574 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001575 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001576 AESENCLAST KEY STATE1 # last round
1577 AESENCLAST KEY STATE2
1578 AESENCLAST KEY STATE3
1579 AESENCLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001580 ret
1581
1582/*
1583 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1584 */
1585ENTRY(aesni_dec)
1586 mov 480(KEYP), KLEN # key length
1587 add $240, KEYP
1588 movups (INP), STATE # input
1589 call _aesni_dec1
1590 movups STATE, (OUTP) #output
1591 ret
1592
1593/*
1594 * _aesni_dec1: internal ABI
1595 * input:
1596 * KEYP: key struct pointer
1597 * KLEN: key length
1598 * STATE: initial state (input)
1599 * output:
1600 * STATE: finial state (output)
1601 * changed:
1602 * KEY
1603 * TKEYP (T1)
1604 */
1605_aesni_dec1:
1606 movaps (KEYP), KEY # key
1607 mov KEYP, TKEYP
1608 pxor KEY, STATE # round 0
1609 add $0x30, TKEYP
1610 cmp $24, KLEN
1611 jb .Ldec128
1612 lea 0x20(TKEYP), TKEYP
1613 je .Ldec192
1614 add $0x20, TKEYP
1615 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001616 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001617 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001618 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001619.align 4
1620.Ldec192:
1621 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001622 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001623 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001624 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001625.align 4
1626.Ldec128:
1627 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001628 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001629 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001630 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001631 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001632 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001633 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001634 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001635 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001636 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001637 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001638 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001639 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001640 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001641 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001642 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001643 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001644 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001645 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001646 AESDECLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001647 ret
1648
1649/*
1650 * _aesni_dec4: internal ABI
1651 * input:
1652 * KEYP: key struct pointer
1653 * KLEN: key length
1654 * STATE1: initial state (input)
1655 * STATE2
1656 * STATE3
1657 * STATE4
1658 * output:
1659 * STATE1: finial state (output)
1660 * STATE2
1661 * STATE3
1662 * STATE4
1663 * changed:
1664 * KEY
1665 * TKEYP (T1)
1666 */
1667_aesni_dec4:
1668 movaps (KEYP), KEY # key
1669 mov KEYP, TKEYP
1670 pxor KEY, STATE1 # round 0
1671 pxor KEY, STATE2
1672 pxor KEY, STATE3
1673 pxor KEY, STATE4
1674 add $0x30, TKEYP
1675 cmp $24, KLEN
1676 jb .L4dec128
1677 lea 0x20(TKEYP), TKEYP
1678 je .L4dec192
1679 add $0x20, TKEYP
1680 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001681 AESDEC KEY STATE1
1682 AESDEC KEY STATE2
1683 AESDEC KEY STATE3
1684 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001685 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001686 AESDEC KEY STATE1
1687 AESDEC KEY STATE2
1688 AESDEC KEY STATE3
1689 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001690.align 4
1691.L4dec192:
1692 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001693 AESDEC KEY STATE1
1694 AESDEC KEY STATE2
1695 AESDEC KEY STATE3
1696 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001697 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001698 AESDEC KEY STATE1
1699 AESDEC KEY STATE2
1700 AESDEC KEY STATE3
1701 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001702.align 4
1703.L4dec128:
1704 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001705 AESDEC KEY STATE1
1706 AESDEC KEY STATE2
1707 AESDEC KEY STATE3
1708 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001709 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001710 AESDEC KEY STATE1
1711 AESDEC KEY STATE2
1712 AESDEC KEY STATE3
1713 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001714 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001715 AESDEC KEY STATE1
1716 AESDEC KEY STATE2
1717 AESDEC KEY STATE3
1718 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001719 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001720 AESDEC KEY STATE1
1721 AESDEC KEY STATE2
1722 AESDEC KEY STATE3
1723 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001724 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001725 AESDEC KEY STATE1
1726 AESDEC KEY STATE2
1727 AESDEC KEY STATE3
1728 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001729 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001730 AESDEC KEY STATE1
1731 AESDEC KEY STATE2
1732 AESDEC KEY STATE3
1733 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001734 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001735 AESDEC KEY STATE1
1736 AESDEC KEY STATE2
1737 AESDEC KEY STATE3
1738 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001739 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001740 AESDEC KEY STATE1
1741 AESDEC KEY STATE2
1742 AESDEC KEY STATE3
1743 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001744 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001745 AESDEC KEY STATE1
1746 AESDEC KEY STATE2
1747 AESDEC KEY STATE3
1748 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001749 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001750 AESDECLAST KEY STATE1 # last round
1751 AESDECLAST KEY STATE2
1752 AESDECLAST KEY STATE3
1753 AESDECLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001754 ret
1755
1756/*
1757 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1758 * size_t len)
1759 */
1760ENTRY(aesni_ecb_enc)
1761 test LEN, LEN # check length
1762 jz .Lecb_enc_ret
1763 mov 480(KEYP), KLEN
1764 cmp $16, LEN
1765 jb .Lecb_enc_ret
1766 cmp $64, LEN
1767 jb .Lecb_enc_loop1
1768.align 4
1769.Lecb_enc_loop4:
1770 movups (INP), STATE1
1771 movups 0x10(INP), STATE2
1772 movups 0x20(INP), STATE3
1773 movups 0x30(INP), STATE4
1774 call _aesni_enc4
1775 movups STATE1, (OUTP)
1776 movups STATE2, 0x10(OUTP)
1777 movups STATE3, 0x20(OUTP)
1778 movups STATE4, 0x30(OUTP)
1779 sub $64, LEN
1780 add $64, INP
1781 add $64, OUTP
1782 cmp $64, LEN
1783 jge .Lecb_enc_loop4
1784 cmp $16, LEN
1785 jb .Lecb_enc_ret
1786.align 4
1787.Lecb_enc_loop1:
1788 movups (INP), STATE1
1789 call _aesni_enc1
1790 movups STATE1, (OUTP)
1791 sub $16, LEN
1792 add $16, INP
1793 add $16, OUTP
1794 cmp $16, LEN
1795 jge .Lecb_enc_loop1
1796.Lecb_enc_ret:
1797 ret
1798
1799/*
1800 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1801 * size_t len);
1802 */
1803ENTRY(aesni_ecb_dec)
1804 test LEN, LEN
1805 jz .Lecb_dec_ret
1806 mov 480(KEYP), KLEN
1807 add $240, KEYP
1808 cmp $16, LEN
1809 jb .Lecb_dec_ret
1810 cmp $64, LEN
1811 jb .Lecb_dec_loop1
1812.align 4
1813.Lecb_dec_loop4:
1814 movups (INP), STATE1
1815 movups 0x10(INP), STATE2
1816 movups 0x20(INP), STATE3
1817 movups 0x30(INP), STATE4
1818 call _aesni_dec4
1819 movups STATE1, (OUTP)
1820 movups STATE2, 0x10(OUTP)
1821 movups STATE3, 0x20(OUTP)
1822 movups STATE4, 0x30(OUTP)
1823 sub $64, LEN
1824 add $64, INP
1825 add $64, OUTP
1826 cmp $64, LEN
1827 jge .Lecb_dec_loop4
1828 cmp $16, LEN
1829 jb .Lecb_dec_ret
1830.align 4
1831.Lecb_dec_loop1:
1832 movups (INP), STATE1
1833 call _aesni_dec1
1834 movups STATE1, (OUTP)
1835 sub $16, LEN
1836 add $16, INP
1837 add $16, OUTP
1838 cmp $16, LEN
1839 jge .Lecb_dec_loop1
1840.Lecb_dec_ret:
1841 ret
1842
1843/*
1844 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1845 * size_t len, u8 *iv)
1846 */
1847ENTRY(aesni_cbc_enc)
1848 cmp $16, LEN
1849 jb .Lcbc_enc_ret
1850 mov 480(KEYP), KLEN
1851 movups (IVP), STATE # load iv as initial state
1852.align 4
1853.Lcbc_enc_loop:
1854 movups (INP), IN # load input
1855 pxor IN, STATE
1856 call _aesni_enc1
1857 movups STATE, (OUTP) # store output
1858 sub $16, LEN
1859 add $16, INP
1860 add $16, OUTP
1861 cmp $16, LEN
1862 jge .Lcbc_enc_loop
1863 movups STATE, (IVP)
1864.Lcbc_enc_ret:
1865 ret
1866
1867/*
1868 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1869 * size_t len, u8 *iv)
1870 */
1871ENTRY(aesni_cbc_dec)
1872 cmp $16, LEN
Huang Yinge6efaa02009-06-18 19:33:57 +08001873 jb .Lcbc_dec_just_ret
Huang Ying54b6a1b2009-01-18 16:28:34 +11001874 mov 480(KEYP), KLEN
1875 add $240, KEYP
1876 movups (IVP), IV
1877 cmp $64, LEN
1878 jb .Lcbc_dec_loop1
1879.align 4
1880.Lcbc_dec_loop4:
1881 movups (INP), IN1
1882 movaps IN1, STATE1
1883 movups 0x10(INP), IN2
1884 movaps IN2, STATE2
1885 movups 0x20(INP), IN3
1886 movaps IN3, STATE3
1887 movups 0x30(INP), IN4
1888 movaps IN4, STATE4
1889 call _aesni_dec4
1890 pxor IV, STATE1
1891 pxor IN1, STATE2
1892 pxor IN2, STATE3
1893 pxor IN3, STATE4
1894 movaps IN4, IV
1895 movups STATE1, (OUTP)
1896 movups STATE2, 0x10(OUTP)
1897 movups STATE3, 0x20(OUTP)
1898 movups STATE4, 0x30(OUTP)
1899 sub $64, LEN
1900 add $64, INP
1901 add $64, OUTP
1902 cmp $64, LEN
1903 jge .Lcbc_dec_loop4
1904 cmp $16, LEN
1905 jb .Lcbc_dec_ret
1906.align 4
1907.Lcbc_dec_loop1:
1908 movups (INP), IN
1909 movaps IN, STATE
1910 call _aesni_dec1
1911 pxor IV, STATE
1912 movups STATE, (OUTP)
1913 movaps IN, IV
1914 sub $16, LEN
1915 add $16, INP
1916 add $16, OUTP
1917 cmp $16, LEN
1918 jge .Lcbc_dec_loop1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001919.Lcbc_dec_ret:
Huang Yinge6efaa02009-06-18 19:33:57 +08001920 movups IV, (IVP)
1921.Lcbc_dec_just_ret:
Huang Ying54b6a1b2009-01-18 16:28:34 +11001922 ret
Huang Ying12387a42010-03-10 18:28:55 +08001923
1924.align 16
1925.Lbswap_mask:
1926 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1927
1928/*
1929 * _aesni_inc_init: internal ABI
1930 * setup registers used by _aesni_inc
1931 * input:
1932 * IV
1933 * output:
1934 * CTR: == IV, in little endian
1935 * TCTR_LOW: == lower qword of CTR
1936 * INC: == 1, in little endian
1937 * BSWAP_MASK == endian swapping mask
1938 */
1939_aesni_inc_init:
1940 movaps .Lbswap_mask, BSWAP_MASK
1941 movaps IV, CTR
1942 PSHUFB_XMM BSWAP_MASK CTR
1943 mov $1, TCTR_LOW
Huang Ying32cbd7d2010-03-13 16:28:42 +08001944 MOVQ_R64_XMM TCTR_LOW INC
1945 MOVQ_R64_XMM CTR TCTR_LOW
Huang Ying12387a42010-03-10 18:28:55 +08001946 ret
1947
1948/*
1949 * _aesni_inc: internal ABI
1950 * Increase IV by 1, IV is in big endian
1951 * input:
1952 * IV
1953 * CTR: == IV, in little endian
1954 * TCTR_LOW: == lower qword of CTR
1955 * INC: == 1, in little endian
1956 * BSWAP_MASK == endian swapping mask
1957 * output:
1958 * IV: Increase by 1
1959 * changed:
1960 * CTR: == output IV, in little endian
1961 * TCTR_LOW: == lower qword of CTR
1962 */
1963_aesni_inc:
1964 paddq INC, CTR
1965 add $1, TCTR_LOW
1966 jnc .Linc_low
1967 pslldq $8, INC
1968 paddq INC, CTR
1969 psrldq $8, INC
1970.Linc_low:
1971 movaps CTR, IV
1972 PSHUFB_XMM BSWAP_MASK IV
1973 ret
1974
1975/*
1976 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1977 * size_t len, u8 *iv)
1978 */
1979ENTRY(aesni_ctr_enc)
1980 cmp $16, LEN
1981 jb .Lctr_enc_just_ret
1982 mov 480(KEYP), KLEN
1983 movups (IVP), IV
1984 call _aesni_inc_init
1985 cmp $64, LEN
1986 jb .Lctr_enc_loop1
1987.align 4
1988.Lctr_enc_loop4:
1989 movaps IV, STATE1
1990 call _aesni_inc
1991 movups (INP), IN1
1992 movaps IV, STATE2
1993 call _aesni_inc
1994 movups 0x10(INP), IN2
1995 movaps IV, STATE3
1996 call _aesni_inc
1997 movups 0x20(INP), IN3
1998 movaps IV, STATE4
1999 call _aesni_inc
2000 movups 0x30(INP), IN4
2001 call _aesni_enc4
2002 pxor IN1, STATE1
2003 movups STATE1, (OUTP)
2004 pxor IN2, STATE2
2005 movups STATE2, 0x10(OUTP)
2006 pxor IN3, STATE3
2007 movups STATE3, 0x20(OUTP)
2008 pxor IN4, STATE4
2009 movups STATE4, 0x30(OUTP)
2010 sub $64, LEN
2011 add $64, INP
2012 add $64, OUTP
2013 cmp $64, LEN
2014 jge .Lctr_enc_loop4
2015 cmp $16, LEN
2016 jb .Lctr_enc_ret
2017.align 4
2018.Lctr_enc_loop1:
2019 movaps IV, STATE
2020 call _aesni_inc
2021 movups (INP), IN
2022 call _aesni_enc1
2023 pxor IN, STATE
2024 movups STATE, (OUTP)
2025 sub $16, LEN
2026 add $16, INP
2027 add $16, OUTP
2028 cmp $16, LEN
2029 jge .Lctr_enc_loop1
2030.Lctr_enc_ret:
2031 movups IV, (IVP)
2032.Lctr_enc_just_ret:
2033 ret