blob: f091f122ed2446a1a842fb420c3c817cbe8565aa [file] [log] [blame]
chandramouli narayanan22cddcc2014-06-10 09:22:47 -07001/*
2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3 *
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
6 *
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11 *
12 * This file is provided under a dual BSD/GPLv2 license. When using or
13 * redistributing this file, you may do so under either license.
14 *
15 * GPL LICENSE SUMMARY
16 *
17 * Copyright(c) 2014 Intel Corporation.
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
22 *
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 * General Public License for more details.
27 *
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
32 *
33 * BSD LICENSE
34 *
35 * Copyright(c) 2014 Intel Corporation.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 *
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
46 * distribution.
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64
65#include <linux/linkage.h>
66#include <asm/inst.h>
67
68#define CONCAT(a,b) a##b
69#define VMOVDQ vmovdqu
70
71#define xdata0 %xmm0
72#define xdata1 %xmm1
73#define xdata2 %xmm2
74#define xdata3 %xmm3
75#define xdata4 %xmm4
76#define xdata5 %xmm5
77#define xdata6 %xmm6
78#define xdata7 %xmm7
79#define xcounter %xmm8
80#define xbyteswap %xmm9
81#define xkey0 %xmm10
82#define xkey3 %xmm11
83#define xkey6 %xmm12
84#define xkey9 %xmm13
85#define xkey4 %xmm11
86#define xkey8 %xmm12
87#define xkey12 %xmm13
88#define xkeyA %xmm14
89#define xkeyB %xmm15
90
91#define p_in %rdi
92#define p_iv %rsi
93#define p_keys %rdx
94#define p_out %rcx
95#define num_bytes %r8
96
97#define tmp %r10
98#define DDQ(i) CONCAT(ddq_add_,i)
99#define XMM(i) CONCAT(%xmm, i)
100#define DDQ_DATA 0
101#define XDATA 1
102#define KEY_128 1
103#define KEY_192 2
104#define KEY_256 3
105
106.section .rodata
107.align 16
108
109byteswap_const:
110 .octa 0x000102030405060708090A0B0C0D0E0F
111ddq_add_1:
112 .octa 0x00000000000000000000000000000001
113ddq_add_2:
114 .octa 0x00000000000000000000000000000002
115ddq_add_3:
116 .octa 0x00000000000000000000000000000003
117ddq_add_4:
118 .octa 0x00000000000000000000000000000004
119ddq_add_5:
120 .octa 0x00000000000000000000000000000005
121ddq_add_6:
122 .octa 0x00000000000000000000000000000006
123ddq_add_7:
124 .octa 0x00000000000000000000000000000007
125ddq_add_8:
126 .octa 0x00000000000000000000000000000008
127
128.text
129
130/* generate a unique variable for ddq_add_x */
131
132.macro setddq n
133 var_ddq_add = DDQ(\n)
134.endm
135
136/* generate a unique variable for xmm register */
137.macro setxdata n
138 var_xdata = XMM(\n)
139.endm
140
141/* club the numeric 'id' to the symbol 'name' */
142
143.macro club name, id
144.altmacro
145 .if \name == DDQ_DATA
146 setddq %\id
147 .elseif \name == XDATA
148 setxdata %\id
149 .endif
150.noaltmacro
151.endm
152
153/*
154 * do_aes num_in_par load_keys key_len
155 * This increments p_in, but not p_out
156 */
157.macro do_aes b, k, key_len
158 .set by, \b
159 .set load_keys, \k
160 .set klen, \key_len
161
162 .if (load_keys)
163 vmovdqa 0*16(p_keys), xkey0
164 .endif
165
166 vpshufb xbyteswap, xcounter, xdata0
167
168 .set i, 1
169 .rept (by - 1)
170 club DDQ_DATA, i
171 club XDATA, i
172 vpaddd var_ddq_add(%rip), xcounter, var_xdata
173 vpshufb xbyteswap, var_xdata, var_xdata
174 .set i, (i +1)
175 .endr
176
177 vmovdqa 1*16(p_keys), xkeyA
178
179 vpxor xkey0, xdata0, xdata0
180 club DDQ_DATA, by
181 vpaddd var_ddq_add(%rip), xcounter, xcounter
182
183 .set i, 1
184 .rept (by - 1)
185 club XDATA, i
186 vpxor xkey0, var_xdata, var_xdata
187 .set i, (i +1)
188 .endr
189
190 vmovdqa 2*16(p_keys), xkeyB
191
192 .set i, 0
193 .rept by
194 club XDATA, i
195 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
196 .set i, (i +1)
197 .endr
198
199 .if (klen == KEY_128)
200 .if (load_keys)
201 vmovdqa 3*16(p_keys), xkeyA
202 .endif
203 .else
204 vmovdqa 3*16(p_keys), xkeyA
205 .endif
206
207 .set i, 0
208 .rept by
209 club XDATA, i
210 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
211 .set i, (i +1)
212 .endr
213
214 add $(16*by), p_in
215
216 .if (klen == KEY_128)
217 vmovdqa 4*16(p_keys), xkey4
218 .else
219 .if (load_keys)
220 vmovdqa 4*16(p_keys), xkey4
221 .endif
222 .endif
223
224 .set i, 0
225 .rept by
226 club XDATA, i
227 vaesenc xkeyA, var_xdata, var_xdata /* key 3 */
228 .set i, (i +1)
229 .endr
230
231 vmovdqa 5*16(p_keys), xkeyA
232
233 .set i, 0
234 .rept by
235 club XDATA, i
236 vaesenc xkey4, var_xdata, var_xdata /* key 4 */
237 .set i, (i +1)
238 .endr
239
240 .if (klen == KEY_128)
241 .if (load_keys)
242 vmovdqa 6*16(p_keys), xkeyB
243 .endif
244 .else
245 vmovdqa 6*16(p_keys), xkeyB
246 .endif
247
248 .set i, 0
249 .rept by
250 club XDATA, i
251 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
252 .set i, (i +1)
253 .endr
254
255 vmovdqa 7*16(p_keys), xkeyA
256
257 .set i, 0
258 .rept by
259 club XDATA, i
260 vaesenc xkeyB, var_xdata, var_xdata /* key 6 */
261 .set i, (i +1)
262 .endr
263
264 .if (klen == KEY_128)
265 vmovdqa 8*16(p_keys), xkey8
266 .else
267 .if (load_keys)
268 vmovdqa 8*16(p_keys), xkey8
269 .endif
270 .endif
271
272 .set i, 0
273 .rept by
274 club XDATA, i
275 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
276 .set i, (i +1)
277 .endr
278
279 .if (klen == KEY_128)
280 .if (load_keys)
281 vmovdqa 9*16(p_keys), xkeyA
282 .endif
283 .else
284 vmovdqa 9*16(p_keys), xkeyA
285 .endif
286
287 .set i, 0
288 .rept by
289 club XDATA, i
290 vaesenc xkey8, var_xdata, var_xdata /* key 8 */
291 .set i, (i +1)
292 .endr
293
294 vmovdqa 10*16(p_keys), xkeyB
295
296 .set i, 0
297 .rept by
298 club XDATA, i
299 vaesenc xkeyA, var_xdata, var_xdata /* key 9 */
300 .set i, (i +1)
301 .endr
302
303 .if (klen != KEY_128)
304 vmovdqa 11*16(p_keys), xkeyA
305 .endif
306
307 .set i, 0
308 .rept by
309 club XDATA, i
310 /* key 10 */
311 .if (klen == KEY_128)
312 vaesenclast xkeyB, var_xdata, var_xdata
313 .else
314 vaesenc xkeyB, var_xdata, var_xdata
315 .endif
316 .set i, (i +1)
317 .endr
318
319 .if (klen != KEY_128)
320 .if (load_keys)
321 vmovdqa 12*16(p_keys), xkey12
322 .endif
323
324 .set i, 0
325 .rept by
326 club XDATA, i
327 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
328 .set i, (i +1)
329 .endr
330
331 .if (klen == KEY_256)
332 vmovdqa 13*16(p_keys), xkeyA
333 .endif
334
335 .set i, 0
336 .rept by
337 club XDATA, i
338 .if (klen == KEY_256)
339 /* key 12 */
340 vaesenc xkey12, var_xdata, var_xdata
341 .else
342 vaesenclast xkey12, var_xdata, var_xdata
343 .endif
344 .set i, (i +1)
345 .endr
346
347 .if (klen == KEY_256)
348 vmovdqa 14*16(p_keys), xkeyB
349
350 .set i, 0
351 .rept by
352 club XDATA, i
353 /* key 13 */
354 vaesenc xkeyA, var_xdata, var_xdata
355 .set i, (i +1)
356 .endr
357
358 .set i, 0
359 .rept by
360 club XDATA, i
361 /* key 14 */
362 vaesenclast xkeyB, var_xdata, var_xdata
363 .set i, (i +1)
364 .endr
365 .endif
366 .endif
367
368 .set i, 0
369 .rept (by / 2)
370 .set j, (i+1)
371 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
372 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
373 club XDATA, i
374 vpxor xkeyA, var_xdata, var_xdata
375 club XDATA, j
376 vpxor xkeyB, var_xdata, var_xdata
377 .set i, (i+2)
378 .endr
379
380 .if (i < by)
381 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
382 club XDATA, i
383 vpxor xkeyA, var_xdata, var_xdata
384 .endif
385
386 .set i, 0
387 .rept by
388 club XDATA, i
389 VMOVDQ var_xdata, i*16(p_out)
390 .set i, (i+1)
391 .endr
392.endm
393
394.macro do_aes_load val, key_len
395 do_aes \val, 1, \key_len
396.endm
397
398.macro do_aes_noload val, key_len
399 do_aes \val, 0, \key_len
400.endm
401
402/* main body of aes ctr load */
403
404.macro do_aes_ctrmain key_len
405
406 cmp $16, num_bytes
407 jb .Ldo_return2\key_len
408
409 vmovdqa byteswap_const(%rip), xbyteswap
410 vmovdqu (p_iv), xcounter
411 vpshufb xbyteswap, xcounter, xcounter
412
413 mov num_bytes, tmp
414 and $(7*16), tmp
415 jz .Lmult_of_8_blks\key_len
416
417 /* 1 <= tmp <= 7 */
418 cmp $(4*16), tmp
419 jg .Lgt4\key_len
420 je .Leq4\key_len
421
422.Llt4\key_len:
423 cmp $(2*16), tmp
424 jg .Leq3\key_len
425 je .Leq2\key_len
426
427.Leq1\key_len:
428 do_aes_load 1, \key_len
429 add $(1*16), p_out
430 and $(~7*16), num_bytes
431 jz .Ldo_return2\key_len
432 jmp .Lmain_loop2\key_len
433
434.Leq2\key_len:
435 do_aes_load 2, \key_len
436 add $(2*16), p_out
437 and $(~7*16), num_bytes
438 jz .Ldo_return2\key_len
439 jmp .Lmain_loop2\key_len
440
441
442.Leq3\key_len:
443 do_aes_load 3, \key_len
444 add $(3*16), p_out
445 and $(~7*16), num_bytes
446 jz .Ldo_return2\key_len
447 jmp .Lmain_loop2\key_len
448
449.Leq4\key_len:
450 do_aes_load 4, \key_len
451 add $(4*16), p_out
452 and $(~7*16), num_bytes
453 jz .Ldo_return2\key_len
454 jmp .Lmain_loop2\key_len
455
456.Lgt4\key_len:
457 cmp $(6*16), tmp
458 jg .Leq7\key_len
459 je .Leq6\key_len
460
461.Leq5\key_len:
462 do_aes_load 5, \key_len
463 add $(5*16), p_out
464 and $(~7*16), num_bytes
465 jz .Ldo_return2\key_len
466 jmp .Lmain_loop2\key_len
467
468.Leq6\key_len:
469 do_aes_load 6, \key_len
470 add $(6*16), p_out
471 and $(~7*16), num_bytes
472 jz .Ldo_return2\key_len
473 jmp .Lmain_loop2\key_len
474
475.Leq7\key_len:
476 do_aes_load 7, \key_len
477 add $(7*16), p_out
478 and $(~7*16), num_bytes
479 jz .Ldo_return2\key_len
480 jmp .Lmain_loop2\key_len
481
482.Lmult_of_8_blks\key_len:
483 .if (\key_len != KEY_128)
484 vmovdqa 0*16(p_keys), xkey0
485 vmovdqa 4*16(p_keys), xkey4
486 vmovdqa 8*16(p_keys), xkey8
487 vmovdqa 12*16(p_keys), xkey12
488 .else
489 vmovdqa 0*16(p_keys), xkey0
490 vmovdqa 3*16(p_keys), xkey4
491 vmovdqa 6*16(p_keys), xkey8
492 vmovdqa 9*16(p_keys), xkey12
493 .endif
494.align 16
495.Lmain_loop2\key_len:
496 /* num_bytes is a multiple of 8 and >0 */
497 do_aes_noload 8, \key_len
498 add $(8*16), p_out
499 sub $(8*16), num_bytes
500 jne .Lmain_loop2\key_len
501
502.Ldo_return2\key_len:
503 /* return updated IV */
504 vpshufb xbyteswap, xcounter, xcounter
505 vmovdqu xcounter, (p_iv)
506 ret
507.endm
508
509/*
510 * routine to do AES128 CTR enc/decrypt "by8"
511 * XMM registers are clobbered.
512 * Saving/restoring must be done at a higher level
513 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
514 * unsigned int num_bytes)
515 */
516ENTRY(aes_ctr_enc_128_avx_by8)
517 /* call the aes main loop */
518 do_aes_ctrmain KEY_128
519
520ENDPROC(aes_ctr_enc_128_avx_by8)
521
522/*
523 * routine to do AES192 CTR enc/decrypt "by8"
524 * XMM registers are clobbered.
525 * Saving/restoring must be done at a higher level
526 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
527 * unsigned int num_bytes)
528 */
529ENTRY(aes_ctr_enc_192_avx_by8)
530 /* call the aes main loop */
531 do_aes_ctrmain KEY_192
532
533ENDPROC(aes_ctr_enc_192_avx_by8)
534
535/*
536 * routine to do AES256 CTR enc/decrypt "by8"
537 * XMM registers are clobbered.
538 * Saving/restoring must be done at a higher level
539 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
540 * unsigned int num_bytes)
541 */
542ENTRY(aes_ctr_enc_256_avx_by8)
543 /* call the aes main loop */
544 do_aes_ctrmain KEY_256
545
546ENDPROC(aes_ctr_enc_256_avx_by8)