blob: a029bc7442442c703228eb6bdd4e91bb12ffcd52 [file] [log] [blame]
chandramouli narayanan22cddcc2014-06-10 09:22:47 -07001/*
2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3 *
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
6 *
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11 *
12 * This file is provided under a dual BSD/GPLv2 license. When using or
13 * redistributing this file, you may do so under either license.
14 *
15 * GPL LICENSE SUMMARY
16 *
17 * Copyright(c) 2014 Intel Corporation.
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
22 *
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 * General Public License for more details.
27 *
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
32 *
33 * BSD LICENSE
34 *
35 * Copyright(c) 2014 Intel Corporation.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 *
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
46 * distribution.
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64
65#include <linux/linkage.h>
66#include <asm/inst.h>
67
68#define CONCAT(a,b) a##b
69#define VMOVDQ vmovdqu
70
71#define xdata0 %xmm0
72#define xdata1 %xmm1
73#define xdata2 %xmm2
74#define xdata3 %xmm3
75#define xdata4 %xmm4
76#define xdata5 %xmm5
77#define xdata6 %xmm6
78#define xdata7 %xmm7
79#define xcounter %xmm8
80#define xbyteswap %xmm9
81#define xkey0 %xmm10
82#define xkey3 %xmm11
83#define xkey6 %xmm12
84#define xkey9 %xmm13
85#define xkey4 %xmm11
86#define xkey8 %xmm12
87#define xkey12 %xmm13
88#define xkeyA %xmm14
89#define xkeyB %xmm15
90
91#define p_in %rdi
92#define p_iv %rsi
93#define p_keys %rdx
94#define p_out %rcx
95#define num_bytes %r8
96
97#define tmp %r10
98#define DDQ(i) CONCAT(ddq_add_,i)
99#define XMM(i) CONCAT(%xmm, i)
100#define DDQ_DATA 0
101#define XDATA 1
102#define KEY_128 1
103#define KEY_192 2
104#define KEY_256 3
105
106.section .rodata
107.align 16
108
109byteswap_const:
110 .octa 0x000102030405060708090A0B0C0D0E0F
Mathias Krause80dca472014-09-28 22:23:59 +0200111ddq_low_msk:
112 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
113ddq_high_add_1:
114 .octa 0x00000000000000010000000000000000
chandramouli narayanan22cddcc2014-06-10 09:22:47 -0700115ddq_add_1:
116 .octa 0x00000000000000000000000000000001
117ddq_add_2:
118 .octa 0x00000000000000000000000000000002
119ddq_add_3:
120 .octa 0x00000000000000000000000000000003
121ddq_add_4:
122 .octa 0x00000000000000000000000000000004
123ddq_add_5:
124 .octa 0x00000000000000000000000000000005
125ddq_add_6:
126 .octa 0x00000000000000000000000000000006
127ddq_add_7:
128 .octa 0x00000000000000000000000000000007
129ddq_add_8:
130 .octa 0x00000000000000000000000000000008
131
132.text
133
134/* generate a unique variable for ddq_add_x */
135
136.macro setddq n
137 var_ddq_add = DDQ(\n)
138.endm
139
140/* generate a unique variable for xmm register */
141.macro setxdata n
142 var_xdata = XMM(\n)
143.endm
144
145/* club the numeric 'id' to the symbol 'name' */
146
147.macro club name, id
148.altmacro
149 .if \name == DDQ_DATA
150 setddq %\id
151 .elseif \name == XDATA
152 setxdata %\id
153 .endif
154.noaltmacro
155.endm
156
157/*
158 * do_aes num_in_par load_keys key_len
159 * This increments p_in, but not p_out
160 */
161.macro do_aes b, k, key_len
162 .set by, \b
163 .set load_keys, \k
164 .set klen, \key_len
165
166 .if (load_keys)
167 vmovdqa 0*16(p_keys), xkey0
168 .endif
169
170 vpshufb xbyteswap, xcounter, xdata0
171
172 .set i, 1
173 .rept (by - 1)
174 club DDQ_DATA, i
175 club XDATA, i
Mathias Krause80dca472014-09-28 22:23:59 +0200176 vpaddq var_ddq_add(%rip), xcounter, var_xdata
177 vptest ddq_low_msk(%rip), var_xdata
178 jnz 1f
179 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
180 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
181 1:
chandramouli narayanan22cddcc2014-06-10 09:22:47 -0700182 vpshufb xbyteswap, var_xdata, var_xdata
183 .set i, (i +1)
184 .endr
185
186 vmovdqa 1*16(p_keys), xkeyA
187
188 vpxor xkey0, xdata0, xdata0
189 club DDQ_DATA, by
Mathias Krause80dca472014-09-28 22:23:59 +0200190 vpaddq var_ddq_add(%rip), xcounter, xcounter
191 vptest ddq_low_msk(%rip), xcounter
192 jnz 1f
193 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
194 1:
chandramouli narayanan22cddcc2014-06-10 09:22:47 -0700195
196 .set i, 1
197 .rept (by - 1)
198 club XDATA, i
199 vpxor xkey0, var_xdata, var_xdata
200 .set i, (i +1)
201 .endr
202
203 vmovdqa 2*16(p_keys), xkeyB
204
205 .set i, 0
206 .rept by
207 club XDATA, i
208 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
209 .set i, (i +1)
210 .endr
211
212 .if (klen == KEY_128)
213 .if (load_keys)
214 vmovdqa 3*16(p_keys), xkeyA
215 .endif
216 .else
217 vmovdqa 3*16(p_keys), xkeyA
218 .endif
219
220 .set i, 0
221 .rept by
222 club XDATA, i
223 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
224 .set i, (i +1)
225 .endr
226
227 add $(16*by), p_in
228
229 .if (klen == KEY_128)
230 vmovdqa 4*16(p_keys), xkey4
231 .else
232 .if (load_keys)
233 vmovdqa 4*16(p_keys), xkey4
234 .endif
235 .endif
236
237 .set i, 0
238 .rept by
239 club XDATA, i
240 vaesenc xkeyA, var_xdata, var_xdata /* key 3 */
241 .set i, (i +1)
242 .endr
243
244 vmovdqa 5*16(p_keys), xkeyA
245
246 .set i, 0
247 .rept by
248 club XDATA, i
249 vaesenc xkey4, var_xdata, var_xdata /* key 4 */
250 .set i, (i +1)
251 .endr
252
253 .if (klen == KEY_128)
254 .if (load_keys)
255 vmovdqa 6*16(p_keys), xkeyB
256 .endif
257 .else
258 vmovdqa 6*16(p_keys), xkeyB
259 .endif
260
261 .set i, 0
262 .rept by
263 club XDATA, i
264 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
265 .set i, (i +1)
266 .endr
267
268 vmovdqa 7*16(p_keys), xkeyA
269
270 .set i, 0
271 .rept by
272 club XDATA, i
273 vaesenc xkeyB, var_xdata, var_xdata /* key 6 */
274 .set i, (i +1)
275 .endr
276
277 .if (klen == KEY_128)
278 vmovdqa 8*16(p_keys), xkey8
279 .else
280 .if (load_keys)
281 vmovdqa 8*16(p_keys), xkey8
282 .endif
283 .endif
284
285 .set i, 0
286 .rept by
287 club XDATA, i
288 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
289 .set i, (i +1)
290 .endr
291
292 .if (klen == KEY_128)
293 .if (load_keys)
294 vmovdqa 9*16(p_keys), xkeyA
295 .endif
296 .else
297 vmovdqa 9*16(p_keys), xkeyA
298 .endif
299
300 .set i, 0
301 .rept by
302 club XDATA, i
303 vaesenc xkey8, var_xdata, var_xdata /* key 8 */
304 .set i, (i +1)
305 .endr
306
307 vmovdqa 10*16(p_keys), xkeyB
308
309 .set i, 0
310 .rept by
311 club XDATA, i
312 vaesenc xkeyA, var_xdata, var_xdata /* key 9 */
313 .set i, (i +1)
314 .endr
315
316 .if (klen != KEY_128)
317 vmovdqa 11*16(p_keys), xkeyA
318 .endif
319
320 .set i, 0
321 .rept by
322 club XDATA, i
323 /* key 10 */
324 .if (klen == KEY_128)
325 vaesenclast xkeyB, var_xdata, var_xdata
326 .else
327 vaesenc xkeyB, var_xdata, var_xdata
328 .endif
329 .set i, (i +1)
330 .endr
331
332 .if (klen != KEY_128)
333 .if (load_keys)
334 vmovdqa 12*16(p_keys), xkey12
335 .endif
336
337 .set i, 0
338 .rept by
339 club XDATA, i
340 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
341 .set i, (i +1)
342 .endr
343
344 .if (klen == KEY_256)
345 vmovdqa 13*16(p_keys), xkeyA
346 .endif
347
348 .set i, 0
349 .rept by
350 club XDATA, i
351 .if (klen == KEY_256)
352 /* key 12 */
353 vaesenc xkey12, var_xdata, var_xdata
354 .else
355 vaesenclast xkey12, var_xdata, var_xdata
356 .endif
357 .set i, (i +1)
358 .endr
359
360 .if (klen == KEY_256)
361 vmovdqa 14*16(p_keys), xkeyB
362
363 .set i, 0
364 .rept by
365 club XDATA, i
366 /* key 13 */
367 vaesenc xkeyA, var_xdata, var_xdata
368 .set i, (i +1)
369 .endr
370
371 .set i, 0
372 .rept by
373 club XDATA, i
374 /* key 14 */
375 vaesenclast xkeyB, var_xdata, var_xdata
376 .set i, (i +1)
377 .endr
378 .endif
379 .endif
380
381 .set i, 0
382 .rept (by / 2)
383 .set j, (i+1)
384 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
385 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
386 club XDATA, i
387 vpxor xkeyA, var_xdata, var_xdata
388 club XDATA, j
389 vpxor xkeyB, var_xdata, var_xdata
390 .set i, (i+2)
391 .endr
392
393 .if (i < by)
394 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
395 club XDATA, i
396 vpxor xkeyA, var_xdata, var_xdata
397 .endif
398
399 .set i, 0
400 .rept by
401 club XDATA, i
402 VMOVDQ var_xdata, i*16(p_out)
403 .set i, (i+1)
404 .endr
405.endm
406
407.macro do_aes_load val, key_len
408 do_aes \val, 1, \key_len
409.endm
410
411.macro do_aes_noload val, key_len
412 do_aes \val, 0, \key_len
413.endm
414
415/* main body of aes ctr load */
416
417.macro do_aes_ctrmain key_len
418
419 cmp $16, num_bytes
420 jb .Ldo_return2\key_len
421
422 vmovdqa byteswap_const(%rip), xbyteswap
423 vmovdqu (p_iv), xcounter
424 vpshufb xbyteswap, xcounter, xcounter
425
426 mov num_bytes, tmp
427 and $(7*16), tmp
428 jz .Lmult_of_8_blks\key_len
429
430 /* 1 <= tmp <= 7 */
431 cmp $(4*16), tmp
432 jg .Lgt4\key_len
433 je .Leq4\key_len
434
435.Llt4\key_len:
436 cmp $(2*16), tmp
437 jg .Leq3\key_len
438 je .Leq2\key_len
439
440.Leq1\key_len:
441 do_aes_load 1, \key_len
442 add $(1*16), p_out
443 and $(~7*16), num_bytes
444 jz .Ldo_return2\key_len
445 jmp .Lmain_loop2\key_len
446
447.Leq2\key_len:
448 do_aes_load 2, \key_len
449 add $(2*16), p_out
450 and $(~7*16), num_bytes
451 jz .Ldo_return2\key_len
452 jmp .Lmain_loop2\key_len
453
454
455.Leq3\key_len:
456 do_aes_load 3, \key_len
457 add $(3*16), p_out
458 and $(~7*16), num_bytes
459 jz .Ldo_return2\key_len
460 jmp .Lmain_loop2\key_len
461
462.Leq4\key_len:
463 do_aes_load 4, \key_len
464 add $(4*16), p_out
465 and $(~7*16), num_bytes
466 jz .Ldo_return2\key_len
467 jmp .Lmain_loop2\key_len
468
469.Lgt4\key_len:
470 cmp $(6*16), tmp
471 jg .Leq7\key_len
472 je .Leq6\key_len
473
474.Leq5\key_len:
475 do_aes_load 5, \key_len
476 add $(5*16), p_out
477 and $(~7*16), num_bytes
478 jz .Ldo_return2\key_len
479 jmp .Lmain_loop2\key_len
480
481.Leq6\key_len:
482 do_aes_load 6, \key_len
483 add $(6*16), p_out
484 and $(~7*16), num_bytes
485 jz .Ldo_return2\key_len
486 jmp .Lmain_loop2\key_len
487
488.Leq7\key_len:
489 do_aes_load 7, \key_len
490 add $(7*16), p_out
491 and $(~7*16), num_bytes
492 jz .Ldo_return2\key_len
493 jmp .Lmain_loop2\key_len
494
495.Lmult_of_8_blks\key_len:
496 .if (\key_len != KEY_128)
497 vmovdqa 0*16(p_keys), xkey0
498 vmovdqa 4*16(p_keys), xkey4
499 vmovdqa 8*16(p_keys), xkey8
500 vmovdqa 12*16(p_keys), xkey12
501 .else
502 vmovdqa 0*16(p_keys), xkey0
503 vmovdqa 3*16(p_keys), xkey4
504 vmovdqa 6*16(p_keys), xkey8
505 vmovdqa 9*16(p_keys), xkey12
506 .endif
507.align 16
508.Lmain_loop2\key_len:
509 /* num_bytes is a multiple of 8 and >0 */
510 do_aes_noload 8, \key_len
511 add $(8*16), p_out
512 sub $(8*16), num_bytes
513 jne .Lmain_loop2\key_len
514
515.Ldo_return2\key_len:
516 /* return updated IV */
517 vpshufb xbyteswap, xcounter, xcounter
518 vmovdqu xcounter, (p_iv)
519 ret
520.endm
521
522/*
523 * routine to do AES128 CTR enc/decrypt "by8"
524 * XMM registers are clobbered.
525 * Saving/restoring must be done at a higher level
526 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
527 * unsigned int num_bytes)
528 */
529ENTRY(aes_ctr_enc_128_avx_by8)
530 /* call the aes main loop */
531 do_aes_ctrmain KEY_128
532
533ENDPROC(aes_ctr_enc_128_avx_by8)
534
535/*
536 * routine to do AES192 CTR enc/decrypt "by8"
537 * XMM registers are clobbered.
538 * Saving/restoring must be done at a higher level
539 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
540 * unsigned int num_bytes)
541 */
542ENTRY(aes_ctr_enc_192_avx_by8)
543 /* call the aes main loop */
544 do_aes_ctrmain KEY_192
545
546ENDPROC(aes_ctr_enc_192_avx_by8)
547
548/*
549 * routine to do AES256 CTR enc/decrypt "by8"
550 * XMM registers are clobbered.
551 * Saving/restoring must be done at a higher level
552 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
553 * unsigned int num_bytes)
554 */
555ENTRY(aes_ctr_enc_256_avx_by8)
556 /* call the aes main loop */
557 do_aes_ctrmain KEY_256
558
559ENDPROC(aes_ctr_enc_256_avx_by8)