blob: 2df2a0298f5ad7075bc9b214438516270dc60bb5 [file] [log] [blame]
chandramouli narayanan22cddcc2014-06-10 09:22:47 -07001/*
2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3 *
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
6 *
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11 *
12 * This file is provided under a dual BSD/GPLv2 license. When using or
13 * redistributing this file, you may do so under either license.
14 *
15 * GPL LICENSE SUMMARY
16 *
17 * Copyright(c) 2014 Intel Corporation.
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
22 *
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 * General Public License for more details.
27 *
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
32 *
33 * BSD LICENSE
34 *
35 * Copyright(c) 2014 Intel Corporation.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 *
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
46 * distribution.
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64
65#include <linux/linkage.h>
66#include <asm/inst.h>
67
68#define CONCAT(a,b) a##b
69#define VMOVDQ vmovdqu
70
71#define xdata0 %xmm0
72#define xdata1 %xmm1
73#define xdata2 %xmm2
74#define xdata3 %xmm3
75#define xdata4 %xmm4
76#define xdata5 %xmm5
77#define xdata6 %xmm6
78#define xdata7 %xmm7
79#define xcounter %xmm8
80#define xbyteswap %xmm9
81#define xkey0 %xmm10
chandramouli narayanan22cddcc2014-06-10 09:22:47 -070082#define xkey4 %xmm11
83#define xkey8 %xmm12
84#define xkey12 %xmm13
85#define xkeyA %xmm14
86#define xkeyB %xmm15
87
88#define p_in %rdi
89#define p_iv %rsi
90#define p_keys %rdx
91#define p_out %rcx
92#define num_bytes %r8
93
94#define tmp %r10
95#define DDQ(i) CONCAT(ddq_add_,i)
96#define XMM(i) CONCAT(%xmm, i)
97#define DDQ_DATA 0
98#define XDATA 1
99#define KEY_128 1
100#define KEY_192 2
101#define KEY_256 3
102
103.section .rodata
104.align 16
105
106byteswap_const:
107 .octa 0x000102030405060708090A0B0C0D0E0F
Mathias Krause80dca472014-09-28 22:23:59 +0200108ddq_low_msk:
109 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
110ddq_high_add_1:
111 .octa 0x00000000000000010000000000000000
chandramouli narayanan22cddcc2014-06-10 09:22:47 -0700112ddq_add_1:
113 .octa 0x00000000000000000000000000000001
114ddq_add_2:
115 .octa 0x00000000000000000000000000000002
116ddq_add_3:
117 .octa 0x00000000000000000000000000000003
118ddq_add_4:
119 .octa 0x00000000000000000000000000000004
120ddq_add_5:
121 .octa 0x00000000000000000000000000000005
122ddq_add_6:
123 .octa 0x00000000000000000000000000000006
124ddq_add_7:
125 .octa 0x00000000000000000000000000000007
126ddq_add_8:
127 .octa 0x00000000000000000000000000000008
128
129.text
130
131/* generate a unique variable for ddq_add_x */
132
133.macro setddq n
134 var_ddq_add = DDQ(\n)
135.endm
136
137/* generate a unique variable for xmm register */
138.macro setxdata n
139 var_xdata = XMM(\n)
140.endm
141
142/* club the numeric 'id' to the symbol 'name' */
143
144.macro club name, id
145.altmacro
146 .if \name == DDQ_DATA
147 setddq %\id
148 .elseif \name == XDATA
149 setxdata %\id
150 .endif
151.noaltmacro
152.endm
153
154/*
155 * do_aes num_in_par load_keys key_len
156 * This increments p_in, but not p_out
157 */
158.macro do_aes b, k, key_len
159 .set by, \b
160 .set load_keys, \k
161 .set klen, \key_len
162
163 .if (load_keys)
164 vmovdqa 0*16(p_keys), xkey0
165 .endif
166
167 vpshufb xbyteswap, xcounter, xdata0
168
169 .set i, 1
170 .rept (by - 1)
171 club DDQ_DATA, i
172 club XDATA, i
Mathias Krause80dca472014-09-28 22:23:59 +0200173 vpaddq var_ddq_add(%rip), xcounter, var_xdata
174 vptest ddq_low_msk(%rip), var_xdata
175 jnz 1f
176 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
177 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
178 1:
chandramouli narayanan22cddcc2014-06-10 09:22:47 -0700179 vpshufb xbyteswap, var_xdata, var_xdata
180 .set i, (i +1)
181 .endr
182
183 vmovdqa 1*16(p_keys), xkeyA
184
185 vpxor xkey0, xdata0, xdata0
186 club DDQ_DATA, by
Mathias Krause80dca472014-09-28 22:23:59 +0200187 vpaddq var_ddq_add(%rip), xcounter, xcounter
188 vptest ddq_low_msk(%rip), xcounter
189 jnz 1f
190 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
191 1:
chandramouli narayanan22cddcc2014-06-10 09:22:47 -0700192
193 .set i, 1
194 .rept (by - 1)
195 club XDATA, i
196 vpxor xkey0, var_xdata, var_xdata
197 .set i, (i +1)
198 .endr
199
200 vmovdqa 2*16(p_keys), xkeyB
201
202 .set i, 0
203 .rept by
204 club XDATA, i
205 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
206 .set i, (i +1)
207 .endr
208
209 .if (klen == KEY_128)
210 .if (load_keys)
211 vmovdqa 3*16(p_keys), xkeyA
212 .endif
213 .else
214 vmovdqa 3*16(p_keys), xkeyA
215 .endif
216
217 .set i, 0
218 .rept by
219 club XDATA, i
220 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
221 .set i, (i +1)
222 .endr
223
224 add $(16*by), p_in
225
226 .if (klen == KEY_128)
227 vmovdqa 4*16(p_keys), xkey4
228 .else
229 .if (load_keys)
230 vmovdqa 4*16(p_keys), xkey4
231 .endif
232 .endif
233
234 .set i, 0
235 .rept by
236 club XDATA, i
237 vaesenc xkeyA, var_xdata, var_xdata /* key 3 */
238 .set i, (i +1)
239 .endr
240
241 vmovdqa 5*16(p_keys), xkeyA
242
243 .set i, 0
244 .rept by
245 club XDATA, i
246 vaesenc xkey4, var_xdata, var_xdata /* key 4 */
247 .set i, (i +1)
248 .endr
249
250 .if (klen == KEY_128)
251 .if (load_keys)
252 vmovdqa 6*16(p_keys), xkeyB
253 .endif
254 .else
255 vmovdqa 6*16(p_keys), xkeyB
256 .endif
257
258 .set i, 0
259 .rept by
260 club XDATA, i
261 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
262 .set i, (i +1)
263 .endr
264
265 vmovdqa 7*16(p_keys), xkeyA
266
267 .set i, 0
268 .rept by
269 club XDATA, i
270 vaesenc xkeyB, var_xdata, var_xdata /* key 6 */
271 .set i, (i +1)
272 .endr
273
274 .if (klen == KEY_128)
275 vmovdqa 8*16(p_keys), xkey8
276 .else
277 .if (load_keys)
278 vmovdqa 8*16(p_keys), xkey8
279 .endif
280 .endif
281
282 .set i, 0
283 .rept by
284 club XDATA, i
285 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
286 .set i, (i +1)
287 .endr
288
289 .if (klen == KEY_128)
290 .if (load_keys)
291 vmovdqa 9*16(p_keys), xkeyA
292 .endif
293 .else
294 vmovdqa 9*16(p_keys), xkeyA
295 .endif
296
297 .set i, 0
298 .rept by
299 club XDATA, i
300 vaesenc xkey8, var_xdata, var_xdata /* key 8 */
301 .set i, (i +1)
302 .endr
303
304 vmovdqa 10*16(p_keys), xkeyB
305
306 .set i, 0
307 .rept by
308 club XDATA, i
309 vaesenc xkeyA, var_xdata, var_xdata /* key 9 */
310 .set i, (i +1)
311 .endr
312
313 .if (klen != KEY_128)
314 vmovdqa 11*16(p_keys), xkeyA
315 .endif
316
317 .set i, 0
318 .rept by
319 club XDATA, i
320 /* key 10 */
321 .if (klen == KEY_128)
322 vaesenclast xkeyB, var_xdata, var_xdata
323 .else
324 vaesenc xkeyB, var_xdata, var_xdata
325 .endif
326 .set i, (i +1)
327 .endr
328
329 .if (klen != KEY_128)
330 .if (load_keys)
331 vmovdqa 12*16(p_keys), xkey12
332 .endif
333
334 .set i, 0
335 .rept by
336 club XDATA, i
337 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
338 .set i, (i +1)
339 .endr
340
341 .if (klen == KEY_256)
342 vmovdqa 13*16(p_keys), xkeyA
343 .endif
344
345 .set i, 0
346 .rept by
347 club XDATA, i
348 .if (klen == KEY_256)
349 /* key 12 */
350 vaesenc xkey12, var_xdata, var_xdata
351 .else
352 vaesenclast xkey12, var_xdata, var_xdata
353 .endif
354 .set i, (i +1)
355 .endr
356
357 .if (klen == KEY_256)
358 vmovdqa 14*16(p_keys), xkeyB
359
360 .set i, 0
361 .rept by
362 club XDATA, i
363 /* key 13 */
364 vaesenc xkeyA, var_xdata, var_xdata
365 .set i, (i +1)
366 .endr
367
368 .set i, 0
369 .rept by
370 club XDATA, i
371 /* key 14 */
372 vaesenclast xkeyB, var_xdata, var_xdata
373 .set i, (i +1)
374 .endr
375 .endif
376 .endif
377
378 .set i, 0
379 .rept (by / 2)
380 .set j, (i+1)
381 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
382 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
383 club XDATA, i
384 vpxor xkeyA, var_xdata, var_xdata
385 club XDATA, j
386 vpxor xkeyB, var_xdata, var_xdata
387 .set i, (i+2)
388 .endr
389
390 .if (i < by)
391 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
392 club XDATA, i
393 vpxor xkeyA, var_xdata, var_xdata
394 .endif
395
396 .set i, 0
397 .rept by
398 club XDATA, i
399 VMOVDQ var_xdata, i*16(p_out)
400 .set i, (i+1)
401 .endr
402.endm
403
404.macro do_aes_load val, key_len
405 do_aes \val, 1, \key_len
406.endm
407
408.macro do_aes_noload val, key_len
409 do_aes \val, 0, \key_len
410.endm
411
412/* main body of aes ctr load */
413
414.macro do_aes_ctrmain key_len
415
416 cmp $16, num_bytes
417 jb .Ldo_return2\key_len
418
419 vmovdqa byteswap_const(%rip), xbyteswap
420 vmovdqu (p_iv), xcounter
421 vpshufb xbyteswap, xcounter, xcounter
422
423 mov num_bytes, tmp
424 and $(7*16), tmp
425 jz .Lmult_of_8_blks\key_len
426
427 /* 1 <= tmp <= 7 */
428 cmp $(4*16), tmp
429 jg .Lgt4\key_len
430 je .Leq4\key_len
431
432.Llt4\key_len:
433 cmp $(2*16), tmp
434 jg .Leq3\key_len
435 je .Leq2\key_len
436
437.Leq1\key_len:
438 do_aes_load 1, \key_len
439 add $(1*16), p_out
440 and $(~7*16), num_bytes
441 jz .Ldo_return2\key_len
442 jmp .Lmain_loop2\key_len
443
444.Leq2\key_len:
445 do_aes_load 2, \key_len
446 add $(2*16), p_out
447 and $(~7*16), num_bytes
448 jz .Ldo_return2\key_len
449 jmp .Lmain_loop2\key_len
450
451
452.Leq3\key_len:
453 do_aes_load 3, \key_len
454 add $(3*16), p_out
455 and $(~7*16), num_bytes
456 jz .Ldo_return2\key_len
457 jmp .Lmain_loop2\key_len
458
459.Leq4\key_len:
460 do_aes_load 4, \key_len
461 add $(4*16), p_out
462 and $(~7*16), num_bytes
463 jz .Ldo_return2\key_len
464 jmp .Lmain_loop2\key_len
465
466.Lgt4\key_len:
467 cmp $(6*16), tmp
468 jg .Leq7\key_len
469 je .Leq6\key_len
470
471.Leq5\key_len:
472 do_aes_load 5, \key_len
473 add $(5*16), p_out
474 and $(~7*16), num_bytes
475 jz .Ldo_return2\key_len
476 jmp .Lmain_loop2\key_len
477
478.Leq6\key_len:
479 do_aes_load 6, \key_len
480 add $(6*16), p_out
481 and $(~7*16), num_bytes
482 jz .Ldo_return2\key_len
483 jmp .Lmain_loop2\key_len
484
485.Leq7\key_len:
486 do_aes_load 7, \key_len
487 add $(7*16), p_out
488 and $(~7*16), num_bytes
489 jz .Ldo_return2\key_len
490 jmp .Lmain_loop2\key_len
491
492.Lmult_of_8_blks\key_len:
493 .if (\key_len != KEY_128)
494 vmovdqa 0*16(p_keys), xkey0
495 vmovdqa 4*16(p_keys), xkey4
496 vmovdqa 8*16(p_keys), xkey8
497 vmovdqa 12*16(p_keys), xkey12
498 .else
499 vmovdqa 0*16(p_keys), xkey0
500 vmovdqa 3*16(p_keys), xkey4
501 vmovdqa 6*16(p_keys), xkey8
502 vmovdqa 9*16(p_keys), xkey12
503 .endif
504.align 16
505.Lmain_loop2\key_len:
506 /* num_bytes is a multiple of 8 and >0 */
507 do_aes_noload 8, \key_len
508 add $(8*16), p_out
509 sub $(8*16), num_bytes
510 jne .Lmain_loop2\key_len
511
512.Ldo_return2\key_len:
513 /* return updated IV */
514 vpshufb xbyteswap, xcounter, xcounter
515 vmovdqu xcounter, (p_iv)
516 ret
517.endm
518
519/*
520 * routine to do AES128 CTR enc/decrypt "by8"
521 * XMM registers are clobbered.
522 * Saving/restoring must be done at a higher level
523 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
524 * unsigned int num_bytes)
525 */
526ENTRY(aes_ctr_enc_128_avx_by8)
527 /* call the aes main loop */
528 do_aes_ctrmain KEY_128
529
530ENDPROC(aes_ctr_enc_128_avx_by8)
531
532/*
533 * routine to do AES192 CTR enc/decrypt "by8"
534 * XMM registers are clobbered.
535 * Saving/restoring must be done at a higher level
536 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
537 * unsigned int num_bytes)
538 */
539ENTRY(aes_ctr_enc_192_avx_by8)
540 /* call the aes main loop */
541 do_aes_ctrmain KEY_192
542
543ENDPROC(aes_ctr_enc_192_avx_by8)
544
545/*
546 * routine to do AES256 CTR enc/decrypt "by8"
547 * XMM registers are clobbered.
548 * Saving/restoring must be done at a higher level
549 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
550 * unsigned int num_bytes)
551 */
552ENTRY(aes_ctr_enc_256_avx_by8)
553 /* call the aes main loop */
554 do_aes_ctrmain KEY_256
555
556ENDPROC(aes_ctr_enc_256_avx_by8)