blob: f6e372c528eb438b6517a236315afeb1694a8002 [file] [log] [blame]
Ard Biesheuvel49788fe2014-03-21 10:19:17 +01001/*
2 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11/* included by aes-ce.S and aes-neon.S */
12
13 .text
14 .align 4
15
16/*
17 * There are several ways to instantiate this code:
18 * - no interleave, all inline
19 * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
20 * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
21 * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
22 * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
23 *
24 * Macros imported by this code:
25 * - enc_prepare - setup NEON registers for encryption
26 * - dec_prepare - setup NEON registers for decryption
27 * - enc_switch_key - change to new key after having prepared for encryption
28 * - encrypt_block - encrypt a single block
29 * - decrypt block - decrypt a single block
30 * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
31 * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
32 * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
33 * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
34 */
35
36#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
37#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
38#define FRAME_POP ldp x29, x30, [sp],#16
39
40#if INTERLEAVE == 2
41
42aes_encrypt_block2x:
43 encrypt_block2x v0, v1, w3, x2, x6, w7
44 ret
45ENDPROC(aes_encrypt_block2x)
46
47aes_decrypt_block2x:
48 decrypt_block2x v0, v1, w3, x2, x6, w7
49 ret
50ENDPROC(aes_decrypt_block2x)
51
52#elif INTERLEAVE == 4
53
54aes_encrypt_block4x:
55 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
56 ret
57ENDPROC(aes_encrypt_block4x)
58
59aes_decrypt_block4x:
60 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
61 ret
62ENDPROC(aes_decrypt_block4x)
63
64#else
65#error INTERLEAVE should equal 2 or 4
66#endif
67
68 .macro do_encrypt_block2x
69 bl aes_encrypt_block2x
70 .endm
71
72 .macro do_decrypt_block2x
73 bl aes_decrypt_block2x
74 .endm
75
76 .macro do_encrypt_block4x
77 bl aes_encrypt_block4x
78 .endm
79
80 .macro do_decrypt_block4x
81 bl aes_decrypt_block4x
82 .endm
83
84#else
85#define FRAME_PUSH
86#define FRAME_POP
87
88 .macro do_encrypt_block2x
89 encrypt_block2x v0, v1, w3, x2, x6, w7
90 .endm
91
92 .macro do_decrypt_block2x
93 decrypt_block2x v0, v1, w3, x2, x6, w7
94 .endm
95
96 .macro do_encrypt_block4x
97 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
98 .endm
99
100 .macro do_decrypt_block4x
101 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
102 .endm
103
104#endif
105
106 /*
107 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
108 * int blocks, int first)
109 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
110 * int blocks, int first)
111 */
112
113AES_ENTRY(aes_ecb_encrypt)
114 FRAME_PUSH
115 cbz w5, .LecbencloopNx
116
117 enc_prepare w3, x2, x5
118
119.LecbencloopNx:
120#if INTERLEAVE >= 2
121 subs w4, w4, #INTERLEAVE
122 bmi .Lecbenc1x
123#if INTERLEAVE == 2
124 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
125 do_encrypt_block2x
126 st1 {v0.16b-v1.16b}, [x0], #32
127#else
128 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
129 do_encrypt_block4x
130 st1 {v0.16b-v3.16b}, [x0], #64
131#endif
132 b .LecbencloopNx
133.Lecbenc1x:
134 adds w4, w4, #INTERLEAVE
135 beq .Lecbencout
136#endif
137.Lecbencloop:
138 ld1 {v0.16b}, [x1], #16 /* get next pt block */
139 encrypt_block v0, w3, x2, x5, w6
140 st1 {v0.16b}, [x0], #16
141 subs w4, w4, #1
142 bne .Lecbencloop
143.Lecbencout:
144 FRAME_POP
145 ret
146AES_ENDPROC(aes_ecb_encrypt)
147
148
149AES_ENTRY(aes_ecb_decrypt)
150 FRAME_PUSH
151 cbz w5, .LecbdecloopNx
152
153 dec_prepare w3, x2, x5
154
155.LecbdecloopNx:
156#if INTERLEAVE >= 2
157 subs w4, w4, #INTERLEAVE
158 bmi .Lecbdec1x
159#if INTERLEAVE == 2
160 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
161 do_decrypt_block2x
162 st1 {v0.16b-v1.16b}, [x0], #32
163#else
164 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
165 do_decrypt_block4x
166 st1 {v0.16b-v3.16b}, [x0], #64
167#endif
168 b .LecbdecloopNx
169.Lecbdec1x:
170 adds w4, w4, #INTERLEAVE
171 beq .Lecbdecout
172#endif
173.Lecbdecloop:
174 ld1 {v0.16b}, [x1], #16 /* get next ct block */
175 decrypt_block v0, w3, x2, x5, w6
176 st1 {v0.16b}, [x0], #16
177 subs w4, w4, #1
178 bne .Lecbdecloop
179.Lecbdecout:
180 FRAME_POP
181 ret
182AES_ENDPROC(aes_ecb_decrypt)
183
184
185 /*
186 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
187 * int blocks, u8 iv[], int first)
188 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
189 * int blocks, u8 iv[], int first)
190 */
191
192AES_ENTRY(aes_cbc_encrypt)
193 cbz w6, .Lcbcencloop
194
195 ld1 {v0.16b}, [x5] /* get iv */
196 enc_prepare w3, x2, x5
197
198.Lcbcencloop:
199 ld1 {v1.16b}, [x1], #16 /* get next pt block */
200 eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */
201 encrypt_block v0, w3, x2, x5, w6
202 st1 {v0.16b}, [x0], #16
203 subs w4, w4, #1
204 bne .Lcbcencloop
205 ret
206AES_ENDPROC(aes_cbc_encrypt)
207
208
209AES_ENTRY(aes_cbc_decrypt)
210 FRAME_PUSH
211 cbz w6, .LcbcdecloopNx
212
213 ld1 {v7.16b}, [x5] /* get iv */
214 dec_prepare w3, x2, x5
215
216.LcbcdecloopNx:
217#if INTERLEAVE >= 2
218 subs w4, w4, #INTERLEAVE
219 bmi .Lcbcdec1x
220#if INTERLEAVE == 2
221 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
222 mov v2.16b, v0.16b
223 mov v3.16b, v1.16b
224 do_decrypt_block2x
225 eor v0.16b, v0.16b, v7.16b
226 eor v1.16b, v1.16b, v2.16b
227 mov v7.16b, v3.16b
228 st1 {v0.16b-v1.16b}, [x0], #32
229#else
230 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
231 mov v4.16b, v0.16b
232 mov v5.16b, v1.16b
233 mov v6.16b, v2.16b
234 do_decrypt_block4x
235 sub x1, x1, #16
236 eor v0.16b, v0.16b, v7.16b
237 eor v1.16b, v1.16b, v4.16b
238 ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
239 eor v2.16b, v2.16b, v5.16b
240 eor v3.16b, v3.16b, v6.16b
241 st1 {v0.16b-v3.16b}, [x0], #64
242#endif
243 b .LcbcdecloopNx
244.Lcbcdec1x:
245 adds w4, w4, #INTERLEAVE
246 beq .Lcbcdecout
247#endif
248.Lcbcdecloop:
249 ld1 {v1.16b}, [x1], #16 /* get next ct block */
250 mov v0.16b, v1.16b /* ...and copy to v0 */
251 decrypt_block v0, w3, x2, x5, w6
252 eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
253 mov v7.16b, v1.16b /* ct is next iv */
254 st1 {v0.16b}, [x0], #16
255 subs w4, w4, #1
256 bne .Lcbcdecloop
257.Lcbcdecout:
258 FRAME_POP
259 ret
260AES_ENDPROC(aes_cbc_decrypt)
261
262
263 /*
264 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
265 * int blocks, u8 ctr[], int first)
266 */
267
268AES_ENTRY(aes_ctr_encrypt)
269 FRAME_PUSH
270 cbnz w6, .Lctrfirst /* 1st time around? */
271 umov x5, v4.d[1] /* keep swabbed ctr in reg */
272 rev x5, x5
273#if INTERLEAVE >= 2
274 cmn w5, w4 /* 32 bit overflow? */
275 bcs .Lctrinc
276 add x5, x5, #1 /* increment BE ctr */
277 b .LctrincNx
278#else
279 b .Lctrinc
280#endif
281.Lctrfirst:
282 enc_prepare w3, x2, x6
283 ld1 {v4.16b}, [x5]
284 umov x5, v4.d[1] /* keep swabbed ctr in reg */
285 rev x5, x5
286#if INTERLEAVE >= 2
287 cmn w5, w4 /* 32 bit overflow? */
288 bcs .Lctrloop
289.LctrloopNx:
290 subs w4, w4, #INTERLEAVE
291 bmi .Lctr1x
292#if INTERLEAVE == 2
293 mov v0.8b, v4.8b
294 mov v1.8b, v4.8b
295 rev x7, x5
296 add x5, x5, #1
297 ins v0.d[1], x7
298 rev x7, x5
299 add x5, x5, #1
300 ins v1.d[1], x7
301 ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */
302 do_encrypt_block2x
303 eor v0.16b, v0.16b, v2.16b
304 eor v1.16b, v1.16b, v3.16b
305 st1 {v0.16b-v1.16b}, [x0], #32
306#else
307 ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
308 dup v7.4s, w5
309 mov v0.16b, v4.16b
310 add v7.4s, v7.4s, v8.4s
311 mov v1.16b, v4.16b
312 rev32 v8.16b, v7.16b
313 mov v2.16b, v4.16b
314 mov v3.16b, v4.16b
315 mov v1.s[3], v8.s[0]
316 mov v2.s[3], v8.s[1]
317 mov v3.s[3], v8.s[2]
318 ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
319 do_encrypt_block4x
320 eor v0.16b, v5.16b, v0.16b
321 ld1 {v5.16b}, [x1], #16 /* get 1 input block */
322 eor v1.16b, v6.16b, v1.16b
323 eor v2.16b, v7.16b, v2.16b
324 eor v3.16b, v5.16b, v3.16b
325 st1 {v0.16b-v3.16b}, [x0], #64
326 add x5, x5, #INTERLEAVE
327#endif
328 cbz w4, .LctroutNx
329.LctrincNx:
330 rev x7, x5
331 ins v4.d[1], x7
332 b .LctrloopNx
333.LctroutNx:
334 sub x5, x5, #1
335 rev x7, x5
336 ins v4.d[1], x7
337 b .Lctrout
338.Lctr1x:
339 adds w4, w4, #INTERLEAVE
340 beq .Lctrout
341#endif
342.Lctrloop:
343 mov v0.16b, v4.16b
344 encrypt_block v0, w3, x2, x6, w7
345 subs w4, w4, #1
346 bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */
347 ld1 {v3.16b}, [x1], #16
348 eor v3.16b, v0.16b, v3.16b
349 st1 {v3.16b}, [x0], #16
350 beq .Lctrout
351.Lctrinc:
352 adds x5, x5, #1 /* increment BE ctr */
353 rev x7, x5
354 ins v4.d[1], x7
355 bcc .Lctrloop /* no overflow? */
356 umov x7, v4.d[0] /* load upper word of ctr */
357 rev x7, x7 /* ... to handle the carry */
358 add x7, x7, #1
359 rev x7, x7
360 ins v4.d[0], x7
361 b .Lctrloop
362.Lctrhalfblock:
363 ld1 {v3.8b}, [x1]
364 eor v3.8b, v0.8b, v3.8b
365 st1 {v3.8b}, [x0]
366.Lctrout:
367 FRAME_POP
368 ret
369AES_ENDPROC(aes_ctr_encrypt)
370 .ltorg
371
372
373 /*
374 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
375 * int blocks, u8 const rk2[], u8 iv[], int first)
376 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
377 * int blocks, u8 const rk2[], u8 iv[], int first)
378 */
379
380 .macro next_tweak, out, in, const, tmp
381 sshr \tmp\().2d, \in\().2d, #63
382 and \tmp\().16b, \tmp\().16b, \const\().16b
383 add \out\().2d, \in\().2d, \in\().2d
384 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
385 eor \out\().16b, \out\().16b, \tmp\().16b
386 .endm
387
388.Lxts_mul_x:
389 .word 1, 0, 0x87, 0
390
391AES_ENTRY(aes_xts_encrypt)
392 FRAME_PUSH
393 cbz w7, .LxtsencloopNx
394
395 ld1 {v4.16b}, [x6]
396 enc_prepare w3, x5, x6
397 encrypt_block v4, w3, x5, x6, w7 /* first tweak */
398 enc_switch_key w3, x2, x6
399 ldr q7, .Lxts_mul_x
400 b .LxtsencNx
401
402.LxtsencloopNx:
403 ldr q7, .Lxts_mul_x
404 next_tweak v4, v4, v7, v8
405.LxtsencNx:
406#if INTERLEAVE >= 2
407 subs w4, w4, #INTERLEAVE
408 bmi .Lxtsenc1x
409#if INTERLEAVE == 2
410 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
411 next_tweak v5, v4, v7, v8
412 eor v0.16b, v0.16b, v4.16b
413 eor v1.16b, v1.16b, v5.16b
414 do_encrypt_block2x
415 eor v0.16b, v0.16b, v4.16b
416 eor v1.16b, v1.16b, v5.16b
417 st1 {v0.16b-v1.16b}, [x0], #32
418 cbz w4, .LxtsencoutNx
419 next_tweak v4, v5, v7, v8
420 b .LxtsencNx
421.LxtsencoutNx:
422 mov v4.16b, v5.16b
423 b .Lxtsencout
424#else
425 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
426 next_tweak v5, v4, v7, v8
427 eor v0.16b, v0.16b, v4.16b
428 next_tweak v6, v5, v7, v8
429 eor v1.16b, v1.16b, v5.16b
430 eor v2.16b, v2.16b, v6.16b
431 next_tweak v7, v6, v7, v8
432 eor v3.16b, v3.16b, v7.16b
433 do_encrypt_block4x
434 eor v3.16b, v3.16b, v7.16b
435 eor v0.16b, v0.16b, v4.16b
436 eor v1.16b, v1.16b, v5.16b
437 eor v2.16b, v2.16b, v6.16b
438 st1 {v0.16b-v3.16b}, [x0], #64
439 mov v4.16b, v7.16b
440 cbz w4, .Lxtsencout
441 b .LxtsencloopNx
442#endif
443.Lxtsenc1x:
444 adds w4, w4, #INTERLEAVE
445 beq .Lxtsencout
446#endif
447.Lxtsencloop:
448 ld1 {v1.16b}, [x1], #16
449 eor v0.16b, v1.16b, v4.16b
450 encrypt_block v0, w3, x2, x6, w7
451 eor v0.16b, v0.16b, v4.16b
452 st1 {v0.16b}, [x0], #16
453 subs w4, w4, #1
454 beq .Lxtsencout
455 next_tweak v4, v4, v7, v8
456 b .Lxtsencloop
457.Lxtsencout:
458 FRAME_POP
459 ret
460AES_ENDPROC(aes_xts_encrypt)
461
462
463AES_ENTRY(aes_xts_decrypt)
464 FRAME_PUSH
465 cbz w7, .LxtsdecloopNx
466
467 ld1 {v4.16b}, [x6]
468 enc_prepare w3, x5, x6
469 encrypt_block v4, w3, x5, x6, w7 /* first tweak */
470 dec_prepare w3, x2, x6
471 ldr q7, .Lxts_mul_x
472 b .LxtsdecNx
473
474.LxtsdecloopNx:
475 ldr q7, .Lxts_mul_x
476 next_tweak v4, v4, v7, v8
477.LxtsdecNx:
478#if INTERLEAVE >= 2
479 subs w4, w4, #INTERLEAVE
480 bmi .Lxtsdec1x
481#if INTERLEAVE == 2
482 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
483 next_tweak v5, v4, v7, v8
484 eor v0.16b, v0.16b, v4.16b
485 eor v1.16b, v1.16b, v5.16b
486 do_decrypt_block2x
487 eor v0.16b, v0.16b, v4.16b
488 eor v1.16b, v1.16b, v5.16b
489 st1 {v0.16b-v1.16b}, [x0], #32
490 cbz w4, .LxtsdecoutNx
491 next_tweak v4, v5, v7, v8
492 b .LxtsdecNx
493.LxtsdecoutNx:
494 mov v4.16b, v5.16b
495 b .Lxtsdecout
496#else
497 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
498 next_tweak v5, v4, v7, v8
499 eor v0.16b, v0.16b, v4.16b
500 next_tweak v6, v5, v7, v8
501 eor v1.16b, v1.16b, v5.16b
502 eor v2.16b, v2.16b, v6.16b
503 next_tweak v7, v6, v7, v8
504 eor v3.16b, v3.16b, v7.16b
505 do_decrypt_block4x
506 eor v3.16b, v3.16b, v7.16b
507 eor v0.16b, v0.16b, v4.16b
508 eor v1.16b, v1.16b, v5.16b
509 eor v2.16b, v2.16b, v6.16b
510 st1 {v0.16b-v3.16b}, [x0], #64
511 mov v4.16b, v7.16b
512 cbz w4, .Lxtsdecout
513 b .LxtsdecloopNx
514#endif
515.Lxtsdec1x:
516 adds w4, w4, #INTERLEAVE
517 beq .Lxtsdecout
518#endif
519.Lxtsdecloop:
520 ld1 {v1.16b}, [x1], #16
521 eor v0.16b, v1.16b, v4.16b
522 decrypt_block v0, w3, x2, x6, w7
523 eor v0.16b, v0.16b, v4.16b
524 st1 {v0.16b}, [x0], #16
525 subs w4, w4, #1
526 beq .Lxtsdecout
527 next_tweak v4, v4, v7, v8
528 b .Lxtsdecloop
529.Lxtsdecout:
530 FRAME_POP
531 ret
532AES_ENDPROC(aes_xts_decrypt)