blob: 838dad5c209fae0f3a660e79d1f5fef8eb1f0c68 [file] [log] [blame]
Ard Biesheuvel49788fe2014-03-21 10:19:17 +01001/*
2 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11/* included by aes-ce.S and aes-neon.S */
12
13 .text
14 .align 4
15
16/*
17 * There are several ways to instantiate this code:
18 * - no interleave, all inline
19 * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
20 * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
21 * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
22 * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
23 *
24 * Macros imported by this code:
25 * - enc_prepare - setup NEON registers for encryption
26 * - dec_prepare - setup NEON registers for decryption
27 * - enc_switch_key - change to new key after having prepared for encryption
28 * - encrypt_block - encrypt a single block
29 * - decrypt block - decrypt a single block
30 * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
31 * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
32 * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
33 * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
34 */
35
36#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
37#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
38#define FRAME_POP ldp x29, x30, [sp],#16
39
40#if INTERLEAVE == 2
41
42aes_encrypt_block2x:
43 encrypt_block2x v0, v1, w3, x2, x6, w7
44 ret
45ENDPROC(aes_encrypt_block2x)
46
47aes_decrypt_block2x:
48 decrypt_block2x v0, v1, w3, x2, x6, w7
49 ret
50ENDPROC(aes_decrypt_block2x)
51
52#elif INTERLEAVE == 4
53
54aes_encrypt_block4x:
55 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
56 ret
57ENDPROC(aes_encrypt_block4x)
58
59aes_decrypt_block4x:
60 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
61 ret
62ENDPROC(aes_decrypt_block4x)
63
64#else
65#error INTERLEAVE should equal 2 or 4
66#endif
67
68 .macro do_encrypt_block2x
69 bl aes_encrypt_block2x
70 .endm
71
72 .macro do_decrypt_block2x
73 bl aes_decrypt_block2x
74 .endm
75
76 .macro do_encrypt_block4x
77 bl aes_encrypt_block4x
78 .endm
79
80 .macro do_decrypt_block4x
81 bl aes_decrypt_block4x
82 .endm
83
84#else
85#define FRAME_PUSH
86#define FRAME_POP
87
88 .macro do_encrypt_block2x
89 encrypt_block2x v0, v1, w3, x2, x6, w7
90 .endm
91
92 .macro do_decrypt_block2x
93 decrypt_block2x v0, v1, w3, x2, x6, w7
94 .endm
95
96 .macro do_encrypt_block4x
97 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
98 .endm
99
100 .macro do_decrypt_block4x
101 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
102 .endm
103
104#endif
105
106 /*
107 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
108 * int blocks, int first)
109 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
110 * int blocks, int first)
111 */
112
113AES_ENTRY(aes_ecb_encrypt)
114 FRAME_PUSH
115 cbz w5, .LecbencloopNx
116
117 enc_prepare w3, x2, x5
118
119.LecbencloopNx:
120#if INTERLEAVE >= 2
121 subs w4, w4, #INTERLEAVE
122 bmi .Lecbenc1x
123#if INTERLEAVE == 2
124 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
125 do_encrypt_block2x
126 st1 {v0.16b-v1.16b}, [x0], #32
127#else
128 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
129 do_encrypt_block4x
130 st1 {v0.16b-v3.16b}, [x0], #64
131#endif
132 b .LecbencloopNx
133.Lecbenc1x:
134 adds w4, w4, #INTERLEAVE
135 beq .Lecbencout
136#endif
137.Lecbencloop:
138 ld1 {v0.16b}, [x1], #16 /* get next pt block */
139 encrypt_block v0, w3, x2, x5, w6
140 st1 {v0.16b}, [x0], #16
141 subs w4, w4, #1
142 bne .Lecbencloop
143.Lecbencout:
144 FRAME_POP
145 ret
146AES_ENDPROC(aes_ecb_encrypt)
147
148
149AES_ENTRY(aes_ecb_decrypt)
150 FRAME_PUSH
151 cbz w5, .LecbdecloopNx
152
153 dec_prepare w3, x2, x5
154
155.LecbdecloopNx:
156#if INTERLEAVE >= 2
157 subs w4, w4, #INTERLEAVE
158 bmi .Lecbdec1x
159#if INTERLEAVE == 2
160 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
161 do_decrypt_block2x
162 st1 {v0.16b-v1.16b}, [x0], #32
163#else
164 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
165 do_decrypt_block4x
166 st1 {v0.16b-v3.16b}, [x0], #64
167#endif
168 b .LecbdecloopNx
169.Lecbdec1x:
170 adds w4, w4, #INTERLEAVE
171 beq .Lecbdecout
172#endif
173.Lecbdecloop:
174 ld1 {v0.16b}, [x1], #16 /* get next ct block */
175 decrypt_block v0, w3, x2, x5, w6
176 st1 {v0.16b}, [x0], #16
177 subs w4, w4, #1
178 bne .Lecbdecloop
179.Lecbdecout:
180 FRAME_POP
181 ret
182AES_ENDPROC(aes_ecb_decrypt)
183
184
185 /*
186 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
187 * int blocks, u8 iv[], int first)
188 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
189 * int blocks, u8 iv[], int first)
190 */
191
192AES_ENTRY(aes_cbc_encrypt)
193 cbz w6, .Lcbcencloop
194
195 ld1 {v0.16b}, [x5] /* get iv */
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000196 enc_prepare w3, x2, x6
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100197
198.Lcbcencloop:
199 ld1 {v1.16b}, [x1], #16 /* get next pt block */
200 eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000201 encrypt_block v0, w3, x2, x6, w7
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100202 st1 {v0.16b}, [x0], #16
203 subs w4, w4, #1
204 bne .Lcbcencloop
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000205 st1 {v0.16b}, [x5] /* return iv */
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100206 ret
207AES_ENDPROC(aes_cbc_encrypt)
208
209
210AES_ENTRY(aes_cbc_decrypt)
211 FRAME_PUSH
212 cbz w6, .LcbcdecloopNx
213
214 ld1 {v7.16b}, [x5] /* get iv */
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000215 dec_prepare w3, x2, x6
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100216
217.LcbcdecloopNx:
218#if INTERLEAVE >= 2
219 subs w4, w4, #INTERLEAVE
220 bmi .Lcbcdec1x
221#if INTERLEAVE == 2
222 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
223 mov v2.16b, v0.16b
224 mov v3.16b, v1.16b
225 do_decrypt_block2x
226 eor v0.16b, v0.16b, v7.16b
227 eor v1.16b, v1.16b, v2.16b
228 mov v7.16b, v3.16b
229 st1 {v0.16b-v1.16b}, [x0], #32
230#else
231 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
232 mov v4.16b, v0.16b
233 mov v5.16b, v1.16b
234 mov v6.16b, v2.16b
235 do_decrypt_block4x
236 sub x1, x1, #16
237 eor v0.16b, v0.16b, v7.16b
238 eor v1.16b, v1.16b, v4.16b
239 ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
240 eor v2.16b, v2.16b, v5.16b
241 eor v3.16b, v3.16b, v6.16b
242 st1 {v0.16b-v3.16b}, [x0], #64
243#endif
244 b .LcbcdecloopNx
245.Lcbcdec1x:
246 adds w4, w4, #INTERLEAVE
247 beq .Lcbcdecout
248#endif
249.Lcbcdecloop:
250 ld1 {v1.16b}, [x1], #16 /* get next ct block */
251 mov v0.16b, v1.16b /* ...and copy to v0 */
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000252 decrypt_block v0, w3, x2, x6, w7
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100253 eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
254 mov v7.16b, v1.16b /* ct is next iv */
255 st1 {v0.16b}, [x0], #16
256 subs w4, w4, #1
257 bne .Lcbcdecloop
258.Lcbcdecout:
259 FRAME_POP
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000260 st1 {v7.16b}, [x5] /* return iv */
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100261 ret
262AES_ENDPROC(aes_cbc_decrypt)
263
264
265 /*
266 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
267 * int blocks, u8 ctr[], int first)
268 */
269
270AES_ENTRY(aes_ctr_encrypt)
271 FRAME_PUSH
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000272 cbz w6, .Lctrnotfirst /* 1st time around? */
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100273 enc_prepare w3, x2, x6
274 ld1 {v4.16b}, [x5]
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000275
276.Lctrnotfirst:
277 umov x8, v4.d[1] /* keep swabbed ctr in reg */
278 rev x8, x8
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100279#if INTERLEAVE >= 2
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000280 cmn w8, w4 /* 32 bit overflow? */
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100281 bcs .Lctrloop
282.LctrloopNx:
283 subs w4, w4, #INTERLEAVE
284 bmi .Lctr1x
285#if INTERLEAVE == 2
286 mov v0.8b, v4.8b
287 mov v1.8b, v4.8b
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000288 rev x7, x8
289 add x8, x8, #1
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100290 ins v0.d[1], x7
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000291 rev x7, x8
292 add x8, x8, #1
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100293 ins v1.d[1], x7
294 ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */
295 do_encrypt_block2x
296 eor v0.16b, v0.16b, v2.16b
297 eor v1.16b, v1.16b, v3.16b
298 st1 {v0.16b-v1.16b}, [x0], #32
299#else
300 ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000301 dup v7.4s, w8
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100302 mov v0.16b, v4.16b
303 add v7.4s, v7.4s, v8.4s
304 mov v1.16b, v4.16b
305 rev32 v8.16b, v7.16b
306 mov v2.16b, v4.16b
307 mov v3.16b, v4.16b
308 mov v1.s[3], v8.s[0]
309 mov v2.s[3], v8.s[1]
310 mov v3.s[3], v8.s[2]
311 ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
312 do_encrypt_block4x
313 eor v0.16b, v5.16b, v0.16b
314 ld1 {v5.16b}, [x1], #16 /* get 1 input block */
315 eor v1.16b, v6.16b, v1.16b
316 eor v2.16b, v7.16b, v2.16b
317 eor v3.16b, v5.16b, v3.16b
318 st1 {v0.16b-v3.16b}, [x0], #64
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000319 add x8, x8, #INTERLEAVE
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100320#endif
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000321 rev x7, x8
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100322 ins v4.d[1], x7
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000323 cbz w4, .Lctrout
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100324 b .LctrloopNx
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100325.Lctr1x:
326 adds w4, w4, #INTERLEAVE
327 beq .Lctrout
328#endif
329.Lctrloop:
330 mov v0.16b, v4.16b
331 encrypt_block v0, w3, x2, x6, w7
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000332
333 adds x8, x8, #1 /* increment BE ctr */
334 rev x7, x8
335 ins v4.d[1], x7
336 bcs .Lctrcarry /* overflow? */
337
338.Lctrcarrydone:
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100339 subs w4, w4, #1
340 bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */
341 ld1 {v3.16b}, [x1], #16
342 eor v3.16b, v0.16b, v3.16b
343 st1 {v3.16b}, [x0], #16
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000344 bne .Lctrloop
345
346.Lctrout:
347 st1 {v4.16b}, [x5] /* return next CTR value */
348 FRAME_POP
349 ret
350
351.Lctrhalfblock:
352 ld1 {v3.8b}, [x1]
353 eor v3.8b, v0.8b, v3.8b
354 st1 {v3.8b}, [x0]
355 FRAME_POP
356 ret
357
358.Lctrcarry:
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100359 umov x7, v4.d[0] /* load upper word of ctr */
360 rev x7, x7 /* ... to handle the carry */
361 add x7, x7, #1
362 rev x7, x7
363 ins v4.d[0], x7
Ard Biesheuvel53bed1f2017-01-17 13:46:29 +0000364 b .Lctrcarrydone
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100365AES_ENDPROC(aes_ctr_encrypt)
366 .ltorg
367
368
369 /*
370 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
371 * int blocks, u8 const rk2[], u8 iv[], int first)
372 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
373 * int blocks, u8 const rk2[], u8 iv[], int first)
374 */
375
376 .macro next_tweak, out, in, const, tmp
377 sshr \tmp\().2d, \in\().2d, #63
378 and \tmp\().16b, \tmp\().16b, \const\().16b
379 add \out\().2d, \in\().2d, \in\().2d
380 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
381 eor \out\().16b, \out\().16b, \tmp\().16b
382 .endm
383
384.Lxts_mul_x:
Ard Biesheuveleb64cbc2016-10-11 19:15:19 +0100385CPU_LE( .quad 1, 0x87 )
386CPU_BE( .quad 0x87, 1 )
Ard Biesheuvel49788fe2014-03-21 10:19:17 +0100387
388AES_ENTRY(aes_xts_encrypt)
389 FRAME_PUSH
390 cbz w7, .LxtsencloopNx
391
392 ld1 {v4.16b}, [x6]
393 enc_prepare w3, x5, x6
394 encrypt_block v4, w3, x5, x6, w7 /* first tweak */
395 enc_switch_key w3, x2, x6
396 ldr q7, .Lxts_mul_x
397 b .LxtsencNx
398
399.LxtsencloopNx:
400 ldr q7, .Lxts_mul_x
401 next_tweak v4, v4, v7, v8
402.LxtsencNx:
403#if INTERLEAVE >= 2
404 subs w4, w4, #INTERLEAVE
405 bmi .Lxtsenc1x
406#if INTERLEAVE == 2
407 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
408 next_tweak v5, v4, v7, v8
409 eor v0.16b, v0.16b, v4.16b
410 eor v1.16b, v1.16b, v5.16b
411 do_encrypt_block2x
412 eor v0.16b, v0.16b, v4.16b
413 eor v1.16b, v1.16b, v5.16b
414 st1 {v0.16b-v1.16b}, [x0], #32
415 cbz w4, .LxtsencoutNx
416 next_tweak v4, v5, v7, v8
417 b .LxtsencNx
418.LxtsencoutNx:
419 mov v4.16b, v5.16b
420 b .Lxtsencout
421#else
422 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
423 next_tweak v5, v4, v7, v8
424 eor v0.16b, v0.16b, v4.16b
425 next_tweak v6, v5, v7, v8
426 eor v1.16b, v1.16b, v5.16b
427 eor v2.16b, v2.16b, v6.16b
428 next_tweak v7, v6, v7, v8
429 eor v3.16b, v3.16b, v7.16b
430 do_encrypt_block4x
431 eor v3.16b, v3.16b, v7.16b
432 eor v0.16b, v0.16b, v4.16b
433 eor v1.16b, v1.16b, v5.16b
434 eor v2.16b, v2.16b, v6.16b
435 st1 {v0.16b-v3.16b}, [x0], #64
436 mov v4.16b, v7.16b
437 cbz w4, .Lxtsencout
438 b .LxtsencloopNx
439#endif
440.Lxtsenc1x:
441 adds w4, w4, #INTERLEAVE
442 beq .Lxtsencout
443#endif
444.Lxtsencloop:
445 ld1 {v1.16b}, [x1], #16
446 eor v0.16b, v1.16b, v4.16b
447 encrypt_block v0, w3, x2, x6, w7
448 eor v0.16b, v0.16b, v4.16b
449 st1 {v0.16b}, [x0], #16
450 subs w4, w4, #1
451 beq .Lxtsencout
452 next_tweak v4, v4, v7, v8
453 b .Lxtsencloop
454.Lxtsencout:
455 FRAME_POP
456 ret
457AES_ENDPROC(aes_xts_encrypt)
458
459
460AES_ENTRY(aes_xts_decrypt)
461 FRAME_PUSH
462 cbz w7, .LxtsdecloopNx
463
464 ld1 {v4.16b}, [x6]
465 enc_prepare w3, x5, x6
466 encrypt_block v4, w3, x5, x6, w7 /* first tweak */
467 dec_prepare w3, x2, x6
468 ldr q7, .Lxts_mul_x
469 b .LxtsdecNx
470
471.LxtsdecloopNx:
472 ldr q7, .Lxts_mul_x
473 next_tweak v4, v4, v7, v8
474.LxtsdecNx:
475#if INTERLEAVE >= 2
476 subs w4, w4, #INTERLEAVE
477 bmi .Lxtsdec1x
478#if INTERLEAVE == 2
479 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
480 next_tweak v5, v4, v7, v8
481 eor v0.16b, v0.16b, v4.16b
482 eor v1.16b, v1.16b, v5.16b
483 do_decrypt_block2x
484 eor v0.16b, v0.16b, v4.16b
485 eor v1.16b, v1.16b, v5.16b
486 st1 {v0.16b-v1.16b}, [x0], #32
487 cbz w4, .LxtsdecoutNx
488 next_tweak v4, v5, v7, v8
489 b .LxtsdecNx
490.LxtsdecoutNx:
491 mov v4.16b, v5.16b
492 b .Lxtsdecout
493#else
494 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
495 next_tweak v5, v4, v7, v8
496 eor v0.16b, v0.16b, v4.16b
497 next_tweak v6, v5, v7, v8
498 eor v1.16b, v1.16b, v5.16b
499 eor v2.16b, v2.16b, v6.16b
500 next_tweak v7, v6, v7, v8
501 eor v3.16b, v3.16b, v7.16b
502 do_decrypt_block4x
503 eor v3.16b, v3.16b, v7.16b
504 eor v0.16b, v0.16b, v4.16b
505 eor v1.16b, v1.16b, v5.16b
506 eor v2.16b, v2.16b, v6.16b
507 st1 {v0.16b-v3.16b}, [x0], #64
508 mov v4.16b, v7.16b
509 cbz w4, .Lxtsdecout
510 b .LxtsdecloopNx
511#endif
512.Lxtsdec1x:
513 adds w4, w4, #INTERLEAVE
514 beq .Lxtsdecout
515#endif
516.Lxtsdecloop:
517 ld1 {v1.16b}, [x1], #16
518 eor v0.16b, v1.16b, v4.16b
519 decrypt_block v0, w3, x2, x6, w7
520 eor v0.16b, v0.16b, v4.16b
521 st1 {v0.16b}, [x0], #16
522 subs w4, w4, #1
523 beq .Lxtsdecout
524 next_tweak v4, v4, v7, v8
525 b .Lxtsdecloop
526.Lxtsdecout:
527 FRAME_POP
528 ret
529AES_ENDPROC(aes_xts_decrypt)