blob: ca047250043371daac6dfe8a9876af8e228081d4 [file] [log] [blame]
Ard Biesheuvel1abee992017-01-11 16:41:55 +00001/*
2 * Bit sliced AES using NEON instructions
3 *
4 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11/*
12 * The algorithm implemented here is described in detail by the paper
13 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
14 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
15 *
16 * This implementation is based primarily on the OpenSSL implementation
17 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
18 */
19
20#include <linux/linkage.h>
21#include <asm/assembler.h>
22
23 .text
24
25 rounds .req x11
26 bskey .req x12
27
28 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
29 eor \b2, \b2, \b1
30 eor \b5, \b5, \b6
31 eor \b3, \b3, \b0
32 eor \b6, \b6, \b2
33 eor \b5, \b5, \b0
34 eor \b6, \b6, \b3
35 eor \b3, \b3, \b7
36 eor \b7, \b7, \b5
37 eor \b3, \b3, \b4
38 eor \b4, \b4, \b5
39 eor \b2, \b2, \b7
40 eor \b3, \b3, \b1
41 eor \b1, \b1, \b5
42 .endm
43
44 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
45 eor \b0, \b0, \b6
46 eor \b1, \b1, \b4
47 eor \b4, \b4, \b6
48 eor \b2, \b2, \b0
49 eor \b6, \b6, \b1
50 eor \b1, \b1, \b5
51 eor \b5, \b5, \b3
52 eor \b3, \b3, \b7
53 eor \b7, \b7, \b5
54 eor \b2, \b2, \b5
55 eor \b4, \b4, \b7
56 .endm
57
58 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
59 eor \b1, \b1, \b7
60 eor \b4, \b4, \b7
61 eor \b7, \b7, \b5
62 eor \b1, \b1, \b3
63 eor \b2, \b2, \b5
64 eor \b3, \b3, \b7
65 eor \b6, \b6, \b1
66 eor \b2, \b2, \b0
67 eor \b5, \b5, \b3
68 eor \b4, \b4, \b6
69 eor \b0, \b0, \b6
70 eor \b1, \b1, \b4
71 .endm
72
73 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
74 eor \b1, \b1, \b5
75 eor \b2, \b2, \b7
76 eor \b3, \b3, \b1
77 eor \b4, \b4, \b5
78 eor \b7, \b7, \b5
79 eor \b3, \b3, \b4
80 eor \b5, \b5, \b0
81 eor \b3, \b3, \b7
82 eor \b6, \b6, \b2
83 eor \b2, \b2, \b1
84 eor \b6, \b6, \b3
85 eor \b3, \b3, \b0
86 eor \b5, \b5, \b6
87 .endm
88
89 .macro mul_gf4, x0, x1, y0, y1, t0, t1
90 eor \t0, \y0, \y1
91 and \t0, \t0, \x0
92 eor \x0, \x0, \x1
93 and \t1, \x1, \y0
94 and \x0, \x0, \y1
95 eor \x1, \t1, \t0
96 eor \x0, \x0, \t1
97 .endm
98
99 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
100 eor \t0, \y0, \y1
101 eor \t1, \y2, \y3
102 and \t0, \t0, \x0
103 and \t1, \t1, \x2
104 eor \x0, \x0, \x1
105 eor \x2, \x2, \x3
106 and \x1, \x1, \y0
107 and \x3, \x3, \y2
108 and \x0, \x0, \y1
109 and \x2, \x2, \y3
110 eor \x1, \x1, \x0
111 eor \x2, \x2, \x3
112 eor \x0, \x0, \t0
113 eor \x3, \x3, \t1
114 .endm
115
116 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
117 y0, y1, y2, y3, t0, t1, t2, t3
118 eor \t0, \x0, \x2
119 eor \t1, \x1, \x3
120 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
121 eor \y0, \y0, \y2
122 eor \y1, \y1, \y3
123 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
124 eor \x0, \x0, \t0
125 eor \x2, \x2, \t0
126 eor \x1, \x1, \t1
127 eor \x3, \x3, \t1
128 eor \t0, \x4, \x6
129 eor \t1, \x5, \x7
130 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
131 eor \y0, \y0, \y2
132 eor \y1, \y1, \y3
133 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
134 eor \x4, \x4, \t0
135 eor \x6, \x6, \t0
136 eor \x5, \x5, \t1
137 eor \x7, \x7, \t1
138 .endm
139
140 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
141 t0, t1, t2, t3, s0, s1, s2, s3
142 eor \t3, \x4, \x6
143 eor \t0, \x5, \x7
144 eor \t1, \x1, \x3
145 eor \s1, \x7, \x6
146 eor \s0, \x0, \x2
147 eor \s3, \t3, \t0
148 orr \t2, \t0, \t1
149 and \s2, \t3, \s0
150 orr \t3, \t3, \s0
151 eor \s0, \s0, \t1
152 and \t0, \t0, \t1
153 eor \t1, \x3, \x2
154 and \s3, \s3, \s0
155 and \s1, \s1, \t1
156 eor \t1, \x4, \x5
157 eor \s0, \x1, \x0
158 eor \t3, \t3, \s1
159 eor \t2, \t2, \s1
160 and \s1, \t1, \s0
161 orr \t1, \t1, \s0
162 eor \t3, \t3, \s3
163 eor \t0, \t0, \s1
164 eor \t2, \t2, \s2
165 eor \t1, \t1, \s3
166 eor \t0, \t0, \s2
167 and \s0, \x7, \x3
168 eor \t1, \t1, \s2
169 and \s1, \x6, \x2
170 and \s2, \x5, \x1
171 orr \s3, \x4, \x0
172 eor \t3, \t3, \s0
173 eor \t1, \t1, \s2
174 eor \s0, \t0, \s3
175 eor \t2, \t2, \s1
176 and \s2, \t3, \t1
177 eor \s1, \t2, \s2
178 eor \s3, \s0, \s2
179 bsl \s1, \t1, \s0
180 not \t0, \s0
181 bsl \s0, \s1, \s3
182 bsl \t0, \s1, \s3
183 bsl \s3, \t3, \t2
184 eor \t3, \t3, \t2
185 and \s2, \s0, \s3
186 eor \t1, \t1, \t0
187 eor \s2, \s2, \t3
188 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
189 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
190 .endm
191
192 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
193 t0, t1, t2, t3, s0, s1, s2, s3
194 in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
195 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
196 inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
197 \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
198 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
199 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
200 out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
201 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
202 .endm
203
204 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
205 t0, t1, t2, t3, s0, s1, s2, s3
206 inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
207 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
208 inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
209 \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
210 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
211 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
212 inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
213 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
214 .endm
215
216 .macro enc_next_rk
217 ldp q16, q17, [bskey], #128
218 ldp q18, q19, [bskey, #-96]
219 ldp q20, q21, [bskey, #-64]
220 ldp q22, q23, [bskey, #-32]
221 .endm
222
223 .macro dec_next_rk
224 ldp q16, q17, [bskey, #-128]!
225 ldp q18, q19, [bskey, #32]
226 ldp q20, q21, [bskey, #64]
227 ldp q22, q23, [bskey, #96]
228 .endm
229
230 .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
231 eor \x0\().16b, \x0\().16b, v16.16b
232 eor \x1\().16b, \x1\().16b, v17.16b
233 eor \x2\().16b, \x2\().16b, v18.16b
234 eor \x3\().16b, \x3\().16b, v19.16b
235 eor \x4\().16b, \x4\().16b, v20.16b
236 eor \x5\().16b, \x5\().16b, v21.16b
237 eor \x6\().16b, \x6\().16b, v22.16b
238 eor \x7\().16b, \x7\().16b, v23.16b
239 .endm
240
241 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
242 tbl \x0\().16b, {\x0\().16b}, \mask\().16b
243 tbl \x1\().16b, {\x1\().16b}, \mask\().16b
244 tbl \x2\().16b, {\x2\().16b}, \mask\().16b
245 tbl \x3\().16b, {\x3\().16b}, \mask\().16b
246 tbl \x4\().16b, {\x4\().16b}, \mask\().16b
247 tbl \x5\().16b, {\x5\().16b}, \mask\().16b
248 tbl \x6\().16b, {\x6\().16b}, \mask\().16b
249 tbl \x7\().16b, {\x7\().16b}, \mask\().16b
250 .endm
251
252 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
253 t0, t1, t2, t3, t4, t5, t6, t7, inv
254 ext \t0\().16b, \x0\().16b, \x0\().16b, #12
255 ext \t1\().16b, \x1\().16b, \x1\().16b, #12
256 eor \x0\().16b, \x0\().16b, \t0\().16b
257 ext \t2\().16b, \x2\().16b, \x2\().16b, #12
258 eor \x1\().16b, \x1\().16b, \t1\().16b
259 ext \t3\().16b, \x3\().16b, \x3\().16b, #12
260 eor \x2\().16b, \x2\().16b, \t2\().16b
261 ext \t4\().16b, \x4\().16b, \x4\().16b, #12
262 eor \x3\().16b, \x3\().16b, \t3\().16b
263 ext \t5\().16b, \x5\().16b, \x5\().16b, #12
264 eor \x4\().16b, \x4\().16b, \t4\().16b
265 ext \t6\().16b, \x6\().16b, \x6\().16b, #12
266 eor \x5\().16b, \x5\().16b, \t5\().16b
267 ext \t7\().16b, \x7\().16b, \x7\().16b, #12
268 eor \x6\().16b, \x6\().16b, \t6\().16b
269 eor \t1\().16b, \t1\().16b, \x0\().16b
270 eor \x7\().16b, \x7\().16b, \t7\().16b
271 ext \x0\().16b, \x0\().16b, \x0\().16b, #8
272 eor \t2\().16b, \t2\().16b, \x1\().16b
273 eor \t0\().16b, \t0\().16b, \x7\().16b
274 eor \t1\().16b, \t1\().16b, \x7\().16b
275 ext \x1\().16b, \x1\().16b, \x1\().16b, #8
276 eor \t5\().16b, \t5\().16b, \x4\().16b
277 eor \x0\().16b, \x0\().16b, \t0\().16b
278 eor \t6\().16b, \t6\().16b, \x5\().16b
279 eor \x1\().16b, \x1\().16b, \t1\().16b
280 ext \t0\().16b, \x4\().16b, \x4\().16b, #8
281 eor \t4\().16b, \t4\().16b, \x3\().16b
282 ext \t1\().16b, \x5\().16b, \x5\().16b, #8
283 eor \t7\().16b, \t7\().16b, \x6\().16b
284 ext \x4\().16b, \x3\().16b, \x3\().16b, #8
285 eor \t3\().16b, \t3\().16b, \x2\().16b
286 ext \x5\().16b, \x7\().16b, \x7\().16b, #8
287 eor \t4\().16b, \t4\().16b, \x7\().16b
288 ext \x3\().16b, \x6\().16b, \x6\().16b, #8
289 eor \t3\().16b, \t3\().16b, \x7\().16b
290 ext \x6\().16b, \x2\().16b, \x2\().16b, #8
291 eor \x7\().16b, \t1\().16b, \t5\().16b
292 .ifb \inv
293 eor \x2\().16b, \t0\().16b, \t4\().16b
294 eor \x4\().16b, \x4\().16b, \t3\().16b
295 eor \x5\().16b, \x5\().16b, \t7\().16b
296 eor \x3\().16b, \x3\().16b, \t6\().16b
297 eor \x6\().16b, \x6\().16b, \t2\().16b
298 .else
299 eor \t3\().16b, \t3\().16b, \x4\().16b
300 eor \x5\().16b, \x5\().16b, \t7\().16b
301 eor \x2\().16b, \x3\().16b, \t6\().16b
302 eor \x3\().16b, \t0\().16b, \t4\().16b
303 eor \x4\().16b, \x6\().16b, \t2\().16b
304 mov \x6\().16b, \t3\().16b
305 .endif
306 .endm
307
308 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
309 t0, t1, t2, t3, t4, t5, t6, t7
310 ext \t0\().16b, \x0\().16b, \x0\().16b, #8
311 ext \t6\().16b, \x6\().16b, \x6\().16b, #8
312 ext \t7\().16b, \x7\().16b, \x7\().16b, #8
313 eor \t0\().16b, \t0\().16b, \x0\().16b
314 ext \t1\().16b, \x1\().16b, \x1\().16b, #8
315 eor \t6\().16b, \t6\().16b, \x6\().16b
316 ext \t2\().16b, \x2\().16b, \x2\().16b, #8
317 eor \t7\().16b, \t7\().16b, \x7\().16b
318 ext \t3\().16b, \x3\().16b, \x3\().16b, #8
319 eor \t1\().16b, \t1\().16b, \x1\().16b
320 ext \t4\().16b, \x4\().16b, \x4\().16b, #8
321 eor \t2\().16b, \t2\().16b, \x2\().16b
322 ext \t5\().16b, \x5\().16b, \x5\().16b, #8
323 eor \t3\().16b, \t3\().16b, \x3\().16b
324 eor \t4\().16b, \t4\().16b, \x4\().16b
325 eor \t5\().16b, \t5\().16b, \x5\().16b
326 eor \x0\().16b, \x0\().16b, \t6\().16b
327 eor \x1\().16b, \x1\().16b, \t6\().16b
328 eor \x2\().16b, \x2\().16b, \t0\().16b
329 eor \x4\().16b, \x4\().16b, \t2\().16b
330 eor \x3\().16b, \x3\().16b, \t1\().16b
331 eor \x1\().16b, \x1\().16b, \t7\().16b
332 eor \x2\().16b, \x2\().16b, \t7\().16b
333 eor \x4\().16b, \x4\().16b, \t6\().16b
334 eor \x5\().16b, \x5\().16b, \t3\().16b
335 eor \x3\().16b, \x3\().16b, \t6\().16b
336 eor \x6\().16b, \x6\().16b, \t4\().16b
337 eor \x4\().16b, \x4\().16b, \t7\().16b
338 eor \x5\().16b, \x5\().16b, \t7\().16b
339 eor \x7\().16b, \x7\().16b, \t5\().16b
340 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
341 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
342 .endm
343
344 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
345 ushr \t0\().2d, \b0\().2d, #\n
346 ushr \t1\().2d, \b1\().2d, #\n
347 eor \t0\().16b, \t0\().16b, \a0\().16b
348 eor \t1\().16b, \t1\().16b, \a1\().16b
349 and \t0\().16b, \t0\().16b, \mask\().16b
350 and \t1\().16b, \t1\().16b, \mask\().16b
351 eor \a0\().16b, \a0\().16b, \t0\().16b
352 shl \t0\().2d, \t0\().2d, #\n
353 eor \a1\().16b, \a1\().16b, \t1\().16b
354 shl \t1\().2d, \t1\().2d, #\n
355 eor \b0\().16b, \b0\().16b, \t0\().16b
356 eor \b1\().16b, \b1\().16b, \t1\().16b
357 .endm
358
359 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
360 movi \t0\().16b, #0x55
361 movi \t1\().16b, #0x33
362 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
363 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
364 movi \t0\().16b, #0x0f
365 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
366 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
367 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
368 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
369 .endm
370
371
372 .align 6
373M0: .octa 0x0004080c0105090d02060a0e03070b0f
374
375M0SR: .octa 0x0004080c05090d010a0e02060f03070b
376SR: .octa 0x0f0e0d0c0a09080b0504070600030201
377SRM0: .octa 0x01060b0c0207080d0304090e00050a0f
378
379M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03
380ISR: .octa 0x0f0e0d0c080b0a090504070602010003
381ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f
382
383 /*
384 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
385 */
386ENTRY(aesbs_convert_key)
387 ld1 {v7.4s}, [x1], #16 // load round 0 key
388 ld1 {v17.4s}, [x1], #16 // load round 1 key
389
390 movi v8.16b, #0x01 // bit masks
391 movi v9.16b, #0x02
392 movi v10.16b, #0x04
393 movi v11.16b, #0x08
394 movi v12.16b, #0x10
395 movi v13.16b, #0x20
396 movi v14.16b, #0x40
397 movi v15.16b, #0x80
398 ldr q16, M0
399
400 sub x2, x2, #1
401 str q7, [x0], #16 // save round 0 key
402
403.Lkey_loop:
404 tbl v7.16b ,{v17.16b}, v16.16b
405 ld1 {v17.4s}, [x1], #16 // load next round key
406
407 cmtst v0.16b, v7.16b, v8.16b
408 cmtst v1.16b, v7.16b, v9.16b
409 cmtst v2.16b, v7.16b, v10.16b
410 cmtst v3.16b, v7.16b, v11.16b
411 cmtst v4.16b, v7.16b, v12.16b
412 cmtst v5.16b, v7.16b, v13.16b
413 cmtst v6.16b, v7.16b, v14.16b
414 cmtst v7.16b, v7.16b, v15.16b
415 not v0.16b, v0.16b
416 not v1.16b, v1.16b
417 not v5.16b, v5.16b
418 not v6.16b, v6.16b
419
420 subs x2, x2, #1
421 stp q0, q1, [x0], #128
422 stp q2, q3, [x0, #-96]
423 stp q4, q5, [x0, #-64]
424 stp q6, q7, [x0, #-32]
425 b.ne .Lkey_loop
426
427 movi v7.16b, #0x63 // compose .L63
428 eor v17.16b, v17.16b, v7.16b
429 str q17, [x0]
430 ret
431ENDPROC(aesbs_convert_key)
432
433 .align 4
434aesbs_encrypt8:
435 ldr q9, [bskey], #16 // round 0 key
436 ldr q8, M0SR
437 ldr q24, SR
438
439 eor v10.16b, v0.16b, v9.16b // xor with round0 key
440 eor v11.16b, v1.16b, v9.16b
441 tbl v0.16b, {v10.16b}, v8.16b
442 eor v12.16b, v2.16b, v9.16b
443 tbl v1.16b, {v11.16b}, v8.16b
444 eor v13.16b, v3.16b, v9.16b
445 tbl v2.16b, {v12.16b}, v8.16b
446 eor v14.16b, v4.16b, v9.16b
447 tbl v3.16b, {v13.16b}, v8.16b
448 eor v15.16b, v5.16b, v9.16b
449 tbl v4.16b, {v14.16b}, v8.16b
450 eor v10.16b, v6.16b, v9.16b
451 tbl v5.16b, {v15.16b}, v8.16b
452 eor v11.16b, v7.16b, v9.16b
453 tbl v6.16b, {v10.16b}, v8.16b
454 tbl v7.16b, {v11.16b}, v8.16b
455
456 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
457
458 sub rounds, rounds, #1
459 b .Lenc_sbox
460
461.Lenc_loop:
462 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
463.Lenc_sbox:
464 sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
465 v13, v14, v15
466 subs rounds, rounds, #1
467 b.cc .Lenc_done
468
469 enc_next_rk
470
471 mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
472 v13, v14, v15
473
474 add_round_key v0, v1, v2, v3, v4, v5, v6, v7
475
476 b.ne .Lenc_loop
477 ldr q24, SRM0
478 b .Lenc_loop
479
480.Lenc_done:
481 ldr q12, [bskey] // last round key
482
483 bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
484
485 eor v0.16b, v0.16b, v12.16b
486 eor v1.16b, v1.16b, v12.16b
487 eor v4.16b, v4.16b, v12.16b
488 eor v6.16b, v6.16b, v12.16b
489 eor v3.16b, v3.16b, v12.16b
490 eor v7.16b, v7.16b, v12.16b
491 eor v2.16b, v2.16b, v12.16b
492 eor v5.16b, v5.16b, v12.16b
493 ret
494ENDPROC(aesbs_encrypt8)
495
496 .align 4
497aesbs_decrypt8:
498 lsl x9, rounds, #7
499 add bskey, bskey, x9
500
501 ldr q9, [bskey, #-112]! // round 0 key
502 ldr q8, M0ISR
503 ldr q24, ISR
504
505 eor v10.16b, v0.16b, v9.16b // xor with round0 key
506 eor v11.16b, v1.16b, v9.16b
507 tbl v0.16b, {v10.16b}, v8.16b
508 eor v12.16b, v2.16b, v9.16b
509 tbl v1.16b, {v11.16b}, v8.16b
510 eor v13.16b, v3.16b, v9.16b
511 tbl v2.16b, {v12.16b}, v8.16b
512 eor v14.16b, v4.16b, v9.16b
513 tbl v3.16b, {v13.16b}, v8.16b
514 eor v15.16b, v5.16b, v9.16b
515 tbl v4.16b, {v14.16b}, v8.16b
516 eor v10.16b, v6.16b, v9.16b
517 tbl v5.16b, {v15.16b}, v8.16b
518 eor v11.16b, v7.16b, v9.16b
519 tbl v6.16b, {v10.16b}, v8.16b
520 tbl v7.16b, {v11.16b}, v8.16b
521
522 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
523
524 sub rounds, rounds, #1
525 b .Ldec_sbox
526
527.Ldec_loop:
528 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
529.Ldec_sbox:
530 inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
531 v13, v14, v15
532 subs rounds, rounds, #1
533 b.cc .Ldec_done
534
535 dec_next_rk
536
537 add_round_key v0, v1, v6, v4, v2, v7, v3, v5
538
539 inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
540 v13, v14, v15
541
542 b.ne .Ldec_loop
543 ldr q24, ISRM0
544 b .Ldec_loop
545.Ldec_done:
546 ldr q12, [bskey, #-16] // last round key
547
548 bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
549
550 eor v0.16b, v0.16b, v12.16b
551 eor v1.16b, v1.16b, v12.16b
552 eor v6.16b, v6.16b, v12.16b
553 eor v4.16b, v4.16b, v12.16b
554 eor v2.16b, v2.16b, v12.16b
555 eor v7.16b, v7.16b, v12.16b
556 eor v3.16b, v3.16b, v12.16b
557 eor v5.16b, v5.16b, v12.16b
558 ret
559ENDPROC(aesbs_decrypt8)
560
561 /*
562 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
563 * int blocks)
564 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
565 * int blocks)
566 */
567 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
568 stp x29, x30, [sp, #-16]!
569 mov x29, sp
570
57199: mov x5, #1
572 lsl x5, x5, x4
573 subs w4, w4, #8
574 csel x4, x4, xzr, pl
575 csel x5, x5, xzr, mi
576
577 ld1 {v0.16b}, [x1], #16
578 tbnz x5, #1, 0f
579 ld1 {v1.16b}, [x1], #16
580 tbnz x5, #2, 0f
581 ld1 {v2.16b}, [x1], #16
582 tbnz x5, #3, 0f
583 ld1 {v3.16b}, [x1], #16
584 tbnz x5, #4, 0f
585 ld1 {v4.16b}, [x1], #16
586 tbnz x5, #5, 0f
587 ld1 {v5.16b}, [x1], #16
588 tbnz x5, #6, 0f
589 ld1 {v6.16b}, [x1], #16
590 tbnz x5, #7, 0f
591 ld1 {v7.16b}, [x1], #16
592
5930: mov bskey, x2
594 mov rounds, x3
595 bl \do8
596
597 st1 {\o0\().16b}, [x0], #16
598 tbnz x5, #1, 1f
599 st1 {\o1\().16b}, [x0], #16
600 tbnz x5, #2, 1f
601 st1 {\o2\().16b}, [x0], #16
602 tbnz x5, #3, 1f
603 st1 {\o3\().16b}, [x0], #16
604 tbnz x5, #4, 1f
605 st1 {\o4\().16b}, [x0], #16
606 tbnz x5, #5, 1f
607 st1 {\o5\().16b}, [x0], #16
608 tbnz x5, #6, 1f
609 st1 {\o6\().16b}, [x0], #16
610 tbnz x5, #7, 1f
611 st1 {\o7\().16b}, [x0], #16
612
613 cbnz x4, 99b
614
6151: ldp x29, x30, [sp], #16
616 ret
617 .endm
618
619 .align 4
620ENTRY(aesbs_ecb_encrypt)
621 __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
622ENDPROC(aesbs_ecb_encrypt)
623
624 .align 4
625ENTRY(aesbs_ecb_decrypt)
626 __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
627ENDPROC(aesbs_ecb_decrypt)
628
629 /*
630 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
631 * int blocks, u8 iv[])
632 */
633 .align 4
634ENTRY(aesbs_cbc_decrypt)
635 stp x29, x30, [sp, #-16]!
636 mov x29, sp
637
63899: mov x6, #1
639 lsl x6, x6, x4
640 subs w4, w4, #8
641 csel x4, x4, xzr, pl
642 csel x6, x6, xzr, mi
643
644 ld1 {v0.16b}, [x1], #16
645 mov v25.16b, v0.16b
646 tbnz x6, #1, 0f
647 ld1 {v1.16b}, [x1], #16
648 mov v26.16b, v1.16b
649 tbnz x6, #2, 0f
650 ld1 {v2.16b}, [x1], #16
651 mov v27.16b, v2.16b
652 tbnz x6, #3, 0f
653 ld1 {v3.16b}, [x1], #16
654 mov v28.16b, v3.16b
655 tbnz x6, #4, 0f
656 ld1 {v4.16b}, [x1], #16
657 mov v29.16b, v4.16b
658 tbnz x6, #5, 0f
659 ld1 {v5.16b}, [x1], #16
660 mov v30.16b, v5.16b
661 tbnz x6, #6, 0f
662 ld1 {v6.16b}, [x1], #16
663 mov v31.16b, v6.16b
664 tbnz x6, #7, 0f
665 ld1 {v7.16b}, [x1]
666
6670: mov bskey, x2
668 mov rounds, x3
669 bl aesbs_decrypt8
670
671 ld1 {v24.16b}, [x5] // load IV
672
673 eor v1.16b, v1.16b, v25.16b
674 eor v6.16b, v6.16b, v26.16b
675 eor v4.16b, v4.16b, v27.16b
676 eor v2.16b, v2.16b, v28.16b
677 eor v7.16b, v7.16b, v29.16b
678 eor v0.16b, v0.16b, v24.16b
679 eor v3.16b, v3.16b, v30.16b
680 eor v5.16b, v5.16b, v31.16b
681
682 st1 {v0.16b}, [x0], #16
683 mov v24.16b, v25.16b
684 tbnz x6, #1, 1f
685 st1 {v1.16b}, [x0], #16
686 mov v24.16b, v26.16b
687 tbnz x6, #2, 1f
688 st1 {v6.16b}, [x0], #16
689 mov v24.16b, v27.16b
690 tbnz x6, #3, 1f
691 st1 {v4.16b}, [x0], #16
692 mov v24.16b, v28.16b
693 tbnz x6, #4, 1f
694 st1 {v2.16b}, [x0], #16
695 mov v24.16b, v29.16b
696 tbnz x6, #5, 1f
697 st1 {v7.16b}, [x0], #16
698 mov v24.16b, v30.16b
699 tbnz x6, #6, 1f
700 st1 {v3.16b}, [x0], #16
701 mov v24.16b, v31.16b
702 tbnz x6, #7, 1f
703 ld1 {v24.16b}, [x1], #16
704 st1 {v5.16b}, [x0], #16
7051: st1 {v24.16b}, [x5] // store IV
706
707 cbnz x4, 99b
708
709 ldp x29, x30, [sp], #16
710 ret
711ENDPROC(aesbs_cbc_decrypt)
712
713 .macro next_tweak, out, in, const, tmp
714 sshr \tmp\().2d, \in\().2d, #63
715 and \tmp\().16b, \tmp\().16b, \const\().16b
716 add \out\().2d, \in\().2d, \in\().2d
717 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
718 eor \out\().16b, \out\().16b, \tmp\().16b
719 .endm
720
721 .align 4
722.Lxts_mul_x:
723CPU_LE( .quad 1, 0x87 )
724CPU_BE( .quad 0x87, 1 )
725
726 /*
727 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
728 * int blocks, u8 iv[])
729 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
730 * int blocks, u8 iv[])
731 */
732__xts_crypt8:
733 mov x6, #1
734 lsl x6, x6, x4
735 subs w4, w4, #8
736 csel x4, x4, xzr, pl
737 csel x6, x6, xzr, mi
738
739 ld1 {v0.16b}, [x1], #16
740 next_tweak v26, v25, v30, v31
741 eor v0.16b, v0.16b, v25.16b
742 tbnz x6, #1, 0f
743
744 ld1 {v1.16b}, [x1], #16
745 next_tweak v27, v26, v30, v31
746 eor v1.16b, v1.16b, v26.16b
747 tbnz x6, #2, 0f
748
749 ld1 {v2.16b}, [x1], #16
750 next_tweak v28, v27, v30, v31
751 eor v2.16b, v2.16b, v27.16b
752 tbnz x6, #3, 0f
753
754 ld1 {v3.16b}, [x1], #16
755 next_tweak v29, v28, v30, v31
756 eor v3.16b, v3.16b, v28.16b
757 tbnz x6, #4, 0f
758
759 ld1 {v4.16b}, [x1], #16
760 str q29, [sp, #16]
761 eor v4.16b, v4.16b, v29.16b
762 next_tweak v29, v29, v30, v31
763 tbnz x6, #5, 0f
764
765 ld1 {v5.16b}, [x1], #16
766 str q29, [sp, #32]
767 eor v5.16b, v5.16b, v29.16b
768 next_tweak v29, v29, v30, v31
769 tbnz x6, #6, 0f
770
771 ld1 {v6.16b}, [x1], #16
772 str q29, [sp, #48]
773 eor v6.16b, v6.16b, v29.16b
774 next_tweak v29, v29, v30, v31
775 tbnz x6, #7, 0f
776
777 ld1 {v7.16b}, [x1], #16
778 str q29, [sp, #64]
779 eor v7.16b, v7.16b, v29.16b
780 next_tweak v29, v29, v30, v31
781
7820: mov bskey, x2
783 mov rounds, x3
784 br x7
785ENDPROC(__xts_crypt8)
786
787 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
788 stp x29, x30, [sp, #-80]!
789 mov x29, sp
790
791 ldr q30, .Lxts_mul_x
792 ld1 {v25.16b}, [x5]
793
79499: adr x7, \do8
795 bl __xts_crypt8
796
797 ldp q16, q17, [sp, #16]
798 ldp q18, q19, [sp, #48]
799
800 eor \o0\().16b, \o0\().16b, v25.16b
801 eor \o1\().16b, \o1\().16b, v26.16b
802 eor \o2\().16b, \o2\().16b, v27.16b
803 eor \o3\().16b, \o3\().16b, v28.16b
804
805 st1 {\o0\().16b}, [x0], #16
806 mov v25.16b, v26.16b
807 tbnz x6, #1, 1f
808 st1 {\o1\().16b}, [x0], #16
809 mov v25.16b, v27.16b
810 tbnz x6, #2, 1f
811 st1 {\o2\().16b}, [x0], #16
812 mov v25.16b, v28.16b
813 tbnz x6, #3, 1f
814 st1 {\o3\().16b}, [x0], #16
815 mov v25.16b, v29.16b
816 tbnz x6, #4, 1f
817
818 eor \o4\().16b, \o4\().16b, v16.16b
819 eor \o5\().16b, \o5\().16b, v17.16b
820 eor \o6\().16b, \o6\().16b, v18.16b
821 eor \o7\().16b, \o7\().16b, v19.16b
822
823 st1 {\o4\().16b}, [x0], #16
824 tbnz x6, #5, 1f
825 st1 {\o5\().16b}, [x0], #16
826 tbnz x6, #6, 1f
827 st1 {\o6\().16b}, [x0], #16
828 tbnz x6, #7, 1f
829 st1 {\o7\().16b}, [x0], #16
830
831 cbnz x4, 99b
832
8331: st1 {v25.16b}, [x5]
834 ldp x29, x30, [sp], #80
835 ret
836 .endm
837
838ENTRY(aesbs_xts_encrypt)
839 __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
840ENDPROC(aesbs_xts_encrypt)
841
842ENTRY(aesbs_xts_decrypt)
843 __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
844ENDPROC(aesbs_xts_decrypt)
845
846 .macro next_ctr, v
847 mov \v\().d[1], x8
848 adds x8, x8, #1
849 mov \v\().d[0], x7
850 adc x7, x7, xzr
851 rev64 \v\().16b, \v\().16b
852 .endm
853
854 /*
855 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000856 * int rounds, int blocks, u8 iv[], u8 final[])
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000857 */
858ENTRY(aesbs_ctr_encrypt)
859 stp x29, x30, [sp, #-16]!
860 mov x29, sp
861
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000862 cmp x6, #0
863 cset x10, ne
864 add x4, x4, x10 // do one extra block if final
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000865
866 ldp x7, x8, [x5]
867 ld1 {v0.16b}, [x5]
868CPU_LE( rev x7, x7 )
869CPU_LE( rev x8, x8 )
870 adds x8, x8, #1
871 adc x7, x7, xzr
872
87399: mov x9, #1
874 lsl x9, x9, x4
875 subs w4, w4, #8
876 csel x4, x4, xzr, pl
877 csel x9, x9, xzr, le
878
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000879 tbnz x9, #1, 0f
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000880 next_ctr v1
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000881 tbnz x9, #2, 0f
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000882 next_ctr v2
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000883 tbnz x9, #3, 0f
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000884 next_ctr v3
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000885 tbnz x9, #4, 0f
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000886 next_ctr v4
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000887 tbnz x9, #5, 0f
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000888 next_ctr v5
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000889 tbnz x9, #6, 0f
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000890 next_ctr v6
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000891 tbnz x9, #7, 0f
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000892 next_ctr v7
893
8940: mov bskey, x2
895 mov rounds, x3
896 bl aesbs_encrypt8
897
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000898 lsr x9, x9, x10 // disregard the extra block
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000899 tbnz x9, #0, 0f
900
901 ld1 {v8.16b}, [x1], #16
902 eor v0.16b, v0.16b, v8.16b
903 st1 {v0.16b}, [x0], #16
904 tbnz x9, #1, 1f
905
906 ld1 {v9.16b}, [x1], #16
907 eor v1.16b, v1.16b, v9.16b
908 st1 {v1.16b}, [x0], #16
909 tbnz x9, #2, 2f
910
911 ld1 {v10.16b}, [x1], #16
912 eor v4.16b, v4.16b, v10.16b
913 st1 {v4.16b}, [x0], #16
914 tbnz x9, #3, 3f
915
916 ld1 {v11.16b}, [x1], #16
917 eor v6.16b, v6.16b, v11.16b
918 st1 {v6.16b}, [x0], #16
919 tbnz x9, #4, 4f
920
921 ld1 {v12.16b}, [x1], #16
922 eor v3.16b, v3.16b, v12.16b
923 st1 {v3.16b}, [x0], #16
924 tbnz x9, #5, 5f
925
926 ld1 {v13.16b}, [x1], #16
927 eor v7.16b, v7.16b, v13.16b
928 st1 {v7.16b}, [x0], #16
929 tbnz x9, #6, 6f
930
931 ld1 {v14.16b}, [x1], #16
932 eor v2.16b, v2.16b, v14.16b
933 st1 {v2.16b}, [x0], #16
934 tbnz x9, #7, 7f
935
936 ld1 {v15.16b}, [x1], #16
937 eor v5.16b, v5.16b, v15.16b
938 st1 {v5.16b}, [x0], #16
939
Ard Biesheuvel88a3f582017-02-02 11:38:55 +00009408: next_ctr v0
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000941 cbnz x4, 99b
942
9430: st1 {v0.16b}, [x5]
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000944 ldp x29, x30, [sp], #16
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000945 ret
946
947 /*
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000948 * If we are handling the tail of the input (x6 != NULL), return the
949 * final keystream block back to the caller.
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000950 */
9511: cbz x6, 8b
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000952 st1 {v1.16b}, [x6]
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000953 b 8b
9542: cbz x6, 8b
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000955 st1 {v4.16b}, [x6]
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000956 b 8b
9573: cbz x6, 8b
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000958 st1 {v6.16b}, [x6]
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000959 b 8b
9604: cbz x6, 8b
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000961 st1 {v3.16b}, [x6]
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000962 b 8b
9635: cbz x6, 8b
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000964 st1 {v7.16b}, [x6]
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000965 b 8b
9666: cbz x6, 8b
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000967 st1 {v2.16b}, [x6]
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000968 b 8b
9697: cbz x6, 8b
Ard Biesheuvel88a3f582017-02-02 11:38:55 +0000970 st1 {v5.16b}, [x6]
Ard Biesheuvel1abee992017-01-11 16:41:55 +0000971 b 8b
972ENDPROC(aesbs_ctr_encrypt)