blob: b14463438b0966b6bc37f2f7784b0285c51ce290 [file] [log] [blame]
Eric Biggersf152ce12018-03-05 11:17:07 -08001// SPDX-License-Identifier: GPL-2.0
2/*
3 * ARM64 NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
4 *
5 * Copyright (c) 2018 Google, Inc
6 *
7 * Author: Eric Biggers <ebiggers@google.com>
8 */
9
10#include <linux/linkage.h>
11
12 .text
13
14 // arguments
15 ROUND_KEYS .req x0 // const {u64,u32} *round_keys
16 NROUNDS .req w1 // int nrounds
17 NROUNDS_X .req x1
18 DST .req x2 // void *dst
19 SRC .req x3 // const void *src
20 NBYTES .req w4 // unsigned int nbytes
21 TWEAK .req x5 // void *tweak
22
23 // registers which hold the data being encrypted/decrypted
24 // (underscores avoid a naming collision with ARM64 registers x0-x3)
25 X_0 .req v0
26 Y_0 .req v1
27 X_1 .req v2
28 Y_1 .req v3
29 X_2 .req v4
30 Y_2 .req v5
31 X_3 .req v6
32 Y_3 .req v7
33
34 // the round key, duplicated in all lanes
35 ROUND_KEY .req v8
36
37 // index vector for tbl-based 8-bit rotates
38 ROTATE_TABLE .req v9
39 ROTATE_TABLE_Q .req q9
40
41 // temporary registers
42 TMP0 .req v10
43 TMP1 .req v11
44 TMP2 .req v12
45 TMP3 .req v13
46
47 // multiplication table for updating XTS tweaks
48 GFMUL_TABLE .req v14
49 GFMUL_TABLE_Q .req q14
50
51 // next XTS tweak value(s)
52 TWEAKV_NEXT .req v15
53
54 // XTS tweaks for the blocks currently being encrypted/decrypted
55 TWEAKV0 .req v16
56 TWEAKV1 .req v17
57 TWEAKV2 .req v18
58 TWEAKV3 .req v19
59 TWEAKV4 .req v20
60 TWEAKV5 .req v21
61 TWEAKV6 .req v22
62 TWEAKV7 .req v23
63
64 .align 4
65.Lror64_8_table:
66 .octa 0x080f0e0d0c0b0a090007060504030201
67.Lror32_8_table:
68 .octa 0x0c0f0e0d080b0a090407060500030201
69.Lrol64_8_table:
70 .octa 0x0e0d0c0b0a09080f0605040302010007
71.Lrol32_8_table:
72 .octa 0x0e0d0c0f0a09080b0605040702010003
73.Lgf128mul_table:
74 .octa 0x00000000000000870000000000000001
75.Lgf64mul_table:
76 .octa 0x0000000000000000000000002d361b00
77
78/*
79 * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
80 *
81 * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
82 * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
83 * of ROUND_KEY. 'n' is the lane size: 64 for Speck128, or 32 for Speck64.
84 * 'lanes' is the lane specifier: "2d" for Speck128 or "4s" for Speck64.
85 */
86.macro _speck_round_128bytes n, lanes
87
88 // x = ror(x, 8)
89 tbl X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
90 tbl X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
91 tbl X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
92 tbl X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
93
94 // x += y
95 add X_0.\lanes, X_0.\lanes, Y_0.\lanes
96 add X_1.\lanes, X_1.\lanes, Y_1.\lanes
97 add X_2.\lanes, X_2.\lanes, Y_2.\lanes
98 add X_3.\lanes, X_3.\lanes, Y_3.\lanes
99
100 // x ^= k
101 eor X_0.16b, X_0.16b, ROUND_KEY.16b
102 eor X_1.16b, X_1.16b, ROUND_KEY.16b
103 eor X_2.16b, X_2.16b, ROUND_KEY.16b
104 eor X_3.16b, X_3.16b, ROUND_KEY.16b
105
106 // y = rol(y, 3)
107 shl TMP0.\lanes, Y_0.\lanes, #3
108 shl TMP1.\lanes, Y_1.\lanes, #3
109 shl TMP2.\lanes, Y_2.\lanes, #3
110 shl TMP3.\lanes, Y_3.\lanes, #3
111 sri TMP0.\lanes, Y_0.\lanes, #(\n - 3)
112 sri TMP1.\lanes, Y_1.\lanes, #(\n - 3)
113 sri TMP2.\lanes, Y_2.\lanes, #(\n - 3)
114 sri TMP3.\lanes, Y_3.\lanes, #(\n - 3)
115
116 // y ^= x
117 eor Y_0.16b, TMP0.16b, X_0.16b
118 eor Y_1.16b, TMP1.16b, X_1.16b
119 eor Y_2.16b, TMP2.16b, X_2.16b
120 eor Y_3.16b, TMP3.16b, X_3.16b
121.endm
122
123/*
124 * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
125 *
126 * This is the inverse of _speck_round_128bytes().
127 */
128.macro _speck_unround_128bytes n, lanes
129
130 // y ^= x
131 eor TMP0.16b, Y_0.16b, X_0.16b
132 eor TMP1.16b, Y_1.16b, X_1.16b
133 eor TMP2.16b, Y_2.16b, X_2.16b
134 eor TMP3.16b, Y_3.16b, X_3.16b
135
136 // y = ror(y, 3)
137 ushr Y_0.\lanes, TMP0.\lanes, #3
138 ushr Y_1.\lanes, TMP1.\lanes, #3
139 ushr Y_2.\lanes, TMP2.\lanes, #3
140 ushr Y_3.\lanes, TMP3.\lanes, #3
141 sli Y_0.\lanes, TMP0.\lanes, #(\n - 3)
142 sli Y_1.\lanes, TMP1.\lanes, #(\n - 3)
143 sli Y_2.\lanes, TMP2.\lanes, #(\n - 3)
144 sli Y_3.\lanes, TMP3.\lanes, #(\n - 3)
145
146 // x ^= k
147 eor X_0.16b, X_0.16b, ROUND_KEY.16b
148 eor X_1.16b, X_1.16b, ROUND_KEY.16b
149 eor X_2.16b, X_2.16b, ROUND_KEY.16b
150 eor X_3.16b, X_3.16b, ROUND_KEY.16b
151
152 // x -= y
153 sub X_0.\lanes, X_0.\lanes, Y_0.\lanes
154 sub X_1.\lanes, X_1.\lanes, Y_1.\lanes
155 sub X_2.\lanes, X_2.\lanes, Y_2.\lanes
156 sub X_3.\lanes, X_3.\lanes, Y_3.\lanes
157
158 // x = rol(x, 8)
159 tbl X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
160 tbl X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
161 tbl X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
162 tbl X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
163.endm
164
165.macro _next_xts_tweak next, cur, tmp, n
166.if \n == 64
167 /*
168 * Calculate the next tweak by multiplying the current one by x,
169 * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
170 */
171 sshr \tmp\().2d, \cur\().2d, #63
172 and \tmp\().16b, \tmp\().16b, GFMUL_TABLE.16b
173 shl \next\().2d, \cur\().2d, #1
174 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
175 eor \next\().16b, \next\().16b, \tmp\().16b
176.else
177 /*
178 * Calculate the next two tweaks by multiplying the current ones by x^2,
179 * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
180 */
181 ushr \tmp\().2d, \cur\().2d, #62
182 shl \next\().2d, \cur\().2d, #2
183 tbl \tmp\().16b, {GFMUL_TABLE.16b}, \tmp\().16b
184 eor \next\().16b, \next\().16b, \tmp\().16b
185.endif
186.endm
187
188/*
189 * _speck_xts_crypt() - Speck-XTS encryption/decryption
190 *
191 * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
192 * using Speck-XTS, specifically the variant with a block size of '2n' and round
193 * count given by NROUNDS. The expanded round keys are given in ROUND_KEYS, and
194 * the current XTS tweak value is given in TWEAK. It's assumed that NBYTES is a
195 * nonzero multiple of 128.
196 */
197.macro _speck_xts_crypt n, lanes, decrypting
198
199 /*
200 * If decrypting, modify the ROUND_KEYS parameter to point to the last
201 * round key rather than the first, since for decryption the round keys
202 * are used in reverse order.
203 */
204.if \decrypting
205 mov NROUNDS, NROUNDS /* zero the high 32 bits */
206.if \n == 64
207 add ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #3
208 sub ROUND_KEYS, ROUND_KEYS, #8
209.else
210 add ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #2
211 sub ROUND_KEYS, ROUND_KEYS, #4
212.endif
213.endif
214
215 // Load the index vector for tbl-based 8-bit rotates
216.if \decrypting
217 ldr ROTATE_TABLE_Q, .Lrol\n\()_8_table
218.else
219 ldr ROTATE_TABLE_Q, .Lror\n\()_8_table
220.endif
221
222 // One-time XTS preparation
223.if \n == 64
224 // Load first tweak
225 ld1 {TWEAKV0.16b}, [TWEAK]
226
227 // Load GF(2^128) multiplication table
228 ldr GFMUL_TABLE_Q, .Lgf128mul_table
229.else
230 // Load first tweak
231 ld1 {TWEAKV0.8b}, [TWEAK]
232
233 // Load GF(2^64) multiplication table
234 ldr GFMUL_TABLE_Q, .Lgf64mul_table
235
236 // Calculate second tweak, packing it together with the first
237 ushr TMP0.2d, TWEAKV0.2d, #63
238 shl TMP1.2d, TWEAKV0.2d, #1
239 tbl TMP0.8b, {GFMUL_TABLE.16b}, TMP0.8b
240 eor TMP0.8b, TMP0.8b, TMP1.8b
241 mov TWEAKV0.d[1], TMP0.d[0]
242.endif
243
244.Lnext_128bytes_\@:
245
246 // Calculate XTS tweaks for next 128 bytes
247 _next_xts_tweak TWEAKV1, TWEAKV0, TMP0, \n
248 _next_xts_tweak TWEAKV2, TWEAKV1, TMP0, \n
249 _next_xts_tweak TWEAKV3, TWEAKV2, TMP0, \n
250 _next_xts_tweak TWEAKV4, TWEAKV3, TMP0, \n
251 _next_xts_tweak TWEAKV5, TWEAKV4, TMP0, \n
252 _next_xts_tweak TWEAKV6, TWEAKV5, TMP0, \n
253 _next_xts_tweak TWEAKV7, TWEAKV6, TMP0, \n
254 _next_xts_tweak TWEAKV_NEXT, TWEAKV7, TMP0, \n
255
256 // Load the next source blocks into {X,Y}[0-3]
257 ld1 {X_0.16b-Y_1.16b}, [SRC], #64
258 ld1 {X_2.16b-Y_3.16b}, [SRC], #64
259
260 // XOR the source blocks with their XTS tweaks
261 eor TMP0.16b, X_0.16b, TWEAKV0.16b
262 eor Y_0.16b, Y_0.16b, TWEAKV1.16b
263 eor TMP1.16b, X_1.16b, TWEAKV2.16b
264 eor Y_1.16b, Y_1.16b, TWEAKV3.16b
265 eor TMP2.16b, X_2.16b, TWEAKV4.16b
266 eor Y_2.16b, Y_2.16b, TWEAKV5.16b
267 eor TMP3.16b, X_3.16b, TWEAKV6.16b
268 eor Y_3.16b, Y_3.16b, TWEAKV7.16b
269
270 /*
271 * De-interleave the 'x' and 'y' elements of each block, i.e. make it so
272 * that the X[0-3] registers contain only the second halves of blocks,
273 * and the Y[0-3] registers contain only the first halves of blocks.
274 * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
275 */
276 uzp2 X_0.\lanes, TMP0.\lanes, Y_0.\lanes
277 uzp1 Y_0.\lanes, TMP0.\lanes, Y_0.\lanes
278 uzp2 X_1.\lanes, TMP1.\lanes, Y_1.\lanes
279 uzp1 Y_1.\lanes, TMP1.\lanes, Y_1.\lanes
280 uzp2 X_2.\lanes, TMP2.\lanes, Y_2.\lanes
281 uzp1 Y_2.\lanes, TMP2.\lanes, Y_2.\lanes
282 uzp2 X_3.\lanes, TMP3.\lanes, Y_3.\lanes
283 uzp1 Y_3.\lanes, TMP3.\lanes, Y_3.\lanes
284
285 // Do the cipher rounds
286 mov x6, ROUND_KEYS
287 mov w7, NROUNDS
288.Lnext_round_\@:
289.if \decrypting
290 ld1r {ROUND_KEY.\lanes}, [x6]
291 sub x6, x6, #( \n / 8 )
292 _speck_unround_128bytes \n, \lanes
293.else
294 ld1r {ROUND_KEY.\lanes}, [x6], #( \n / 8 )
295 _speck_round_128bytes \n, \lanes
296.endif
297 subs w7, w7, #1
298 bne .Lnext_round_\@
299
300 // Re-interleave the 'x' and 'y' elements of each block
301 zip1 TMP0.\lanes, Y_0.\lanes, X_0.\lanes
302 zip2 Y_0.\lanes, Y_0.\lanes, X_0.\lanes
303 zip1 TMP1.\lanes, Y_1.\lanes, X_1.\lanes
304 zip2 Y_1.\lanes, Y_1.\lanes, X_1.\lanes
305 zip1 TMP2.\lanes, Y_2.\lanes, X_2.\lanes
306 zip2 Y_2.\lanes, Y_2.\lanes, X_2.\lanes
307 zip1 TMP3.\lanes, Y_3.\lanes, X_3.\lanes
308 zip2 Y_3.\lanes, Y_3.\lanes, X_3.\lanes
309
310 // XOR the encrypted/decrypted blocks with the tweaks calculated earlier
311 eor X_0.16b, TMP0.16b, TWEAKV0.16b
312 eor Y_0.16b, Y_0.16b, TWEAKV1.16b
313 eor X_1.16b, TMP1.16b, TWEAKV2.16b
314 eor Y_1.16b, Y_1.16b, TWEAKV3.16b
315 eor X_2.16b, TMP2.16b, TWEAKV4.16b
316 eor Y_2.16b, Y_2.16b, TWEAKV5.16b
317 eor X_3.16b, TMP3.16b, TWEAKV6.16b
318 eor Y_3.16b, Y_3.16b, TWEAKV7.16b
319 mov TWEAKV0.16b, TWEAKV_NEXT.16b
320
321 // Store the ciphertext in the destination buffer
322 st1 {X_0.16b-Y_1.16b}, [DST], #64
323 st1 {X_2.16b-Y_3.16b}, [DST], #64
324
325 // Continue if there are more 128-byte chunks remaining
326 subs NBYTES, NBYTES, #128
327 bne .Lnext_128bytes_\@
328
329 // Store the next tweak and return
330.if \n == 64
331 st1 {TWEAKV_NEXT.16b}, [TWEAK]
332.else
333 st1 {TWEAKV_NEXT.8b}, [TWEAK]
334.endif
335 ret
336.endm
337
338ENTRY(speck128_xts_encrypt_neon)
339 _speck_xts_crypt n=64, lanes=2d, decrypting=0
340ENDPROC(speck128_xts_encrypt_neon)
341
342ENTRY(speck128_xts_decrypt_neon)
343 _speck_xts_crypt n=64, lanes=2d, decrypting=1
344ENDPROC(speck128_xts_decrypt_neon)
345
346ENTRY(speck64_xts_encrypt_neon)
347 _speck_xts_crypt n=32, lanes=4s, decrypting=0
348ENDPROC(speck64_xts_encrypt_neon)
349
350ENTRY(speck64_xts_decrypt_neon)
351 _speck_xts_crypt n=32, lanes=4s, decrypting=1
352ENDPROC(speck64_xts_decrypt_neon)