blob: eb22926d49127e894a060afc003871d743dfe36d [file] [log] [blame]
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +00001/*
Eric Biggers54a345a2018-11-16 17:26:25 -08002 * ChaCha/XChaCha NEON helper functions
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +00003 *
4 * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * Based on:
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
12 *
13 * Copyright (C) 2015 Martin Willi
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 */
20
Eric Biggersd2333222018-09-01 00:17:07 -070021 /*
22 * NEON doesn't have a rotate instruction. The alternatives are, more or less:
23 *
24 * (a) vshl.u32 + vsri.u32 (needs temporary register)
25 * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register)
26 * (c) vrev32.16 (16-bit rotations only)
27 * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only,
28 * needs index vector)
29 *
Eric Biggers54a345a2018-11-16 17:26:25 -080030 * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations,
31 * the only choices are (a) and (b). We use (a) since it takes two-thirds the
32 * cycles of (b) on both Cortex-A7 and Cortex-A53.
Eric Biggersd2333222018-09-01 00:17:07 -070033 *
34 * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
35 * and doesn't need a temporary register.
36 *
37 * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence
38 * is twice as fast as (a), even when doing (a) on multiple registers
39 * simultaneously to eliminate the stall between vshl and vsri. Also, it
40 * parallelizes better when temporary registers are scarce.
41 *
42 * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
43 * (a), so the need to load the rotation table actually makes the vtbl method
44 * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it
45 * seems to be a good compromise to get a more significant speed boost on some
46 * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
47 */
48
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +000049#include <linux/linkage.h>
50
51 .text
52 .fpu neon
53 .align 5
54
Eric Biggers0b8e72b2018-11-16 17:26:24 -080055/*
Eric Biggers54a345a2018-11-16 17:26:25 -080056 * chacha_permute - permute one block
Eric Biggers0b8e72b2018-11-16 17:26:24 -080057 *
58 * Permute one 64-byte block where the state matrix is stored in the four NEON
59 * registers q0-q3. It performs matrix operations on four words in parallel,
60 * but requires shuffling to rearrange the words after each round.
61 *
Eric Biggers54a345a2018-11-16 17:26:25 -080062 * The round count is given in r3.
63 *
Eric Biggers0b8e72b2018-11-16 17:26:24 -080064 * Clobbers: r3, ip, q4-q5
65 */
Eric Biggers54a345a2018-11-16 17:26:25 -080066chacha_permute:
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +000067
Eric Biggersd2333222018-09-01 00:17:07 -070068 adr ip, .Lrol8_table
Eric Biggersd2333222018-09-01 00:17:07 -070069 vld1.8 {d10}, [ip, :64]
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +000070
71.Ldoubleround:
72 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
73 vadd.i32 q0, q0, q1
Eric Biggers52e5ce32018-07-24 18:29:07 -070074 veor q3, q3, q0
75 vrev32.16 q3, q3
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +000076
77 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
78 vadd.i32 q2, q2, q3
79 veor q4, q1, q2
80 vshl.u32 q1, q4, #12
81 vsri.u32 q1, q4, #20
82
83 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
84 vadd.i32 q0, q0, q1
Eric Biggersd2333222018-09-01 00:17:07 -070085 veor q3, q3, q0
86 vtbl.8 d6, {d6}, d10
87 vtbl.8 d7, {d7}, d10
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +000088
89 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
90 vadd.i32 q2, q2, q3
91 veor q4, q1, q2
92 vshl.u32 q1, q4, #7
93 vsri.u32 q1, q4, #25
94
95 // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
96 vext.8 q1, q1, q1, #4
97 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
98 vext.8 q2, q2, q2, #8
99 // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
100 vext.8 q3, q3, q3, #12
101
102 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
103 vadd.i32 q0, q0, q1
Eric Biggers52e5ce32018-07-24 18:29:07 -0700104 veor q3, q3, q0
105 vrev32.16 q3, q3
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000106
107 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
108 vadd.i32 q2, q2, q3
109 veor q4, q1, q2
110 vshl.u32 q1, q4, #12
111 vsri.u32 q1, q4, #20
112
113 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
114 vadd.i32 q0, q0, q1
Eric Biggersd2333222018-09-01 00:17:07 -0700115 veor q3, q3, q0
116 vtbl.8 d6, {d6}, d10
117 vtbl.8 d7, {d7}, d10
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000118
119 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
120 vadd.i32 q2, q2, q3
121 veor q4, q1, q2
122 vshl.u32 q1, q4, #7
123 vsri.u32 q1, q4, #25
124
125 // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
126 vext.8 q1, q1, q1, #12
127 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
128 vext.8 q2, q2, q2, #8
129 // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
130 vext.8 q3, q3, q3, #4
131
Eric Biggers54a345a2018-11-16 17:26:25 -0800132 subs r3, r3, #2
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000133 bne .Ldoubleround
134
Eric Biggers0b8e72b2018-11-16 17:26:24 -0800135 bx lr
Eric Biggers54a345a2018-11-16 17:26:25 -0800136ENDPROC(chacha_permute)
Eric Biggers0b8e72b2018-11-16 17:26:24 -0800137
Eric Biggers54a345a2018-11-16 17:26:25 -0800138ENTRY(chacha_block_xor_neon)
Eric Biggers0b8e72b2018-11-16 17:26:24 -0800139 // r0: Input state matrix, s
140 // r1: 1 data block output, o
141 // r2: 1 data block input, i
Eric Biggers54a345a2018-11-16 17:26:25 -0800142 // r3: nrounds
Eric Biggers0b8e72b2018-11-16 17:26:24 -0800143 push {lr}
144
145 // x0..3 = s0..3
146 add ip, r0, #0x20
147 vld1.32 {q0-q1}, [r0]
148 vld1.32 {q2-q3}, [ip]
149
150 vmov q8, q0
151 vmov q9, q1
152 vmov q10, q2
153 vmov q11, q3
154
Eric Biggers54a345a2018-11-16 17:26:25 -0800155 bl chacha_permute
Eric Biggers0b8e72b2018-11-16 17:26:24 -0800156
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000157 add ip, r2, #0x20
158 vld1.8 {q4-q5}, [r2]
159 vld1.8 {q6-q7}, [ip]
160
161 // o0 = i0 ^ (x0 + s0)
162 vadd.i32 q0, q0, q8
163 veor q0, q0, q4
164
165 // o1 = i1 ^ (x1 + s1)
166 vadd.i32 q1, q1, q9
167 veor q1, q1, q5
168
169 // o2 = i2 ^ (x2 + s2)
170 vadd.i32 q2, q2, q10
171 veor q2, q2, q6
172
173 // o3 = i3 ^ (x3 + s3)
174 vadd.i32 q3, q3, q11
175 veor q3, q3, q7
176
177 add ip, r1, #0x20
178 vst1.8 {q0-q1}, [r1]
179 vst1.8 {q2-q3}, [ip]
180
Eric Biggers0b8e72b2018-11-16 17:26:24 -0800181 pop {pc}
Eric Biggers54a345a2018-11-16 17:26:25 -0800182ENDPROC(chacha_block_xor_neon)
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000183
Eric Biggers54a345a2018-11-16 17:26:25 -0800184ENTRY(hchacha_block_neon)
Eric Biggers0b8e72b2018-11-16 17:26:24 -0800185 // r0: Input state matrix, s
186 // r1: output (8 32-bit words)
Eric Biggers54a345a2018-11-16 17:26:25 -0800187 // r2: nrounds
Eric Biggers0b8e72b2018-11-16 17:26:24 -0800188 push {lr}
189
190 vld1.32 {q0-q1}, [r0]!
191 vld1.32 {q2-q3}, [r0]
192
Eric Biggers54a345a2018-11-16 17:26:25 -0800193 mov r3, r2
194 bl chacha_permute
Eric Biggers0b8e72b2018-11-16 17:26:24 -0800195
196 vst1.32 {q0}, [r1]!
197 vst1.32 {q3}, [r1]
198
199 pop {pc}
Eric Biggers54a345a2018-11-16 17:26:25 -0800200ENDPROC(hchacha_block_neon)
Eric Biggers0b8e72b2018-11-16 17:26:24 -0800201
Eric Biggersd2333222018-09-01 00:17:07 -0700202 .align 4
203.Lctrinc: .word 0, 1, 2, 3
204.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6
205
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000206 .align 5
Eric Biggers54a345a2018-11-16 17:26:25 -0800207ENTRY(chacha_4block_xor_neon)
Eric Biggersd2333222018-09-01 00:17:07 -0700208 push {r4-r5}
209 mov r4, sp // preserve the stack pointer
210 sub ip, sp, #0x20 // allocate a 32 byte buffer
211 bic ip, ip, #0x1f // aligned to 32 bytes
212 mov sp, ip
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000213
214 // r0: Input state matrix, s
215 // r1: 4 data blocks output, o
216 // r2: 4 data blocks input, i
Eric Biggers54a345a2018-11-16 17:26:25 -0800217 // r3: nrounds
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000218
219 //
Eric Biggers54a345a2018-11-16 17:26:25 -0800220 // This function encrypts four consecutive ChaCha blocks by loading
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000221 // the state matrix in NEON registers four times. The algorithm performs
222 // each operation on the corresponding word of each state matrix, hence
Eric Biggersd2333222018-09-01 00:17:07 -0700223 // requires no word shuffling. The words are re-interleaved before the
224 // final addition of the original state and the XORing step.
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000225 //
226
Eric Biggersd2333222018-09-01 00:17:07 -0700227 // x0..15[0-3] = s0..15[0-3]
228 add ip, r0, #0x20
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000229 vld1.32 {q0-q1}, [r0]
Eric Biggersd2333222018-09-01 00:17:07 -0700230 vld1.32 {q2-q3}, [ip]
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000231
Eric Biggersd2333222018-09-01 00:17:07 -0700232 adr r5, .Lctrinc
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000233 vdup.32 q15, d7[1]
234 vdup.32 q14, d7[0]
Eric Biggersd2333222018-09-01 00:17:07 -0700235 vld1.32 {q4}, [r5, :128]
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000236 vdup.32 q13, d6[1]
237 vdup.32 q12, d6[0]
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000238 vdup.32 q11, d5[1]
239 vdup.32 q10, d5[0]
Eric Biggersd2333222018-09-01 00:17:07 -0700240 vadd.u32 q12, q12, q4 // x12 += counter values 0-3
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000241 vdup.32 q9, d4[1]
242 vdup.32 q8, d4[0]
243 vdup.32 q7, d3[1]
244 vdup.32 q6, d3[0]
245 vdup.32 q5, d2[1]
246 vdup.32 q4, d2[0]
247 vdup.32 q3, d1[1]
248 vdup.32 q2, d1[0]
249 vdup.32 q1, d0[1]
250 vdup.32 q0, d0[0]
251
Eric Biggersd2333222018-09-01 00:17:07 -0700252 adr ip, .Lrol8_table
Eric Biggersd2333222018-09-01 00:17:07 -0700253 b 1f
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000254
255.Ldoubleround4:
Eric Biggersd2333222018-09-01 00:17:07 -0700256 vld1.32 {q8-q9}, [sp, :256]
2571:
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000258 // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
259 // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
260 // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
261 // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
262 vadd.i32 q0, q0, q4
263 vadd.i32 q1, q1, q5
264 vadd.i32 q2, q2, q6
265 vadd.i32 q3, q3, q7
266
267 veor q12, q12, q0
268 veor q13, q13, q1
269 veor q14, q14, q2
270 veor q15, q15, q3
271
272 vrev32.16 q12, q12
273 vrev32.16 q13, q13
274 vrev32.16 q14, q14
275 vrev32.16 q15, q15
276
277 // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
278 // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
279 // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
280 // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
281 vadd.i32 q8, q8, q12
282 vadd.i32 q9, q9, q13
283 vadd.i32 q10, q10, q14
284 vadd.i32 q11, q11, q15
285
286 vst1.32 {q8-q9}, [sp, :256]
287
288 veor q8, q4, q8
289 veor q9, q5, q9
290 vshl.u32 q4, q8, #12
291 vshl.u32 q5, q9, #12
292 vsri.u32 q4, q8, #20
293 vsri.u32 q5, q9, #20
294
295 veor q8, q6, q10
296 veor q9, q7, q11
297 vshl.u32 q6, q8, #12
298 vshl.u32 q7, q9, #12
299 vsri.u32 q6, q8, #20
300 vsri.u32 q7, q9, #20
301
302 // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
303 // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
304 // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
305 // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
Eric Biggersd2333222018-09-01 00:17:07 -0700306 vld1.8 {d16}, [ip, :64]
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000307 vadd.i32 q0, q0, q4
308 vadd.i32 q1, q1, q5
309 vadd.i32 q2, q2, q6
310 vadd.i32 q3, q3, q7
311
Eric Biggersd2333222018-09-01 00:17:07 -0700312 veor q12, q12, q0
313 veor q13, q13, q1
314 veor q14, q14, q2
315 veor q15, q15, q3
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000316
Eric Biggersd2333222018-09-01 00:17:07 -0700317 vtbl.8 d24, {d24}, d16
318 vtbl.8 d25, {d25}, d16
319 vtbl.8 d26, {d26}, d16
320 vtbl.8 d27, {d27}, d16
321 vtbl.8 d28, {d28}, d16
322 vtbl.8 d29, {d29}, d16
323 vtbl.8 d30, {d30}, d16
324 vtbl.8 d31, {d31}, d16
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000325
326 vld1.32 {q8-q9}, [sp, :256]
327
328 // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
329 // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
330 // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
331 // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
332 vadd.i32 q8, q8, q12
333 vadd.i32 q9, q9, q13
334 vadd.i32 q10, q10, q14
335 vadd.i32 q11, q11, q15
336
337 vst1.32 {q8-q9}, [sp, :256]
338
339 veor q8, q4, q8
340 veor q9, q5, q9
341 vshl.u32 q4, q8, #7
342 vshl.u32 q5, q9, #7
343 vsri.u32 q4, q8, #25
344 vsri.u32 q5, q9, #25
345
346 veor q8, q6, q10
347 veor q9, q7, q11
348 vshl.u32 q6, q8, #7
349 vshl.u32 q7, q9, #7
350 vsri.u32 q6, q8, #25
351 vsri.u32 q7, q9, #25
352
353 vld1.32 {q8-q9}, [sp, :256]
354
355 // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
356 // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
357 // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
358 // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
359 vadd.i32 q0, q0, q5
360 vadd.i32 q1, q1, q6
361 vadd.i32 q2, q2, q7
362 vadd.i32 q3, q3, q4
363
364 veor q15, q15, q0
365 veor q12, q12, q1
366 veor q13, q13, q2
367 veor q14, q14, q3
368
369 vrev32.16 q15, q15
370 vrev32.16 q12, q12
371 vrev32.16 q13, q13
372 vrev32.16 q14, q14
373
374 // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
375 // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
376 // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
377 // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
378 vadd.i32 q10, q10, q15
379 vadd.i32 q11, q11, q12
380 vadd.i32 q8, q8, q13
381 vadd.i32 q9, q9, q14
382
383 vst1.32 {q8-q9}, [sp, :256]
384
385 veor q8, q7, q8
386 veor q9, q4, q9
387 vshl.u32 q7, q8, #12
388 vshl.u32 q4, q9, #12
389 vsri.u32 q7, q8, #20
390 vsri.u32 q4, q9, #20
391
392 veor q8, q5, q10
393 veor q9, q6, q11
394 vshl.u32 q5, q8, #12
395 vshl.u32 q6, q9, #12
396 vsri.u32 q5, q8, #20
397 vsri.u32 q6, q9, #20
398
399 // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
400 // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
401 // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
402 // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
Eric Biggersd2333222018-09-01 00:17:07 -0700403 vld1.8 {d16}, [ip, :64]
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000404 vadd.i32 q0, q0, q5
405 vadd.i32 q1, q1, q6
406 vadd.i32 q2, q2, q7
407 vadd.i32 q3, q3, q4
408
Eric Biggersd2333222018-09-01 00:17:07 -0700409 veor q15, q15, q0
410 veor q12, q12, q1
411 veor q13, q13, q2
412 veor q14, q14, q3
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000413
Eric Biggersd2333222018-09-01 00:17:07 -0700414 vtbl.8 d30, {d30}, d16
415 vtbl.8 d31, {d31}, d16
416 vtbl.8 d24, {d24}, d16
417 vtbl.8 d25, {d25}, d16
418 vtbl.8 d26, {d26}, d16
419 vtbl.8 d27, {d27}, d16
420 vtbl.8 d28, {d28}, d16
421 vtbl.8 d29, {d29}, d16
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000422
423 vld1.32 {q8-q9}, [sp, :256]
424
425 // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
426 // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
427 // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
428 // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
429 vadd.i32 q10, q10, q15
430 vadd.i32 q11, q11, q12
431 vadd.i32 q8, q8, q13
432 vadd.i32 q9, q9, q14
433
434 vst1.32 {q8-q9}, [sp, :256]
435
436 veor q8, q7, q8
437 veor q9, q4, q9
438 vshl.u32 q7, q8, #7
439 vshl.u32 q4, q9, #7
440 vsri.u32 q7, q8, #25
441 vsri.u32 q4, q9, #25
442
443 veor q8, q5, q10
444 veor q9, q6, q11
445 vshl.u32 q5, q8, #7
446 vshl.u32 q6, q9, #7
447 vsri.u32 q5, q8, #25
448 vsri.u32 q6, q9, #25
449
Eric Biggers54a345a2018-11-16 17:26:25 -0800450 subs r3, r3, #2
Eric Biggersd2333222018-09-01 00:17:07 -0700451 bne .Ldoubleround4
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000452
Eric Biggersd2333222018-09-01 00:17:07 -0700453 // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
454 // x8..9[0-3] are on the stack.
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000455
Eric Biggersd2333222018-09-01 00:17:07 -0700456 // Re-interleave the words in the first two rows of each block (x0..7).
457 // Also add the counter values 0-3 to x12[0-3].
458 vld1.32 {q8}, [r5, :128] // load counter values 0-3
459 vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
460 vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
461 vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
462 vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7)
463 vadd.u32 q12, q8 // x12 += counter values 0-3
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000464 vswp d1, d4
465 vswp d3, d6
Eric Biggersd2333222018-09-01 00:17:07 -0700466 vld1.32 {q8-q9}, [r0]! // load s0..7
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000467 vswp d9, d12
468 vswp d11, d14
469
Eric Biggersd2333222018-09-01 00:17:07 -0700470 // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
471 // after XORing the first 32 bytes.
472 vswp q1, q4
473
474 // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
475
476 // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block)
477 vadd.u32 q0, q0, q8
478 vadd.u32 q2, q2, q8
479 vadd.u32 q4, q4, q8
480 vadd.u32 q3, q3, q8
481
482 // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block)
483 vadd.u32 q1, q1, q9
484 vadd.u32 q6, q6, q9
485 vadd.u32 q5, q5, q9
486 vadd.u32 q7, q7, q9
487
488 // XOR first 32 bytes using keystream from first two rows of first block
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000489 vld1.8 {q8-q9}, [r2]!
490 veor q8, q8, q0
Eric Biggersd2333222018-09-01 00:17:07 -0700491 veor q9, q9, q1
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000492 vst1.8 {q8-q9}, [r1]!
493
Eric Biggersd2333222018-09-01 00:17:07 -0700494 // Re-interleave the words in the last two rows of each block (x8..15).
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000495 vld1.32 {q8-q9}, [sp, :256]
Eric Biggersd2333222018-09-01 00:17:07 -0700496 vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
497 vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
498 vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
499 vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11)
500 vld1.32 {q0-q1}, [r0] // load s8..15
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000501 vswp d25, d28
502 vswp d27, d30
Eric Biggersd2333222018-09-01 00:17:07 -0700503 vswp d17, d20
504 vswp d19, d22
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000505
Eric Biggersd2333222018-09-01 00:17:07 -0700506 // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
507
508 // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block)
509 vadd.u32 q8, q8, q0
510 vadd.u32 q10, q10, q0
511 vadd.u32 q9, q9, q0
512 vadd.u32 q11, q11, q0
513
514 // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
515 vadd.u32 q12, q12, q1
516 vadd.u32 q14, q14, q1
517 vadd.u32 q13, q13, q1
518 vadd.u32 q15, q15, q1
519
520 // XOR the rest of the data with the keystream
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000521
522 vld1.8 {q0-q1}, [r2]!
523 veor q0, q0, q8
524 veor q1, q1, q12
525 vst1.8 {q0-q1}, [r1]!
526
527 vld1.8 {q0-q1}, [r2]!
528 veor q0, q0, q2
529 veor q1, q1, q6
530 vst1.8 {q0-q1}, [r1]!
531
532 vld1.8 {q0-q1}, [r2]!
533 veor q0, q0, q10
534 veor q1, q1, q14
535 vst1.8 {q0-q1}, [r1]!
536
537 vld1.8 {q0-q1}, [r2]!
538 veor q0, q0, q4
539 veor q1, q1, q5
540 vst1.8 {q0-q1}, [r1]!
541
542 vld1.8 {q0-q1}, [r2]!
543 veor q0, q0, q9
544 veor q1, q1, q13
545 vst1.8 {q0-q1}, [r1]!
546
547 vld1.8 {q0-q1}, [r2]!
548 veor q0, q0, q3
549 veor q1, q1, q7
550 vst1.8 {q0-q1}, [r1]!
551
552 vld1.8 {q0-q1}, [r2]
Eric Biggersd2333222018-09-01 00:17:07 -0700553 mov sp, r4 // restore original stack pointer
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000554 veor q0, q0, q11
555 veor q1, q1, q15
556 vst1.8 {q0-q1}, [r1]
557
Eric Biggersd2333222018-09-01 00:17:07 -0700558 pop {r4-r5}
559 bx lr
Eric Biggers54a345a2018-11-16 17:26:25 -0800560ENDPROC(chacha_4block_xor_neon)