blob: 50e7b98968189bc302f95d7ced8d58c834969a9d [file] [log] [blame]
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +00001/*
2 * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
3 *
4 * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * Based on:
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
12 *
13 * Copyright (C) 2015 Martin Willi
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 */
20
Eric Biggersd2333222018-09-01 00:17:07 -070021 /*
22 * NEON doesn't have a rotate instruction. The alternatives are, more or less:
23 *
24 * (a) vshl.u32 + vsri.u32 (needs temporary register)
25 * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register)
26 * (c) vrev32.16 (16-bit rotations only)
27 * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only,
28 * needs index vector)
29 *
30 * ChaCha20 has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit
31 * rotations, the only choices are (a) and (b). We use (a) since it takes
32 * two-thirds the cycles of (b) on both Cortex-A7 and Cortex-A53.
33 *
34 * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
35 * and doesn't need a temporary register.
36 *
37 * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence
38 * is twice as fast as (a), even when doing (a) on multiple registers
39 * simultaneously to eliminate the stall between vshl and vsri. Also, it
40 * parallelizes better when temporary registers are scarce.
41 *
42 * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
43 * (a), so the need to load the rotation table actually makes the vtbl method
44 * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it
45 * seems to be a good compromise to get a more significant speed boost on some
46 * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
47 */
48
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +000049#include <linux/linkage.h>
50
51 .text
52 .fpu neon
53 .align 5
54
55ENTRY(chacha20_block_xor_neon)
56 // r0: Input state matrix, s
57 // r1: 1 data block output, o
58 // r2: 1 data block input, i
59
60 //
61 // This function encrypts one ChaCha20 block by loading the state matrix
62 // in four NEON registers. It performs matrix operation on four words in
63 // parallel, but requireds shuffling to rearrange the words after each
64 // round.
65 //
66
67 // x0..3 = s0..3
68 add ip, r0, #0x20
69 vld1.32 {q0-q1}, [r0]
70 vld1.32 {q2-q3}, [ip]
71
72 vmov q8, q0
73 vmov q9, q1
74 vmov q10, q2
75 vmov q11, q3
76
Eric Biggersd2333222018-09-01 00:17:07 -070077 adr ip, .Lrol8_table
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +000078 mov r3, #10
Eric Biggersd2333222018-09-01 00:17:07 -070079 vld1.8 {d10}, [ip, :64]
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +000080
81.Ldoubleround:
82 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
83 vadd.i32 q0, q0, q1
Eric Biggers52e5ce32018-07-24 18:29:07 -070084 veor q3, q3, q0
85 vrev32.16 q3, q3
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +000086
87 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
88 vadd.i32 q2, q2, q3
89 veor q4, q1, q2
90 vshl.u32 q1, q4, #12
91 vsri.u32 q1, q4, #20
92
93 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
94 vadd.i32 q0, q0, q1
Eric Biggersd2333222018-09-01 00:17:07 -070095 veor q3, q3, q0
96 vtbl.8 d6, {d6}, d10
97 vtbl.8 d7, {d7}, d10
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +000098
99 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
100 vadd.i32 q2, q2, q3
101 veor q4, q1, q2
102 vshl.u32 q1, q4, #7
103 vsri.u32 q1, q4, #25
104
105 // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
106 vext.8 q1, q1, q1, #4
107 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
108 vext.8 q2, q2, q2, #8
109 // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
110 vext.8 q3, q3, q3, #12
111
112 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
113 vadd.i32 q0, q0, q1
Eric Biggers52e5ce32018-07-24 18:29:07 -0700114 veor q3, q3, q0
115 vrev32.16 q3, q3
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000116
117 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
118 vadd.i32 q2, q2, q3
119 veor q4, q1, q2
120 vshl.u32 q1, q4, #12
121 vsri.u32 q1, q4, #20
122
123 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
124 vadd.i32 q0, q0, q1
Eric Biggersd2333222018-09-01 00:17:07 -0700125 veor q3, q3, q0
126 vtbl.8 d6, {d6}, d10
127 vtbl.8 d7, {d7}, d10
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000128
129 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
130 vadd.i32 q2, q2, q3
131 veor q4, q1, q2
132 vshl.u32 q1, q4, #7
133 vsri.u32 q1, q4, #25
134
135 // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
136 vext.8 q1, q1, q1, #12
137 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
138 vext.8 q2, q2, q2, #8
139 // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
140 vext.8 q3, q3, q3, #4
141
142 subs r3, r3, #1
143 bne .Ldoubleround
144
145 add ip, r2, #0x20
146 vld1.8 {q4-q5}, [r2]
147 vld1.8 {q6-q7}, [ip]
148
149 // o0 = i0 ^ (x0 + s0)
150 vadd.i32 q0, q0, q8
151 veor q0, q0, q4
152
153 // o1 = i1 ^ (x1 + s1)
154 vadd.i32 q1, q1, q9
155 veor q1, q1, q5
156
157 // o2 = i2 ^ (x2 + s2)
158 vadd.i32 q2, q2, q10
159 veor q2, q2, q6
160
161 // o3 = i3 ^ (x3 + s3)
162 vadd.i32 q3, q3, q11
163 veor q3, q3, q7
164
165 add ip, r1, #0x20
166 vst1.8 {q0-q1}, [r1]
167 vst1.8 {q2-q3}, [ip]
168
169 bx lr
170ENDPROC(chacha20_block_xor_neon)
171
Eric Biggersd2333222018-09-01 00:17:07 -0700172 .align 4
173.Lctrinc: .word 0, 1, 2, 3
174.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6
175
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000176 .align 5
177ENTRY(chacha20_4block_xor_neon)
Eric Biggersd2333222018-09-01 00:17:07 -0700178 push {r4-r5}
179 mov r4, sp // preserve the stack pointer
180 sub ip, sp, #0x20 // allocate a 32 byte buffer
181 bic ip, ip, #0x1f // aligned to 32 bytes
182 mov sp, ip
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000183
184 // r0: Input state matrix, s
185 // r1: 4 data blocks output, o
186 // r2: 4 data blocks input, i
187
188 //
189 // This function encrypts four consecutive ChaCha20 blocks by loading
190 // the state matrix in NEON registers four times. The algorithm performs
191 // each operation on the corresponding word of each state matrix, hence
Eric Biggersd2333222018-09-01 00:17:07 -0700192 // requires no word shuffling. The words are re-interleaved before the
193 // final addition of the original state and the XORing step.
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000194 //
195
Eric Biggersd2333222018-09-01 00:17:07 -0700196 // x0..15[0-3] = s0..15[0-3]
197 add ip, r0, #0x20
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000198 vld1.32 {q0-q1}, [r0]
Eric Biggersd2333222018-09-01 00:17:07 -0700199 vld1.32 {q2-q3}, [ip]
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000200
Eric Biggersd2333222018-09-01 00:17:07 -0700201 adr r5, .Lctrinc
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000202 vdup.32 q15, d7[1]
203 vdup.32 q14, d7[0]
Eric Biggersd2333222018-09-01 00:17:07 -0700204 vld1.32 {q4}, [r5, :128]
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000205 vdup.32 q13, d6[1]
206 vdup.32 q12, d6[0]
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000207 vdup.32 q11, d5[1]
208 vdup.32 q10, d5[0]
Eric Biggersd2333222018-09-01 00:17:07 -0700209 vadd.u32 q12, q12, q4 // x12 += counter values 0-3
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000210 vdup.32 q9, d4[1]
211 vdup.32 q8, d4[0]
212 vdup.32 q7, d3[1]
213 vdup.32 q6, d3[0]
214 vdup.32 q5, d2[1]
215 vdup.32 q4, d2[0]
216 vdup.32 q3, d1[1]
217 vdup.32 q2, d1[0]
218 vdup.32 q1, d0[1]
219 vdup.32 q0, d0[0]
220
Eric Biggersd2333222018-09-01 00:17:07 -0700221 adr ip, .Lrol8_table
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000222 mov r3, #10
Eric Biggersd2333222018-09-01 00:17:07 -0700223 b 1f
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000224
225.Ldoubleround4:
Eric Biggersd2333222018-09-01 00:17:07 -0700226 vld1.32 {q8-q9}, [sp, :256]
2271:
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000228 // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
229 // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
230 // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
231 // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
232 vadd.i32 q0, q0, q4
233 vadd.i32 q1, q1, q5
234 vadd.i32 q2, q2, q6
235 vadd.i32 q3, q3, q7
236
237 veor q12, q12, q0
238 veor q13, q13, q1
239 veor q14, q14, q2
240 veor q15, q15, q3
241
242 vrev32.16 q12, q12
243 vrev32.16 q13, q13
244 vrev32.16 q14, q14
245 vrev32.16 q15, q15
246
247 // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
248 // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
249 // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
250 // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
251 vadd.i32 q8, q8, q12
252 vadd.i32 q9, q9, q13
253 vadd.i32 q10, q10, q14
254 vadd.i32 q11, q11, q15
255
256 vst1.32 {q8-q9}, [sp, :256]
257
258 veor q8, q4, q8
259 veor q9, q5, q9
260 vshl.u32 q4, q8, #12
261 vshl.u32 q5, q9, #12
262 vsri.u32 q4, q8, #20
263 vsri.u32 q5, q9, #20
264
265 veor q8, q6, q10
266 veor q9, q7, q11
267 vshl.u32 q6, q8, #12
268 vshl.u32 q7, q9, #12
269 vsri.u32 q6, q8, #20
270 vsri.u32 q7, q9, #20
271
272 // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
273 // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
274 // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
275 // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
Eric Biggersd2333222018-09-01 00:17:07 -0700276 vld1.8 {d16}, [ip, :64]
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000277 vadd.i32 q0, q0, q4
278 vadd.i32 q1, q1, q5
279 vadd.i32 q2, q2, q6
280 vadd.i32 q3, q3, q7
281
Eric Biggersd2333222018-09-01 00:17:07 -0700282 veor q12, q12, q0
283 veor q13, q13, q1
284 veor q14, q14, q2
285 veor q15, q15, q3
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000286
Eric Biggersd2333222018-09-01 00:17:07 -0700287 vtbl.8 d24, {d24}, d16
288 vtbl.8 d25, {d25}, d16
289 vtbl.8 d26, {d26}, d16
290 vtbl.8 d27, {d27}, d16
291 vtbl.8 d28, {d28}, d16
292 vtbl.8 d29, {d29}, d16
293 vtbl.8 d30, {d30}, d16
294 vtbl.8 d31, {d31}, d16
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000295
296 vld1.32 {q8-q9}, [sp, :256]
297
298 // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
299 // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
300 // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
301 // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
302 vadd.i32 q8, q8, q12
303 vadd.i32 q9, q9, q13
304 vadd.i32 q10, q10, q14
305 vadd.i32 q11, q11, q15
306
307 vst1.32 {q8-q9}, [sp, :256]
308
309 veor q8, q4, q8
310 veor q9, q5, q9
311 vshl.u32 q4, q8, #7
312 vshl.u32 q5, q9, #7
313 vsri.u32 q4, q8, #25
314 vsri.u32 q5, q9, #25
315
316 veor q8, q6, q10
317 veor q9, q7, q11
318 vshl.u32 q6, q8, #7
319 vshl.u32 q7, q9, #7
320 vsri.u32 q6, q8, #25
321 vsri.u32 q7, q9, #25
322
323 vld1.32 {q8-q9}, [sp, :256]
324
325 // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
326 // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
327 // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
328 // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
329 vadd.i32 q0, q0, q5
330 vadd.i32 q1, q1, q6
331 vadd.i32 q2, q2, q7
332 vadd.i32 q3, q3, q4
333
334 veor q15, q15, q0
335 veor q12, q12, q1
336 veor q13, q13, q2
337 veor q14, q14, q3
338
339 vrev32.16 q15, q15
340 vrev32.16 q12, q12
341 vrev32.16 q13, q13
342 vrev32.16 q14, q14
343
344 // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
345 // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
346 // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
347 // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
348 vadd.i32 q10, q10, q15
349 vadd.i32 q11, q11, q12
350 vadd.i32 q8, q8, q13
351 vadd.i32 q9, q9, q14
352
353 vst1.32 {q8-q9}, [sp, :256]
354
355 veor q8, q7, q8
356 veor q9, q4, q9
357 vshl.u32 q7, q8, #12
358 vshl.u32 q4, q9, #12
359 vsri.u32 q7, q8, #20
360 vsri.u32 q4, q9, #20
361
362 veor q8, q5, q10
363 veor q9, q6, q11
364 vshl.u32 q5, q8, #12
365 vshl.u32 q6, q9, #12
366 vsri.u32 q5, q8, #20
367 vsri.u32 q6, q9, #20
368
369 // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
370 // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
371 // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
372 // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
Eric Biggersd2333222018-09-01 00:17:07 -0700373 vld1.8 {d16}, [ip, :64]
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000374 vadd.i32 q0, q0, q5
375 vadd.i32 q1, q1, q6
376 vadd.i32 q2, q2, q7
377 vadd.i32 q3, q3, q4
378
Eric Biggersd2333222018-09-01 00:17:07 -0700379 veor q15, q15, q0
380 veor q12, q12, q1
381 veor q13, q13, q2
382 veor q14, q14, q3
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000383
Eric Biggersd2333222018-09-01 00:17:07 -0700384 vtbl.8 d30, {d30}, d16
385 vtbl.8 d31, {d31}, d16
386 vtbl.8 d24, {d24}, d16
387 vtbl.8 d25, {d25}, d16
388 vtbl.8 d26, {d26}, d16
389 vtbl.8 d27, {d27}, d16
390 vtbl.8 d28, {d28}, d16
391 vtbl.8 d29, {d29}, d16
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000392
393 vld1.32 {q8-q9}, [sp, :256]
394
395 // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
396 // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
397 // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
398 // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
399 vadd.i32 q10, q10, q15
400 vadd.i32 q11, q11, q12
401 vadd.i32 q8, q8, q13
402 vadd.i32 q9, q9, q14
403
404 vst1.32 {q8-q9}, [sp, :256]
405
406 veor q8, q7, q8
407 veor q9, q4, q9
408 vshl.u32 q7, q8, #7
409 vshl.u32 q4, q9, #7
410 vsri.u32 q7, q8, #25
411 vsri.u32 q4, q9, #25
412
413 veor q8, q5, q10
414 veor q9, q6, q11
415 vshl.u32 q5, q8, #7
416 vshl.u32 q6, q9, #7
417 vsri.u32 q5, q8, #25
418 vsri.u32 q6, q9, #25
419
420 subs r3, r3, #1
Eric Biggersd2333222018-09-01 00:17:07 -0700421 bne .Ldoubleround4
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000422
Eric Biggersd2333222018-09-01 00:17:07 -0700423 // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
424 // x8..9[0-3] are on the stack.
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000425
Eric Biggersd2333222018-09-01 00:17:07 -0700426 // Re-interleave the words in the first two rows of each block (x0..7).
427 // Also add the counter values 0-3 to x12[0-3].
428 vld1.32 {q8}, [r5, :128] // load counter values 0-3
429 vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
430 vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
431 vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
432 vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7)
433 vadd.u32 q12, q8 // x12 += counter values 0-3
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000434 vswp d1, d4
435 vswp d3, d6
Eric Biggersd2333222018-09-01 00:17:07 -0700436 vld1.32 {q8-q9}, [r0]! // load s0..7
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000437 vswp d9, d12
438 vswp d11, d14
439
Eric Biggersd2333222018-09-01 00:17:07 -0700440 // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
441 // after XORing the first 32 bytes.
442 vswp q1, q4
443
444 // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
445
446 // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block)
447 vadd.u32 q0, q0, q8
448 vadd.u32 q2, q2, q8
449 vadd.u32 q4, q4, q8
450 vadd.u32 q3, q3, q8
451
452 // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block)
453 vadd.u32 q1, q1, q9
454 vadd.u32 q6, q6, q9
455 vadd.u32 q5, q5, q9
456 vadd.u32 q7, q7, q9
457
458 // XOR first 32 bytes using keystream from first two rows of first block
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000459 vld1.8 {q8-q9}, [r2]!
460 veor q8, q8, q0
Eric Biggersd2333222018-09-01 00:17:07 -0700461 veor q9, q9, q1
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000462 vst1.8 {q8-q9}, [r1]!
463
Eric Biggersd2333222018-09-01 00:17:07 -0700464 // Re-interleave the words in the last two rows of each block (x8..15).
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000465 vld1.32 {q8-q9}, [sp, :256]
Eric Biggersd2333222018-09-01 00:17:07 -0700466 vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
467 vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
468 vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
469 vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11)
470 vld1.32 {q0-q1}, [r0] // load s8..15
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000471 vswp d25, d28
472 vswp d27, d30
Eric Biggersd2333222018-09-01 00:17:07 -0700473 vswp d17, d20
474 vswp d19, d22
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000475
Eric Biggersd2333222018-09-01 00:17:07 -0700476 // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
477
478 // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block)
479 vadd.u32 q8, q8, q0
480 vadd.u32 q10, q10, q0
481 vadd.u32 q9, q9, q0
482 vadd.u32 q11, q11, q0
483
484 // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
485 vadd.u32 q12, q12, q1
486 vadd.u32 q14, q14, q1
487 vadd.u32 q13, q13, q1
488 vadd.u32 q15, q15, q1
489
490 // XOR the rest of the data with the keystream
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000491
492 vld1.8 {q0-q1}, [r2]!
493 veor q0, q0, q8
494 veor q1, q1, q12
495 vst1.8 {q0-q1}, [r1]!
496
497 vld1.8 {q0-q1}, [r2]!
498 veor q0, q0, q2
499 veor q1, q1, q6
500 vst1.8 {q0-q1}, [r1]!
501
502 vld1.8 {q0-q1}, [r2]!
503 veor q0, q0, q10
504 veor q1, q1, q14
505 vst1.8 {q0-q1}, [r1]!
506
507 vld1.8 {q0-q1}, [r2]!
508 veor q0, q0, q4
509 veor q1, q1, q5
510 vst1.8 {q0-q1}, [r1]!
511
512 vld1.8 {q0-q1}, [r2]!
513 veor q0, q0, q9
514 veor q1, q1, q13
515 vst1.8 {q0-q1}, [r1]!
516
517 vld1.8 {q0-q1}, [r2]!
518 veor q0, q0, q3
519 veor q1, q1, q7
520 vst1.8 {q0-q1}, [r1]!
521
522 vld1.8 {q0-q1}, [r2]
Eric Biggersd2333222018-09-01 00:17:07 -0700523 mov sp, r4 // restore original stack pointer
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000524 veor q0, q0, q11
525 veor q1, q1, q15
526 vst1.8 {q0-q1}, [r1]
527
Eric Biggersd2333222018-09-01 00:17:07 -0700528 pop {r4-r5}
529 bx lr
Ard Biesheuveldc6ff2022016-12-08 14:28:59 +0000530ENDPROC(chacha20_4block_xor_neon)