blob: ff8f97775a3a3edab771524a1a4db0165aba9104 [file] [log] [blame]
Robert Sloanc9abfe42018-11-26 12:19:07 -08001// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
Robert Sloan726e9d12018-09-11 11:45:04 -07004#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if !defined(OPENSSL_NO_ASM)
David Benjamin4969cc92016-04-22 15:02:23 -040011#if defined(__aarch64__)
Robert Sloan726e9d12018-09-11 11:45:04 -070012#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
David Benjamin4969cc92016-04-22 15:02:23 -040015#include <openssl/arm_arch.h>
16
David Benjamin4969cc92016-04-22 15:02:23 -040017
18
Robert Sloanc9abfe42018-11-26 12:19:07 -080019.section .rodata
David Benjamin4969cc92016-04-22 15:02:23 -040020
21.align 5
22.Lsigma:
23.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
24.Lone:
25.long 1,0,0,0
26.LOPENSSL_armcap_P:
27#ifdef __ILP32__
28.long OPENSSL_armcap_P-.
29#else
30.quad OPENSSL_armcap_P-.
31#endif
32.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
33.align 2
34
Robert Sloanc9abfe42018-11-26 12:19:07 -080035.text
36
David Benjamin4969cc92016-04-22 15:02:23 -040037.globl ChaCha20_ctr32
38.hidden ChaCha20_ctr32
39.type ChaCha20_ctr32,%function
40.align 5
41ChaCha20_ctr32:
42 cbz x2,.Labort
Robert Sloanc9abfe42018-11-26 12:19:07 -080043 adrp x5,OPENSSL_armcap_P
David Benjamin4969cc92016-04-22 15:02:23 -040044 cmp x2,#192
45 b.lo .Lshort
Robert Sloanc9abfe42018-11-26 12:19:07 -080046 add x5,x5,:lo12:OPENSSL_armcap_P
47 ldr w17,[x5]
David Benjamin4969cc92016-04-22 15:02:23 -040048 tst w17,#ARMV7_NEON
49 b.ne ChaCha20_neon
50
51.Lshort:
52 stp x29,x30,[sp,#-96]!
53 add x29,sp,#0
54
Robert Sloanc9abfe42018-11-26 12:19:07 -080055 adrp x5,.Lsigma
56 add x5,x5,:lo12:.Lsigma
David Benjamin4969cc92016-04-22 15:02:23 -040057 stp x19,x20,[sp,#16]
58 stp x21,x22,[sp,#32]
59 stp x23,x24,[sp,#48]
60 stp x25,x26,[sp,#64]
61 stp x27,x28,[sp,#80]
62 sub sp,sp,#64
63
64 ldp x22,x23,[x5] // load sigma
65 ldp x24,x25,[x3] // load key
66 ldp x26,x27,[x3,#16]
67 ldp x28,x30,[x4] // load counter
68#ifdef __ARMEB__
69 ror x24,x24,#32
70 ror x25,x25,#32
71 ror x26,x26,#32
72 ror x27,x27,#32
73 ror x28,x28,#32
74 ror x30,x30,#32
75#endif
76
77.Loop_outer:
78 mov w5,w22 // unpack key block
79 lsr x6,x22,#32
80 mov w7,w23
81 lsr x8,x23,#32
82 mov w9,w24
83 lsr x10,x24,#32
84 mov w11,w25
85 lsr x12,x25,#32
86 mov w13,w26
87 lsr x14,x26,#32
88 mov w15,w27
89 lsr x16,x27,#32
90 mov w17,w28
91 lsr x19,x28,#32
92 mov w20,w30
93 lsr x21,x30,#32
94
95 mov x4,#10
96 subs x2,x2,#64
97.Loop:
98 sub x4,x4,#1
99 add w5,w5,w9
100 add w6,w6,w10
101 add w7,w7,w11
102 add w8,w8,w12
103 eor w17,w17,w5
104 eor w19,w19,w6
105 eor w20,w20,w7
106 eor w21,w21,w8
107 ror w17,w17,#16
108 ror w19,w19,#16
109 ror w20,w20,#16
110 ror w21,w21,#16
111 add w13,w13,w17
112 add w14,w14,w19
113 add w15,w15,w20
114 add w16,w16,w21
115 eor w9,w9,w13
116 eor w10,w10,w14
117 eor w11,w11,w15
118 eor w12,w12,w16
119 ror w9,w9,#20
120 ror w10,w10,#20
121 ror w11,w11,#20
122 ror w12,w12,#20
123 add w5,w5,w9
124 add w6,w6,w10
125 add w7,w7,w11
126 add w8,w8,w12
127 eor w17,w17,w5
128 eor w19,w19,w6
129 eor w20,w20,w7
130 eor w21,w21,w8
131 ror w17,w17,#24
132 ror w19,w19,#24
133 ror w20,w20,#24
134 ror w21,w21,#24
135 add w13,w13,w17
136 add w14,w14,w19
137 add w15,w15,w20
138 add w16,w16,w21
139 eor w9,w9,w13
140 eor w10,w10,w14
141 eor w11,w11,w15
142 eor w12,w12,w16
143 ror w9,w9,#25
144 ror w10,w10,#25
145 ror w11,w11,#25
146 ror w12,w12,#25
147 add w5,w5,w10
148 add w6,w6,w11
149 add w7,w7,w12
150 add w8,w8,w9
151 eor w21,w21,w5
152 eor w17,w17,w6
153 eor w19,w19,w7
154 eor w20,w20,w8
155 ror w21,w21,#16
156 ror w17,w17,#16
157 ror w19,w19,#16
158 ror w20,w20,#16
159 add w15,w15,w21
160 add w16,w16,w17
161 add w13,w13,w19
162 add w14,w14,w20
163 eor w10,w10,w15
164 eor w11,w11,w16
165 eor w12,w12,w13
166 eor w9,w9,w14
167 ror w10,w10,#20
168 ror w11,w11,#20
169 ror w12,w12,#20
170 ror w9,w9,#20
171 add w5,w5,w10
172 add w6,w6,w11
173 add w7,w7,w12
174 add w8,w8,w9
175 eor w21,w21,w5
176 eor w17,w17,w6
177 eor w19,w19,w7
178 eor w20,w20,w8
179 ror w21,w21,#24
180 ror w17,w17,#24
181 ror w19,w19,#24
182 ror w20,w20,#24
183 add w15,w15,w21
184 add w16,w16,w17
185 add w13,w13,w19
186 add w14,w14,w20
187 eor w10,w10,w15
188 eor w11,w11,w16
189 eor w12,w12,w13
190 eor w9,w9,w14
191 ror w10,w10,#25
192 ror w11,w11,#25
193 ror w12,w12,#25
194 ror w9,w9,#25
195 cbnz x4,.Loop
196
197 add w5,w5,w22 // accumulate key block
198 add x6,x6,x22,lsr#32
199 add w7,w7,w23
200 add x8,x8,x23,lsr#32
201 add w9,w9,w24
202 add x10,x10,x24,lsr#32
203 add w11,w11,w25
204 add x12,x12,x25,lsr#32
205 add w13,w13,w26
206 add x14,x14,x26,lsr#32
207 add w15,w15,w27
208 add x16,x16,x27,lsr#32
209 add w17,w17,w28
210 add x19,x19,x28,lsr#32
211 add w20,w20,w30
212 add x21,x21,x30,lsr#32
213
214 b.lo .Ltail
215
216 add x5,x5,x6,lsl#32 // pack
217 add x7,x7,x8,lsl#32
218 ldp x6,x8,[x1,#0] // load input
219 add x9,x9,x10,lsl#32
220 add x11,x11,x12,lsl#32
221 ldp x10,x12,[x1,#16]
222 add x13,x13,x14,lsl#32
223 add x15,x15,x16,lsl#32
224 ldp x14,x16,[x1,#32]
225 add x17,x17,x19,lsl#32
226 add x20,x20,x21,lsl#32
227 ldp x19,x21,[x1,#48]
228 add x1,x1,#64
229#ifdef __ARMEB__
230 rev x5,x5
231 rev x7,x7
232 rev x9,x9
233 rev x11,x11
234 rev x13,x13
235 rev x15,x15
236 rev x17,x17
237 rev x20,x20
238#endif
239 eor x5,x5,x6
240 eor x7,x7,x8
241 eor x9,x9,x10
242 eor x11,x11,x12
243 eor x13,x13,x14
244 eor x15,x15,x16
245 eor x17,x17,x19
246 eor x20,x20,x21
247
248 stp x5,x7,[x0,#0] // store output
249 add x28,x28,#1 // increment counter
250 stp x9,x11,[x0,#16]
251 stp x13,x15,[x0,#32]
252 stp x17,x20,[x0,#48]
253 add x0,x0,#64
254
255 b.hi .Loop_outer
256
257 ldp x19,x20,[x29,#16]
258 add sp,sp,#64
259 ldp x21,x22,[x29,#32]
260 ldp x23,x24,[x29,#48]
261 ldp x25,x26,[x29,#64]
262 ldp x27,x28,[x29,#80]
263 ldp x29,x30,[sp],#96
264.Labort:
265 ret
266
267.align 4
268.Ltail:
269 add x2,x2,#64
270.Less_than_64:
271 sub x0,x0,#1
272 add x1,x1,x2
273 add x0,x0,x2
274 add x4,sp,x2
275 neg x2,x2
276
277 add x5,x5,x6,lsl#32 // pack
278 add x7,x7,x8,lsl#32
279 add x9,x9,x10,lsl#32
280 add x11,x11,x12,lsl#32
281 add x13,x13,x14,lsl#32
282 add x15,x15,x16,lsl#32
283 add x17,x17,x19,lsl#32
284 add x20,x20,x21,lsl#32
285#ifdef __ARMEB__
286 rev x5,x5
287 rev x7,x7
288 rev x9,x9
289 rev x11,x11
290 rev x13,x13
291 rev x15,x15
292 rev x17,x17
293 rev x20,x20
294#endif
295 stp x5,x7,[sp,#0]
296 stp x9,x11,[sp,#16]
297 stp x13,x15,[sp,#32]
298 stp x17,x20,[sp,#48]
299
300.Loop_tail:
301 ldrb w10,[x1,x2]
302 ldrb w11,[x4,x2]
303 add x2,x2,#1
304 eor w10,w10,w11
305 strb w10,[x0,x2]
306 cbnz x2,.Loop_tail
307
308 stp xzr,xzr,[sp,#0]
309 stp xzr,xzr,[sp,#16]
310 stp xzr,xzr,[sp,#32]
311 stp xzr,xzr,[sp,#48]
312
313 ldp x19,x20,[x29,#16]
314 add sp,sp,#64
315 ldp x21,x22,[x29,#32]
316 ldp x23,x24,[x29,#48]
317 ldp x25,x26,[x29,#64]
318 ldp x27,x28,[x29,#80]
319 ldp x29,x30,[sp],#96
320 ret
321.size ChaCha20_ctr32,.-ChaCha20_ctr32
322
323.type ChaCha20_neon,%function
324.align 5
325ChaCha20_neon:
326 stp x29,x30,[sp,#-96]!
327 add x29,sp,#0
328
Robert Sloanc9abfe42018-11-26 12:19:07 -0800329 adrp x5,.Lsigma
330 add x5,x5,:lo12:.Lsigma
David Benjamin4969cc92016-04-22 15:02:23 -0400331 stp x19,x20,[sp,#16]
332 stp x21,x22,[sp,#32]
333 stp x23,x24,[sp,#48]
334 stp x25,x26,[sp,#64]
335 stp x27,x28,[sp,#80]
336 cmp x2,#512
337 b.hs .L512_or_more_neon
338
339 sub sp,sp,#64
340
341 ldp x22,x23,[x5] // load sigma
342 ld1 {v24.4s},[x5],#16
343 ldp x24,x25,[x3] // load key
344 ldp x26,x27,[x3,#16]
345 ld1 {v25.4s,v26.4s},[x3]
346 ldp x28,x30,[x4] // load counter
347 ld1 {v27.4s},[x4]
348 ld1 {v31.4s},[x5]
349#ifdef __ARMEB__
350 rev64 v24.4s,v24.4s
351 ror x24,x24,#32
352 ror x25,x25,#32
353 ror x26,x26,#32
354 ror x27,x27,#32
355 ror x28,x28,#32
356 ror x30,x30,#32
357#endif
358 add v27.4s,v27.4s,v31.4s // += 1
359 add v28.4s,v27.4s,v31.4s
360 add v29.4s,v28.4s,v31.4s
361 shl v31.4s,v31.4s,#2 // 1 -> 4
362
363.Loop_outer_neon:
364 mov w5,w22 // unpack key block
365 lsr x6,x22,#32
366 mov v0.16b,v24.16b
367 mov w7,w23
368 lsr x8,x23,#32
369 mov v4.16b,v24.16b
370 mov w9,w24
371 lsr x10,x24,#32
372 mov v16.16b,v24.16b
373 mov w11,w25
374 mov v1.16b,v25.16b
375 lsr x12,x25,#32
376 mov v5.16b,v25.16b
377 mov w13,w26
378 mov v17.16b,v25.16b
379 lsr x14,x26,#32
380 mov v3.16b,v27.16b
381 mov w15,w27
382 mov v7.16b,v28.16b
383 lsr x16,x27,#32
384 mov v19.16b,v29.16b
385 mov w17,w28
386 mov v2.16b,v26.16b
387 lsr x19,x28,#32
388 mov v6.16b,v26.16b
389 mov w20,w30
390 mov v18.16b,v26.16b
391 lsr x21,x30,#32
392
393 mov x4,#10
394 subs x2,x2,#256
395.Loop_neon:
396 sub x4,x4,#1
397 add v0.4s,v0.4s,v1.4s
398 add w5,w5,w9
399 add v4.4s,v4.4s,v5.4s
400 add w6,w6,w10
401 add v16.4s,v16.4s,v17.4s
402 add w7,w7,w11
403 eor v3.16b,v3.16b,v0.16b
404 add w8,w8,w12
405 eor v7.16b,v7.16b,v4.16b
406 eor w17,w17,w5
407 eor v19.16b,v19.16b,v16.16b
408 eor w19,w19,w6
409 rev32 v3.8h,v3.8h
410 eor w20,w20,w7
411 rev32 v7.8h,v7.8h
412 eor w21,w21,w8
413 rev32 v19.8h,v19.8h
414 ror w17,w17,#16
415 add v2.4s,v2.4s,v3.4s
416 ror w19,w19,#16
417 add v6.4s,v6.4s,v7.4s
418 ror w20,w20,#16
419 add v18.4s,v18.4s,v19.4s
420 ror w21,w21,#16
421 eor v20.16b,v1.16b,v2.16b
422 add w13,w13,w17
423 eor v21.16b,v5.16b,v6.16b
424 add w14,w14,w19
425 eor v22.16b,v17.16b,v18.16b
426 add w15,w15,w20
427 ushr v1.4s,v20.4s,#20
428 add w16,w16,w21
429 ushr v5.4s,v21.4s,#20
430 eor w9,w9,w13
431 ushr v17.4s,v22.4s,#20
432 eor w10,w10,w14
433 sli v1.4s,v20.4s,#12
434 eor w11,w11,w15
435 sli v5.4s,v21.4s,#12
436 eor w12,w12,w16
437 sli v17.4s,v22.4s,#12
438 ror w9,w9,#20
439 add v0.4s,v0.4s,v1.4s
440 ror w10,w10,#20
441 add v4.4s,v4.4s,v5.4s
442 ror w11,w11,#20
443 add v16.4s,v16.4s,v17.4s
444 ror w12,w12,#20
445 eor v20.16b,v3.16b,v0.16b
446 add w5,w5,w9
447 eor v21.16b,v7.16b,v4.16b
448 add w6,w6,w10
449 eor v22.16b,v19.16b,v16.16b
450 add w7,w7,w11
451 ushr v3.4s,v20.4s,#24
452 add w8,w8,w12
453 ushr v7.4s,v21.4s,#24
454 eor w17,w17,w5
455 ushr v19.4s,v22.4s,#24
456 eor w19,w19,w6
457 sli v3.4s,v20.4s,#8
458 eor w20,w20,w7
459 sli v7.4s,v21.4s,#8
460 eor w21,w21,w8
461 sli v19.4s,v22.4s,#8
462 ror w17,w17,#24
463 add v2.4s,v2.4s,v3.4s
464 ror w19,w19,#24
465 add v6.4s,v6.4s,v7.4s
466 ror w20,w20,#24
467 add v18.4s,v18.4s,v19.4s
468 ror w21,w21,#24
469 eor v20.16b,v1.16b,v2.16b
470 add w13,w13,w17
471 eor v21.16b,v5.16b,v6.16b
472 add w14,w14,w19
473 eor v22.16b,v17.16b,v18.16b
474 add w15,w15,w20
475 ushr v1.4s,v20.4s,#25
476 add w16,w16,w21
477 ushr v5.4s,v21.4s,#25
478 eor w9,w9,w13
479 ushr v17.4s,v22.4s,#25
480 eor w10,w10,w14
481 sli v1.4s,v20.4s,#7
482 eor w11,w11,w15
483 sli v5.4s,v21.4s,#7
484 eor w12,w12,w16
485 sli v17.4s,v22.4s,#7
486 ror w9,w9,#25
487 ext v2.16b,v2.16b,v2.16b,#8
488 ror w10,w10,#25
489 ext v6.16b,v6.16b,v6.16b,#8
490 ror w11,w11,#25
491 ext v18.16b,v18.16b,v18.16b,#8
492 ror w12,w12,#25
493 ext v3.16b,v3.16b,v3.16b,#12
494 ext v7.16b,v7.16b,v7.16b,#12
495 ext v19.16b,v19.16b,v19.16b,#12
496 ext v1.16b,v1.16b,v1.16b,#4
497 ext v5.16b,v5.16b,v5.16b,#4
498 ext v17.16b,v17.16b,v17.16b,#4
499 add v0.4s,v0.4s,v1.4s
500 add w5,w5,w10
501 add v4.4s,v4.4s,v5.4s
502 add w6,w6,w11
503 add v16.4s,v16.4s,v17.4s
504 add w7,w7,w12
505 eor v3.16b,v3.16b,v0.16b
506 add w8,w8,w9
507 eor v7.16b,v7.16b,v4.16b
508 eor w21,w21,w5
509 eor v19.16b,v19.16b,v16.16b
510 eor w17,w17,w6
511 rev32 v3.8h,v3.8h
512 eor w19,w19,w7
513 rev32 v7.8h,v7.8h
514 eor w20,w20,w8
515 rev32 v19.8h,v19.8h
516 ror w21,w21,#16
517 add v2.4s,v2.4s,v3.4s
518 ror w17,w17,#16
519 add v6.4s,v6.4s,v7.4s
520 ror w19,w19,#16
521 add v18.4s,v18.4s,v19.4s
522 ror w20,w20,#16
523 eor v20.16b,v1.16b,v2.16b
524 add w15,w15,w21
525 eor v21.16b,v5.16b,v6.16b
526 add w16,w16,w17
527 eor v22.16b,v17.16b,v18.16b
528 add w13,w13,w19
529 ushr v1.4s,v20.4s,#20
530 add w14,w14,w20
531 ushr v5.4s,v21.4s,#20
532 eor w10,w10,w15
533 ushr v17.4s,v22.4s,#20
534 eor w11,w11,w16
535 sli v1.4s,v20.4s,#12
536 eor w12,w12,w13
537 sli v5.4s,v21.4s,#12
538 eor w9,w9,w14
539 sli v17.4s,v22.4s,#12
540 ror w10,w10,#20
541 add v0.4s,v0.4s,v1.4s
542 ror w11,w11,#20
543 add v4.4s,v4.4s,v5.4s
544 ror w12,w12,#20
545 add v16.4s,v16.4s,v17.4s
546 ror w9,w9,#20
547 eor v20.16b,v3.16b,v0.16b
548 add w5,w5,w10
549 eor v21.16b,v7.16b,v4.16b
550 add w6,w6,w11
551 eor v22.16b,v19.16b,v16.16b
552 add w7,w7,w12
553 ushr v3.4s,v20.4s,#24
554 add w8,w8,w9
555 ushr v7.4s,v21.4s,#24
556 eor w21,w21,w5
557 ushr v19.4s,v22.4s,#24
558 eor w17,w17,w6
559 sli v3.4s,v20.4s,#8
560 eor w19,w19,w7
561 sli v7.4s,v21.4s,#8
562 eor w20,w20,w8
563 sli v19.4s,v22.4s,#8
564 ror w21,w21,#24
565 add v2.4s,v2.4s,v3.4s
566 ror w17,w17,#24
567 add v6.4s,v6.4s,v7.4s
568 ror w19,w19,#24
569 add v18.4s,v18.4s,v19.4s
570 ror w20,w20,#24
571 eor v20.16b,v1.16b,v2.16b
572 add w15,w15,w21
573 eor v21.16b,v5.16b,v6.16b
574 add w16,w16,w17
575 eor v22.16b,v17.16b,v18.16b
576 add w13,w13,w19
577 ushr v1.4s,v20.4s,#25
578 add w14,w14,w20
579 ushr v5.4s,v21.4s,#25
580 eor w10,w10,w15
581 ushr v17.4s,v22.4s,#25
582 eor w11,w11,w16
583 sli v1.4s,v20.4s,#7
584 eor w12,w12,w13
585 sli v5.4s,v21.4s,#7
586 eor w9,w9,w14
587 sli v17.4s,v22.4s,#7
588 ror w10,w10,#25
589 ext v2.16b,v2.16b,v2.16b,#8
590 ror w11,w11,#25
591 ext v6.16b,v6.16b,v6.16b,#8
592 ror w12,w12,#25
593 ext v18.16b,v18.16b,v18.16b,#8
594 ror w9,w9,#25
595 ext v3.16b,v3.16b,v3.16b,#4
596 ext v7.16b,v7.16b,v7.16b,#4
597 ext v19.16b,v19.16b,v19.16b,#4
598 ext v1.16b,v1.16b,v1.16b,#12
599 ext v5.16b,v5.16b,v5.16b,#12
600 ext v17.16b,v17.16b,v17.16b,#12
601 cbnz x4,.Loop_neon
602
603 add w5,w5,w22 // accumulate key block
604 add v0.4s,v0.4s,v24.4s
605 add x6,x6,x22,lsr#32
606 add v4.4s,v4.4s,v24.4s
607 add w7,w7,w23
608 add v16.4s,v16.4s,v24.4s
609 add x8,x8,x23,lsr#32
610 add v2.4s,v2.4s,v26.4s
611 add w9,w9,w24
612 add v6.4s,v6.4s,v26.4s
613 add x10,x10,x24,lsr#32
614 add v18.4s,v18.4s,v26.4s
615 add w11,w11,w25
616 add v3.4s,v3.4s,v27.4s
617 add x12,x12,x25,lsr#32
618 add w13,w13,w26
619 add v7.4s,v7.4s,v28.4s
620 add x14,x14,x26,lsr#32
621 add w15,w15,w27
622 add v19.4s,v19.4s,v29.4s
623 add x16,x16,x27,lsr#32
624 add w17,w17,w28
625 add v1.4s,v1.4s,v25.4s
626 add x19,x19,x28,lsr#32
627 add w20,w20,w30
628 add v5.4s,v5.4s,v25.4s
629 add x21,x21,x30,lsr#32
630 add v17.4s,v17.4s,v25.4s
631
632 b.lo .Ltail_neon
633
634 add x5,x5,x6,lsl#32 // pack
635 add x7,x7,x8,lsl#32
636 ldp x6,x8,[x1,#0] // load input
637 add x9,x9,x10,lsl#32
638 add x11,x11,x12,lsl#32
639 ldp x10,x12,[x1,#16]
640 add x13,x13,x14,lsl#32
641 add x15,x15,x16,lsl#32
642 ldp x14,x16,[x1,#32]
643 add x17,x17,x19,lsl#32
644 add x20,x20,x21,lsl#32
645 ldp x19,x21,[x1,#48]
646 add x1,x1,#64
647#ifdef __ARMEB__
648 rev x5,x5
649 rev x7,x7
650 rev x9,x9
651 rev x11,x11
652 rev x13,x13
653 rev x15,x15
654 rev x17,x17
655 rev x20,x20
656#endif
657 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
658 eor x5,x5,x6
659 eor x7,x7,x8
660 eor x9,x9,x10
661 eor x11,x11,x12
662 eor x13,x13,x14
663 eor v0.16b,v0.16b,v20.16b
664 eor x15,x15,x16
665 eor v1.16b,v1.16b,v21.16b
666 eor x17,x17,x19
667 eor v2.16b,v2.16b,v22.16b
668 eor x20,x20,x21
669 eor v3.16b,v3.16b,v23.16b
670 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
671
672 stp x5,x7,[x0,#0] // store output
673 add x28,x28,#4 // increment counter
674 stp x9,x11,[x0,#16]
675 add v27.4s,v27.4s,v31.4s // += 4
676 stp x13,x15,[x0,#32]
677 add v28.4s,v28.4s,v31.4s
678 stp x17,x20,[x0,#48]
679 add v29.4s,v29.4s,v31.4s
680 add x0,x0,#64
681
682 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
683 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
684
685 eor v4.16b,v4.16b,v20.16b
686 eor v5.16b,v5.16b,v21.16b
687 eor v6.16b,v6.16b,v22.16b
688 eor v7.16b,v7.16b,v23.16b
689 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
690
691 eor v16.16b,v16.16b,v0.16b
692 eor v17.16b,v17.16b,v1.16b
693 eor v18.16b,v18.16b,v2.16b
694 eor v19.16b,v19.16b,v3.16b
695 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
696
697 b.hi .Loop_outer_neon
698
699 ldp x19,x20,[x29,#16]
700 add sp,sp,#64
701 ldp x21,x22,[x29,#32]
702 ldp x23,x24,[x29,#48]
703 ldp x25,x26,[x29,#64]
704 ldp x27,x28,[x29,#80]
705 ldp x29,x30,[sp],#96
706 ret
707
708.Ltail_neon:
709 add x2,x2,#256
710 cmp x2,#64
711 b.lo .Less_than_64
712
713 add x5,x5,x6,lsl#32 // pack
714 add x7,x7,x8,lsl#32
715 ldp x6,x8,[x1,#0] // load input
716 add x9,x9,x10,lsl#32
717 add x11,x11,x12,lsl#32
718 ldp x10,x12,[x1,#16]
719 add x13,x13,x14,lsl#32
720 add x15,x15,x16,lsl#32
721 ldp x14,x16,[x1,#32]
722 add x17,x17,x19,lsl#32
723 add x20,x20,x21,lsl#32
724 ldp x19,x21,[x1,#48]
725 add x1,x1,#64
726#ifdef __ARMEB__
727 rev x5,x5
728 rev x7,x7
729 rev x9,x9
730 rev x11,x11
731 rev x13,x13
732 rev x15,x15
733 rev x17,x17
734 rev x20,x20
735#endif
736 eor x5,x5,x6
737 eor x7,x7,x8
738 eor x9,x9,x10
739 eor x11,x11,x12
740 eor x13,x13,x14
741 eor x15,x15,x16
742 eor x17,x17,x19
743 eor x20,x20,x21
744
745 stp x5,x7,[x0,#0] // store output
746 add x28,x28,#4 // increment counter
747 stp x9,x11,[x0,#16]
748 stp x13,x15,[x0,#32]
749 stp x17,x20,[x0,#48]
750 add x0,x0,#64
751 b.eq .Ldone_neon
752 sub x2,x2,#64
753 cmp x2,#64
754 b.lo .Less_than_128
755
756 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
757 eor v0.16b,v0.16b,v20.16b
758 eor v1.16b,v1.16b,v21.16b
759 eor v2.16b,v2.16b,v22.16b
760 eor v3.16b,v3.16b,v23.16b
761 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
762 b.eq .Ldone_neon
763 sub x2,x2,#64
764 cmp x2,#64
765 b.lo .Less_than_192
766
767 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
768 eor v4.16b,v4.16b,v20.16b
769 eor v5.16b,v5.16b,v21.16b
770 eor v6.16b,v6.16b,v22.16b
771 eor v7.16b,v7.16b,v23.16b
772 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
773 b.eq .Ldone_neon
774 sub x2,x2,#64
775
776 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
777 b .Last_neon
778
779.Less_than_128:
780 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
781 b .Last_neon
782.Less_than_192:
783 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
784 b .Last_neon
785
786.align 4
787.Last_neon:
788 sub x0,x0,#1
789 add x1,x1,x2
790 add x0,x0,x2
791 add x4,sp,x2
792 neg x2,x2
793
794.Loop_tail_neon:
795 ldrb w10,[x1,x2]
796 ldrb w11,[x4,x2]
797 add x2,x2,#1
798 eor w10,w10,w11
799 strb w10,[x0,x2]
800 cbnz x2,.Loop_tail_neon
801
802 stp xzr,xzr,[sp,#0]
803 stp xzr,xzr,[sp,#16]
804 stp xzr,xzr,[sp,#32]
805 stp xzr,xzr,[sp,#48]
806
807.Ldone_neon:
808 ldp x19,x20,[x29,#16]
809 add sp,sp,#64
810 ldp x21,x22,[x29,#32]
811 ldp x23,x24,[x29,#48]
812 ldp x25,x26,[x29,#64]
813 ldp x27,x28,[x29,#80]
814 ldp x29,x30,[sp],#96
815 ret
816.size ChaCha20_neon,.-ChaCha20_neon
817.type ChaCha20_512_neon,%function
818.align 5
819ChaCha20_512_neon:
820 stp x29,x30,[sp,#-96]!
821 add x29,sp,#0
822
Robert Sloanc9abfe42018-11-26 12:19:07 -0800823 adrp x5,.Lsigma
824 add x5,x5,:lo12:.Lsigma
David Benjamin4969cc92016-04-22 15:02:23 -0400825 stp x19,x20,[sp,#16]
826 stp x21,x22,[sp,#32]
827 stp x23,x24,[sp,#48]
828 stp x25,x26,[sp,#64]
829 stp x27,x28,[sp,#80]
830
831.L512_or_more_neon:
832 sub sp,sp,#128+64
833
834 ldp x22,x23,[x5] // load sigma
835 ld1 {v24.4s},[x5],#16
836 ldp x24,x25,[x3] // load key
837 ldp x26,x27,[x3,#16]
838 ld1 {v25.4s,v26.4s},[x3]
839 ldp x28,x30,[x4] // load counter
840 ld1 {v27.4s},[x4]
841 ld1 {v31.4s},[x5]
842#ifdef __ARMEB__
843 rev64 v24.4s,v24.4s
844 ror x24,x24,#32
845 ror x25,x25,#32
846 ror x26,x26,#32
847 ror x27,x27,#32
848 ror x28,x28,#32
849 ror x30,x30,#32
850#endif
851 add v27.4s,v27.4s,v31.4s // += 1
852 stp q24,q25,[sp,#0] // off-load key block, invariant part
853 add v27.4s,v27.4s,v31.4s // not typo
854 str q26,[sp,#32]
855 add v28.4s,v27.4s,v31.4s
856 add v29.4s,v28.4s,v31.4s
857 add v30.4s,v29.4s,v31.4s
858 shl v31.4s,v31.4s,#2 // 1 -> 4
859
860 stp d8,d9,[sp,#128+0] // meet ABI requirements
861 stp d10,d11,[sp,#128+16]
862 stp d12,d13,[sp,#128+32]
863 stp d14,d15,[sp,#128+48]
864
865 sub x2,x2,#512 // not typo
866
867.Loop_outer_512_neon:
868 mov v0.16b,v24.16b
869 mov v4.16b,v24.16b
870 mov v8.16b,v24.16b
871 mov v12.16b,v24.16b
872 mov v16.16b,v24.16b
873 mov v20.16b,v24.16b
874 mov v1.16b,v25.16b
875 mov w5,w22 // unpack key block
876 mov v5.16b,v25.16b
877 lsr x6,x22,#32
878 mov v9.16b,v25.16b
879 mov w7,w23
880 mov v13.16b,v25.16b
881 lsr x8,x23,#32
882 mov v17.16b,v25.16b
883 mov w9,w24
884 mov v21.16b,v25.16b
885 lsr x10,x24,#32
886 mov v3.16b,v27.16b
887 mov w11,w25
888 mov v7.16b,v28.16b
889 lsr x12,x25,#32
890 mov v11.16b,v29.16b
891 mov w13,w26
892 mov v15.16b,v30.16b
893 lsr x14,x26,#32
894 mov v2.16b,v26.16b
895 mov w15,w27
896 mov v6.16b,v26.16b
897 lsr x16,x27,#32
898 add v19.4s,v3.4s,v31.4s // +4
899 mov w17,w28
900 add v23.4s,v7.4s,v31.4s // +4
901 lsr x19,x28,#32
902 mov v10.16b,v26.16b
903 mov w20,w30
904 mov v14.16b,v26.16b
905 lsr x21,x30,#32
906 mov v18.16b,v26.16b
907 stp q27,q28,[sp,#48] // off-load key block, variable part
908 mov v22.16b,v26.16b
909 str q29,[sp,#80]
910
911 mov x4,#5
912 subs x2,x2,#512
913.Loop_upper_neon:
914 sub x4,x4,#1
915 add v0.4s,v0.4s,v1.4s
916 add w5,w5,w9
917 add v4.4s,v4.4s,v5.4s
918 add w6,w6,w10
919 add v8.4s,v8.4s,v9.4s
920 add w7,w7,w11
921 add v12.4s,v12.4s,v13.4s
922 add w8,w8,w12
923 add v16.4s,v16.4s,v17.4s
924 eor w17,w17,w5
925 add v20.4s,v20.4s,v21.4s
926 eor w19,w19,w6
927 eor v3.16b,v3.16b,v0.16b
928 eor w20,w20,w7
929 eor v7.16b,v7.16b,v4.16b
930 eor w21,w21,w8
931 eor v11.16b,v11.16b,v8.16b
932 ror w17,w17,#16
933 eor v15.16b,v15.16b,v12.16b
934 ror w19,w19,#16
935 eor v19.16b,v19.16b,v16.16b
936 ror w20,w20,#16
937 eor v23.16b,v23.16b,v20.16b
938 ror w21,w21,#16
939 rev32 v3.8h,v3.8h
940 add w13,w13,w17
941 rev32 v7.8h,v7.8h
942 add w14,w14,w19
943 rev32 v11.8h,v11.8h
944 add w15,w15,w20
945 rev32 v15.8h,v15.8h
946 add w16,w16,w21
947 rev32 v19.8h,v19.8h
948 eor w9,w9,w13
949 rev32 v23.8h,v23.8h
950 eor w10,w10,w14
951 add v2.4s,v2.4s,v3.4s
952 eor w11,w11,w15
953 add v6.4s,v6.4s,v7.4s
954 eor w12,w12,w16
955 add v10.4s,v10.4s,v11.4s
956 ror w9,w9,#20
957 add v14.4s,v14.4s,v15.4s
958 ror w10,w10,#20
959 add v18.4s,v18.4s,v19.4s
960 ror w11,w11,#20
961 add v22.4s,v22.4s,v23.4s
962 ror w12,w12,#20
963 eor v24.16b,v1.16b,v2.16b
964 add w5,w5,w9
965 eor v25.16b,v5.16b,v6.16b
966 add w6,w6,w10
967 eor v26.16b,v9.16b,v10.16b
968 add w7,w7,w11
969 eor v27.16b,v13.16b,v14.16b
970 add w8,w8,w12
971 eor v28.16b,v17.16b,v18.16b
972 eor w17,w17,w5
973 eor v29.16b,v21.16b,v22.16b
974 eor w19,w19,w6
975 ushr v1.4s,v24.4s,#20
976 eor w20,w20,w7
977 ushr v5.4s,v25.4s,#20
978 eor w21,w21,w8
979 ushr v9.4s,v26.4s,#20
980 ror w17,w17,#24
981 ushr v13.4s,v27.4s,#20
982 ror w19,w19,#24
983 ushr v17.4s,v28.4s,#20
984 ror w20,w20,#24
985 ushr v21.4s,v29.4s,#20
986 ror w21,w21,#24
987 sli v1.4s,v24.4s,#12
988 add w13,w13,w17
989 sli v5.4s,v25.4s,#12
990 add w14,w14,w19
991 sli v9.4s,v26.4s,#12
992 add w15,w15,w20
993 sli v13.4s,v27.4s,#12
994 add w16,w16,w21
995 sli v17.4s,v28.4s,#12
996 eor w9,w9,w13
997 sli v21.4s,v29.4s,#12
998 eor w10,w10,w14
999 add v0.4s,v0.4s,v1.4s
1000 eor w11,w11,w15
1001 add v4.4s,v4.4s,v5.4s
1002 eor w12,w12,w16
1003 add v8.4s,v8.4s,v9.4s
1004 ror w9,w9,#25
1005 add v12.4s,v12.4s,v13.4s
1006 ror w10,w10,#25
1007 add v16.4s,v16.4s,v17.4s
1008 ror w11,w11,#25
1009 add v20.4s,v20.4s,v21.4s
1010 ror w12,w12,#25
1011 eor v24.16b,v3.16b,v0.16b
1012 add w5,w5,w10
1013 eor v25.16b,v7.16b,v4.16b
1014 add w6,w6,w11
1015 eor v26.16b,v11.16b,v8.16b
1016 add w7,w7,w12
1017 eor v27.16b,v15.16b,v12.16b
1018 add w8,w8,w9
1019 eor v28.16b,v19.16b,v16.16b
1020 eor w21,w21,w5
1021 eor v29.16b,v23.16b,v20.16b
1022 eor w17,w17,w6
1023 ushr v3.4s,v24.4s,#24
1024 eor w19,w19,w7
1025 ushr v7.4s,v25.4s,#24
1026 eor w20,w20,w8
1027 ushr v11.4s,v26.4s,#24
1028 ror w21,w21,#16
1029 ushr v15.4s,v27.4s,#24
1030 ror w17,w17,#16
1031 ushr v19.4s,v28.4s,#24
1032 ror w19,w19,#16
1033 ushr v23.4s,v29.4s,#24
1034 ror w20,w20,#16
1035 sli v3.4s,v24.4s,#8
1036 add w15,w15,w21
1037 sli v7.4s,v25.4s,#8
1038 add w16,w16,w17
1039 sli v11.4s,v26.4s,#8
1040 add w13,w13,w19
1041 sli v15.4s,v27.4s,#8
1042 add w14,w14,w20
1043 sli v19.4s,v28.4s,#8
1044 eor w10,w10,w15
1045 sli v23.4s,v29.4s,#8
1046 eor w11,w11,w16
1047 add v2.4s,v2.4s,v3.4s
1048 eor w12,w12,w13
1049 add v6.4s,v6.4s,v7.4s
1050 eor w9,w9,w14
1051 add v10.4s,v10.4s,v11.4s
1052 ror w10,w10,#20
1053 add v14.4s,v14.4s,v15.4s
1054 ror w11,w11,#20
1055 add v18.4s,v18.4s,v19.4s
1056 ror w12,w12,#20
1057 add v22.4s,v22.4s,v23.4s
1058 ror w9,w9,#20
1059 eor v24.16b,v1.16b,v2.16b
1060 add w5,w5,w10
1061 eor v25.16b,v5.16b,v6.16b
1062 add w6,w6,w11
1063 eor v26.16b,v9.16b,v10.16b
1064 add w7,w7,w12
1065 eor v27.16b,v13.16b,v14.16b
1066 add w8,w8,w9
1067 eor v28.16b,v17.16b,v18.16b
1068 eor w21,w21,w5
1069 eor v29.16b,v21.16b,v22.16b
1070 eor w17,w17,w6
1071 ushr v1.4s,v24.4s,#25
1072 eor w19,w19,w7
1073 ushr v5.4s,v25.4s,#25
1074 eor w20,w20,w8
1075 ushr v9.4s,v26.4s,#25
1076 ror w21,w21,#24
1077 ushr v13.4s,v27.4s,#25
1078 ror w17,w17,#24
1079 ushr v17.4s,v28.4s,#25
1080 ror w19,w19,#24
1081 ushr v21.4s,v29.4s,#25
1082 ror w20,w20,#24
1083 sli v1.4s,v24.4s,#7
1084 add w15,w15,w21
1085 sli v5.4s,v25.4s,#7
1086 add w16,w16,w17
1087 sli v9.4s,v26.4s,#7
1088 add w13,w13,w19
1089 sli v13.4s,v27.4s,#7
1090 add w14,w14,w20
1091 sli v17.4s,v28.4s,#7
1092 eor w10,w10,w15
1093 sli v21.4s,v29.4s,#7
1094 eor w11,w11,w16
1095 ext v2.16b,v2.16b,v2.16b,#8
1096 eor w12,w12,w13
1097 ext v6.16b,v6.16b,v6.16b,#8
1098 eor w9,w9,w14
1099 ext v10.16b,v10.16b,v10.16b,#8
1100 ror w10,w10,#25
1101 ext v14.16b,v14.16b,v14.16b,#8
1102 ror w11,w11,#25
1103 ext v18.16b,v18.16b,v18.16b,#8
1104 ror w12,w12,#25
1105 ext v22.16b,v22.16b,v22.16b,#8
1106 ror w9,w9,#25
1107 ext v3.16b,v3.16b,v3.16b,#12
1108 ext v7.16b,v7.16b,v7.16b,#12
1109 ext v11.16b,v11.16b,v11.16b,#12
1110 ext v15.16b,v15.16b,v15.16b,#12
1111 ext v19.16b,v19.16b,v19.16b,#12
1112 ext v23.16b,v23.16b,v23.16b,#12
1113 ext v1.16b,v1.16b,v1.16b,#4
1114 ext v5.16b,v5.16b,v5.16b,#4
1115 ext v9.16b,v9.16b,v9.16b,#4
1116 ext v13.16b,v13.16b,v13.16b,#4
1117 ext v17.16b,v17.16b,v17.16b,#4
1118 ext v21.16b,v21.16b,v21.16b,#4
1119 add v0.4s,v0.4s,v1.4s
1120 add w5,w5,w9
1121 add v4.4s,v4.4s,v5.4s
1122 add w6,w6,w10
1123 add v8.4s,v8.4s,v9.4s
1124 add w7,w7,w11
1125 add v12.4s,v12.4s,v13.4s
1126 add w8,w8,w12
1127 add v16.4s,v16.4s,v17.4s
1128 eor w17,w17,w5
1129 add v20.4s,v20.4s,v21.4s
1130 eor w19,w19,w6
1131 eor v3.16b,v3.16b,v0.16b
1132 eor w20,w20,w7
1133 eor v7.16b,v7.16b,v4.16b
1134 eor w21,w21,w8
1135 eor v11.16b,v11.16b,v8.16b
1136 ror w17,w17,#16
1137 eor v15.16b,v15.16b,v12.16b
1138 ror w19,w19,#16
1139 eor v19.16b,v19.16b,v16.16b
1140 ror w20,w20,#16
1141 eor v23.16b,v23.16b,v20.16b
1142 ror w21,w21,#16
1143 rev32 v3.8h,v3.8h
1144 add w13,w13,w17
1145 rev32 v7.8h,v7.8h
1146 add w14,w14,w19
1147 rev32 v11.8h,v11.8h
1148 add w15,w15,w20
1149 rev32 v15.8h,v15.8h
1150 add w16,w16,w21
1151 rev32 v19.8h,v19.8h
1152 eor w9,w9,w13
1153 rev32 v23.8h,v23.8h
1154 eor w10,w10,w14
1155 add v2.4s,v2.4s,v3.4s
1156 eor w11,w11,w15
1157 add v6.4s,v6.4s,v7.4s
1158 eor w12,w12,w16
1159 add v10.4s,v10.4s,v11.4s
1160 ror w9,w9,#20
1161 add v14.4s,v14.4s,v15.4s
1162 ror w10,w10,#20
1163 add v18.4s,v18.4s,v19.4s
1164 ror w11,w11,#20
1165 add v22.4s,v22.4s,v23.4s
1166 ror w12,w12,#20
1167 eor v24.16b,v1.16b,v2.16b
1168 add w5,w5,w9
1169 eor v25.16b,v5.16b,v6.16b
1170 add w6,w6,w10
1171 eor v26.16b,v9.16b,v10.16b
1172 add w7,w7,w11
1173 eor v27.16b,v13.16b,v14.16b
1174 add w8,w8,w12
1175 eor v28.16b,v17.16b,v18.16b
1176 eor w17,w17,w5
1177 eor v29.16b,v21.16b,v22.16b
1178 eor w19,w19,w6
1179 ushr v1.4s,v24.4s,#20
1180 eor w20,w20,w7
1181 ushr v5.4s,v25.4s,#20
1182 eor w21,w21,w8
1183 ushr v9.4s,v26.4s,#20
1184 ror w17,w17,#24
1185 ushr v13.4s,v27.4s,#20
1186 ror w19,w19,#24
1187 ushr v17.4s,v28.4s,#20
1188 ror w20,w20,#24
1189 ushr v21.4s,v29.4s,#20
1190 ror w21,w21,#24
1191 sli v1.4s,v24.4s,#12
1192 add w13,w13,w17
1193 sli v5.4s,v25.4s,#12
1194 add w14,w14,w19
1195 sli v9.4s,v26.4s,#12
1196 add w15,w15,w20
1197 sli v13.4s,v27.4s,#12
1198 add w16,w16,w21
1199 sli v17.4s,v28.4s,#12
1200 eor w9,w9,w13
1201 sli v21.4s,v29.4s,#12
1202 eor w10,w10,w14
1203 add v0.4s,v0.4s,v1.4s
1204 eor w11,w11,w15
1205 add v4.4s,v4.4s,v5.4s
1206 eor w12,w12,w16
1207 add v8.4s,v8.4s,v9.4s
1208 ror w9,w9,#25
1209 add v12.4s,v12.4s,v13.4s
1210 ror w10,w10,#25
1211 add v16.4s,v16.4s,v17.4s
1212 ror w11,w11,#25
1213 add v20.4s,v20.4s,v21.4s
1214 ror w12,w12,#25
1215 eor v24.16b,v3.16b,v0.16b
1216 add w5,w5,w10
1217 eor v25.16b,v7.16b,v4.16b
1218 add w6,w6,w11
1219 eor v26.16b,v11.16b,v8.16b
1220 add w7,w7,w12
1221 eor v27.16b,v15.16b,v12.16b
1222 add w8,w8,w9
1223 eor v28.16b,v19.16b,v16.16b
1224 eor w21,w21,w5
1225 eor v29.16b,v23.16b,v20.16b
1226 eor w17,w17,w6
1227 ushr v3.4s,v24.4s,#24
1228 eor w19,w19,w7
1229 ushr v7.4s,v25.4s,#24
1230 eor w20,w20,w8
1231 ushr v11.4s,v26.4s,#24
1232 ror w21,w21,#16
1233 ushr v15.4s,v27.4s,#24
1234 ror w17,w17,#16
1235 ushr v19.4s,v28.4s,#24
1236 ror w19,w19,#16
1237 ushr v23.4s,v29.4s,#24
1238 ror w20,w20,#16
1239 sli v3.4s,v24.4s,#8
1240 add w15,w15,w21
1241 sli v7.4s,v25.4s,#8
1242 add w16,w16,w17
1243 sli v11.4s,v26.4s,#8
1244 add w13,w13,w19
1245 sli v15.4s,v27.4s,#8
1246 add w14,w14,w20
1247 sli v19.4s,v28.4s,#8
1248 eor w10,w10,w15
1249 sli v23.4s,v29.4s,#8
1250 eor w11,w11,w16
1251 add v2.4s,v2.4s,v3.4s
1252 eor w12,w12,w13
1253 add v6.4s,v6.4s,v7.4s
1254 eor w9,w9,w14
1255 add v10.4s,v10.4s,v11.4s
1256 ror w10,w10,#20
1257 add v14.4s,v14.4s,v15.4s
1258 ror w11,w11,#20
1259 add v18.4s,v18.4s,v19.4s
1260 ror w12,w12,#20
1261 add v22.4s,v22.4s,v23.4s
1262 ror w9,w9,#20
1263 eor v24.16b,v1.16b,v2.16b
1264 add w5,w5,w10
1265 eor v25.16b,v5.16b,v6.16b
1266 add w6,w6,w11
1267 eor v26.16b,v9.16b,v10.16b
1268 add w7,w7,w12
1269 eor v27.16b,v13.16b,v14.16b
1270 add w8,w8,w9
1271 eor v28.16b,v17.16b,v18.16b
1272 eor w21,w21,w5
1273 eor v29.16b,v21.16b,v22.16b
1274 eor w17,w17,w6
1275 ushr v1.4s,v24.4s,#25
1276 eor w19,w19,w7
1277 ushr v5.4s,v25.4s,#25
1278 eor w20,w20,w8
1279 ushr v9.4s,v26.4s,#25
1280 ror w21,w21,#24
1281 ushr v13.4s,v27.4s,#25
1282 ror w17,w17,#24
1283 ushr v17.4s,v28.4s,#25
1284 ror w19,w19,#24
1285 ushr v21.4s,v29.4s,#25
1286 ror w20,w20,#24
1287 sli v1.4s,v24.4s,#7
1288 add w15,w15,w21
1289 sli v5.4s,v25.4s,#7
1290 add w16,w16,w17
1291 sli v9.4s,v26.4s,#7
1292 add w13,w13,w19
1293 sli v13.4s,v27.4s,#7
1294 add w14,w14,w20
1295 sli v17.4s,v28.4s,#7
1296 eor w10,w10,w15
1297 sli v21.4s,v29.4s,#7
1298 eor w11,w11,w16
1299 ext v2.16b,v2.16b,v2.16b,#8
1300 eor w12,w12,w13
1301 ext v6.16b,v6.16b,v6.16b,#8
1302 eor w9,w9,w14
1303 ext v10.16b,v10.16b,v10.16b,#8
1304 ror w10,w10,#25
1305 ext v14.16b,v14.16b,v14.16b,#8
1306 ror w11,w11,#25
1307 ext v18.16b,v18.16b,v18.16b,#8
1308 ror w12,w12,#25
1309 ext v22.16b,v22.16b,v22.16b,#8
1310 ror w9,w9,#25
1311 ext v3.16b,v3.16b,v3.16b,#4
1312 ext v7.16b,v7.16b,v7.16b,#4
1313 ext v11.16b,v11.16b,v11.16b,#4
1314 ext v15.16b,v15.16b,v15.16b,#4
1315 ext v19.16b,v19.16b,v19.16b,#4
1316 ext v23.16b,v23.16b,v23.16b,#4
1317 ext v1.16b,v1.16b,v1.16b,#12
1318 ext v5.16b,v5.16b,v5.16b,#12
1319 ext v9.16b,v9.16b,v9.16b,#12
1320 ext v13.16b,v13.16b,v13.16b,#12
1321 ext v17.16b,v17.16b,v17.16b,#12
1322 ext v21.16b,v21.16b,v21.16b,#12
1323 cbnz x4,.Loop_upper_neon
1324
1325 add w5,w5,w22 // accumulate key block
1326 add x6,x6,x22,lsr#32
1327 add w7,w7,w23
1328 add x8,x8,x23,lsr#32
1329 add w9,w9,w24
1330 add x10,x10,x24,lsr#32
1331 add w11,w11,w25
1332 add x12,x12,x25,lsr#32
1333 add w13,w13,w26
1334 add x14,x14,x26,lsr#32
1335 add w15,w15,w27
1336 add x16,x16,x27,lsr#32
1337 add w17,w17,w28
1338 add x19,x19,x28,lsr#32
1339 add w20,w20,w30
1340 add x21,x21,x30,lsr#32
1341
1342 add x5,x5,x6,lsl#32 // pack
1343 add x7,x7,x8,lsl#32
1344 ldp x6,x8,[x1,#0] // load input
1345 add x9,x9,x10,lsl#32
1346 add x11,x11,x12,lsl#32
1347 ldp x10,x12,[x1,#16]
1348 add x13,x13,x14,lsl#32
1349 add x15,x15,x16,lsl#32
1350 ldp x14,x16,[x1,#32]
1351 add x17,x17,x19,lsl#32
1352 add x20,x20,x21,lsl#32
1353 ldp x19,x21,[x1,#48]
1354 add x1,x1,#64
1355#ifdef __ARMEB__
1356 rev x5,x5
1357 rev x7,x7
1358 rev x9,x9
1359 rev x11,x11
1360 rev x13,x13
1361 rev x15,x15
1362 rev x17,x17
1363 rev x20,x20
1364#endif
1365 eor x5,x5,x6
1366 eor x7,x7,x8
1367 eor x9,x9,x10
1368 eor x11,x11,x12
1369 eor x13,x13,x14
1370 eor x15,x15,x16
1371 eor x17,x17,x19
1372 eor x20,x20,x21
1373
1374 stp x5,x7,[x0,#0] // store output
1375 add x28,x28,#1 // increment counter
1376 mov w5,w22 // unpack key block
1377 lsr x6,x22,#32
1378 stp x9,x11,[x0,#16]
1379 mov w7,w23
1380 lsr x8,x23,#32
1381 stp x13,x15,[x0,#32]
1382 mov w9,w24
1383 lsr x10,x24,#32
1384 stp x17,x20,[x0,#48]
1385 add x0,x0,#64
1386 mov w11,w25
1387 lsr x12,x25,#32
1388 mov w13,w26
1389 lsr x14,x26,#32
1390 mov w15,w27
1391 lsr x16,x27,#32
1392 mov w17,w28
1393 lsr x19,x28,#32
1394 mov w20,w30
1395 lsr x21,x30,#32
1396
1397 mov x4,#5
1398.Loop_lower_neon:
1399 sub x4,x4,#1
1400 add v0.4s,v0.4s,v1.4s
1401 add w5,w5,w9
1402 add v4.4s,v4.4s,v5.4s
1403 add w6,w6,w10
1404 add v8.4s,v8.4s,v9.4s
1405 add w7,w7,w11
1406 add v12.4s,v12.4s,v13.4s
1407 add w8,w8,w12
1408 add v16.4s,v16.4s,v17.4s
1409 eor w17,w17,w5
1410 add v20.4s,v20.4s,v21.4s
1411 eor w19,w19,w6
1412 eor v3.16b,v3.16b,v0.16b
1413 eor w20,w20,w7
1414 eor v7.16b,v7.16b,v4.16b
1415 eor w21,w21,w8
1416 eor v11.16b,v11.16b,v8.16b
1417 ror w17,w17,#16
1418 eor v15.16b,v15.16b,v12.16b
1419 ror w19,w19,#16
1420 eor v19.16b,v19.16b,v16.16b
1421 ror w20,w20,#16
1422 eor v23.16b,v23.16b,v20.16b
1423 ror w21,w21,#16
1424 rev32 v3.8h,v3.8h
1425 add w13,w13,w17
1426 rev32 v7.8h,v7.8h
1427 add w14,w14,w19
1428 rev32 v11.8h,v11.8h
1429 add w15,w15,w20
1430 rev32 v15.8h,v15.8h
1431 add w16,w16,w21
1432 rev32 v19.8h,v19.8h
1433 eor w9,w9,w13
1434 rev32 v23.8h,v23.8h
1435 eor w10,w10,w14
1436 add v2.4s,v2.4s,v3.4s
1437 eor w11,w11,w15
1438 add v6.4s,v6.4s,v7.4s
1439 eor w12,w12,w16
1440 add v10.4s,v10.4s,v11.4s
1441 ror w9,w9,#20
1442 add v14.4s,v14.4s,v15.4s
1443 ror w10,w10,#20
1444 add v18.4s,v18.4s,v19.4s
1445 ror w11,w11,#20
1446 add v22.4s,v22.4s,v23.4s
1447 ror w12,w12,#20
1448 eor v24.16b,v1.16b,v2.16b
1449 add w5,w5,w9
1450 eor v25.16b,v5.16b,v6.16b
1451 add w6,w6,w10
1452 eor v26.16b,v9.16b,v10.16b
1453 add w7,w7,w11
1454 eor v27.16b,v13.16b,v14.16b
1455 add w8,w8,w12
1456 eor v28.16b,v17.16b,v18.16b
1457 eor w17,w17,w5
1458 eor v29.16b,v21.16b,v22.16b
1459 eor w19,w19,w6
1460 ushr v1.4s,v24.4s,#20
1461 eor w20,w20,w7
1462 ushr v5.4s,v25.4s,#20
1463 eor w21,w21,w8
1464 ushr v9.4s,v26.4s,#20
1465 ror w17,w17,#24
1466 ushr v13.4s,v27.4s,#20
1467 ror w19,w19,#24
1468 ushr v17.4s,v28.4s,#20
1469 ror w20,w20,#24
1470 ushr v21.4s,v29.4s,#20
1471 ror w21,w21,#24
1472 sli v1.4s,v24.4s,#12
1473 add w13,w13,w17
1474 sli v5.4s,v25.4s,#12
1475 add w14,w14,w19
1476 sli v9.4s,v26.4s,#12
1477 add w15,w15,w20
1478 sli v13.4s,v27.4s,#12
1479 add w16,w16,w21
1480 sli v17.4s,v28.4s,#12
1481 eor w9,w9,w13
1482 sli v21.4s,v29.4s,#12
1483 eor w10,w10,w14
1484 add v0.4s,v0.4s,v1.4s
1485 eor w11,w11,w15
1486 add v4.4s,v4.4s,v5.4s
1487 eor w12,w12,w16
1488 add v8.4s,v8.4s,v9.4s
1489 ror w9,w9,#25
1490 add v12.4s,v12.4s,v13.4s
1491 ror w10,w10,#25
1492 add v16.4s,v16.4s,v17.4s
1493 ror w11,w11,#25
1494 add v20.4s,v20.4s,v21.4s
1495 ror w12,w12,#25
1496 eor v24.16b,v3.16b,v0.16b
1497 add w5,w5,w10
1498 eor v25.16b,v7.16b,v4.16b
1499 add w6,w6,w11
1500 eor v26.16b,v11.16b,v8.16b
1501 add w7,w7,w12
1502 eor v27.16b,v15.16b,v12.16b
1503 add w8,w8,w9
1504 eor v28.16b,v19.16b,v16.16b
1505 eor w21,w21,w5
1506 eor v29.16b,v23.16b,v20.16b
1507 eor w17,w17,w6
1508 ushr v3.4s,v24.4s,#24
1509 eor w19,w19,w7
1510 ushr v7.4s,v25.4s,#24
1511 eor w20,w20,w8
1512 ushr v11.4s,v26.4s,#24
1513 ror w21,w21,#16
1514 ushr v15.4s,v27.4s,#24
1515 ror w17,w17,#16
1516 ushr v19.4s,v28.4s,#24
1517 ror w19,w19,#16
1518 ushr v23.4s,v29.4s,#24
1519 ror w20,w20,#16
1520 sli v3.4s,v24.4s,#8
1521 add w15,w15,w21
1522 sli v7.4s,v25.4s,#8
1523 add w16,w16,w17
1524 sli v11.4s,v26.4s,#8
1525 add w13,w13,w19
1526 sli v15.4s,v27.4s,#8
1527 add w14,w14,w20
1528 sli v19.4s,v28.4s,#8
1529 eor w10,w10,w15
1530 sli v23.4s,v29.4s,#8
1531 eor w11,w11,w16
1532 add v2.4s,v2.4s,v3.4s
1533 eor w12,w12,w13
1534 add v6.4s,v6.4s,v7.4s
1535 eor w9,w9,w14
1536 add v10.4s,v10.4s,v11.4s
1537 ror w10,w10,#20
1538 add v14.4s,v14.4s,v15.4s
1539 ror w11,w11,#20
1540 add v18.4s,v18.4s,v19.4s
1541 ror w12,w12,#20
1542 add v22.4s,v22.4s,v23.4s
1543 ror w9,w9,#20
1544 eor v24.16b,v1.16b,v2.16b
1545 add w5,w5,w10
1546 eor v25.16b,v5.16b,v6.16b
1547 add w6,w6,w11
1548 eor v26.16b,v9.16b,v10.16b
1549 add w7,w7,w12
1550 eor v27.16b,v13.16b,v14.16b
1551 add w8,w8,w9
1552 eor v28.16b,v17.16b,v18.16b
1553 eor w21,w21,w5
1554 eor v29.16b,v21.16b,v22.16b
1555 eor w17,w17,w6
1556 ushr v1.4s,v24.4s,#25
1557 eor w19,w19,w7
1558 ushr v5.4s,v25.4s,#25
1559 eor w20,w20,w8
1560 ushr v9.4s,v26.4s,#25
1561 ror w21,w21,#24
1562 ushr v13.4s,v27.4s,#25
1563 ror w17,w17,#24
1564 ushr v17.4s,v28.4s,#25
1565 ror w19,w19,#24
1566 ushr v21.4s,v29.4s,#25
1567 ror w20,w20,#24
1568 sli v1.4s,v24.4s,#7
1569 add w15,w15,w21
1570 sli v5.4s,v25.4s,#7
1571 add w16,w16,w17
1572 sli v9.4s,v26.4s,#7
1573 add w13,w13,w19
1574 sli v13.4s,v27.4s,#7
1575 add w14,w14,w20
1576 sli v17.4s,v28.4s,#7
1577 eor w10,w10,w15
1578 sli v21.4s,v29.4s,#7
1579 eor w11,w11,w16
1580 ext v2.16b,v2.16b,v2.16b,#8
1581 eor w12,w12,w13
1582 ext v6.16b,v6.16b,v6.16b,#8
1583 eor w9,w9,w14
1584 ext v10.16b,v10.16b,v10.16b,#8
1585 ror w10,w10,#25
1586 ext v14.16b,v14.16b,v14.16b,#8
1587 ror w11,w11,#25
1588 ext v18.16b,v18.16b,v18.16b,#8
1589 ror w12,w12,#25
1590 ext v22.16b,v22.16b,v22.16b,#8
1591 ror w9,w9,#25
1592 ext v3.16b,v3.16b,v3.16b,#12
1593 ext v7.16b,v7.16b,v7.16b,#12
1594 ext v11.16b,v11.16b,v11.16b,#12
1595 ext v15.16b,v15.16b,v15.16b,#12
1596 ext v19.16b,v19.16b,v19.16b,#12
1597 ext v23.16b,v23.16b,v23.16b,#12
1598 ext v1.16b,v1.16b,v1.16b,#4
1599 ext v5.16b,v5.16b,v5.16b,#4
1600 ext v9.16b,v9.16b,v9.16b,#4
1601 ext v13.16b,v13.16b,v13.16b,#4
1602 ext v17.16b,v17.16b,v17.16b,#4
1603 ext v21.16b,v21.16b,v21.16b,#4
1604 add v0.4s,v0.4s,v1.4s
1605 add w5,w5,w9
1606 add v4.4s,v4.4s,v5.4s
1607 add w6,w6,w10
1608 add v8.4s,v8.4s,v9.4s
1609 add w7,w7,w11
1610 add v12.4s,v12.4s,v13.4s
1611 add w8,w8,w12
1612 add v16.4s,v16.4s,v17.4s
1613 eor w17,w17,w5
1614 add v20.4s,v20.4s,v21.4s
1615 eor w19,w19,w6
1616 eor v3.16b,v3.16b,v0.16b
1617 eor w20,w20,w7
1618 eor v7.16b,v7.16b,v4.16b
1619 eor w21,w21,w8
1620 eor v11.16b,v11.16b,v8.16b
1621 ror w17,w17,#16
1622 eor v15.16b,v15.16b,v12.16b
1623 ror w19,w19,#16
1624 eor v19.16b,v19.16b,v16.16b
1625 ror w20,w20,#16
1626 eor v23.16b,v23.16b,v20.16b
1627 ror w21,w21,#16
1628 rev32 v3.8h,v3.8h
1629 add w13,w13,w17
1630 rev32 v7.8h,v7.8h
1631 add w14,w14,w19
1632 rev32 v11.8h,v11.8h
1633 add w15,w15,w20
1634 rev32 v15.8h,v15.8h
1635 add w16,w16,w21
1636 rev32 v19.8h,v19.8h
1637 eor w9,w9,w13
1638 rev32 v23.8h,v23.8h
1639 eor w10,w10,w14
1640 add v2.4s,v2.4s,v3.4s
1641 eor w11,w11,w15
1642 add v6.4s,v6.4s,v7.4s
1643 eor w12,w12,w16
1644 add v10.4s,v10.4s,v11.4s
1645 ror w9,w9,#20
1646 add v14.4s,v14.4s,v15.4s
1647 ror w10,w10,#20
1648 add v18.4s,v18.4s,v19.4s
1649 ror w11,w11,#20
1650 add v22.4s,v22.4s,v23.4s
1651 ror w12,w12,#20
1652 eor v24.16b,v1.16b,v2.16b
1653 add w5,w5,w9
1654 eor v25.16b,v5.16b,v6.16b
1655 add w6,w6,w10
1656 eor v26.16b,v9.16b,v10.16b
1657 add w7,w7,w11
1658 eor v27.16b,v13.16b,v14.16b
1659 add w8,w8,w12
1660 eor v28.16b,v17.16b,v18.16b
1661 eor w17,w17,w5
1662 eor v29.16b,v21.16b,v22.16b
1663 eor w19,w19,w6
1664 ushr v1.4s,v24.4s,#20
1665 eor w20,w20,w7
1666 ushr v5.4s,v25.4s,#20
1667 eor w21,w21,w8
1668 ushr v9.4s,v26.4s,#20
1669 ror w17,w17,#24
1670 ushr v13.4s,v27.4s,#20
1671 ror w19,w19,#24
1672 ushr v17.4s,v28.4s,#20
1673 ror w20,w20,#24
1674 ushr v21.4s,v29.4s,#20
1675 ror w21,w21,#24
1676 sli v1.4s,v24.4s,#12
1677 add w13,w13,w17
1678 sli v5.4s,v25.4s,#12
1679 add w14,w14,w19
1680 sli v9.4s,v26.4s,#12
1681 add w15,w15,w20
1682 sli v13.4s,v27.4s,#12
1683 add w16,w16,w21
1684 sli v17.4s,v28.4s,#12
1685 eor w9,w9,w13
1686 sli v21.4s,v29.4s,#12
1687 eor w10,w10,w14
1688 add v0.4s,v0.4s,v1.4s
1689 eor w11,w11,w15
1690 add v4.4s,v4.4s,v5.4s
1691 eor w12,w12,w16
1692 add v8.4s,v8.4s,v9.4s
1693 ror w9,w9,#25
1694 add v12.4s,v12.4s,v13.4s
1695 ror w10,w10,#25
1696 add v16.4s,v16.4s,v17.4s
1697 ror w11,w11,#25
1698 add v20.4s,v20.4s,v21.4s
1699 ror w12,w12,#25
1700 eor v24.16b,v3.16b,v0.16b
1701 add w5,w5,w10
1702 eor v25.16b,v7.16b,v4.16b
1703 add w6,w6,w11
1704 eor v26.16b,v11.16b,v8.16b
1705 add w7,w7,w12
1706 eor v27.16b,v15.16b,v12.16b
1707 add w8,w8,w9
1708 eor v28.16b,v19.16b,v16.16b
1709 eor w21,w21,w5
1710 eor v29.16b,v23.16b,v20.16b
1711 eor w17,w17,w6
1712 ushr v3.4s,v24.4s,#24
1713 eor w19,w19,w7
1714 ushr v7.4s,v25.4s,#24
1715 eor w20,w20,w8
1716 ushr v11.4s,v26.4s,#24
1717 ror w21,w21,#16
1718 ushr v15.4s,v27.4s,#24
1719 ror w17,w17,#16
1720 ushr v19.4s,v28.4s,#24
1721 ror w19,w19,#16
1722 ushr v23.4s,v29.4s,#24
1723 ror w20,w20,#16
1724 sli v3.4s,v24.4s,#8
1725 add w15,w15,w21
1726 sli v7.4s,v25.4s,#8
1727 add w16,w16,w17
1728 sli v11.4s,v26.4s,#8
1729 add w13,w13,w19
1730 sli v15.4s,v27.4s,#8
1731 add w14,w14,w20
1732 sli v19.4s,v28.4s,#8
1733 eor w10,w10,w15
1734 sli v23.4s,v29.4s,#8
1735 eor w11,w11,w16
1736 add v2.4s,v2.4s,v3.4s
1737 eor w12,w12,w13
1738 add v6.4s,v6.4s,v7.4s
1739 eor w9,w9,w14
1740 add v10.4s,v10.4s,v11.4s
1741 ror w10,w10,#20
1742 add v14.4s,v14.4s,v15.4s
1743 ror w11,w11,#20
1744 add v18.4s,v18.4s,v19.4s
1745 ror w12,w12,#20
1746 add v22.4s,v22.4s,v23.4s
1747 ror w9,w9,#20
1748 eor v24.16b,v1.16b,v2.16b
1749 add w5,w5,w10
1750 eor v25.16b,v5.16b,v6.16b
1751 add w6,w6,w11
1752 eor v26.16b,v9.16b,v10.16b
1753 add w7,w7,w12
1754 eor v27.16b,v13.16b,v14.16b
1755 add w8,w8,w9
1756 eor v28.16b,v17.16b,v18.16b
1757 eor w21,w21,w5
1758 eor v29.16b,v21.16b,v22.16b
1759 eor w17,w17,w6
1760 ushr v1.4s,v24.4s,#25
1761 eor w19,w19,w7
1762 ushr v5.4s,v25.4s,#25
1763 eor w20,w20,w8
1764 ushr v9.4s,v26.4s,#25
1765 ror w21,w21,#24
1766 ushr v13.4s,v27.4s,#25
1767 ror w17,w17,#24
1768 ushr v17.4s,v28.4s,#25
1769 ror w19,w19,#24
1770 ushr v21.4s,v29.4s,#25
1771 ror w20,w20,#24
1772 sli v1.4s,v24.4s,#7
1773 add w15,w15,w21
1774 sli v5.4s,v25.4s,#7
1775 add w16,w16,w17
1776 sli v9.4s,v26.4s,#7
1777 add w13,w13,w19
1778 sli v13.4s,v27.4s,#7
1779 add w14,w14,w20
1780 sli v17.4s,v28.4s,#7
1781 eor w10,w10,w15
1782 sli v21.4s,v29.4s,#7
1783 eor w11,w11,w16
1784 ext v2.16b,v2.16b,v2.16b,#8
1785 eor w12,w12,w13
1786 ext v6.16b,v6.16b,v6.16b,#8
1787 eor w9,w9,w14
1788 ext v10.16b,v10.16b,v10.16b,#8
1789 ror w10,w10,#25
1790 ext v14.16b,v14.16b,v14.16b,#8
1791 ror w11,w11,#25
1792 ext v18.16b,v18.16b,v18.16b,#8
1793 ror w12,w12,#25
1794 ext v22.16b,v22.16b,v22.16b,#8
1795 ror w9,w9,#25
1796 ext v3.16b,v3.16b,v3.16b,#4
1797 ext v7.16b,v7.16b,v7.16b,#4
1798 ext v11.16b,v11.16b,v11.16b,#4
1799 ext v15.16b,v15.16b,v15.16b,#4
1800 ext v19.16b,v19.16b,v19.16b,#4
1801 ext v23.16b,v23.16b,v23.16b,#4
1802 ext v1.16b,v1.16b,v1.16b,#12
1803 ext v5.16b,v5.16b,v5.16b,#12
1804 ext v9.16b,v9.16b,v9.16b,#12
1805 ext v13.16b,v13.16b,v13.16b,#12
1806 ext v17.16b,v17.16b,v17.16b,#12
1807 ext v21.16b,v21.16b,v21.16b,#12
1808 cbnz x4,.Loop_lower_neon
1809
1810 add w5,w5,w22 // accumulate key block
1811 ldp q24,q25,[sp,#0]
1812 add x6,x6,x22,lsr#32
1813 ldp q26,q27,[sp,#32]
1814 add w7,w7,w23
1815 ldp q28,q29,[sp,#64]
1816 add x8,x8,x23,lsr#32
1817 add v0.4s,v0.4s,v24.4s
1818 add w9,w9,w24
1819 add v4.4s,v4.4s,v24.4s
1820 add x10,x10,x24,lsr#32
1821 add v8.4s,v8.4s,v24.4s
1822 add w11,w11,w25
1823 add v12.4s,v12.4s,v24.4s
1824 add x12,x12,x25,lsr#32
1825 add v16.4s,v16.4s,v24.4s
1826 add w13,w13,w26
1827 add v20.4s,v20.4s,v24.4s
1828 add x14,x14,x26,lsr#32
1829 add v2.4s,v2.4s,v26.4s
1830 add w15,w15,w27
1831 add v6.4s,v6.4s,v26.4s
1832 add x16,x16,x27,lsr#32
1833 add v10.4s,v10.4s,v26.4s
1834 add w17,w17,w28
1835 add v14.4s,v14.4s,v26.4s
1836 add x19,x19,x28,lsr#32
1837 add v18.4s,v18.4s,v26.4s
1838 add w20,w20,w30
1839 add v22.4s,v22.4s,v26.4s
1840 add x21,x21,x30,lsr#32
1841 add v19.4s,v19.4s,v31.4s // +4
1842 add x5,x5,x6,lsl#32 // pack
1843 add v23.4s,v23.4s,v31.4s // +4
1844 add x7,x7,x8,lsl#32
1845 add v3.4s,v3.4s,v27.4s
1846 ldp x6,x8,[x1,#0] // load input
1847 add v7.4s,v7.4s,v28.4s
1848 add x9,x9,x10,lsl#32
1849 add v11.4s,v11.4s,v29.4s
1850 add x11,x11,x12,lsl#32
1851 add v15.4s,v15.4s,v30.4s
1852 ldp x10,x12,[x1,#16]
1853 add v19.4s,v19.4s,v27.4s
1854 add x13,x13,x14,lsl#32
1855 add v23.4s,v23.4s,v28.4s
1856 add x15,x15,x16,lsl#32
1857 add v1.4s,v1.4s,v25.4s
1858 ldp x14,x16,[x1,#32]
1859 add v5.4s,v5.4s,v25.4s
1860 add x17,x17,x19,lsl#32
1861 add v9.4s,v9.4s,v25.4s
1862 add x20,x20,x21,lsl#32
1863 add v13.4s,v13.4s,v25.4s
1864 ldp x19,x21,[x1,#48]
1865 add v17.4s,v17.4s,v25.4s
1866 add x1,x1,#64
1867 add v21.4s,v21.4s,v25.4s
1868
1869#ifdef __ARMEB__
1870 rev x5,x5
1871 rev x7,x7
1872 rev x9,x9
1873 rev x11,x11
1874 rev x13,x13
1875 rev x15,x15
1876 rev x17,x17
1877 rev x20,x20
1878#endif
1879 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1880 eor x5,x5,x6
1881 eor x7,x7,x8
1882 eor x9,x9,x10
1883 eor x11,x11,x12
1884 eor x13,x13,x14
1885 eor v0.16b,v0.16b,v24.16b
1886 eor x15,x15,x16
1887 eor v1.16b,v1.16b,v25.16b
1888 eor x17,x17,x19
1889 eor v2.16b,v2.16b,v26.16b
1890 eor x20,x20,x21
1891 eor v3.16b,v3.16b,v27.16b
1892 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1893
1894 stp x5,x7,[x0,#0] // store output
1895 add x28,x28,#7 // increment counter
1896 stp x9,x11,[x0,#16]
1897 stp x13,x15,[x0,#32]
1898 stp x17,x20,[x0,#48]
1899 add x0,x0,#64
1900 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1901
1902 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1903 eor v4.16b,v4.16b,v24.16b
1904 eor v5.16b,v5.16b,v25.16b
1905 eor v6.16b,v6.16b,v26.16b
1906 eor v7.16b,v7.16b,v27.16b
1907 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1908
1909 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1910 eor v8.16b,v8.16b,v0.16b
1911 ldp q24,q25,[sp,#0]
1912 eor v9.16b,v9.16b,v1.16b
1913 ldp q26,q27,[sp,#32]
1914 eor v10.16b,v10.16b,v2.16b
1915 eor v11.16b,v11.16b,v3.16b
1916 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1917
1918 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1919 eor v12.16b,v12.16b,v4.16b
1920 eor v13.16b,v13.16b,v5.16b
1921 eor v14.16b,v14.16b,v6.16b
1922 eor v15.16b,v15.16b,v7.16b
1923 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1924
1925 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1926 eor v16.16b,v16.16b,v8.16b
1927 eor v17.16b,v17.16b,v9.16b
1928 eor v18.16b,v18.16b,v10.16b
1929 eor v19.16b,v19.16b,v11.16b
1930 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1931
1932 shl v0.4s,v31.4s,#1 // 4 -> 8
1933 eor v20.16b,v20.16b,v12.16b
1934 eor v21.16b,v21.16b,v13.16b
1935 eor v22.16b,v22.16b,v14.16b
1936 eor v23.16b,v23.16b,v15.16b
1937 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1938
1939 add v27.4s,v27.4s,v0.4s // += 8
1940 add v28.4s,v28.4s,v0.4s
1941 add v29.4s,v29.4s,v0.4s
1942 add v30.4s,v30.4s,v0.4s
1943
1944 b.hs .Loop_outer_512_neon
1945
1946 adds x2,x2,#512
1947 ushr v0.4s,v31.4s,#2 // 4 -> 1
1948
1949 ldp d8,d9,[sp,#128+0] // meet ABI requirements
1950 ldp d10,d11,[sp,#128+16]
1951 ldp d12,d13,[sp,#128+32]
1952 ldp d14,d15,[sp,#128+48]
1953
1954 stp q24,q31,[sp,#0] // wipe off-load area
1955 stp q24,q31,[sp,#32]
1956 stp q24,q31,[sp,#64]
1957
1958 b.eq .Ldone_512_neon
1959
1960 cmp x2,#192
1961 sub v27.4s,v27.4s,v0.4s // -= 1
1962 sub v28.4s,v28.4s,v0.4s
1963 sub v29.4s,v29.4s,v0.4s
1964 add sp,sp,#128
1965 b.hs .Loop_outer_neon
1966
1967 eor v25.16b,v25.16b,v25.16b
1968 eor v26.16b,v26.16b,v26.16b
1969 eor v27.16b,v27.16b,v27.16b
1970 eor v28.16b,v28.16b,v28.16b
1971 eor v29.16b,v29.16b,v29.16b
1972 eor v30.16b,v30.16b,v30.16b
1973 b .Loop_outer
1974
1975.Ldone_512_neon:
1976 ldp x19,x20,[x29,#16]
1977 add sp,sp,#128+64
1978 ldp x21,x22,[x29,#32]
1979 ldp x23,x24,[x29,#48]
1980 ldp x25,x26,[x29,#64]
1981 ldp x27,x28,[x29,#80]
1982 ldp x29,x30,[sp],#96
1983 ret
1984.size ChaCha20_512_neon,.-ChaCha20_512_neon
1985#endif
Robert Sloan726e9d12018-09-11 11:45:04 -07001986#endif // !OPENSSL_NO_ASM