blob: b14466ddd77c9f5c41bc687f670fb52ab4a43adf [file] [log] [blame]
Robert Sloanc9abfe42018-11-26 12:19:07 -08001// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
Pete Bentley0c61efe2019-08-13 09:32:23 +01004#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
Robert Sloan726e9d12018-09-11 11:45:04 -07007#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
Robert Sloan726e9d12018-09-11 11:45:04 -070010
11#if !defined(OPENSSL_NO_ASM)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
Robert Sloan8ff03552017-06-14 12:40:58 -070015#include <openssl/arm_arch.h>
16
Robert Sloan8ff03552017-06-14 12:40:58 -070017
18
Robert Sloanc9abfe42018-11-26 12:19:07 -080019.section __TEXT,__const
Robert Sloan8ff03552017-06-14 12:40:58 -070020
21.align 5
22Lsigma:
23.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
24Lone:
25.long 1,0,0,0
Robert Sloan8ff03552017-06-14 12:40:58 -070026.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
27.align 2
28
Robert Sloanc9abfe42018-11-26 12:19:07 -080029.text
30
Robert Sloan8ff03552017-06-14 12:40:58 -070031.globl _ChaCha20_ctr32
32.private_extern _ChaCha20_ctr32
33
34.align 5
35_ChaCha20_ctr32:
36 cbz x2,Labort
Pete Bentley0c61efe2019-08-13 09:32:23 +010037#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
38 adrp x5,:pg_hi21_nc:_OPENSSL_armcap_P
39#else
Robert Sloanc9abfe42018-11-26 12:19:07 -080040 adrp x5,_OPENSSL_armcap_P@PAGE
Pete Bentley0c61efe2019-08-13 09:32:23 +010041#endif
Robert Sloan8ff03552017-06-14 12:40:58 -070042 cmp x2,#192
43 b.lo Lshort
Pete Bentley0c61efe2019-08-13 09:32:23 +010044 ldr w17,[x5,_OPENSSL_armcap_P@PAGEOFF]
Robert Sloan8ff03552017-06-14 12:40:58 -070045 tst w17,#ARMV7_NEON
46 b.ne ChaCha20_neon
47
48Lshort:
49 stp x29,x30,[sp,#-96]!
50 add x29,sp,#0
51
Robert Sloanc9abfe42018-11-26 12:19:07 -080052 adrp x5,Lsigma@PAGE
53 add x5,x5,Lsigma@PAGEOFF
Robert Sloan8ff03552017-06-14 12:40:58 -070054 stp x19,x20,[sp,#16]
55 stp x21,x22,[sp,#32]
56 stp x23,x24,[sp,#48]
57 stp x25,x26,[sp,#64]
58 stp x27,x28,[sp,#80]
59 sub sp,sp,#64
60
61 ldp x22,x23,[x5] // load sigma
62 ldp x24,x25,[x3] // load key
63 ldp x26,x27,[x3,#16]
64 ldp x28,x30,[x4] // load counter
65#ifdef __ARMEB__
66 ror x24,x24,#32
67 ror x25,x25,#32
68 ror x26,x26,#32
69 ror x27,x27,#32
70 ror x28,x28,#32
71 ror x30,x30,#32
72#endif
73
74Loop_outer:
75 mov w5,w22 // unpack key block
76 lsr x6,x22,#32
77 mov w7,w23
78 lsr x8,x23,#32
79 mov w9,w24
80 lsr x10,x24,#32
81 mov w11,w25
82 lsr x12,x25,#32
83 mov w13,w26
84 lsr x14,x26,#32
85 mov w15,w27
86 lsr x16,x27,#32
87 mov w17,w28
88 lsr x19,x28,#32
89 mov w20,w30
90 lsr x21,x30,#32
91
92 mov x4,#10
93 subs x2,x2,#64
94Loop:
95 sub x4,x4,#1
96 add w5,w5,w9
97 add w6,w6,w10
98 add w7,w7,w11
99 add w8,w8,w12
100 eor w17,w17,w5
101 eor w19,w19,w6
102 eor w20,w20,w7
103 eor w21,w21,w8
104 ror w17,w17,#16
105 ror w19,w19,#16
106 ror w20,w20,#16
107 ror w21,w21,#16
108 add w13,w13,w17
109 add w14,w14,w19
110 add w15,w15,w20
111 add w16,w16,w21
112 eor w9,w9,w13
113 eor w10,w10,w14
114 eor w11,w11,w15
115 eor w12,w12,w16
116 ror w9,w9,#20
117 ror w10,w10,#20
118 ror w11,w11,#20
119 ror w12,w12,#20
120 add w5,w5,w9
121 add w6,w6,w10
122 add w7,w7,w11
123 add w8,w8,w12
124 eor w17,w17,w5
125 eor w19,w19,w6
126 eor w20,w20,w7
127 eor w21,w21,w8
128 ror w17,w17,#24
129 ror w19,w19,#24
130 ror w20,w20,#24
131 ror w21,w21,#24
132 add w13,w13,w17
133 add w14,w14,w19
134 add w15,w15,w20
135 add w16,w16,w21
136 eor w9,w9,w13
137 eor w10,w10,w14
138 eor w11,w11,w15
139 eor w12,w12,w16
140 ror w9,w9,#25
141 ror w10,w10,#25
142 ror w11,w11,#25
143 ror w12,w12,#25
144 add w5,w5,w10
145 add w6,w6,w11
146 add w7,w7,w12
147 add w8,w8,w9
148 eor w21,w21,w5
149 eor w17,w17,w6
150 eor w19,w19,w7
151 eor w20,w20,w8
152 ror w21,w21,#16
153 ror w17,w17,#16
154 ror w19,w19,#16
155 ror w20,w20,#16
156 add w15,w15,w21
157 add w16,w16,w17
158 add w13,w13,w19
159 add w14,w14,w20
160 eor w10,w10,w15
161 eor w11,w11,w16
162 eor w12,w12,w13
163 eor w9,w9,w14
164 ror w10,w10,#20
165 ror w11,w11,#20
166 ror w12,w12,#20
167 ror w9,w9,#20
168 add w5,w5,w10
169 add w6,w6,w11
170 add w7,w7,w12
171 add w8,w8,w9
172 eor w21,w21,w5
173 eor w17,w17,w6
174 eor w19,w19,w7
175 eor w20,w20,w8
176 ror w21,w21,#24
177 ror w17,w17,#24
178 ror w19,w19,#24
179 ror w20,w20,#24
180 add w15,w15,w21
181 add w16,w16,w17
182 add w13,w13,w19
183 add w14,w14,w20
184 eor w10,w10,w15
185 eor w11,w11,w16
186 eor w12,w12,w13
187 eor w9,w9,w14
188 ror w10,w10,#25
189 ror w11,w11,#25
190 ror w12,w12,#25
191 ror w9,w9,#25
192 cbnz x4,Loop
193
194 add w5,w5,w22 // accumulate key block
195 add x6,x6,x22,lsr#32
196 add w7,w7,w23
197 add x8,x8,x23,lsr#32
198 add w9,w9,w24
199 add x10,x10,x24,lsr#32
200 add w11,w11,w25
201 add x12,x12,x25,lsr#32
202 add w13,w13,w26
203 add x14,x14,x26,lsr#32
204 add w15,w15,w27
205 add x16,x16,x27,lsr#32
206 add w17,w17,w28
207 add x19,x19,x28,lsr#32
208 add w20,w20,w30
209 add x21,x21,x30,lsr#32
210
211 b.lo Ltail
212
213 add x5,x5,x6,lsl#32 // pack
214 add x7,x7,x8,lsl#32
215 ldp x6,x8,[x1,#0] // load input
216 add x9,x9,x10,lsl#32
217 add x11,x11,x12,lsl#32
218 ldp x10,x12,[x1,#16]
219 add x13,x13,x14,lsl#32
220 add x15,x15,x16,lsl#32
221 ldp x14,x16,[x1,#32]
222 add x17,x17,x19,lsl#32
223 add x20,x20,x21,lsl#32
224 ldp x19,x21,[x1,#48]
225 add x1,x1,#64
226#ifdef __ARMEB__
227 rev x5,x5
228 rev x7,x7
229 rev x9,x9
230 rev x11,x11
231 rev x13,x13
232 rev x15,x15
233 rev x17,x17
234 rev x20,x20
235#endif
236 eor x5,x5,x6
237 eor x7,x7,x8
238 eor x9,x9,x10
239 eor x11,x11,x12
240 eor x13,x13,x14
241 eor x15,x15,x16
242 eor x17,x17,x19
243 eor x20,x20,x21
244
245 stp x5,x7,[x0,#0] // store output
246 add x28,x28,#1 // increment counter
247 stp x9,x11,[x0,#16]
248 stp x13,x15,[x0,#32]
249 stp x17,x20,[x0,#48]
250 add x0,x0,#64
251
252 b.hi Loop_outer
253
254 ldp x19,x20,[x29,#16]
255 add sp,sp,#64
256 ldp x21,x22,[x29,#32]
257 ldp x23,x24,[x29,#48]
258 ldp x25,x26,[x29,#64]
259 ldp x27,x28,[x29,#80]
260 ldp x29,x30,[sp],#96
261Labort:
262 ret
263
264.align 4
265Ltail:
266 add x2,x2,#64
267Less_than_64:
268 sub x0,x0,#1
269 add x1,x1,x2
270 add x0,x0,x2
271 add x4,sp,x2
272 neg x2,x2
273
274 add x5,x5,x6,lsl#32 // pack
275 add x7,x7,x8,lsl#32
276 add x9,x9,x10,lsl#32
277 add x11,x11,x12,lsl#32
278 add x13,x13,x14,lsl#32
279 add x15,x15,x16,lsl#32
280 add x17,x17,x19,lsl#32
281 add x20,x20,x21,lsl#32
282#ifdef __ARMEB__
283 rev x5,x5
284 rev x7,x7
285 rev x9,x9
286 rev x11,x11
287 rev x13,x13
288 rev x15,x15
289 rev x17,x17
290 rev x20,x20
291#endif
292 stp x5,x7,[sp,#0]
293 stp x9,x11,[sp,#16]
294 stp x13,x15,[sp,#32]
295 stp x17,x20,[sp,#48]
296
297Loop_tail:
298 ldrb w10,[x1,x2]
299 ldrb w11,[x4,x2]
300 add x2,x2,#1
301 eor w10,w10,w11
302 strb w10,[x0,x2]
303 cbnz x2,Loop_tail
304
305 stp xzr,xzr,[sp,#0]
306 stp xzr,xzr,[sp,#16]
307 stp xzr,xzr,[sp,#32]
308 stp xzr,xzr,[sp,#48]
309
310 ldp x19,x20,[x29,#16]
311 add sp,sp,#64
312 ldp x21,x22,[x29,#32]
313 ldp x23,x24,[x29,#48]
314 ldp x25,x26,[x29,#64]
315 ldp x27,x28,[x29,#80]
316 ldp x29,x30,[sp],#96
317 ret
318
319
320
321.align 5
322ChaCha20_neon:
323 stp x29,x30,[sp,#-96]!
324 add x29,sp,#0
325
Robert Sloanc9abfe42018-11-26 12:19:07 -0800326 adrp x5,Lsigma@PAGE
327 add x5,x5,Lsigma@PAGEOFF
Robert Sloan8ff03552017-06-14 12:40:58 -0700328 stp x19,x20,[sp,#16]
329 stp x21,x22,[sp,#32]
330 stp x23,x24,[sp,#48]
331 stp x25,x26,[sp,#64]
332 stp x27,x28,[sp,#80]
333 cmp x2,#512
334 b.hs L512_or_more_neon
335
336 sub sp,sp,#64
337
338 ldp x22,x23,[x5] // load sigma
339 ld1 {v24.4s},[x5],#16
340 ldp x24,x25,[x3] // load key
341 ldp x26,x27,[x3,#16]
342 ld1 {v25.4s,v26.4s},[x3]
343 ldp x28,x30,[x4] // load counter
344 ld1 {v27.4s},[x4]
345 ld1 {v31.4s},[x5]
346#ifdef __ARMEB__
347 rev64 v24.4s,v24.4s
348 ror x24,x24,#32
349 ror x25,x25,#32
350 ror x26,x26,#32
351 ror x27,x27,#32
352 ror x28,x28,#32
353 ror x30,x30,#32
354#endif
355 add v27.4s,v27.4s,v31.4s // += 1
356 add v28.4s,v27.4s,v31.4s
357 add v29.4s,v28.4s,v31.4s
358 shl v31.4s,v31.4s,#2 // 1 -> 4
359
360Loop_outer_neon:
361 mov w5,w22 // unpack key block
362 lsr x6,x22,#32
363 mov v0.16b,v24.16b
364 mov w7,w23
365 lsr x8,x23,#32
366 mov v4.16b,v24.16b
367 mov w9,w24
368 lsr x10,x24,#32
369 mov v16.16b,v24.16b
370 mov w11,w25
371 mov v1.16b,v25.16b
372 lsr x12,x25,#32
373 mov v5.16b,v25.16b
374 mov w13,w26
375 mov v17.16b,v25.16b
376 lsr x14,x26,#32
377 mov v3.16b,v27.16b
378 mov w15,w27
379 mov v7.16b,v28.16b
380 lsr x16,x27,#32
381 mov v19.16b,v29.16b
382 mov w17,w28
383 mov v2.16b,v26.16b
384 lsr x19,x28,#32
385 mov v6.16b,v26.16b
386 mov w20,w30
387 mov v18.16b,v26.16b
388 lsr x21,x30,#32
389
390 mov x4,#10
391 subs x2,x2,#256
392Loop_neon:
393 sub x4,x4,#1
394 add v0.4s,v0.4s,v1.4s
395 add w5,w5,w9
396 add v4.4s,v4.4s,v5.4s
397 add w6,w6,w10
398 add v16.4s,v16.4s,v17.4s
399 add w7,w7,w11
400 eor v3.16b,v3.16b,v0.16b
401 add w8,w8,w12
402 eor v7.16b,v7.16b,v4.16b
403 eor w17,w17,w5
404 eor v19.16b,v19.16b,v16.16b
405 eor w19,w19,w6
406 rev32 v3.8h,v3.8h
407 eor w20,w20,w7
408 rev32 v7.8h,v7.8h
409 eor w21,w21,w8
410 rev32 v19.8h,v19.8h
411 ror w17,w17,#16
412 add v2.4s,v2.4s,v3.4s
413 ror w19,w19,#16
414 add v6.4s,v6.4s,v7.4s
415 ror w20,w20,#16
416 add v18.4s,v18.4s,v19.4s
417 ror w21,w21,#16
418 eor v20.16b,v1.16b,v2.16b
419 add w13,w13,w17
420 eor v21.16b,v5.16b,v6.16b
421 add w14,w14,w19
422 eor v22.16b,v17.16b,v18.16b
423 add w15,w15,w20
424 ushr v1.4s,v20.4s,#20
425 add w16,w16,w21
426 ushr v5.4s,v21.4s,#20
427 eor w9,w9,w13
428 ushr v17.4s,v22.4s,#20
429 eor w10,w10,w14
430 sli v1.4s,v20.4s,#12
431 eor w11,w11,w15
432 sli v5.4s,v21.4s,#12
433 eor w12,w12,w16
434 sli v17.4s,v22.4s,#12
435 ror w9,w9,#20
436 add v0.4s,v0.4s,v1.4s
437 ror w10,w10,#20
438 add v4.4s,v4.4s,v5.4s
439 ror w11,w11,#20
440 add v16.4s,v16.4s,v17.4s
441 ror w12,w12,#20
442 eor v20.16b,v3.16b,v0.16b
443 add w5,w5,w9
444 eor v21.16b,v7.16b,v4.16b
445 add w6,w6,w10
446 eor v22.16b,v19.16b,v16.16b
447 add w7,w7,w11
448 ushr v3.4s,v20.4s,#24
449 add w8,w8,w12
450 ushr v7.4s,v21.4s,#24
451 eor w17,w17,w5
452 ushr v19.4s,v22.4s,#24
453 eor w19,w19,w6
454 sli v3.4s,v20.4s,#8
455 eor w20,w20,w7
456 sli v7.4s,v21.4s,#8
457 eor w21,w21,w8
458 sli v19.4s,v22.4s,#8
459 ror w17,w17,#24
460 add v2.4s,v2.4s,v3.4s
461 ror w19,w19,#24
462 add v6.4s,v6.4s,v7.4s
463 ror w20,w20,#24
464 add v18.4s,v18.4s,v19.4s
465 ror w21,w21,#24
466 eor v20.16b,v1.16b,v2.16b
467 add w13,w13,w17
468 eor v21.16b,v5.16b,v6.16b
469 add w14,w14,w19
470 eor v22.16b,v17.16b,v18.16b
471 add w15,w15,w20
472 ushr v1.4s,v20.4s,#25
473 add w16,w16,w21
474 ushr v5.4s,v21.4s,#25
475 eor w9,w9,w13
476 ushr v17.4s,v22.4s,#25
477 eor w10,w10,w14
478 sli v1.4s,v20.4s,#7
479 eor w11,w11,w15
480 sli v5.4s,v21.4s,#7
481 eor w12,w12,w16
482 sli v17.4s,v22.4s,#7
483 ror w9,w9,#25
484 ext v2.16b,v2.16b,v2.16b,#8
485 ror w10,w10,#25
486 ext v6.16b,v6.16b,v6.16b,#8
487 ror w11,w11,#25
488 ext v18.16b,v18.16b,v18.16b,#8
489 ror w12,w12,#25
490 ext v3.16b,v3.16b,v3.16b,#12
491 ext v7.16b,v7.16b,v7.16b,#12
492 ext v19.16b,v19.16b,v19.16b,#12
493 ext v1.16b,v1.16b,v1.16b,#4
494 ext v5.16b,v5.16b,v5.16b,#4
495 ext v17.16b,v17.16b,v17.16b,#4
496 add v0.4s,v0.4s,v1.4s
497 add w5,w5,w10
498 add v4.4s,v4.4s,v5.4s
499 add w6,w6,w11
500 add v16.4s,v16.4s,v17.4s
501 add w7,w7,w12
502 eor v3.16b,v3.16b,v0.16b
503 add w8,w8,w9
504 eor v7.16b,v7.16b,v4.16b
505 eor w21,w21,w5
506 eor v19.16b,v19.16b,v16.16b
507 eor w17,w17,w6
508 rev32 v3.8h,v3.8h
509 eor w19,w19,w7
510 rev32 v7.8h,v7.8h
511 eor w20,w20,w8
512 rev32 v19.8h,v19.8h
513 ror w21,w21,#16
514 add v2.4s,v2.4s,v3.4s
515 ror w17,w17,#16
516 add v6.4s,v6.4s,v7.4s
517 ror w19,w19,#16
518 add v18.4s,v18.4s,v19.4s
519 ror w20,w20,#16
520 eor v20.16b,v1.16b,v2.16b
521 add w15,w15,w21
522 eor v21.16b,v5.16b,v6.16b
523 add w16,w16,w17
524 eor v22.16b,v17.16b,v18.16b
525 add w13,w13,w19
526 ushr v1.4s,v20.4s,#20
527 add w14,w14,w20
528 ushr v5.4s,v21.4s,#20
529 eor w10,w10,w15
530 ushr v17.4s,v22.4s,#20
531 eor w11,w11,w16
532 sli v1.4s,v20.4s,#12
533 eor w12,w12,w13
534 sli v5.4s,v21.4s,#12
535 eor w9,w9,w14
536 sli v17.4s,v22.4s,#12
537 ror w10,w10,#20
538 add v0.4s,v0.4s,v1.4s
539 ror w11,w11,#20
540 add v4.4s,v4.4s,v5.4s
541 ror w12,w12,#20
542 add v16.4s,v16.4s,v17.4s
543 ror w9,w9,#20
544 eor v20.16b,v3.16b,v0.16b
545 add w5,w5,w10
546 eor v21.16b,v7.16b,v4.16b
547 add w6,w6,w11
548 eor v22.16b,v19.16b,v16.16b
549 add w7,w7,w12
550 ushr v3.4s,v20.4s,#24
551 add w8,w8,w9
552 ushr v7.4s,v21.4s,#24
553 eor w21,w21,w5
554 ushr v19.4s,v22.4s,#24
555 eor w17,w17,w6
556 sli v3.4s,v20.4s,#8
557 eor w19,w19,w7
558 sli v7.4s,v21.4s,#8
559 eor w20,w20,w8
560 sli v19.4s,v22.4s,#8
561 ror w21,w21,#24
562 add v2.4s,v2.4s,v3.4s
563 ror w17,w17,#24
564 add v6.4s,v6.4s,v7.4s
565 ror w19,w19,#24
566 add v18.4s,v18.4s,v19.4s
567 ror w20,w20,#24
568 eor v20.16b,v1.16b,v2.16b
569 add w15,w15,w21
570 eor v21.16b,v5.16b,v6.16b
571 add w16,w16,w17
572 eor v22.16b,v17.16b,v18.16b
573 add w13,w13,w19
574 ushr v1.4s,v20.4s,#25
575 add w14,w14,w20
576 ushr v5.4s,v21.4s,#25
577 eor w10,w10,w15
578 ushr v17.4s,v22.4s,#25
579 eor w11,w11,w16
580 sli v1.4s,v20.4s,#7
581 eor w12,w12,w13
582 sli v5.4s,v21.4s,#7
583 eor w9,w9,w14
584 sli v17.4s,v22.4s,#7
585 ror w10,w10,#25
586 ext v2.16b,v2.16b,v2.16b,#8
587 ror w11,w11,#25
588 ext v6.16b,v6.16b,v6.16b,#8
589 ror w12,w12,#25
590 ext v18.16b,v18.16b,v18.16b,#8
591 ror w9,w9,#25
592 ext v3.16b,v3.16b,v3.16b,#4
593 ext v7.16b,v7.16b,v7.16b,#4
594 ext v19.16b,v19.16b,v19.16b,#4
595 ext v1.16b,v1.16b,v1.16b,#12
596 ext v5.16b,v5.16b,v5.16b,#12
597 ext v17.16b,v17.16b,v17.16b,#12
598 cbnz x4,Loop_neon
599
600 add w5,w5,w22 // accumulate key block
601 add v0.4s,v0.4s,v24.4s
602 add x6,x6,x22,lsr#32
603 add v4.4s,v4.4s,v24.4s
604 add w7,w7,w23
605 add v16.4s,v16.4s,v24.4s
606 add x8,x8,x23,lsr#32
607 add v2.4s,v2.4s,v26.4s
608 add w9,w9,w24
609 add v6.4s,v6.4s,v26.4s
610 add x10,x10,x24,lsr#32
611 add v18.4s,v18.4s,v26.4s
612 add w11,w11,w25
613 add v3.4s,v3.4s,v27.4s
614 add x12,x12,x25,lsr#32
615 add w13,w13,w26
616 add v7.4s,v7.4s,v28.4s
617 add x14,x14,x26,lsr#32
618 add w15,w15,w27
619 add v19.4s,v19.4s,v29.4s
620 add x16,x16,x27,lsr#32
621 add w17,w17,w28
622 add v1.4s,v1.4s,v25.4s
623 add x19,x19,x28,lsr#32
624 add w20,w20,w30
625 add v5.4s,v5.4s,v25.4s
626 add x21,x21,x30,lsr#32
627 add v17.4s,v17.4s,v25.4s
628
629 b.lo Ltail_neon
630
631 add x5,x5,x6,lsl#32 // pack
632 add x7,x7,x8,lsl#32
633 ldp x6,x8,[x1,#0] // load input
634 add x9,x9,x10,lsl#32
635 add x11,x11,x12,lsl#32
636 ldp x10,x12,[x1,#16]
637 add x13,x13,x14,lsl#32
638 add x15,x15,x16,lsl#32
639 ldp x14,x16,[x1,#32]
640 add x17,x17,x19,lsl#32
641 add x20,x20,x21,lsl#32
642 ldp x19,x21,[x1,#48]
643 add x1,x1,#64
644#ifdef __ARMEB__
645 rev x5,x5
646 rev x7,x7
647 rev x9,x9
648 rev x11,x11
649 rev x13,x13
650 rev x15,x15
651 rev x17,x17
652 rev x20,x20
653#endif
654 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
655 eor x5,x5,x6
656 eor x7,x7,x8
657 eor x9,x9,x10
658 eor x11,x11,x12
659 eor x13,x13,x14
660 eor v0.16b,v0.16b,v20.16b
661 eor x15,x15,x16
662 eor v1.16b,v1.16b,v21.16b
663 eor x17,x17,x19
664 eor v2.16b,v2.16b,v22.16b
665 eor x20,x20,x21
666 eor v3.16b,v3.16b,v23.16b
667 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
668
669 stp x5,x7,[x0,#0] // store output
670 add x28,x28,#4 // increment counter
671 stp x9,x11,[x0,#16]
672 add v27.4s,v27.4s,v31.4s // += 4
673 stp x13,x15,[x0,#32]
674 add v28.4s,v28.4s,v31.4s
675 stp x17,x20,[x0,#48]
676 add v29.4s,v29.4s,v31.4s
677 add x0,x0,#64
678
679 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
680 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
681
682 eor v4.16b,v4.16b,v20.16b
683 eor v5.16b,v5.16b,v21.16b
684 eor v6.16b,v6.16b,v22.16b
685 eor v7.16b,v7.16b,v23.16b
686 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
687
688 eor v16.16b,v16.16b,v0.16b
689 eor v17.16b,v17.16b,v1.16b
690 eor v18.16b,v18.16b,v2.16b
691 eor v19.16b,v19.16b,v3.16b
692 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
693
694 b.hi Loop_outer_neon
695
696 ldp x19,x20,[x29,#16]
697 add sp,sp,#64
698 ldp x21,x22,[x29,#32]
699 ldp x23,x24,[x29,#48]
700 ldp x25,x26,[x29,#64]
701 ldp x27,x28,[x29,#80]
702 ldp x29,x30,[sp],#96
703 ret
704
705Ltail_neon:
706 add x2,x2,#256
707 cmp x2,#64
708 b.lo Less_than_64
709
710 add x5,x5,x6,lsl#32 // pack
711 add x7,x7,x8,lsl#32
712 ldp x6,x8,[x1,#0] // load input
713 add x9,x9,x10,lsl#32
714 add x11,x11,x12,lsl#32
715 ldp x10,x12,[x1,#16]
716 add x13,x13,x14,lsl#32
717 add x15,x15,x16,lsl#32
718 ldp x14,x16,[x1,#32]
719 add x17,x17,x19,lsl#32
720 add x20,x20,x21,lsl#32
721 ldp x19,x21,[x1,#48]
722 add x1,x1,#64
723#ifdef __ARMEB__
724 rev x5,x5
725 rev x7,x7
726 rev x9,x9
727 rev x11,x11
728 rev x13,x13
729 rev x15,x15
730 rev x17,x17
731 rev x20,x20
732#endif
733 eor x5,x5,x6
734 eor x7,x7,x8
735 eor x9,x9,x10
736 eor x11,x11,x12
737 eor x13,x13,x14
738 eor x15,x15,x16
739 eor x17,x17,x19
740 eor x20,x20,x21
741
742 stp x5,x7,[x0,#0] // store output
743 add x28,x28,#4 // increment counter
744 stp x9,x11,[x0,#16]
745 stp x13,x15,[x0,#32]
746 stp x17,x20,[x0,#48]
747 add x0,x0,#64
748 b.eq Ldone_neon
749 sub x2,x2,#64
750 cmp x2,#64
751 b.lo Less_than_128
752
753 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
754 eor v0.16b,v0.16b,v20.16b
755 eor v1.16b,v1.16b,v21.16b
756 eor v2.16b,v2.16b,v22.16b
757 eor v3.16b,v3.16b,v23.16b
758 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
759 b.eq Ldone_neon
760 sub x2,x2,#64
761 cmp x2,#64
762 b.lo Less_than_192
763
764 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
765 eor v4.16b,v4.16b,v20.16b
766 eor v5.16b,v5.16b,v21.16b
767 eor v6.16b,v6.16b,v22.16b
768 eor v7.16b,v7.16b,v23.16b
769 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
770 b.eq Ldone_neon
771 sub x2,x2,#64
772
773 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
774 b Last_neon
775
776Less_than_128:
777 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
778 b Last_neon
779Less_than_192:
780 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
781 b Last_neon
782
783.align 4
784Last_neon:
785 sub x0,x0,#1
786 add x1,x1,x2
787 add x0,x0,x2
788 add x4,sp,x2
789 neg x2,x2
790
791Loop_tail_neon:
792 ldrb w10,[x1,x2]
793 ldrb w11,[x4,x2]
794 add x2,x2,#1
795 eor w10,w10,w11
796 strb w10,[x0,x2]
797 cbnz x2,Loop_tail_neon
798
799 stp xzr,xzr,[sp,#0]
800 stp xzr,xzr,[sp,#16]
801 stp xzr,xzr,[sp,#32]
802 stp xzr,xzr,[sp,#48]
803
804Ldone_neon:
805 ldp x19,x20,[x29,#16]
806 add sp,sp,#64
807 ldp x21,x22,[x29,#32]
808 ldp x23,x24,[x29,#48]
809 ldp x25,x26,[x29,#64]
810 ldp x27,x28,[x29,#80]
811 ldp x29,x30,[sp],#96
812 ret
813
814
815.align 5
816ChaCha20_512_neon:
817 stp x29,x30,[sp,#-96]!
818 add x29,sp,#0
819
Robert Sloanc9abfe42018-11-26 12:19:07 -0800820 adrp x5,Lsigma@PAGE
821 add x5,x5,Lsigma@PAGEOFF
Robert Sloan8ff03552017-06-14 12:40:58 -0700822 stp x19,x20,[sp,#16]
823 stp x21,x22,[sp,#32]
824 stp x23,x24,[sp,#48]
825 stp x25,x26,[sp,#64]
826 stp x27,x28,[sp,#80]
827
828L512_or_more_neon:
829 sub sp,sp,#128+64
830
831 ldp x22,x23,[x5] // load sigma
832 ld1 {v24.4s},[x5],#16
833 ldp x24,x25,[x3] // load key
834 ldp x26,x27,[x3,#16]
835 ld1 {v25.4s,v26.4s},[x3]
836 ldp x28,x30,[x4] // load counter
837 ld1 {v27.4s},[x4]
838 ld1 {v31.4s},[x5]
839#ifdef __ARMEB__
840 rev64 v24.4s,v24.4s
841 ror x24,x24,#32
842 ror x25,x25,#32
843 ror x26,x26,#32
844 ror x27,x27,#32
845 ror x28,x28,#32
846 ror x30,x30,#32
847#endif
848 add v27.4s,v27.4s,v31.4s // += 1
849 stp q24,q25,[sp,#0] // off-load key block, invariant part
850 add v27.4s,v27.4s,v31.4s // not typo
851 str q26,[sp,#32]
852 add v28.4s,v27.4s,v31.4s
853 add v29.4s,v28.4s,v31.4s
854 add v30.4s,v29.4s,v31.4s
855 shl v31.4s,v31.4s,#2 // 1 -> 4
856
857 stp d8,d9,[sp,#128+0] // meet ABI requirements
858 stp d10,d11,[sp,#128+16]
859 stp d12,d13,[sp,#128+32]
860 stp d14,d15,[sp,#128+48]
861
862 sub x2,x2,#512 // not typo
863
864Loop_outer_512_neon:
865 mov v0.16b,v24.16b
866 mov v4.16b,v24.16b
867 mov v8.16b,v24.16b
868 mov v12.16b,v24.16b
869 mov v16.16b,v24.16b
870 mov v20.16b,v24.16b
871 mov v1.16b,v25.16b
872 mov w5,w22 // unpack key block
873 mov v5.16b,v25.16b
874 lsr x6,x22,#32
875 mov v9.16b,v25.16b
876 mov w7,w23
877 mov v13.16b,v25.16b
878 lsr x8,x23,#32
879 mov v17.16b,v25.16b
880 mov w9,w24
881 mov v21.16b,v25.16b
882 lsr x10,x24,#32
883 mov v3.16b,v27.16b
884 mov w11,w25
885 mov v7.16b,v28.16b
886 lsr x12,x25,#32
887 mov v11.16b,v29.16b
888 mov w13,w26
889 mov v15.16b,v30.16b
890 lsr x14,x26,#32
891 mov v2.16b,v26.16b
892 mov w15,w27
893 mov v6.16b,v26.16b
894 lsr x16,x27,#32
895 add v19.4s,v3.4s,v31.4s // +4
896 mov w17,w28
897 add v23.4s,v7.4s,v31.4s // +4
898 lsr x19,x28,#32
899 mov v10.16b,v26.16b
900 mov w20,w30
901 mov v14.16b,v26.16b
902 lsr x21,x30,#32
903 mov v18.16b,v26.16b
904 stp q27,q28,[sp,#48] // off-load key block, variable part
905 mov v22.16b,v26.16b
906 str q29,[sp,#80]
907
908 mov x4,#5
909 subs x2,x2,#512
910Loop_upper_neon:
911 sub x4,x4,#1
912 add v0.4s,v0.4s,v1.4s
913 add w5,w5,w9
914 add v4.4s,v4.4s,v5.4s
915 add w6,w6,w10
916 add v8.4s,v8.4s,v9.4s
917 add w7,w7,w11
918 add v12.4s,v12.4s,v13.4s
919 add w8,w8,w12
920 add v16.4s,v16.4s,v17.4s
921 eor w17,w17,w5
922 add v20.4s,v20.4s,v21.4s
923 eor w19,w19,w6
924 eor v3.16b,v3.16b,v0.16b
925 eor w20,w20,w7
926 eor v7.16b,v7.16b,v4.16b
927 eor w21,w21,w8
928 eor v11.16b,v11.16b,v8.16b
929 ror w17,w17,#16
930 eor v15.16b,v15.16b,v12.16b
931 ror w19,w19,#16
932 eor v19.16b,v19.16b,v16.16b
933 ror w20,w20,#16
934 eor v23.16b,v23.16b,v20.16b
935 ror w21,w21,#16
936 rev32 v3.8h,v3.8h
937 add w13,w13,w17
938 rev32 v7.8h,v7.8h
939 add w14,w14,w19
940 rev32 v11.8h,v11.8h
941 add w15,w15,w20
942 rev32 v15.8h,v15.8h
943 add w16,w16,w21
944 rev32 v19.8h,v19.8h
945 eor w9,w9,w13
946 rev32 v23.8h,v23.8h
947 eor w10,w10,w14
948 add v2.4s,v2.4s,v3.4s
949 eor w11,w11,w15
950 add v6.4s,v6.4s,v7.4s
951 eor w12,w12,w16
952 add v10.4s,v10.4s,v11.4s
953 ror w9,w9,#20
954 add v14.4s,v14.4s,v15.4s
955 ror w10,w10,#20
956 add v18.4s,v18.4s,v19.4s
957 ror w11,w11,#20
958 add v22.4s,v22.4s,v23.4s
959 ror w12,w12,#20
960 eor v24.16b,v1.16b,v2.16b
961 add w5,w5,w9
962 eor v25.16b,v5.16b,v6.16b
963 add w6,w6,w10
964 eor v26.16b,v9.16b,v10.16b
965 add w7,w7,w11
966 eor v27.16b,v13.16b,v14.16b
967 add w8,w8,w12
968 eor v28.16b,v17.16b,v18.16b
969 eor w17,w17,w5
970 eor v29.16b,v21.16b,v22.16b
971 eor w19,w19,w6
972 ushr v1.4s,v24.4s,#20
973 eor w20,w20,w7
974 ushr v5.4s,v25.4s,#20
975 eor w21,w21,w8
976 ushr v9.4s,v26.4s,#20
977 ror w17,w17,#24
978 ushr v13.4s,v27.4s,#20
979 ror w19,w19,#24
980 ushr v17.4s,v28.4s,#20
981 ror w20,w20,#24
982 ushr v21.4s,v29.4s,#20
983 ror w21,w21,#24
984 sli v1.4s,v24.4s,#12
985 add w13,w13,w17
986 sli v5.4s,v25.4s,#12
987 add w14,w14,w19
988 sli v9.4s,v26.4s,#12
989 add w15,w15,w20
990 sli v13.4s,v27.4s,#12
991 add w16,w16,w21
992 sli v17.4s,v28.4s,#12
993 eor w9,w9,w13
994 sli v21.4s,v29.4s,#12
995 eor w10,w10,w14
996 add v0.4s,v0.4s,v1.4s
997 eor w11,w11,w15
998 add v4.4s,v4.4s,v5.4s
999 eor w12,w12,w16
1000 add v8.4s,v8.4s,v9.4s
1001 ror w9,w9,#25
1002 add v12.4s,v12.4s,v13.4s
1003 ror w10,w10,#25
1004 add v16.4s,v16.4s,v17.4s
1005 ror w11,w11,#25
1006 add v20.4s,v20.4s,v21.4s
1007 ror w12,w12,#25
1008 eor v24.16b,v3.16b,v0.16b
1009 add w5,w5,w10
1010 eor v25.16b,v7.16b,v4.16b
1011 add w6,w6,w11
1012 eor v26.16b,v11.16b,v8.16b
1013 add w7,w7,w12
1014 eor v27.16b,v15.16b,v12.16b
1015 add w8,w8,w9
1016 eor v28.16b,v19.16b,v16.16b
1017 eor w21,w21,w5
1018 eor v29.16b,v23.16b,v20.16b
1019 eor w17,w17,w6
1020 ushr v3.4s,v24.4s,#24
1021 eor w19,w19,w7
1022 ushr v7.4s,v25.4s,#24
1023 eor w20,w20,w8
1024 ushr v11.4s,v26.4s,#24
1025 ror w21,w21,#16
1026 ushr v15.4s,v27.4s,#24
1027 ror w17,w17,#16
1028 ushr v19.4s,v28.4s,#24
1029 ror w19,w19,#16
1030 ushr v23.4s,v29.4s,#24
1031 ror w20,w20,#16
1032 sli v3.4s,v24.4s,#8
1033 add w15,w15,w21
1034 sli v7.4s,v25.4s,#8
1035 add w16,w16,w17
1036 sli v11.4s,v26.4s,#8
1037 add w13,w13,w19
1038 sli v15.4s,v27.4s,#8
1039 add w14,w14,w20
1040 sli v19.4s,v28.4s,#8
1041 eor w10,w10,w15
1042 sli v23.4s,v29.4s,#8
1043 eor w11,w11,w16
1044 add v2.4s,v2.4s,v3.4s
1045 eor w12,w12,w13
1046 add v6.4s,v6.4s,v7.4s
1047 eor w9,w9,w14
1048 add v10.4s,v10.4s,v11.4s
1049 ror w10,w10,#20
1050 add v14.4s,v14.4s,v15.4s
1051 ror w11,w11,#20
1052 add v18.4s,v18.4s,v19.4s
1053 ror w12,w12,#20
1054 add v22.4s,v22.4s,v23.4s
1055 ror w9,w9,#20
1056 eor v24.16b,v1.16b,v2.16b
1057 add w5,w5,w10
1058 eor v25.16b,v5.16b,v6.16b
1059 add w6,w6,w11
1060 eor v26.16b,v9.16b,v10.16b
1061 add w7,w7,w12
1062 eor v27.16b,v13.16b,v14.16b
1063 add w8,w8,w9
1064 eor v28.16b,v17.16b,v18.16b
1065 eor w21,w21,w5
1066 eor v29.16b,v21.16b,v22.16b
1067 eor w17,w17,w6
1068 ushr v1.4s,v24.4s,#25
1069 eor w19,w19,w7
1070 ushr v5.4s,v25.4s,#25
1071 eor w20,w20,w8
1072 ushr v9.4s,v26.4s,#25
1073 ror w21,w21,#24
1074 ushr v13.4s,v27.4s,#25
1075 ror w17,w17,#24
1076 ushr v17.4s,v28.4s,#25
1077 ror w19,w19,#24
1078 ushr v21.4s,v29.4s,#25
1079 ror w20,w20,#24
1080 sli v1.4s,v24.4s,#7
1081 add w15,w15,w21
1082 sli v5.4s,v25.4s,#7
1083 add w16,w16,w17
1084 sli v9.4s,v26.4s,#7
1085 add w13,w13,w19
1086 sli v13.4s,v27.4s,#7
1087 add w14,w14,w20
1088 sli v17.4s,v28.4s,#7
1089 eor w10,w10,w15
1090 sli v21.4s,v29.4s,#7
1091 eor w11,w11,w16
1092 ext v2.16b,v2.16b,v2.16b,#8
1093 eor w12,w12,w13
1094 ext v6.16b,v6.16b,v6.16b,#8
1095 eor w9,w9,w14
1096 ext v10.16b,v10.16b,v10.16b,#8
1097 ror w10,w10,#25
1098 ext v14.16b,v14.16b,v14.16b,#8
1099 ror w11,w11,#25
1100 ext v18.16b,v18.16b,v18.16b,#8
1101 ror w12,w12,#25
1102 ext v22.16b,v22.16b,v22.16b,#8
1103 ror w9,w9,#25
1104 ext v3.16b,v3.16b,v3.16b,#12
1105 ext v7.16b,v7.16b,v7.16b,#12
1106 ext v11.16b,v11.16b,v11.16b,#12
1107 ext v15.16b,v15.16b,v15.16b,#12
1108 ext v19.16b,v19.16b,v19.16b,#12
1109 ext v23.16b,v23.16b,v23.16b,#12
1110 ext v1.16b,v1.16b,v1.16b,#4
1111 ext v5.16b,v5.16b,v5.16b,#4
1112 ext v9.16b,v9.16b,v9.16b,#4
1113 ext v13.16b,v13.16b,v13.16b,#4
1114 ext v17.16b,v17.16b,v17.16b,#4
1115 ext v21.16b,v21.16b,v21.16b,#4
1116 add v0.4s,v0.4s,v1.4s
1117 add w5,w5,w9
1118 add v4.4s,v4.4s,v5.4s
1119 add w6,w6,w10
1120 add v8.4s,v8.4s,v9.4s
1121 add w7,w7,w11
1122 add v12.4s,v12.4s,v13.4s
1123 add w8,w8,w12
1124 add v16.4s,v16.4s,v17.4s
1125 eor w17,w17,w5
1126 add v20.4s,v20.4s,v21.4s
1127 eor w19,w19,w6
1128 eor v3.16b,v3.16b,v0.16b
1129 eor w20,w20,w7
1130 eor v7.16b,v7.16b,v4.16b
1131 eor w21,w21,w8
1132 eor v11.16b,v11.16b,v8.16b
1133 ror w17,w17,#16
1134 eor v15.16b,v15.16b,v12.16b
1135 ror w19,w19,#16
1136 eor v19.16b,v19.16b,v16.16b
1137 ror w20,w20,#16
1138 eor v23.16b,v23.16b,v20.16b
1139 ror w21,w21,#16
1140 rev32 v3.8h,v3.8h
1141 add w13,w13,w17
1142 rev32 v7.8h,v7.8h
1143 add w14,w14,w19
1144 rev32 v11.8h,v11.8h
1145 add w15,w15,w20
1146 rev32 v15.8h,v15.8h
1147 add w16,w16,w21
1148 rev32 v19.8h,v19.8h
1149 eor w9,w9,w13
1150 rev32 v23.8h,v23.8h
1151 eor w10,w10,w14
1152 add v2.4s,v2.4s,v3.4s
1153 eor w11,w11,w15
1154 add v6.4s,v6.4s,v7.4s
1155 eor w12,w12,w16
1156 add v10.4s,v10.4s,v11.4s
1157 ror w9,w9,#20
1158 add v14.4s,v14.4s,v15.4s
1159 ror w10,w10,#20
1160 add v18.4s,v18.4s,v19.4s
1161 ror w11,w11,#20
1162 add v22.4s,v22.4s,v23.4s
1163 ror w12,w12,#20
1164 eor v24.16b,v1.16b,v2.16b
1165 add w5,w5,w9
1166 eor v25.16b,v5.16b,v6.16b
1167 add w6,w6,w10
1168 eor v26.16b,v9.16b,v10.16b
1169 add w7,w7,w11
1170 eor v27.16b,v13.16b,v14.16b
1171 add w8,w8,w12
1172 eor v28.16b,v17.16b,v18.16b
1173 eor w17,w17,w5
1174 eor v29.16b,v21.16b,v22.16b
1175 eor w19,w19,w6
1176 ushr v1.4s,v24.4s,#20
1177 eor w20,w20,w7
1178 ushr v5.4s,v25.4s,#20
1179 eor w21,w21,w8
1180 ushr v9.4s,v26.4s,#20
1181 ror w17,w17,#24
1182 ushr v13.4s,v27.4s,#20
1183 ror w19,w19,#24
1184 ushr v17.4s,v28.4s,#20
1185 ror w20,w20,#24
1186 ushr v21.4s,v29.4s,#20
1187 ror w21,w21,#24
1188 sli v1.4s,v24.4s,#12
1189 add w13,w13,w17
1190 sli v5.4s,v25.4s,#12
1191 add w14,w14,w19
1192 sli v9.4s,v26.4s,#12
1193 add w15,w15,w20
1194 sli v13.4s,v27.4s,#12
1195 add w16,w16,w21
1196 sli v17.4s,v28.4s,#12
1197 eor w9,w9,w13
1198 sli v21.4s,v29.4s,#12
1199 eor w10,w10,w14
1200 add v0.4s,v0.4s,v1.4s
1201 eor w11,w11,w15
1202 add v4.4s,v4.4s,v5.4s
1203 eor w12,w12,w16
1204 add v8.4s,v8.4s,v9.4s
1205 ror w9,w9,#25
1206 add v12.4s,v12.4s,v13.4s
1207 ror w10,w10,#25
1208 add v16.4s,v16.4s,v17.4s
1209 ror w11,w11,#25
1210 add v20.4s,v20.4s,v21.4s
1211 ror w12,w12,#25
1212 eor v24.16b,v3.16b,v0.16b
1213 add w5,w5,w10
1214 eor v25.16b,v7.16b,v4.16b
1215 add w6,w6,w11
1216 eor v26.16b,v11.16b,v8.16b
1217 add w7,w7,w12
1218 eor v27.16b,v15.16b,v12.16b
1219 add w8,w8,w9
1220 eor v28.16b,v19.16b,v16.16b
1221 eor w21,w21,w5
1222 eor v29.16b,v23.16b,v20.16b
1223 eor w17,w17,w6
1224 ushr v3.4s,v24.4s,#24
1225 eor w19,w19,w7
1226 ushr v7.4s,v25.4s,#24
1227 eor w20,w20,w8
1228 ushr v11.4s,v26.4s,#24
1229 ror w21,w21,#16
1230 ushr v15.4s,v27.4s,#24
1231 ror w17,w17,#16
1232 ushr v19.4s,v28.4s,#24
1233 ror w19,w19,#16
1234 ushr v23.4s,v29.4s,#24
1235 ror w20,w20,#16
1236 sli v3.4s,v24.4s,#8
1237 add w15,w15,w21
1238 sli v7.4s,v25.4s,#8
1239 add w16,w16,w17
1240 sli v11.4s,v26.4s,#8
1241 add w13,w13,w19
1242 sli v15.4s,v27.4s,#8
1243 add w14,w14,w20
1244 sli v19.4s,v28.4s,#8
1245 eor w10,w10,w15
1246 sli v23.4s,v29.4s,#8
1247 eor w11,w11,w16
1248 add v2.4s,v2.4s,v3.4s
1249 eor w12,w12,w13
1250 add v6.4s,v6.4s,v7.4s
1251 eor w9,w9,w14
1252 add v10.4s,v10.4s,v11.4s
1253 ror w10,w10,#20
1254 add v14.4s,v14.4s,v15.4s
1255 ror w11,w11,#20
1256 add v18.4s,v18.4s,v19.4s
1257 ror w12,w12,#20
1258 add v22.4s,v22.4s,v23.4s
1259 ror w9,w9,#20
1260 eor v24.16b,v1.16b,v2.16b
1261 add w5,w5,w10
1262 eor v25.16b,v5.16b,v6.16b
1263 add w6,w6,w11
1264 eor v26.16b,v9.16b,v10.16b
1265 add w7,w7,w12
1266 eor v27.16b,v13.16b,v14.16b
1267 add w8,w8,w9
1268 eor v28.16b,v17.16b,v18.16b
1269 eor w21,w21,w5
1270 eor v29.16b,v21.16b,v22.16b
1271 eor w17,w17,w6
1272 ushr v1.4s,v24.4s,#25
1273 eor w19,w19,w7
1274 ushr v5.4s,v25.4s,#25
1275 eor w20,w20,w8
1276 ushr v9.4s,v26.4s,#25
1277 ror w21,w21,#24
1278 ushr v13.4s,v27.4s,#25
1279 ror w17,w17,#24
1280 ushr v17.4s,v28.4s,#25
1281 ror w19,w19,#24
1282 ushr v21.4s,v29.4s,#25
1283 ror w20,w20,#24
1284 sli v1.4s,v24.4s,#7
1285 add w15,w15,w21
1286 sli v5.4s,v25.4s,#7
1287 add w16,w16,w17
1288 sli v9.4s,v26.4s,#7
1289 add w13,w13,w19
1290 sli v13.4s,v27.4s,#7
1291 add w14,w14,w20
1292 sli v17.4s,v28.4s,#7
1293 eor w10,w10,w15
1294 sli v21.4s,v29.4s,#7
1295 eor w11,w11,w16
1296 ext v2.16b,v2.16b,v2.16b,#8
1297 eor w12,w12,w13
1298 ext v6.16b,v6.16b,v6.16b,#8
1299 eor w9,w9,w14
1300 ext v10.16b,v10.16b,v10.16b,#8
1301 ror w10,w10,#25
1302 ext v14.16b,v14.16b,v14.16b,#8
1303 ror w11,w11,#25
1304 ext v18.16b,v18.16b,v18.16b,#8
1305 ror w12,w12,#25
1306 ext v22.16b,v22.16b,v22.16b,#8
1307 ror w9,w9,#25
1308 ext v3.16b,v3.16b,v3.16b,#4
1309 ext v7.16b,v7.16b,v7.16b,#4
1310 ext v11.16b,v11.16b,v11.16b,#4
1311 ext v15.16b,v15.16b,v15.16b,#4
1312 ext v19.16b,v19.16b,v19.16b,#4
1313 ext v23.16b,v23.16b,v23.16b,#4
1314 ext v1.16b,v1.16b,v1.16b,#12
1315 ext v5.16b,v5.16b,v5.16b,#12
1316 ext v9.16b,v9.16b,v9.16b,#12
1317 ext v13.16b,v13.16b,v13.16b,#12
1318 ext v17.16b,v17.16b,v17.16b,#12
1319 ext v21.16b,v21.16b,v21.16b,#12
1320 cbnz x4,Loop_upper_neon
1321
1322 add w5,w5,w22 // accumulate key block
1323 add x6,x6,x22,lsr#32
1324 add w7,w7,w23
1325 add x8,x8,x23,lsr#32
1326 add w9,w9,w24
1327 add x10,x10,x24,lsr#32
1328 add w11,w11,w25
1329 add x12,x12,x25,lsr#32
1330 add w13,w13,w26
1331 add x14,x14,x26,lsr#32
1332 add w15,w15,w27
1333 add x16,x16,x27,lsr#32
1334 add w17,w17,w28
1335 add x19,x19,x28,lsr#32
1336 add w20,w20,w30
1337 add x21,x21,x30,lsr#32
1338
1339 add x5,x5,x6,lsl#32 // pack
1340 add x7,x7,x8,lsl#32
1341 ldp x6,x8,[x1,#0] // load input
1342 add x9,x9,x10,lsl#32
1343 add x11,x11,x12,lsl#32
1344 ldp x10,x12,[x1,#16]
1345 add x13,x13,x14,lsl#32
1346 add x15,x15,x16,lsl#32
1347 ldp x14,x16,[x1,#32]
1348 add x17,x17,x19,lsl#32
1349 add x20,x20,x21,lsl#32
1350 ldp x19,x21,[x1,#48]
1351 add x1,x1,#64
1352#ifdef __ARMEB__
1353 rev x5,x5
1354 rev x7,x7
1355 rev x9,x9
1356 rev x11,x11
1357 rev x13,x13
1358 rev x15,x15
1359 rev x17,x17
1360 rev x20,x20
1361#endif
1362 eor x5,x5,x6
1363 eor x7,x7,x8
1364 eor x9,x9,x10
1365 eor x11,x11,x12
1366 eor x13,x13,x14
1367 eor x15,x15,x16
1368 eor x17,x17,x19
1369 eor x20,x20,x21
1370
1371 stp x5,x7,[x0,#0] // store output
1372 add x28,x28,#1 // increment counter
1373 mov w5,w22 // unpack key block
1374 lsr x6,x22,#32
1375 stp x9,x11,[x0,#16]
1376 mov w7,w23
1377 lsr x8,x23,#32
1378 stp x13,x15,[x0,#32]
1379 mov w9,w24
1380 lsr x10,x24,#32
1381 stp x17,x20,[x0,#48]
1382 add x0,x0,#64
1383 mov w11,w25
1384 lsr x12,x25,#32
1385 mov w13,w26
1386 lsr x14,x26,#32
1387 mov w15,w27
1388 lsr x16,x27,#32
1389 mov w17,w28
1390 lsr x19,x28,#32
1391 mov w20,w30
1392 lsr x21,x30,#32
1393
1394 mov x4,#5
1395Loop_lower_neon:
1396 sub x4,x4,#1
1397 add v0.4s,v0.4s,v1.4s
1398 add w5,w5,w9
1399 add v4.4s,v4.4s,v5.4s
1400 add w6,w6,w10
1401 add v8.4s,v8.4s,v9.4s
1402 add w7,w7,w11
1403 add v12.4s,v12.4s,v13.4s
1404 add w8,w8,w12
1405 add v16.4s,v16.4s,v17.4s
1406 eor w17,w17,w5
1407 add v20.4s,v20.4s,v21.4s
1408 eor w19,w19,w6
1409 eor v3.16b,v3.16b,v0.16b
1410 eor w20,w20,w7
1411 eor v7.16b,v7.16b,v4.16b
1412 eor w21,w21,w8
1413 eor v11.16b,v11.16b,v8.16b
1414 ror w17,w17,#16
1415 eor v15.16b,v15.16b,v12.16b
1416 ror w19,w19,#16
1417 eor v19.16b,v19.16b,v16.16b
1418 ror w20,w20,#16
1419 eor v23.16b,v23.16b,v20.16b
1420 ror w21,w21,#16
1421 rev32 v3.8h,v3.8h
1422 add w13,w13,w17
1423 rev32 v7.8h,v7.8h
1424 add w14,w14,w19
1425 rev32 v11.8h,v11.8h
1426 add w15,w15,w20
1427 rev32 v15.8h,v15.8h
1428 add w16,w16,w21
1429 rev32 v19.8h,v19.8h
1430 eor w9,w9,w13
1431 rev32 v23.8h,v23.8h
1432 eor w10,w10,w14
1433 add v2.4s,v2.4s,v3.4s
1434 eor w11,w11,w15
1435 add v6.4s,v6.4s,v7.4s
1436 eor w12,w12,w16
1437 add v10.4s,v10.4s,v11.4s
1438 ror w9,w9,#20
1439 add v14.4s,v14.4s,v15.4s
1440 ror w10,w10,#20
1441 add v18.4s,v18.4s,v19.4s
1442 ror w11,w11,#20
1443 add v22.4s,v22.4s,v23.4s
1444 ror w12,w12,#20
1445 eor v24.16b,v1.16b,v2.16b
1446 add w5,w5,w9
1447 eor v25.16b,v5.16b,v6.16b
1448 add w6,w6,w10
1449 eor v26.16b,v9.16b,v10.16b
1450 add w7,w7,w11
1451 eor v27.16b,v13.16b,v14.16b
1452 add w8,w8,w12
1453 eor v28.16b,v17.16b,v18.16b
1454 eor w17,w17,w5
1455 eor v29.16b,v21.16b,v22.16b
1456 eor w19,w19,w6
1457 ushr v1.4s,v24.4s,#20
1458 eor w20,w20,w7
1459 ushr v5.4s,v25.4s,#20
1460 eor w21,w21,w8
1461 ushr v9.4s,v26.4s,#20
1462 ror w17,w17,#24
1463 ushr v13.4s,v27.4s,#20
1464 ror w19,w19,#24
1465 ushr v17.4s,v28.4s,#20
1466 ror w20,w20,#24
1467 ushr v21.4s,v29.4s,#20
1468 ror w21,w21,#24
1469 sli v1.4s,v24.4s,#12
1470 add w13,w13,w17
1471 sli v5.4s,v25.4s,#12
1472 add w14,w14,w19
1473 sli v9.4s,v26.4s,#12
1474 add w15,w15,w20
1475 sli v13.4s,v27.4s,#12
1476 add w16,w16,w21
1477 sli v17.4s,v28.4s,#12
1478 eor w9,w9,w13
1479 sli v21.4s,v29.4s,#12
1480 eor w10,w10,w14
1481 add v0.4s,v0.4s,v1.4s
1482 eor w11,w11,w15
1483 add v4.4s,v4.4s,v5.4s
1484 eor w12,w12,w16
1485 add v8.4s,v8.4s,v9.4s
1486 ror w9,w9,#25
1487 add v12.4s,v12.4s,v13.4s
1488 ror w10,w10,#25
1489 add v16.4s,v16.4s,v17.4s
1490 ror w11,w11,#25
1491 add v20.4s,v20.4s,v21.4s
1492 ror w12,w12,#25
1493 eor v24.16b,v3.16b,v0.16b
1494 add w5,w5,w10
1495 eor v25.16b,v7.16b,v4.16b
1496 add w6,w6,w11
1497 eor v26.16b,v11.16b,v8.16b
1498 add w7,w7,w12
1499 eor v27.16b,v15.16b,v12.16b
1500 add w8,w8,w9
1501 eor v28.16b,v19.16b,v16.16b
1502 eor w21,w21,w5
1503 eor v29.16b,v23.16b,v20.16b
1504 eor w17,w17,w6
1505 ushr v3.4s,v24.4s,#24
1506 eor w19,w19,w7
1507 ushr v7.4s,v25.4s,#24
1508 eor w20,w20,w8
1509 ushr v11.4s,v26.4s,#24
1510 ror w21,w21,#16
1511 ushr v15.4s,v27.4s,#24
1512 ror w17,w17,#16
1513 ushr v19.4s,v28.4s,#24
1514 ror w19,w19,#16
1515 ushr v23.4s,v29.4s,#24
1516 ror w20,w20,#16
1517 sli v3.4s,v24.4s,#8
1518 add w15,w15,w21
1519 sli v7.4s,v25.4s,#8
1520 add w16,w16,w17
1521 sli v11.4s,v26.4s,#8
1522 add w13,w13,w19
1523 sli v15.4s,v27.4s,#8
1524 add w14,w14,w20
1525 sli v19.4s,v28.4s,#8
1526 eor w10,w10,w15
1527 sli v23.4s,v29.4s,#8
1528 eor w11,w11,w16
1529 add v2.4s,v2.4s,v3.4s
1530 eor w12,w12,w13
1531 add v6.4s,v6.4s,v7.4s
1532 eor w9,w9,w14
1533 add v10.4s,v10.4s,v11.4s
1534 ror w10,w10,#20
1535 add v14.4s,v14.4s,v15.4s
1536 ror w11,w11,#20
1537 add v18.4s,v18.4s,v19.4s
1538 ror w12,w12,#20
1539 add v22.4s,v22.4s,v23.4s
1540 ror w9,w9,#20
1541 eor v24.16b,v1.16b,v2.16b
1542 add w5,w5,w10
1543 eor v25.16b,v5.16b,v6.16b
1544 add w6,w6,w11
1545 eor v26.16b,v9.16b,v10.16b
1546 add w7,w7,w12
1547 eor v27.16b,v13.16b,v14.16b
1548 add w8,w8,w9
1549 eor v28.16b,v17.16b,v18.16b
1550 eor w21,w21,w5
1551 eor v29.16b,v21.16b,v22.16b
1552 eor w17,w17,w6
1553 ushr v1.4s,v24.4s,#25
1554 eor w19,w19,w7
1555 ushr v5.4s,v25.4s,#25
1556 eor w20,w20,w8
1557 ushr v9.4s,v26.4s,#25
1558 ror w21,w21,#24
1559 ushr v13.4s,v27.4s,#25
1560 ror w17,w17,#24
1561 ushr v17.4s,v28.4s,#25
1562 ror w19,w19,#24
1563 ushr v21.4s,v29.4s,#25
1564 ror w20,w20,#24
1565 sli v1.4s,v24.4s,#7
1566 add w15,w15,w21
1567 sli v5.4s,v25.4s,#7
1568 add w16,w16,w17
1569 sli v9.4s,v26.4s,#7
1570 add w13,w13,w19
1571 sli v13.4s,v27.4s,#7
1572 add w14,w14,w20
1573 sli v17.4s,v28.4s,#7
1574 eor w10,w10,w15
1575 sli v21.4s,v29.4s,#7
1576 eor w11,w11,w16
1577 ext v2.16b,v2.16b,v2.16b,#8
1578 eor w12,w12,w13
1579 ext v6.16b,v6.16b,v6.16b,#8
1580 eor w9,w9,w14
1581 ext v10.16b,v10.16b,v10.16b,#8
1582 ror w10,w10,#25
1583 ext v14.16b,v14.16b,v14.16b,#8
1584 ror w11,w11,#25
1585 ext v18.16b,v18.16b,v18.16b,#8
1586 ror w12,w12,#25
1587 ext v22.16b,v22.16b,v22.16b,#8
1588 ror w9,w9,#25
1589 ext v3.16b,v3.16b,v3.16b,#12
1590 ext v7.16b,v7.16b,v7.16b,#12
1591 ext v11.16b,v11.16b,v11.16b,#12
1592 ext v15.16b,v15.16b,v15.16b,#12
1593 ext v19.16b,v19.16b,v19.16b,#12
1594 ext v23.16b,v23.16b,v23.16b,#12
1595 ext v1.16b,v1.16b,v1.16b,#4
1596 ext v5.16b,v5.16b,v5.16b,#4
1597 ext v9.16b,v9.16b,v9.16b,#4
1598 ext v13.16b,v13.16b,v13.16b,#4
1599 ext v17.16b,v17.16b,v17.16b,#4
1600 ext v21.16b,v21.16b,v21.16b,#4
1601 add v0.4s,v0.4s,v1.4s
1602 add w5,w5,w9
1603 add v4.4s,v4.4s,v5.4s
1604 add w6,w6,w10
1605 add v8.4s,v8.4s,v9.4s
1606 add w7,w7,w11
1607 add v12.4s,v12.4s,v13.4s
1608 add w8,w8,w12
1609 add v16.4s,v16.4s,v17.4s
1610 eor w17,w17,w5
1611 add v20.4s,v20.4s,v21.4s
1612 eor w19,w19,w6
1613 eor v3.16b,v3.16b,v0.16b
1614 eor w20,w20,w7
1615 eor v7.16b,v7.16b,v4.16b
1616 eor w21,w21,w8
1617 eor v11.16b,v11.16b,v8.16b
1618 ror w17,w17,#16
1619 eor v15.16b,v15.16b,v12.16b
1620 ror w19,w19,#16
1621 eor v19.16b,v19.16b,v16.16b
1622 ror w20,w20,#16
1623 eor v23.16b,v23.16b,v20.16b
1624 ror w21,w21,#16
1625 rev32 v3.8h,v3.8h
1626 add w13,w13,w17
1627 rev32 v7.8h,v7.8h
1628 add w14,w14,w19
1629 rev32 v11.8h,v11.8h
1630 add w15,w15,w20
1631 rev32 v15.8h,v15.8h
1632 add w16,w16,w21
1633 rev32 v19.8h,v19.8h
1634 eor w9,w9,w13
1635 rev32 v23.8h,v23.8h
1636 eor w10,w10,w14
1637 add v2.4s,v2.4s,v3.4s
1638 eor w11,w11,w15
1639 add v6.4s,v6.4s,v7.4s
1640 eor w12,w12,w16
1641 add v10.4s,v10.4s,v11.4s
1642 ror w9,w9,#20
1643 add v14.4s,v14.4s,v15.4s
1644 ror w10,w10,#20
1645 add v18.4s,v18.4s,v19.4s
1646 ror w11,w11,#20
1647 add v22.4s,v22.4s,v23.4s
1648 ror w12,w12,#20
1649 eor v24.16b,v1.16b,v2.16b
1650 add w5,w5,w9
1651 eor v25.16b,v5.16b,v6.16b
1652 add w6,w6,w10
1653 eor v26.16b,v9.16b,v10.16b
1654 add w7,w7,w11
1655 eor v27.16b,v13.16b,v14.16b
1656 add w8,w8,w12
1657 eor v28.16b,v17.16b,v18.16b
1658 eor w17,w17,w5
1659 eor v29.16b,v21.16b,v22.16b
1660 eor w19,w19,w6
1661 ushr v1.4s,v24.4s,#20
1662 eor w20,w20,w7
1663 ushr v5.4s,v25.4s,#20
1664 eor w21,w21,w8
1665 ushr v9.4s,v26.4s,#20
1666 ror w17,w17,#24
1667 ushr v13.4s,v27.4s,#20
1668 ror w19,w19,#24
1669 ushr v17.4s,v28.4s,#20
1670 ror w20,w20,#24
1671 ushr v21.4s,v29.4s,#20
1672 ror w21,w21,#24
1673 sli v1.4s,v24.4s,#12
1674 add w13,w13,w17
1675 sli v5.4s,v25.4s,#12
1676 add w14,w14,w19
1677 sli v9.4s,v26.4s,#12
1678 add w15,w15,w20
1679 sli v13.4s,v27.4s,#12
1680 add w16,w16,w21
1681 sli v17.4s,v28.4s,#12
1682 eor w9,w9,w13
1683 sli v21.4s,v29.4s,#12
1684 eor w10,w10,w14
1685 add v0.4s,v0.4s,v1.4s
1686 eor w11,w11,w15
1687 add v4.4s,v4.4s,v5.4s
1688 eor w12,w12,w16
1689 add v8.4s,v8.4s,v9.4s
1690 ror w9,w9,#25
1691 add v12.4s,v12.4s,v13.4s
1692 ror w10,w10,#25
1693 add v16.4s,v16.4s,v17.4s
1694 ror w11,w11,#25
1695 add v20.4s,v20.4s,v21.4s
1696 ror w12,w12,#25
1697 eor v24.16b,v3.16b,v0.16b
1698 add w5,w5,w10
1699 eor v25.16b,v7.16b,v4.16b
1700 add w6,w6,w11
1701 eor v26.16b,v11.16b,v8.16b
1702 add w7,w7,w12
1703 eor v27.16b,v15.16b,v12.16b
1704 add w8,w8,w9
1705 eor v28.16b,v19.16b,v16.16b
1706 eor w21,w21,w5
1707 eor v29.16b,v23.16b,v20.16b
1708 eor w17,w17,w6
1709 ushr v3.4s,v24.4s,#24
1710 eor w19,w19,w7
1711 ushr v7.4s,v25.4s,#24
1712 eor w20,w20,w8
1713 ushr v11.4s,v26.4s,#24
1714 ror w21,w21,#16
1715 ushr v15.4s,v27.4s,#24
1716 ror w17,w17,#16
1717 ushr v19.4s,v28.4s,#24
1718 ror w19,w19,#16
1719 ushr v23.4s,v29.4s,#24
1720 ror w20,w20,#16
1721 sli v3.4s,v24.4s,#8
1722 add w15,w15,w21
1723 sli v7.4s,v25.4s,#8
1724 add w16,w16,w17
1725 sli v11.4s,v26.4s,#8
1726 add w13,w13,w19
1727 sli v15.4s,v27.4s,#8
1728 add w14,w14,w20
1729 sli v19.4s,v28.4s,#8
1730 eor w10,w10,w15
1731 sli v23.4s,v29.4s,#8
1732 eor w11,w11,w16
1733 add v2.4s,v2.4s,v3.4s
1734 eor w12,w12,w13
1735 add v6.4s,v6.4s,v7.4s
1736 eor w9,w9,w14
1737 add v10.4s,v10.4s,v11.4s
1738 ror w10,w10,#20
1739 add v14.4s,v14.4s,v15.4s
1740 ror w11,w11,#20
1741 add v18.4s,v18.4s,v19.4s
1742 ror w12,w12,#20
1743 add v22.4s,v22.4s,v23.4s
1744 ror w9,w9,#20
1745 eor v24.16b,v1.16b,v2.16b
1746 add w5,w5,w10
1747 eor v25.16b,v5.16b,v6.16b
1748 add w6,w6,w11
1749 eor v26.16b,v9.16b,v10.16b
1750 add w7,w7,w12
1751 eor v27.16b,v13.16b,v14.16b
1752 add w8,w8,w9
1753 eor v28.16b,v17.16b,v18.16b
1754 eor w21,w21,w5
1755 eor v29.16b,v21.16b,v22.16b
1756 eor w17,w17,w6
1757 ushr v1.4s,v24.4s,#25
1758 eor w19,w19,w7
1759 ushr v5.4s,v25.4s,#25
1760 eor w20,w20,w8
1761 ushr v9.4s,v26.4s,#25
1762 ror w21,w21,#24
1763 ushr v13.4s,v27.4s,#25
1764 ror w17,w17,#24
1765 ushr v17.4s,v28.4s,#25
1766 ror w19,w19,#24
1767 ushr v21.4s,v29.4s,#25
1768 ror w20,w20,#24
1769 sli v1.4s,v24.4s,#7
1770 add w15,w15,w21
1771 sli v5.4s,v25.4s,#7
1772 add w16,w16,w17
1773 sli v9.4s,v26.4s,#7
1774 add w13,w13,w19
1775 sli v13.4s,v27.4s,#7
1776 add w14,w14,w20
1777 sli v17.4s,v28.4s,#7
1778 eor w10,w10,w15
1779 sli v21.4s,v29.4s,#7
1780 eor w11,w11,w16
1781 ext v2.16b,v2.16b,v2.16b,#8
1782 eor w12,w12,w13
1783 ext v6.16b,v6.16b,v6.16b,#8
1784 eor w9,w9,w14
1785 ext v10.16b,v10.16b,v10.16b,#8
1786 ror w10,w10,#25
1787 ext v14.16b,v14.16b,v14.16b,#8
1788 ror w11,w11,#25
1789 ext v18.16b,v18.16b,v18.16b,#8
1790 ror w12,w12,#25
1791 ext v22.16b,v22.16b,v22.16b,#8
1792 ror w9,w9,#25
1793 ext v3.16b,v3.16b,v3.16b,#4
1794 ext v7.16b,v7.16b,v7.16b,#4
1795 ext v11.16b,v11.16b,v11.16b,#4
1796 ext v15.16b,v15.16b,v15.16b,#4
1797 ext v19.16b,v19.16b,v19.16b,#4
1798 ext v23.16b,v23.16b,v23.16b,#4
1799 ext v1.16b,v1.16b,v1.16b,#12
1800 ext v5.16b,v5.16b,v5.16b,#12
1801 ext v9.16b,v9.16b,v9.16b,#12
1802 ext v13.16b,v13.16b,v13.16b,#12
1803 ext v17.16b,v17.16b,v17.16b,#12
1804 ext v21.16b,v21.16b,v21.16b,#12
1805 cbnz x4,Loop_lower_neon
1806
1807 add w5,w5,w22 // accumulate key block
1808 ldp q24,q25,[sp,#0]
1809 add x6,x6,x22,lsr#32
1810 ldp q26,q27,[sp,#32]
1811 add w7,w7,w23
1812 ldp q28,q29,[sp,#64]
1813 add x8,x8,x23,lsr#32
1814 add v0.4s,v0.4s,v24.4s
1815 add w9,w9,w24
1816 add v4.4s,v4.4s,v24.4s
1817 add x10,x10,x24,lsr#32
1818 add v8.4s,v8.4s,v24.4s
1819 add w11,w11,w25
1820 add v12.4s,v12.4s,v24.4s
1821 add x12,x12,x25,lsr#32
1822 add v16.4s,v16.4s,v24.4s
1823 add w13,w13,w26
1824 add v20.4s,v20.4s,v24.4s
1825 add x14,x14,x26,lsr#32
1826 add v2.4s,v2.4s,v26.4s
1827 add w15,w15,w27
1828 add v6.4s,v6.4s,v26.4s
1829 add x16,x16,x27,lsr#32
1830 add v10.4s,v10.4s,v26.4s
1831 add w17,w17,w28
1832 add v14.4s,v14.4s,v26.4s
1833 add x19,x19,x28,lsr#32
1834 add v18.4s,v18.4s,v26.4s
1835 add w20,w20,w30
1836 add v22.4s,v22.4s,v26.4s
1837 add x21,x21,x30,lsr#32
1838 add v19.4s,v19.4s,v31.4s // +4
1839 add x5,x5,x6,lsl#32 // pack
1840 add v23.4s,v23.4s,v31.4s // +4
1841 add x7,x7,x8,lsl#32
1842 add v3.4s,v3.4s,v27.4s
1843 ldp x6,x8,[x1,#0] // load input
1844 add v7.4s,v7.4s,v28.4s
1845 add x9,x9,x10,lsl#32
1846 add v11.4s,v11.4s,v29.4s
1847 add x11,x11,x12,lsl#32
1848 add v15.4s,v15.4s,v30.4s
1849 ldp x10,x12,[x1,#16]
1850 add v19.4s,v19.4s,v27.4s
1851 add x13,x13,x14,lsl#32
1852 add v23.4s,v23.4s,v28.4s
1853 add x15,x15,x16,lsl#32
1854 add v1.4s,v1.4s,v25.4s
1855 ldp x14,x16,[x1,#32]
1856 add v5.4s,v5.4s,v25.4s
1857 add x17,x17,x19,lsl#32
1858 add v9.4s,v9.4s,v25.4s
1859 add x20,x20,x21,lsl#32
1860 add v13.4s,v13.4s,v25.4s
1861 ldp x19,x21,[x1,#48]
1862 add v17.4s,v17.4s,v25.4s
1863 add x1,x1,#64
1864 add v21.4s,v21.4s,v25.4s
1865
1866#ifdef __ARMEB__
1867 rev x5,x5
1868 rev x7,x7
1869 rev x9,x9
1870 rev x11,x11
1871 rev x13,x13
1872 rev x15,x15
1873 rev x17,x17
1874 rev x20,x20
1875#endif
1876 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1877 eor x5,x5,x6
1878 eor x7,x7,x8
1879 eor x9,x9,x10
1880 eor x11,x11,x12
1881 eor x13,x13,x14
1882 eor v0.16b,v0.16b,v24.16b
1883 eor x15,x15,x16
1884 eor v1.16b,v1.16b,v25.16b
1885 eor x17,x17,x19
1886 eor v2.16b,v2.16b,v26.16b
1887 eor x20,x20,x21
1888 eor v3.16b,v3.16b,v27.16b
1889 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1890
1891 stp x5,x7,[x0,#0] // store output
1892 add x28,x28,#7 // increment counter
1893 stp x9,x11,[x0,#16]
1894 stp x13,x15,[x0,#32]
1895 stp x17,x20,[x0,#48]
1896 add x0,x0,#64
1897 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1898
1899 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1900 eor v4.16b,v4.16b,v24.16b
1901 eor v5.16b,v5.16b,v25.16b
1902 eor v6.16b,v6.16b,v26.16b
1903 eor v7.16b,v7.16b,v27.16b
1904 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1905
1906 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1907 eor v8.16b,v8.16b,v0.16b
1908 ldp q24,q25,[sp,#0]
1909 eor v9.16b,v9.16b,v1.16b
1910 ldp q26,q27,[sp,#32]
1911 eor v10.16b,v10.16b,v2.16b
1912 eor v11.16b,v11.16b,v3.16b
1913 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1914
1915 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1916 eor v12.16b,v12.16b,v4.16b
1917 eor v13.16b,v13.16b,v5.16b
1918 eor v14.16b,v14.16b,v6.16b
1919 eor v15.16b,v15.16b,v7.16b
1920 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1921
1922 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1923 eor v16.16b,v16.16b,v8.16b
1924 eor v17.16b,v17.16b,v9.16b
1925 eor v18.16b,v18.16b,v10.16b
1926 eor v19.16b,v19.16b,v11.16b
1927 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1928
1929 shl v0.4s,v31.4s,#1 // 4 -> 8
1930 eor v20.16b,v20.16b,v12.16b
1931 eor v21.16b,v21.16b,v13.16b
1932 eor v22.16b,v22.16b,v14.16b
1933 eor v23.16b,v23.16b,v15.16b
1934 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1935
1936 add v27.4s,v27.4s,v0.4s // += 8
1937 add v28.4s,v28.4s,v0.4s
1938 add v29.4s,v29.4s,v0.4s
1939 add v30.4s,v30.4s,v0.4s
1940
1941 b.hs Loop_outer_512_neon
1942
1943 adds x2,x2,#512
1944 ushr v0.4s,v31.4s,#2 // 4 -> 1
1945
1946 ldp d8,d9,[sp,#128+0] // meet ABI requirements
1947 ldp d10,d11,[sp,#128+16]
1948 ldp d12,d13,[sp,#128+32]
1949 ldp d14,d15,[sp,#128+48]
1950
1951 stp q24,q31,[sp,#0] // wipe off-load area
1952 stp q24,q31,[sp,#32]
1953 stp q24,q31,[sp,#64]
1954
1955 b.eq Ldone_512_neon
1956
1957 cmp x2,#192
1958 sub v27.4s,v27.4s,v0.4s // -= 1
1959 sub v28.4s,v28.4s,v0.4s
1960 sub v29.4s,v29.4s,v0.4s
1961 add sp,sp,#128
1962 b.hs Loop_outer_neon
1963
1964 eor v25.16b,v25.16b,v25.16b
1965 eor v26.16b,v26.16b,v26.16b
1966 eor v27.16b,v27.16b,v27.16b
1967 eor v28.16b,v28.16b,v28.16b
1968 eor v29.16b,v29.16b,v29.16b
1969 eor v30.16b,v30.16b,v30.16b
1970 b Loop_outer
1971
1972Ldone_512_neon:
1973 ldp x19,x20,[x29,#16]
1974 add sp,sp,#128+64
1975 ldp x21,x22,[x29,#32]
1976 ldp x23,x24,[x29,#48]
1977 ldp x25,x26,[x29,#64]
1978 ldp x27,x28,[x29,#80]
1979 ldp x29,x30,[sp],#96
1980 ret
1981
Robert Sloan726e9d12018-09-11 11:45:04 -07001982#endif // !OPENSSL_NO_ASM