blob: 0a1c415f1387bf1c17a6933af6cbde1f3cbc2e5a [file] [log] [blame]
Robert Sloan8ff03552017-06-14 12:40:58 -07001#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
David Benjamin4969cc92016-04-22 15:02:23 -04009#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# June 2015
Robert Sloana94fe052017-02-21 08:49:28 -080018#
David Benjamin4969cc92016-04-22 15:02:23 -040019# ChaCha20 for ARMv8.
20#
21# Performance in cycles per byte out of large buffer.
22#
23# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU
24#
25# Apple A7 5.50/+49% 3.33 1.70
26# Cortex-A53 8.40/+80% 4.72 4.72(*)
27# Cortex-A57 8.06/+43% 4.90 4.43(**)
28# Denver 4.50/+82% 2.63 2.67(*)
29# X-Gene 9.50/+46% 8.82 8.89(*)
Robert Sloan8ff03552017-06-14 12:40:58 -070030# Mongoose 8.00/+44% 3.64 3.25
Robert Sloanab8b8882018-03-26 11:39:51 -070031# Kryo 8.17/+50% 4.83 4.65
David Benjamin4969cc92016-04-22 15:02:23 -040032#
33# (*) it's expected that doubling interleave factor doesn't help
34# all processors, only those with higher NEON latency and
35# higher instruction issue rate;
36# (**) expected improvement was actually higher;
37
38$flavour=shift;
39$output=shift;
40
41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
43( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
44die "can't locate arm-xlate.pl";
45
46open OUT,"| \"$^X\" $xlate $flavour $output";
47*STDOUT=*OUT;
48
49sub AUTOLOAD() # thunk [simplified] x86-style perlasm
50{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
51 my $arg = pop;
52 $arg = "#$arg" if ($arg*1 eq $arg);
53 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
54}
55
56my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
57
58my @x=map("x$_",(5..17,19..21));
59my @d=map("x$_",(22..28,30));
60
61sub ROUND {
62my ($a0,$b0,$c0,$d0)=@_;
63my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
64my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
65my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
66
67 (
68 "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
69 "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
70 "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
71 "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
72 "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
73 "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
74 "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
75 "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
76 "&ror_32 (@x[$d0],@x[$d0],16)",
77 "&ror_32 (@x[$d1],@x[$d1],16)",
78 "&ror_32 (@x[$d2],@x[$d2],16)",
79 "&ror_32 (@x[$d3],@x[$d3],16)",
80
81 "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
82 "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
83 "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
84 "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
85 "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
86 "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
87 "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
88 "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
89 "&ror_32 (@x[$b0],@x[$b0],20)",
90 "&ror_32 (@x[$b1],@x[$b1],20)",
91 "&ror_32 (@x[$b2],@x[$b2],20)",
92 "&ror_32 (@x[$b3],@x[$b3],20)",
93
94 "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
95 "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
96 "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
97 "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
98 "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
99 "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
100 "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
101 "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
102 "&ror_32 (@x[$d0],@x[$d0],24)",
103 "&ror_32 (@x[$d1],@x[$d1],24)",
104 "&ror_32 (@x[$d2],@x[$d2],24)",
105 "&ror_32 (@x[$d3],@x[$d3],24)",
106
107 "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
108 "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
109 "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
110 "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
111 "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
112 "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
113 "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
114 "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
115 "&ror_32 (@x[$b0],@x[$b0],25)",
116 "&ror_32 (@x[$b1],@x[$b1],25)",
117 "&ror_32 (@x[$b2],@x[$b2],25)",
118 "&ror_32 (@x[$b3],@x[$b3],25)"
119 );
120}
121
122$code.=<<___;
123#include <openssl/arm_arch.h>
124
125.text
126
127.extern OPENSSL_armcap_P
128
129.align 5
130.Lsigma:
131.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
132.Lone:
133.long 1,0,0,0
134.LOPENSSL_armcap_P:
135#ifdef __ILP32__
136.long OPENSSL_armcap_P-.
137#else
138.quad OPENSSL_armcap_P-.
139#endif
140.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
141
142.globl ChaCha20_ctr32
143.type ChaCha20_ctr32,%function
144.align 5
145ChaCha20_ctr32:
146 cbz $len,.Labort
147 adr @x[0],.LOPENSSL_armcap_P
148 cmp $len,#192
149 b.lo .Lshort
150#ifdef __ILP32__
151 ldrsw @x[1],[@x[0]]
152#else
153 ldr @x[1],[@x[0]]
154#endif
155 ldr w17,[@x[1],@x[0]]
156 tst w17,#ARMV7_NEON
157 b.ne ChaCha20_neon
158
159.Lshort:
160 stp x29,x30,[sp,#-96]!
161 add x29,sp,#0
162
163 adr @x[0],.Lsigma
164 stp x19,x20,[sp,#16]
165 stp x21,x22,[sp,#32]
166 stp x23,x24,[sp,#48]
167 stp x25,x26,[sp,#64]
168 stp x27,x28,[sp,#80]
169 sub sp,sp,#64
170
171 ldp @d[0],@d[1],[@x[0]] // load sigma
172 ldp @d[2],@d[3],[$key] // load key
173 ldp @d[4],@d[5],[$key,#16]
174 ldp @d[6],@d[7],[$ctr] // load counter
175#ifdef __ARMEB__
176 ror @d[2],@d[2],#32
177 ror @d[3],@d[3],#32
178 ror @d[4],@d[4],#32
179 ror @d[5],@d[5],#32
180 ror @d[6],@d[6],#32
181 ror @d[7],@d[7],#32
182#endif
183
184.Loop_outer:
185 mov.32 @x[0],@d[0] // unpack key block
186 lsr @x[1],@d[0],#32
187 mov.32 @x[2],@d[1]
188 lsr @x[3],@d[1],#32
189 mov.32 @x[4],@d[2]
190 lsr @x[5],@d[2],#32
191 mov.32 @x[6],@d[3]
192 lsr @x[7],@d[3],#32
193 mov.32 @x[8],@d[4]
194 lsr @x[9],@d[4],#32
195 mov.32 @x[10],@d[5]
196 lsr @x[11],@d[5],#32
197 mov.32 @x[12],@d[6]
198 lsr @x[13],@d[6],#32
199 mov.32 @x[14],@d[7]
200 lsr @x[15],@d[7],#32
201
202 mov $ctr,#10
203 subs $len,$len,#64
204.Loop:
Robert Sloana94fe052017-02-21 08:49:28 -0800205 sub $ctr,$ctr,#1
David Benjamin4969cc92016-04-22 15:02:23 -0400206___
207 foreach (&ROUND(0, 4, 8,12)) { eval; }
208 foreach (&ROUND(0, 5,10,15)) { eval; }
209$code.=<<___;
210 cbnz $ctr,.Loop
211
212 add.32 @x[0],@x[0],@d[0] // accumulate key block
213 add @x[1],@x[1],@d[0],lsr#32
214 add.32 @x[2],@x[2],@d[1]
215 add @x[3],@x[3],@d[1],lsr#32
216 add.32 @x[4],@x[4],@d[2]
217 add @x[5],@x[5],@d[2],lsr#32
218 add.32 @x[6],@x[6],@d[3]
219 add @x[7],@x[7],@d[3],lsr#32
220 add.32 @x[8],@x[8],@d[4]
221 add @x[9],@x[9],@d[4],lsr#32
222 add.32 @x[10],@x[10],@d[5]
223 add @x[11],@x[11],@d[5],lsr#32
224 add.32 @x[12],@x[12],@d[6]
225 add @x[13],@x[13],@d[6],lsr#32
226 add.32 @x[14],@x[14],@d[7]
227 add @x[15],@x[15],@d[7],lsr#32
228
229 b.lo .Ltail
230
231 add @x[0],@x[0],@x[1],lsl#32 // pack
232 add @x[2],@x[2],@x[3],lsl#32
233 ldp @x[1],@x[3],[$inp,#0] // load input
234 add @x[4],@x[4],@x[5],lsl#32
235 add @x[6],@x[6],@x[7],lsl#32
236 ldp @x[5],@x[7],[$inp,#16]
237 add @x[8],@x[8],@x[9],lsl#32
238 add @x[10],@x[10],@x[11],lsl#32
239 ldp @x[9],@x[11],[$inp,#32]
240 add @x[12],@x[12],@x[13],lsl#32
241 add @x[14],@x[14],@x[15],lsl#32
242 ldp @x[13],@x[15],[$inp,#48]
243 add $inp,$inp,#64
244#ifdef __ARMEB__
245 rev @x[0],@x[0]
246 rev @x[2],@x[2]
247 rev @x[4],@x[4]
248 rev @x[6],@x[6]
249 rev @x[8],@x[8]
250 rev @x[10],@x[10]
251 rev @x[12],@x[12]
252 rev @x[14],@x[14]
253#endif
254 eor @x[0],@x[0],@x[1]
255 eor @x[2],@x[2],@x[3]
256 eor @x[4],@x[4],@x[5]
257 eor @x[6],@x[6],@x[7]
258 eor @x[8],@x[8],@x[9]
259 eor @x[10],@x[10],@x[11]
260 eor @x[12],@x[12],@x[13]
261 eor @x[14],@x[14],@x[15]
262
263 stp @x[0],@x[2],[$out,#0] // store output
264 add @d[6],@d[6],#1 // increment counter
265 stp @x[4],@x[6],[$out,#16]
266 stp @x[8],@x[10],[$out,#32]
267 stp @x[12],@x[14],[$out,#48]
268 add $out,$out,#64
269
270 b.hi .Loop_outer
271
272 ldp x19,x20,[x29,#16]
273 add sp,sp,#64
274 ldp x21,x22,[x29,#32]
275 ldp x23,x24,[x29,#48]
276 ldp x25,x26,[x29,#64]
277 ldp x27,x28,[x29,#80]
278 ldp x29,x30,[sp],#96
279.Labort:
280 ret
281
282.align 4
283.Ltail:
284 add $len,$len,#64
285.Less_than_64:
286 sub $out,$out,#1
287 add $inp,$inp,$len
288 add $out,$out,$len
289 add $ctr,sp,$len
290 neg $len,$len
291
292 add @x[0],@x[0],@x[1],lsl#32 // pack
293 add @x[2],@x[2],@x[3],lsl#32
294 add @x[4],@x[4],@x[5],lsl#32
295 add @x[6],@x[6],@x[7],lsl#32
296 add @x[8],@x[8],@x[9],lsl#32
297 add @x[10],@x[10],@x[11],lsl#32
298 add @x[12],@x[12],@x[13],lsl#32
299 add @x[14],@x[14],@x[15],lsl#32
300#ifdef __ARMEB__
301 rev @x[0],@x[0]
302 rev @x[2],@x[2]
303 rev @x[4],@x[4]
304 rev @x[6],@x[6]
305 rev @x[8],@x[8]
306 rev @x[10],@x[10]
307 rev @x[12],@x[12]
308 rev @x[14],@x[14]
309#endif
310 stp @x[0],@x[2],[sp,#0]
311 stp @x[4],@x[6],[sp,#16]
312 stp @x[8],@x[10],[sp,#32]
313 stp @x[12],@x[14],[sp,#48]
314
315.Loop_tail:
316 ldrb w10,[$inp,$len]
317 ldrb w11,[$ctr,$len]
318 add $len,$len,#1
319 eor w10,w10,w11
320 strb w10,[$out,$len]
321 cbnz $len,.Loop_tail
322
323 stp xzr,xzr,[sp,#0]
324 stp xzr,xzr,[sp,#16]
325 stp xzr,xzr,[sp,#32]
326 stp xzr,xzr,[sp,#48]
327
328 ldp x19,x20,[x29,#16]
329 add sp,sp,#64
330 ldp x21,x22,[x29,#32]
331 ldp x23,x24,[x29,#48]
332 ldp x25,x26,[x29,#64]
333 ldp x27,x28,[x29,#80]
334 ldp x29,x30,[sp],#96
335 ret
336.size ChaCha20_ctr32,.-ChaCha20_ctr32
337___
338
339{{{
340my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
341 map("v$_.4s",(0..7,16..23));
342my (@K)=map("v$_.4s",(24..30));
343my $ONE="v31.4s";
344
345sub NEONROUND {
346my $odd = pop;
347my ($a,$b,$c,$d,$t)=@_;
348
349 (
350 "&add ('$a','$a','$b')",
351 "&eor ('$d','$d','$a')",
352 "&rev32_16 ('$d','$d')", # vrot ($d,16)
353
354 "&add ('$c','$c','$d')",
355 "&eor ('$t','$b','$c')",
356 "&ushr ('$b','$t',20)",
357 "&sli ('$b','$t',12)",
358
359 "&add ('$a','$a','$b')",
360 "&eor ('$t','$d','$a')",
361 "&ushr ('$d','$t',24)",
362 "&sli ('$d','$t',8)",
363
364 "&add ('$c','$c','$d')",
365 "&eor ('$t','$b','$c')",
366 "&ushr ('$b','$t',25)",
367 "&sli ('$b','$t',7)",
368
369 "&ext ('$c','$c','$c',8)",
370 "&ext ('$d','$d','$d',$odd?4:12)",
371 "&ext ('$b','$b','$b',$odd?12:4)"
372 );
373}
374
375$code.=<<___;
376
377.type ChaCha20_neon,%function
378.align 5
379ChaCha20_neon:
380 stp x29,x30,[sp,#-96]!
381 add x29,sp,#0
382
383 adr @x[0],.Lsigma
384 stp x19,x20,[sp,#16]
385 stp x21,x22,[sp,#32]
386 stp x23,x24,[sp,#48]
387 stp x25,x26,[sp,#64]
388 stp x27,x28,[sp,#80]
389 cmp $len,#512
390 b.hs .L512_or_more_neon
391
392 sub sp,sp,#64
393
394 ldp @d[0],@d[1],[@x[0]] // load sigma
395 ld1 {@K[0]},[@x[0]],#16
396 ldp @d[2],@d[3],[$key] // load key
397 ldp @d[4],@d[5],[$key,#16]
398 ld1 {@K[1],@K[2]},[$key]
399 ldp @d[6],@d[7],[$ctr] // load counter
400 ld1 {@K[3]},[$ctr]
401 ld1 {$ONE},[@x[0]]
402#ifdef __ARMEB__
403 rev64 @K[0],@K[0]
404 ror @d[2],@d[2],#32
405 ror @d[3],@d[3],#32
406 ror @d[4],@d[4],#32
407 ror @d[5],@d[5],#32
408 ror @d[6],@d[6],#32
409 ror @d[7],@d[7],#32
410#endif
411 add @K[3],@K[3],$ONE // += 1
412 add @K[4],@K[3],$ONE
413 add @K[5],@K[4],$ONE
414 shl $ONE,$ONE,#2 // 1 -> 4
415
416.Loop_outer_neon:
417 mov.32 @x[0],@d[0] // unpack key block
418 lsr @x[1],@d[0],#32
419 mov $A0,@K[0]
420 mov.32 @x[2],@d[1]
421 lsr @x[3],@d[1],#32
422 mov $A1,@K[0]
423 mov.32 @x[4],@d[2]
424 lsr @x[5],@d[2],#32
425 mov $A2,@K[0]
426 mov.32 @x[6],@d[3]
427 mov $B0,@K[1]
428 lsr @x[7],@d[3],#32
429 mov $B1,@K[1]
430 mov.32 @x[8],@d[4]
431 mov $B2,@K[1]
432 lsr @x[9],@d[4],#32
433 mov $D0,@K[3]
434 mov.32 @x[10],@d[5]
435 mov $D1,@K[4]
436 lsr @x[11],@d[5],#32
437 mov $D2,@K[5]
438 mov.32 @x[12],@d[6]
439 mov $C0,@K[2]
440 lsr @x[13],@d[6],#32
441 mov $C1,@K[2]
442 mov.32 @x[14],@d[7]
443 mov $C2,@K[2]
444 lsr @x[15],@d[7],#32
445
446 mov $ctr,#10
447 subs $len,$len,#256
448.Loop_neon:
449 sub $ctr,$ctr,#1
450___
451 my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
452 my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
453 my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
454 my @thread3=&ROUND(0,4,8,12);
455
456 foreach (@thread0) {
457 eval; eval(shift(@thread3));
458 eval(shift(@thread1)); eval(shift(@thread3));
459 eval(shift(@thread2)); eval(shift(@thread3));
460 }
461
462 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
463 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
464 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
465 @thread3=&ROUND(0,5,10,15);
466
467 foreach (@thread0) {
468 eval; eval(shift(@thread3));
469 eval(shift(@thread1)); eval(shift(@thread3));
470 eval(shift(@thread2)); eval(shift(@thread3));
471 }
472$code.=<<___;
473 cbnz $ctr,.Loop_neon
474
475 add.32 @x[0],@x[0],@d[0] // accumulate key block
476 add $A0,$A0,@K[0]
477 add @x[1],@x[1],@d[0],lsr#32
478 add $A1,$A1,@K[0]
479 add.32 @x[2],@x[2],@d[1]
480 add $A2,$A2,@K[0]
481 add @x[3],@x[3],@d[1],lsr#32
482 add $C0,$C0,@K[2]
483 add.32 @x[4],@x[4],@d[2]
484 add $C1,$C1,@K[2]
485 add @x[5],@x[5],@d[2],lsr#32
486 add $C2,$C2,@K[2]
487 add.32 @x[6],@x[6],@d[3]
488 add $D0,$D0,@K[3]
489 add @x[7],@x[7],@d[3],lsr#32
490 add.32 @x[8],@x[8],@d[4]
491 add $D1,$D1,@K[4]
492 add @x[9],@x[9],@d[4],lsr#32
493 add.32 @x[10],@x[10],@d[5]
494 add $D2,$D2,@K[5]
495 add @x[11],@x[11],@d[5],lsr#32
496 add.32 @x[12],@x[12],@d[6]
497 add $B0,$B0,@K[1]
498 add @x[13],@x[13],@d[6],lsr#32
499 add.32 @x[14],@x[14],@d[7]
500 add $B1,$B1,@K[1]
501 add @x[15],@x[15],@d[7],lsr#32
502 add $B2,$B2,@K[1]
503
504 b.lo .Ltail_neon
505
506 add @x[0],@x[0],@x[1],lsl#32 // pack
507 add @x[2],@x[2],@x[3],lsl#32
508 ldp @x[1],@x[3],[$inp,#0] // load input
509 add @x[4],@x[4],@x[5],lsl#32
510 add @x[6],@x[6],@x[7],lsl#32
511 ldp @x[5],@x[7],[$inp,#16]
512 add @x[8],@x[8],@x[9],lsl#32
513 add @x[10],@x[10],@x[11],lsl#32
514 ldp @x[9],@x[11],[$inp,#32]
515 add @x[12],@x[12],@x[13],lsl#32
516 add @x[14],@x[14],@x[15],lsl#32
517 ldp @x[13],@x[15],[$inp,#48]
518 add $inp,$inp,#64
519#ifdef __ARMEB__
520 rev @x[0],@x[0]
521 rev @x[2],@x[2]
522 rev @x[4],@x[4]
523 rev @x[6],@x[6]
524 rev @x[8],@x[8]
525 rev @x[10],@x[10]
526 rev @x[12],@x[12]
527 rev @x[14],@x[14]
528#endif
529 ld1.8 {$T0-$T3},[$inp],#64
530 eor @x[0],@x[0],@x[1]
531 eor @x[2],@x[2],@x[3]
532 eor @x[4],@x[4],@x[5]
533 eor @x[6],@x[6],@x[7]
534 eor @x[8],@x[8],@x[9]
535 eor $A0,$A0,$T0
536 eor @x[10],@x[10],@x[11]
537 eor $B0,$B0,$T1
538 eor @x[12],@x[12],@x[13]
539 eor $C0,$C0,$T2
540 eor @x[14],@x[14],@x[15]
541 eor $D0,$D0,$T3
542 ld1.8 {$T0-$T3},[$inp],#64
543
544 stp @x[0],@x[2],[$out,#0] // store output
545 add @d[6],@d[6],#4 // increment counter
546 stp @x[4],@x[6],[$out,#16]
547 add @K[3],@K[3],$ONE // += 4
548 stp @x[8],@x[10],[$out,#32]
549 add @K[4],@K[4],$ONE
550 stp @x[12],@x[14],[$out,#48]
551 add @K[5],@K[5],$ONE
552 add $out,$out,#64
553
554 st1.8 {$A0-$D0},[$out],#64
555 ld1.8 {$A0-$D0},[$inp],#64
556
557 eor $A1,$A1,$T0
558 eor $B1,$B1,$T1
559 eor $C1,$C1,$T2
560 eor $D1,$D1,$T3
561 st1.8 {$A1-$D1},[$out],#64
562
563 eor $A2,$A2,$A0
564 eor $B2,$B2,$B0
565 eor $C2,$C2,$C0
566 eor $D2,$D2,$D0
567 st1.8 {$A2-$D2},[$out],#64
568
569 b.hi .Loop_outer_neon
570
571 ldp x19,x20,[x29,#16]
572 add sp,sp,#64
573 ldp x21,x22,[x29,#32]
574 ldp x23,x24,[x29,#48]
575 ldp x25,x26,[x29,#64]
576 ldp x27,x28,[x29,#80]
577 ldp x29,x30,[sp],#96
578 ret
579
580.Ltail_neon:
581 add $len,$len,#256
582 cmp $len,#64
583 b.lo .Less_than_64
584
585 add @x[0],@x[0],@x[1],lsl#32 // pack
586 add @x[2],@x[2],@x[3],lsl#32
587 ldp @x[1],@x[3],[$inp,#0] // load input
588 add @x[4],@x[4],@x[5],lsl#32
589 add @x[6],@x[6],@x[7],lsl#32
590 ldp @x[5],@x[7],[$inp,#16]
591 add @x[8],@x[8],@x[9],lsl#32
592 add @x[10],@x[10],@x[11],lsl#32
593 ldp @x[9],@x[11],[$inp,#32]
594 add @x[12],@x[12],@x[13],lsl#32
595 add @x[14],@x[14],@x[15],lsl#32
596 ldp @x[13],@x[15],[$inp,#48]
597 add $inp,$inp,#64
598#ifdef __ARMEB__
599 rev @x[0],@x[0]
600 rev @x[2],@x[2]
601 rev @x[4],@x[4]
602 rev @x[6],@x[6]
603 rev @x[8],@x[8]
604 rev @x[10],@x[10]
605 rev @x[12],@x[12]
606 rev @x[14],@x[14]
607#endif
608 eor @x[0],@x[0],@x[1]
609 eor @x[2],@x[2],@x[3]
610 eor @x[4],@x[4],@x[5]
611 eor @x[6],@x[6],@x[7]
612 eor @x[8],@x[8],@x[9]
613 eor @x[10],@x[10],@x[11]
614 eor @x[12],@x[12],@x[13]
615 eor @x[14],@x[14],@x[15]
616
617 stp @x[0],@x[2],[$out,#0] // store output
618 add @d[6],@d[6],#4 // increment counter
619 stp @x[4],@x[6],[$out,#16]
620 stp @x[8],@x[10],[$out,#32]
621 stp @x[12],@x[14],[$out,#48]
622 add $out,$out,#64
623 b.eq .Ldone_neon
624 sub $len,$len,#64
625 cmp $len,#64
626 b.lo .Less_than_128
627
628 ld1.8 {$T0-$T3},[$inp],#64
629 eor $A0,$A0,$T0
630 eor $B0,$B0,$T1
631 eor $C0,$C0,$T2
632 eor $D0,$D0,$T3
633 st1.8 {$A0-$D0},[$out],#64
634 b.eq .Ldone_neon
635 sub $len,$len,#64
636 cmp $len,#64
637 b.lo .Less_than_192
638
639 ld1.8 {$T0-$T3},[$inp],#64
640 eor $A1,$A1,$T0
641 eor $B1,$B1,$T1
642 eor $C1,$C1,$T2
643 eor $D1,$D1,$T3
644 st1.8 {$A1-$D1},[$out],#64
645 b.eq .Ldone_neon
646 sub $len,$len,#64
647
648 st1.8 {$A2-$D2},[sp]
649 b .Last_neon
650
651.Less_than_128:
652 st1.8 {$A0-$D0},[sp]
653 b .Last_neon
654.Less_than_192:
655 st1.8 {$A1-$D1},[sp]
656 b .Last_neon
657
658.align 4
659.Last_neon:
660 sub $out,$out,#1
661 add $inp,$inp,$len
662 add $out,$out,$len
663 add $ctr,sp,$len
664 neg $len,$len
665
666.Loop_tail_neon:
667 ldrb w10,[$inp,$len]
668 ldrb w11,[$ctr,$len]
669 add $len,$len,#1
670 eor w10,w10,w11
671 strb w10,[$out,$len]
672 cbnz $len,.Loop_tail_neon
673
674 stp xzr,xzr,[sp,#0]
675 stp xzr,xzr,[sp,#16]
676 stp xzr,xzr,[sp,#32]
677 stp xzr,xzr,[sp,#48]
678
679.Ldone_neon:
680 ldp x19,x20,[x29,#16]
681 add sp,sp,#64
682 ldp x21,x22,[x29,#32]
683 ldp x23,x24,[x29,#48]
684 ldp x25,x26,[x29,#64]
685 ldp x27,x28,[x29,#80]
686 ldp x29,x30,[sp],#96
687 ret
688.size ChaCha20_neon,.-ChaCha20_neon
689___
690{
691my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
692my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
693 $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
694
695$code.=<<___;
696.type ChaCha20_512_neon,%function
697.align 5
698ChaCha20_512_neon:
699 stp x29,x30,[sp,#-96]!
700 add x29,sp,#0
701
702 adr @x[0],.Lsigma
703 stp x19,x20,[sp,#16]
704 stp x21,x22,[sp,#32]
705 stp x23,x24,[sp,#48]
706 stp x25,x26,[sp,#64]
707 stp x27,x28,[sp,#80]
708
709.L512_or_more_neon:
710 sub sp,sp,#128+64
711
712 ldp @d[0],@d[1],[@x[0]] // load sigma
713 ld1 {@K[0]},[@x[0]],#16
714 ldp @d[2],@d[3],[$key] // load key
715 ldp @d[4],@d[5],[$key,#16]
716 ld1 {@K[1],@K[2]},[$key]
717 ldp @d[6],@d[7],[$ctr] // load counter
718 ld1 {@K[3]},[$ctr]
719 ld1 {$ONE},[@x[0]]
720#ifdef __ARMEB__
721 rev64 @K[0],@K[0]
722 ror @d[2],@d[2],#32
723 ror @d[3],@d[3],#32
724 ror @d[4],@d[4],#32
725 ror @d[5],@d[5],#32
726 ror @d[6],@d[6],#32
727 ror @d[7],@d[7],#32
728#endif
729 add @K[3],@K[3],$ONE // += 1
730 stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
731 add @K[3],@K[3],$ONE // not typo
732 str @K[2],[sp,#32]
733 add @K[4],@K[3],$ONE
734 add @K[5],@K[4],$ONE
735 add @K[6],@K[5],$ONE
736 shl $ONE,$ONE,#2 // 1 -> 4
737
738 stp d8,d9,[sp,#128+0] // meet ABI requirements
739 stp d10,d11,[sp,#128+16]
740 stp d12,d13,[sp,#128+32]
741 stp d14,d15,[sp,#128+48]
742
743 sub $len,$len,#512 // not typo
744
745.Loop_outer_512_neon:
746 mov $A0,@K[0]
747 mov $A1,@K[0]
748 mov $A2,@K[0]
749 mov $A3,@K[0]
750 mov $A4,@K[0]
751 mov $A5,@K[0]
752 mov $B0,@K[1]
753 mov.32 @x[0],@d[0] // unpack key block
754 mov $B1,@K[1]
755 lsr @x[1],@d[0],#32
756 mov $B2,@K[1]
757 mov.32 @x[2],@d[1]
758 mov $B3,@K[1]
759 lsr @x[3],@d[1],#32
760 mov $B4,@K[1]
761 mov.32 @x[4],@d[2]
762 mov $B5,@K[1]
763 lsr @x[5],@d[2],#32
764 mov $D0,@K[3]
765 mov.32 @x[6],@d[3]
766 mov $D1,@K[4]
767 lsr @x[7],@d[3],#32
768 mov $D2,@K[5]
769 mov.32 @x[8],@d[4]
770 mov $D3,@K[6]
771 lsr @x[9],@d[4],#32
772 mov $C0,@K[2]
773 mov.32 @x[10],@d[5]
774 mov $C1,@K[2]
775 lsr @x[11],@d[5],#32
776 add $D4,$D0,$ONE // +4
777 mov.32 @x[12],@d[6]
778 add $D5,$D1,$ONE // +4
779 lsr @x[13],@d[6],#32
780 mov $C2,@K[2]
781 mov.32 @x[14],@d[7]
782 mov $C3,@K[2]
783 lsr @x[15],@d[7],#32
784 mov $C4,@K[2]
785 stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
786 mov $C5,@K[2]
787 str @K[5],[sp,#80]
788
789 mov $ctr,#5
790 subs $len,$len,#512
791.Loop_upper_neon:
792 sub $ctr,$ctr,#1
793___
794 my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
795 my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
796 my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
797 my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
798 my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
799 my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
800 my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
801 my $diff = ($#thread0+1)*6 - $#thread67 - 1;
802 my $i = 0;
803
804 foreach (@thread0) {
805 eval; eval(shift(@thread67));
806 eval(shift(@thread1)); eval(shift(@thread67));
807 eval(shift(@thread2)); eval(shift(@thread67));
808 eval(shift(@thread3)); eval(shift(@thread67));
809 eval(shift(@thread4)); eval(shift(@thread67));
810 eval(shift(@thread5)); eval(shift(@thread67));
811 }
812
813 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
814 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
815 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
816 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
817 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
818 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
819 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
820
821 foreach (@thread0) {
822 eval; eval(shift(@thread67));
823 eval(shift(@thread1)); eval(shift(@thread67));
824 eval(shift(@thread2)); eval(shift(@thread67));
825 eval(shift(@thread3)); eval(shift(@thread67));
826 eval(shift(@thread4)); eval(shift(@thread67));
827 eval(shift(@thread5)); eval(shift(@thread67));
828 }
829$code.=<<___;
830 cbnz $ctr,.Loop_upper_neon
831
832 add.32 @x[0],@x[0],@d[0] // accumulate key block
833 add @x[1],@x[1],@d[0],lsr#32
834 add.32 @x[2],@x[2],@d[1]
835 add @x[3],@x[3],@d[1],lsr#32
836 add.32 @x[4],@x[4],@d[2]
837 add @x[5],@x[5],@d[2],lsr#32
838 add.32 @x[6],@x[6],@d[3]
839 add @x[7],@x[7],@d[3],lsr#32
840 add.32 @x[8],@x[8],@d[4]
841 add @x[9],@x[9],@d[4],lsr#32
842 add.32 @x[10],@x[10],@d[5]
843 add @x[11],@x[11],@d[5],lsr#32
844 add.32 @x[12],@x[12],@d[6]
845 add @x[13],@x[13],@d[6],lsr#32
846 add.32 @x[14],@x[14],@d[7]
847 add @x[15],@x[15],@d[7],lsr#32
848
849 add @x[0],@x[0],@x[1],lsl#32 // pack
850 add @x[2],@x[2],@x[3],lsl#32
851 ldp @x[1],@x[3],[$inp,#0] // load input
852 add @x[4],@x[4],@x[5],lsl#32
853 add @x[6],@x[6],@x[7],lsl#32
854 ldp @x[5],@x[7],[$inp,#16]
855 add @x[8],@x[8],@x[9],lsl#32
856 add @x[10],@x[10],@x[11],lsl#32
857 ldp @x[9],@x[11],[$inp,#32]
858 add @x[12],@x[12],@x[13],lsl#32
859 add @x[14],@x[14],@x[15],lsl#32
860 ldp @x[13],@x[15],[$inp,#48]
861 add $inp,$inp,#64
862#ifdef __ARMEB__
863 rev @x[0],@x[0]
864 rev @x[2],@x[2]
865 rev @x[4],@x[4]
866 rev @x[6],@x[6]
867 rev @x[8],@x[8]
868 rev @x[10],@x[10]
869 rev @x[12],@x[12]
870 rev @x[14],@x[14]
871#endif
872 eor @x[0],@x[0],@x[1]
873 eor @x[2],@x[2],@x[3]
874 eor @x[4],@x[4],@x[5]
875 eor @x[6],@x[6],@x[7]
876 eor @x[8],@x[8],@x[9]
877 eor @x[10],@x[10],@x[11]
878 eor @x[12],@x[12],@x[13]
879 eor @x[14],@x[14],@x[15]
880
881 stp @x[0],@x[2],[$out,#0] // store output
882 add @d[6],@d[6],#1 // increment counter
883 mov.32 @x[0],@d[0] // unpack key block
884 lsr @x[1],@d[0],#32
885 stp @x[4],@x[6],[$out,#16]
886 mov.32 @x[2],@d[1]
887 lsr @x[3],@d[1],#32
888 stp @x[8],@x[10],[$out,#32]
889 mov.32 @x[4],@d[2]
890 lsr @x[5],@d[2],#32
891 stp @x[12],@x[14],[$out,#48]
892 add $out,$out,#64
893 mov.32 @x[6],@d[3]
894 lsr @x[7],@d[3],#32
895 mov.32 @x[8],@d[4]
896 lsr @x[9],@d[4],#32
897 mov.32 @x[10],@d[5]
898 lsr @x[11],@d[5],#32
899 mov.32 @x[12],@d[6]
900 lsr @x[13],@d[6],#32
901 mov.32 @x[14],@d[7]
902 lsr @x[15],@d[7],#32
903
904 mov $ctr,#5
905.Loop_lower_neon:
906 sub $ctr,$ctr,#1
907___
908 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
909 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
910 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
911 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
912 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
913 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
914 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
915
916 foreach (@thread0) {
917 eval; eval(shift(@thread67));
918 eval(shift(@thread1)); eval(shift(@thread67));
919 eval(shift(@thread2)); eval(shift(@thread67));
920 eval(shift(@thread3)); eval(shift(@thread67));
921 eval(shift(@thread4)); eval(shift(@thread67));
922 eval(shift(@thread5)); eval(shift(@thread67));
923 }
924
925 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
926 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
927 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
928 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
929 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
930 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
931 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
932
933 foreach (@thread0) {
934 eval; eval(shift(@thread67));
935 eval(shift(@thread1)); eval(shift(@thread67));
936 eval(shift(@thread2)); eval(shift(@thread67));
937 eval(shift(@thread3)); eval(shift(@thread67));
938 eval(shift(@thread4)); eval(shift(@thread67));
939 eval(shift(@thread5)); eval(shift(@thread67));
940 }
941$code.=<<___;
942 cbnz $ctr,.Loop_lower_neon
943
944 add.32 @x[0],@x[0],@d[0] // accumulate key block
945 ldp @K[0],@K[1],[sp,#0]
946 add @x[1],@x[1],@d[0],lsr#32
947 ldp @K[2],@K[3],[sp,#32]
948 add.32 @x[2],@x[2],@d[1]
949 ldp @K[4],@K[5],[sp,#64]
950 add @x[3],@x[3],@d[1],lsr#32
951 add $A0,$A0,@K[0]
952 add.32 @x[4],@x[4],@d[2]
953 add $A1,$A1,@K[0]
954 add @x[5],@x[5],@d[2],lsr#32
955 add $A2,$A2,@K[0]
956 add.32 @x[6],@x[6],@d[3]
957 add $A3,$A3,@K[0]
958 add @x[7],@x[7],@d[3],lsr#32
959 add $A4,$A4,@K[0]
960 add.32 @x[8],@x[8],@d[4]
961 add $A5,$A5,@K[0]
962 add @x[9],@x[9],@d[4],lsr#32
963 add $C0,$C0,@K[2]
964 add.32 @x[10],@x[10],@d[5]
965 add $C1,$C1,@K[2]
966 add @x[11],@x[11],@d[5],lsr#32
967 add $C2,$C2,@K[2]
968 add.32 @x[12],@x[12],@d[6]
969 add $C3,$C3,@K[2]
970 add @x[13],@x[13],@d[6],lsr#32
971 add $C4,$C4,@K[2]
972 add.32 @x[14],@x[14],@d[7]
973 add $C5,$C5,@K[2]
974 add @x[15],@x[15],@d[7],lsr#32
975 add $D4,$D4,$ONE // +4
976 add @x[0],@x[0],@x[1],lsl#32 // pack
977 add $D5,$D5,$ONE // +4
978 add @x[2],@x[2],@x[3],lsl#32
979 add $D0,$D0,@K[3]
980 ldp @x[1],@x[3],[$inp,#0] // load input
981 add $D1,$D1,@K[4]
982 add @x[4],@x[4],@x[5],lsl#32
983 add $D2,$D2,@K[5]
984 add @x[6],@x[6],@x[7],lsl#32
985 add $D3,$D3,@K[6]
986 ldp @x[5],@x[7],[$inp,#16]
987 add $D4,$D4,@K[3]
988 add @x[8],@x[8],@x[9],lsl#32
989 add $D5,$D5,@K[4]
990 add @x[10],@x[10],@x[11],lsl#32
991 add $B0,$B0,@K[1]
992 ldp @x[9],@x[11],[$inp,#32]
993 add $B1,$B1,@K[1]
994 add @x[12],@x[12],@x[13],lsl#32
995 add $B2,$B2,@K[1]
996 add @x[14],@x[14],@x[15],lsl#32
997 add $B3,$B3,@K[1]
998 ldp @x[13],@x[15],[$inp,#48]
999 add $B4,$B4,@K[1]
1000 add $inp,$inp,#64
1001 add $B5,$B5,@K[1]
1002
1003#ifdef __ARMEB__
1004 rev @x[0],@x[0]
1005 rev @x[2],@x[2]
1006 rev @x[4],@x[4]
1007 rev @x[6],@x[6]
1008 rev @x[8],@x[8]
1009 rev @x[10],@x[10]
1010 rev @x[12],@x[12]
1011 rev @x[14],@x[14]
1012#endif
1013 ld1.8 {$T0-$T3},[$inp],#64
1014 eor @x[0],@x[0],@x[1]
1015 eor @x[2],@x[2],@x[3]
1016 eor @x[4],@x[4],@x[5]
1017 eor @x[6],@x[6],@x[7]
1018 eor @x[8],@x[8],@x[9]
1019 eor $A0,$A0,$T0
1020 eor @x[10],@x[10],@x[11]
1021 eor $B0,$B0,$T1
1022 eor @x[12],@x[12],@x[13]
1023 eor $C0,$C0,$T2
1024 eor @x[14],@x[14],@x[15]
1025 eor $D0,$D0,$T3
1026 ld1.8 {$T0-$T3},[$inp],#64
1027
1028 stp @x[0],@x[2],[$out,#0] // store output
1029 add @d[6],@d[6],#7 // increment counter
1030 stp @x[4],@x[6],[$out,#16]
1031 stp @x[8],@x[10],[$out,#32]
1032 stp @x[12],@x[14],[$out,#48]
1033 add $out,$out,#64
1034 st1.8 {$A0-$D0},[$out],#64
1035
1036 ld1.8 {$A0-$D0},[$inp],#64
1037 eor $A1,$A1,$T0
1038 eor $B1,$B1,$T1
1039 eor $C1,$C1,$T2
1040 eor $D1,$D1,$T3
1041 st1.8 {$A1-$D1},[$out],#64
1042
1043 ld1.8 {$A1-$D1},[$inp],#64
1044 eor $A2,$A2,$A0
1045 ldp @K[0],@K[1],[sp,#0]
1046 eor $B2,$B2,$B0
1047 ldp @K[2],@K[3],[sp,#32]
1048 eor $C2,$C2,$C0
1049 eor $D2,$D2,$D0
1050 st1.8 {$A2-$D2},[$out],#64
1051
1052 ld1.8 {$A2-$D2},[$inp],#64
1053 eor $A3,$A3,$A1
1054 eor $B3,$B3,$B1
1055 eor $C3,$C3,$C1
1056 eor $D3,$D3,$D1
1057 st1.8 {$A3-$D3},[$out],#64
1058
1059 ld1.8 {$A3-$D3},[$inp],#64
1060 eor $A4,$A4,$A2
1061 eor $B4,$B4,$B2
1062 eor $C4,$C4,$C2
1063 eor $D4,$D4,$D2
1064 st1.8 {$A4-$D4},[$out],#64
1065
1066 shl $A0,$ONE,#1 // 4 -> 8
1067 eor $A5,$A5,$A3
1068 eor $B5,$B5,$B3
1069 eor $C5,$C5,$C3
1070 eor $D5,$D5,$D3
1071 st1.8 {$A5-$D5},[$out],#64
1072
1073 add @K[3],@K[3],$A0 // += 8
1074 add @K[4],@K[4],$A0
1075 add @K[5],@K[5],$A0
1076 add @K[6],@K[6],$A0
1077
1078 b.hs .Loop_outer_512_neon
1079
1080 adds $len,$len,#512
1081 ushr $A0,$ONE,#2 // 4 -> 1
1082
1083 ldp d8,d9,[sp,#128+0] // meet ABI requirements
1084 ldp d10,d11,[sp,#128+16]
1085 ldp d12,d13,[sp,#128+32]
1086 ldp d14,d15,[sp,#128+48]
1087
1088 stp @K[0],$ONE,[sp,#0] // wipe off-load area
1089 stp @K[0],$ONE,[sp,#32]
1090 stp @K[0],$ONE,[sp,#64]
1091
1092 b.eq .Ldone_512_neon
1093
1094 cmp $len,#192
1095 sub @K[3],@K[3],$A0 // -= 1
1096 sub @K[4],@K[4],$A0
1097 sub @K[5],@K[5],$A0
1098 add sp,sp,#128
1099 b.hs .Loop_outer_neon
1100
1101 eor @K[1],@K[1],@K[1]
1102 eor @K[2],@K[2],@K[2]
1103 eor @K[3],@K[3],@K[3]
1104 eor @K[4],@K[4],@K[4]
1105 eor @K[5],@K[5],@K[5]
1106 eor @K[6],@K[6],@K[6]
1107 b .Loop_outer
1108
1109.Ldone_512_neon:
1110 ldp x19,x20,[x29,#16]
1111 add sp,sp,#128+64
1112 ldp x21,x22,[x29,#32]
1113 ldp x23,x24,[x29,#48]
1114 ldp x25,x26,[x29,#64]
1115 ldp x27,x28,[x29,#80]
1116 ldp x29,x30,[sp],#96
1117 ret
1118.size ChaCha20_512_neon,.-ChaCha20_512_neon
1119___
1120}
1121}}}
1122
1123foreach (split("\n",$code)) {
1124 s/\`([^\`]*)\`/eval $1/geo;
1125
1126 (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
1127 (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
1128 (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
1129 (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
1130 (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
1131
1132 #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1133
1134 print $_,"\n";
1135}
1136close STDOUT; # flush