blob: c2d04298ac7204a0ad361b9f1f966a779085f35d [file] [log] [blame]
David Benjamin4969cc92016-04-22 15:02:23 -04001#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# June 2015
Robert Sloana94fe052017-02-21 08:49:28 -080011#
David Benjamin4969cc92016-04-22 15:02:23 -040012# ChaCha20 for ARMv8.
13#
14# Performance in cycles per byte out of large buffer.
15#
16# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU
17#
18# Apple A7 5.50/+49% 3.33 1.70
19# Cortex-A53 8.40/+80% 4.72 4.72(*)
20# Cortex-A57 8.06/+43% 4.90 4.43(**)
21# Denver 4.50/+82% 2.63 2.67(*)
22# X-Gene 9.50/+46% 8.82 8.89(*)
23#
24# (*) it's expected that doubling interleave factor doesn't help
25# all processors, only those with higher NEON latency and
26# higher instruction issue rate;
27# (**) expected improvement was actually higher;
28
29$flavour=shift;
30$output=shift;
31
32$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
33( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
34( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
35die "can't locate arm-xlate.pl";
36
37open OUT,"| \"$^X\" $xlate $flavour $output";
38*STDOUT=*OUT;
39
40sub AUTOLOAD() # thunk [simplified] x86-style perlasm
41{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
42 my $arg = pop;
43 $arg = "#$arg" if ($arg*1 eq $arg);
44 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
45}
46
47my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
48
49my @x=map("x$_",(5..17,19..21));
50my @d=map("x$_",(22..28,30));
51
52sub ROUND {
53my ($a0,$b0,$c0,$d0)=@_;
54my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
55my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
56my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
57
58 (
59 "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
60 "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
61 "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
62 "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
63 "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
64 "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
65 "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
66 "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
67 "&ror_32 (@x[$d0],@x[$d0],16)",
68 "&ror_32 (@x[$d1],@x[$d1],16)",
69 "&ror_32 (@x[$d2],@x[$d2],16)",
70 "&ror_32 (@x[$d3],@x[$d3],16)",
71
72 "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
73 "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
74 "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
75 "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
76 "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
77 "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
78 "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
79 "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
80 "&ror_32 (@x[$b0],@x[$b0],20)",
81 "&ror_32 (@x[$b1],@x[$b1],20)",
82 "&ror_32 (@x[$b2],@x[$b2],20)",
83 "&ror_32 (@x[$b3],@x[$b3],20)",
84
85 "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
86 "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
87 "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
88 "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
89 "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
90 "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
91 "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
92 "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
93 "&ror_32 (@x[$d0],@x[$d0],24)",
94 "&ror_32 (@x[$d1],@x[$d1],24)",
95 "&ror_32 (@x[$d2],@x[$d2],24)",
96 "&ror_32 (@x[$d3],@x[$d3],24)",
97
98 "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
99 "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
100 "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
101 "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
102 "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
103 "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
104 "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
105 "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
106 "&ror_32 (@x[$b0],@x[$b0],25)",
107 "&ror_32 (@x[$b1],@x[$b1],25)",
108 "&ror_32 (@x[$b2],@x[$b2],25)",
109 "&ror_32 (@x[$b3],@x[$b3],25)"
110 );
111}
112
113$code.=<<___;
114#include <openssl/arm_arch.h>
115
116.text
117
118.extern OPENSSL_armcap_P
119
120.align 5
121.Lsigma:
122.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
123.Lone:
124.long 1,0,0,0
125.LOPENSSL_armcap_P:
126#ifdef __ILP32__
127.long OPENSSL_armcap_P-.
128#else
129.quad OPENSSL_armcap_P-.
130#endif
131.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
132
133.globl ChaCha20_ctr32
134.type ChaCha20_ctr32,%function
135.align 5
136ChaCha20_ctr32:
137 cbz $len,.Labort
138 adr @x[0],.LOPENSSL_armcap_P
139 cmp $len,#192
140 b.lo .Lshort
141#ifdef __ILP32__
142 ldrsw @x[1],[@x[0]]
143#else
144 ldr @x[1],[@x[0]]
145#endif
146 ldr w17,[@x[1],@x[0]]
147 tst w17,#ARMV7_NEON
148 b.ne ChaCha20_neon
149
150.Lshort:
151 stp x29,x30,[sp,#-96]!
152 add x29,sp,#0
153
154 adr @x[0],.Lsigma
155 stp x19,x20,[sp,#16]
156 stp x21,x22,[sp,#32]
157 stp x23,x24,[sp,#48]
158 stp x25,x26,[sp,#64]
159 stp x27,x28,[sp,#80]
160 sub sp,sp,#64
161
162 ldp @d[0],@d[1],[@x[0]] // load sigma
163 ldp @d[2],@d[3],[$key] // load key
164 ldp @d[4],@d[5],[$key,#16]
165 ldp @d[6],@d[7],[$ctr] // load counter
166#ifdef __ARMEB__
167 ror @d[2],@d[2],#32
168 ror @d[3],@d[3],#32
169 ror @d[4],@d[4],#32
170 ror @d[5],@d[5],#32
171 ror @d[6],@d[6],#32
172 ror @d[7],@d[7],#32
173#endif
174
175.Loop_outer:
176 mov.32 @x[0],@d[0] // unpack key block
177 lsr @x[1],@d[0],#32
178 mov.32 @x[2],@d[1]
179 lsr @x[3],@d[1],#32
180 mov.32 @x[4],@d[2]
181 lsr @x[5],@d[2],#32
182 mov.32 @x[6],@d[3]
183 lsr @x[7],@d[3],#32
184 mov.32 @x[8],@d[4]
185 lsr @x[9],@d[4],#32
186 mov.32 @x[10],@d[5]
187 lsr @x[11],@d[5],#32
188 mov.32 @x[12],@d[6]
189 lsr @x[13],@d[6],#32
190 mov.32 @x[14],@d[7]
191 lsr @x[15],@d[7],#32
192
193 mov $ctr,#10
194 subs $len,$len,#64
195.Loop:
Robert Sloana94fe052017-02-21 08:49:28 -0800196 sub $ctr,$ctr,#1
David Benjamin4969cc92016-04-22 15:02:23 -0400197___
198 foreach (&ROUND(0, 4, 8,12)) { eval; }
199 foreach (&ROUND(0, 5,10,15)) { eval; }
200$code.=<<___;
201 cbnz $ctr,.Loop
202
203 add.32 @x[0],@x[0],@d[0] // accumulate key block
204 add @x[1],@x[1],@d[0],lsr#32
205 add.32 @x[2],@x[2],@d[1]
206 add @x[3],@x[3],@d[1],lsr#32
207 add.32 @x[4],@x[4],@d[2]
208 add @x[5],@x[5],@d[2],lsr#32
209 add.32 @x[6],@x[6],@d[3]
210 add @x[7],@x[7],@d[3],lsr#32
211 add.32 @x[8],@x[8],@d[4]
212 add @x[9],@x[9],@d[4],lsr#32
213 add.32 @x[10],@x[10],@d[5]
214 add @x[11],@x[11],@d[5],lsr#32
215 add.32 @x[12],@x[12],@d[6]
216 add @x[13],@x[13],@d[6],lsr#32
217 add.32 @x[14],@x[14],@d[7]
218 add @x[15],@x[15],@d[7],lsr#32
219
220 b.lo .Ltail
221
222 add @x[0],@x[0],@x[1],lsl#32 // pack
223 add @x[2],@x[2],@x[3],lsl#32
224 ldp @x[1],@x[3],[$inp,#0] // load input
225 add @x[4],@x[4],@x[5],lsl#32
226 add @x[6],@x[6],@x[7],lsl#32
227 ldp @x[5],@x[7],[$inp,#16]
228 add @x[8],@x[8],@x[9],lsl#32
229 add @x[10],@x[10],@x[11],lsl#32
230 ldp @x[9],@x[11],[$inp,#32]
231 add @x[12],@x[12],@x[13],lsl#32
232 add @x[14],@x[14],@x[15],lsl#32
233 ldp @x[13],@x[15],[$inp,#48]
234 add $inp,$inp,#64
235#ifdef __ARMEB__
236 rev @x[0],@x[0]
237 rev @x[2],@x[2]
238 rev @x[4],@x[4]
239 rev @x[6],@x[6]
240 rev @x[8],@x[8]
241 rev @x[10],@x[10]
242 rev @x[12],@x[12]
243 rev @x[14],@x[14]
244#endif
245 eor @x[0],@x[0],@x[1]
246 eor @x[2],@x[2],@x[3]
247 eor @x[4],@x[4],@x[5]
248 eor @x[6],@x[6],@x[7]
249 eor @x[8],@x[8],@x[9]
250 eor @x[10],@x[10],@x[11]
251 eor @x[12],@x[12],@x[13]
252 eor @x[14],@x[14],@x[15]
253
254 stp @x[0],@x[2],[$out,#0] // store output
255 add @d[6],@d[6],#1 // increment counter
256 stp @x[4],@x[6],[$out,#16]
257 stp @x[8],@x[10],[$out,#32]
258 stp @x[12],@x[14],[$out,#48]
259 add $out,$out,#64
260
261 b.hi .Loop_outer
262
263 ldp x19,x20,[x29,#16]
264 add sp,sp,#64
265 ldp x21,x22,[x29,#32]
266 ldp x23,x24,[x29,#48]
267 ldp x25,x26,[x29,#64]
268 ldp x27,x28,[x29,#80]
269 ldp x29,x30,[sp],#96
270.Labort:
271 ret
272
273.align 4
274.Ltail:
275 add $len,$len,#64
276.Less_than_64:
277 sub $out,$out,#1
278 add $inp,$inp,$len
279 add $out,$out,$len
280 add $ctr,sp,$len
281 neg $len,$len
282
283 add @x[0],@x[0],@x[1],lsl#32 // pack
284 add @x[2],@x[2],@x[3],lsl#32
285 add @x[4],@x[4],@x[5],lsl#32
286 add @x[6],@x[6],@x[7],lsl#32
287 add @x[8],@x[8],@x[9],lsl#32
288 add @x[10],@x[10],@x[11],lsl#32
289 add @x[12],@x[12],@x[13],lsl#32
290 add @x[14],@x[14],@x[15],lsl#32
291#ifdef __ARMEB__
292 rev @x[0],@x[0]
293 rev @x[2],@x[2]
294 rev @x[4],@x[4]
295 rev @x[6],@x[6]
296 rev @x[8],@x[8]
297 rev @x[10],@x[10]
298 rev @x[12],@x[12]
299 rev @x[14],@x[14]
300#endif
301 stp @x[0],@x[2],[sp,#0]
302 stp @x[4],@x[6],[sp,#16]
303 stp @x[8],@x[10],[sp,#32]
304 stp @x[12],@x[14],[sp,#48]
305
306.Loop_tail:
307 ldrb w10,[$inp,$len]
308 ldrb w11,[$ctr,$len]
309 add $len,$len,#1
310 eor w10,w10,w11
311 strb w10,[$out,$len]
312 cbnz $len,.Loop_tail
313
314 stp xzr,xzr,[sp,#0]
315 stp xzr,xzr,[sp,#16]
316 stp xzr,xzr,[sp,#32]
317 stp xzr,xzr,[sp,#48]
318
319 ldp x19,x20,[x29,#16]
320 add sp,sp,#64
321 ldp x21,x22,[x29,#32]
322 ldp x23,x24,[x29,#48]
323 ldp x25,x26,[x29,#64]
324 ldp x27,x28,[x29,#80]
325 ldp x29,x30,[sp],#96
326 ret
327.size ChaCha20_ctr32,.-ChaCha20_ctr32
328___
329
330{{{
331my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
332 map("v$_.4s",(0..7,16..23));
333my (@K)=map("v$_.4s",(24..30));
334my $ONE="v31.4s";
335
336sub NEONROUND {
337my $odd = pop;
338my ($a,$b,$c,$d,$t)=@_;
339
340 (
341 "&add ('$a','$a','$b')",
342 "&eor ('$d','$d','$a')",
343 "&rev32_16 ('$d','$d')", # vrot ($d,16)
344
345 "&add ('$c','$c','$d')",
346 "&eor ('$t','$b','$c')",
347 "&ushr ('$b','$t',20)",
348 "&sli ('$b','$t',12)",
349
350 "&add ('$a','$a','$b')",
351 "&eor ('$t','$d','$a')",
352 "&ushr ('$d','$t',24)",
353 "&sli ('$d','$t',8)",
354
355 "&add ('$c','$c','$d')",
356 "&eor ('$t','$b','$c')",
357 "&ushr ('$b','$t',25)",
358 "&sli ('$b','$t',7)",
359
360 "&ext ('$c','$c','$c',8)",
361 "&ext ('$d','$d','$d',$odd?4:12)",
362 "&ext ('$b','$b','$b',$odd?12:4)"
363 );
364}
365
366$code.=<<___;
367
368.type ChaCha20_neon,%function
369.align 5
370ChaCha20_neon:
371 stp x29,x30,[sp,#-96]!
372 add x29,sp,#0
373
374 adr @x[0],.Lsigma
375 stp x19,x20,[sp,#16]
376 stp x21,x22,[sp,#32]
377 stp x23,x24,[sp,#48]
378 stp x25,x26,[sp,#64]
379 stp x27,x28,[sp,#80]
380 cmp $len,#512
381 b.hs .L512_or_more_neon
382
383 sub sp,sp,#64
384
385 ldp @d[0],@d[1],[@x[0]] // load sigma
386 ld1 {@K[0]},[@x[0]],#16
387 ldp @d[2],@d[3],[$key] // load key
388 ldp @d[4],@d[5],[$key,#16]
389 ld1 {@K[1],@K[2]},[$key]
390 ldp @d[6],@d[7],[$ctr] // load counter
391 ld1 {@K[3]},[$ctr]
392 ld1 {$ONE},[@x[0]]
393#ifdef __ARMEB__
394 rev64 @K[0],@K[0]
395 ror @d[2],@d[2],#32
396 ror @d[3],@d[3],#32
397 ror @d[4],@d[4],#32
398 ror @d[5],@d[5],#32
399 ror @d[6],@d[6],#32
400 ror @d[7],@d[7],#32
401#endif
402 add @K[3],@K[3],$ONE // += 1
403 add @K[4],@K[3],$ONE
404 add @K[5],@K[4],$ONE
405 shl $ONE,$ONE,#2 // 1 -> 4
406
407.Loop_outer_neon:
408 mov.32 @x[0],@d[0] // unpack key block
409 lsr @x[1],@d[0],#32
410 mov $A0,@K[0]
411 mov.32 @x[2],@d[1]
412 lsr @x[3],@d[1],#32
413 mov $A1,@K[0]
414 mov.32 @x[4],@d[2]
415 lsr @x[5],@d[2],#32
416 mov $A2,@K[0]
417 mov.32 @x[6],@d[3]
418 mov $B0,@K[1]
419 lsr @x[7],@d[3],#32
420 mov $B1,@K[1]
421 mov.32 @x[8],@d[4]
422 mov $B2,@K[1]
423 lsr @x[9],@d[4],#32
424 mov $D0,@K[3]
425 mov.32 @x[10],@d[5]
426 mov $D1,@K[4]
427 lsr @x[11],@d[5],#32
428 mov $D2,@K[5]
429 mov.32 @x[12],@d[6]
430 mov $C0,@K[2]
431 lsr @x[13],@d[6],#32
432 mov $C1,@K[2]
433 mov.32 @x[14],@d[7]
434 mov $C2,@K[2]
435 lsr @x[15],@d[7],#32
436
437 mov $ctr,#10
438 subs $len,$len,#256
439.Loop_neon:
440 sub $ctr,$ctr,#1
441___
442 my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
443 my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
444 my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
445 my @thread3=&ROUND(0,4,8,12);
446
447 foreach (@thread0) {
448 eval; eval(shift(@thread3));
449 eval(shift(@thread1)); eval(shift(@thread3));
450 eval(shift(@thread2)); eval(shift(@thread3));
451 }
452
453 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
454 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
455 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
456 @thread3=&ROUND(0,5,10,15);
457
458 foreach (@thread0) {
459 eval; eval(shift(@thread3));
460 eval(shift(@thread1)); eval(shift(@thread3));
461 eval(shift(@thread2)); eval(shift(@thread3));
462 }
463$code.=<<___;
464 cbnz $ctr,.Loop_neon
465
466 add.32 @x[0],@x[0],@d[0] // accumulate key block
467 add $A0,$A0,@K[0]
468 add @x[1],@x[1],@d[0],lsr#32
469 add $A1,$A1,@K[0]
470 add.32 @x[2],@x[2],@d[1]
471 add $A2,$A2,@K[0]
472 add @x[3],@x[3],@d[1],lsr#32
473 add $C0,$C0,@K[2]
474 add.32 @x[4],@x[4],@d[2]
475 add $C1,$C1,@K[2]
476 add @x[5],@x[5],@d[2],lsr#32
477 add $C2,$C2,@K[2]
478 add.32 @x[6],@x[6],@d[3]
479 add $D0,$D0,@K[3]
480 add @x[7],@x[7],@d[3],lsr#32
481 add.32 @x[8],@x[8],@d[4]
482 add $D1,$D1,@K[4]
483 add @x[9],@x[9],@d[4],lsr#32
484 add.32 @x[10],@x[10],@d[5]
485 add $D2,$D2,@K[5]
486 add @x[11],@x[11],@d[5],lsr#32
487 add.32 @x[12],@x[12],@d[6]
488 add $B0,$B0,@K[1]
489 add @x[13],@x[13],@d[6],lsr#32
490 add.32 @x[14],@x[14],@d[7]
491 add $B1,$B1,@K[1]
492 add @x[15],@x[15],@d[7],lsr#32
493 add $B2,$B2,@K[1]
494
495 b.lo .Ltail_neon
496
497 add @x[0],@x[0],@x[1],lsl#32 // pack
498 add @x[2],@x[2],@x[3],lsl#32
499 ldp @x[1],@x[3],[$inp,#0] // load input
500 add @x[4],@x[4],@x[5],lsl#32
501 add @x[6],@x[6],@x[7],lsl#32
502 ldp @x[5],@x[7],[$inp,#16]
503 add @x[8],@x[8],@x[9],lsl#32
504 add @x[10],@x[10],@x[11],lsl#32
505 ldp @x[9],@x[11],[$inp,#32]
506 add @x[12],@x[12],@x[13],lsl#32
507 add @x[14],@x[14],@x[15],lsl#32
508 ldp @x[13],@x[15],[$inp,#48]
509 add $inp,$inp,#64
510#ifdef __ARMEB__
511 rev @x[0],@x[0]
512 rev @x[2],@x[2]
513 rev @x[4],@x[4]
514 rev @x[6],@x[6]
515 rev @x[8],@x[8]
516 rev @x[10],@x[10]
517 rev @x[12],@x[12]
518 rev @x[14],@x[14]
519#endif
520 ld1.8 {$T0-$T3},[$inp],#64
521 eor @x[0],@x[0],@x[1]
522 eor @x[2],@x[2],@x[3]
523 eor @x[4],@x[4],@x[5]
524 eor @x[6],@x[6],@x[7]
525 eor @x[8],@x[8],@x[9]
526 eor $A0,$A0,$T0
527 eor @x[10],@x[10],@x[11]
528 eor $B0,$B0,$T1
529 eor @x[12],@x[12],@x[13]
530 eor $C0,$C0,$T2
531 eor @x[14],@x[14],@x[15]
532 eor $D0,$D0,$T3
533 ld1.8 {$T0-$T3},[$inp],#64
534
535 stp @x[0],@x[2],[$out,#0] // store output
536 add @d[6],@d[6],#4 // increment counter
537 stp @x[4],@x[6],[$out,#16]
538 add @K[3],@K[3],$ONE // += 4
539 stp @x[8],@x[10],[$out,#32]
540 add @K[4],@K[4],$ONE
541 stp @x[12],@x[14],[$out,#48]
542 add @K[5],@K[5],$ONE
543 add $out,$out,#64
544
545 st1.8 {$A0-$D0},[$out],#64
546 ld1.8 {$A0-$D0},[$inp],#64
547
548 eor $A1,$A1,$T0
549 eor $B1,$B1,$T1
550 eor $C1,$C1,$T2
551 eor $D1,$D1,$T3
552 st1.8 {$A1-$D1},[$out],#64
553
554 eor $A2,$A2,$A0
555 eor $B2,$B2,$B0
556 eor $C2,$C2,$C0
557 eor $D2,$D2,$D0
558 st1.8 {$A2-$D2},[$out],#64
559
560 b.hi .Loop_outer_neon
561
562 ldp x19,x20,[x29,#16]
563 add sp,sp,#64
564 ldp x21,x22,[x29,#32]
565 ldp x23,x24,[x29,#48]
566 ldp x25,x26,[x29,#64]
567 ldp x27,x28,[x29,#80]
568 ldp x29,x30,[sp],#96
569 ret
570
571.Ltail_neon:
572 add $len,$len,#256
573 cmp $len,#64
574 b.lo .Less_than_64
575
576 add @x[0],@x[0],@x[1],lsl#32 // pack
577 add @x[2],@x[2],@x[3],lsl#32
578 ldp @x[1],@x[3],[$inp,#0] // load input
579 add @x[4],@x[4],@x[5],lsl#32
580 add @x[6],@x[6],@x[7],lsl#32
581 ldp @x[5],@x[7],[$inp,#16]
582 add @x[8],@x[8],@x[9],lsl#32
583 add @x[10],@x[10],@x[11],lsl#32
584 ldp @x[9],@x[11],[$inp,#32]
585 add @x[12],@x[12],@x[13],lsl#32
586 add @x[14],@x[14],@x[15],lsl#32
587 ldp @x[13],@x[15],[$inp,#48]
588 add $inp,$inp,#64
589#ifdef __ARMEB__
590 rev @x[0],@x[0]
591 rev @x[2],@x[2]
592 rev @x[4],@x[4]
593 rev @x[6],@x[6]
594 rev @x[8],@x[8]
595 rev @x[10],@x[10]
596 rev @x[12],@x[12]
597 rev @x[14],@x[14]
598#endif
599 eor @x[0],@x[0],@x[1]
600 eor @x[2],@x[2],@x[3]
601 eor @x[4],@x[4],@x[5]
602 eor @x[6],@x[6],@x[7]
603 eor @x[8],@x[8],@x[9]
604 eor @x[10],@x[10],@x[11]
605 eor @x[12],@x[12],@x[13]
606 eor @x[14],@x[14],@x[15]
607
608 stp @x[0],@x[2],[$out,#0] // store output
609 add @d[6],@d[6],#4 // increment counter
610 stp @x[4],@x[6],[$out,#16]
611 stp @x[8],@x[10],[$out,#32]
612 stp @x[12],@x[14],[$out,#48]
613 add $out,$out,#64
614 b.eq .Ldone_neon
615 sub $len,$len,#64
616 cmp $len,#64
617 b.lo .Less_than_128
618
619 ld1.8 {$T0-$T3},[$inp],#64
620 eor $A0,$A0,$T0
621 eor $B0,$B0,$T1
622 eor $C0,$C0,$T2
623 eor $D0,$D0,$T3
624 st1.8 {$A0-$D0},[$out],#64
625 b.eq .Ldone_neon
626 sub $len,$len,#64
627 cmp $len,#64
628 b.lo .Less_than_192
629
630 ld1.8 {$T0-$T3},[$inp],#64
631 eor $A1,$A1,$T0
632 eor $B1,$B1,$T1
633 eor $C1,$C1,$T2
634 eor $D1,$D1,$T3
635 st1.8 {$A1-$D1},[$out],#64
636 b.eq .Ldone_neon
637 sub $len,$len,#64
638
639 st1.8 {$A2-$D2},[sp]
640 b .Last_neon
641
642.Less_than_128:
643 st1.8 {$A0-$D0},[sp]
644 b .Last_neon
645.Less_than_192:
646 st1.8 {$A1-$D1},[sp]
647 b .Last_neon
648
649.align 4
650.Last_neon:
651 sub $out,$out,#1
652 add $inp,$inp,$len
653 add $out,$out,$len
654 add $ctr,sp,$len
655 neg $len,$len
656
657.Loop_tail_neon:
658 ldrb w10,[$inp,$len]
659 ldrb w11,[$ctr,$len]
660 add $len,$len,#1
661 eor w10,w10,w11
662 strb w10,[$out,$len]
663 cbnz $len,.Loop_tail_neon
664
665 stp xzr,xzr,[sp,#0]
666 stp xzr,xzr,[sp,#16]
667 stp xzr,xzr,[sp,#32]
668 stp xzr,xzr,[sp,#48]
669
670.Ldone_neon:
671 ldp x19,x20,[x29,#16]
672 add sp,sp,#64
673 ldp x21,x22,[x29,#32]
674 ldp x23,x24,[x29,#48]
675 ldp x25,x26,[x29,#64]
676 ldp x27,x28,[x29,#80]
677 ldp x29,x30,[sp],#96
678 ret
679.size ChaCha20_neon,.-ChaCha20_neon
680___
681{
682my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
683my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
684 $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
685
686$code.=<<___;
687.type ChaCha20_512_neon,%function
688.align 5
689ChaCha20_512_neon:
690 stp x29,x30,[sp,#-96]!
691 add x29,sp,#0
692
693 adr @x[0],.Lsigma
694 stp x19,x20,[sp,#16]
695 stp x21,x22,[sp,#32]
696 stp x23,x24,[sp,#48]
697 stp x25,x26,[sp,#64]
698 stp x27,x28,[sp,#80]
699
700.L512_or_more_neon:
701 sub sp,sp,#128+64
702
703 ldp @d[0],@d[1],[@x[0]] // load sigma
704 ld1 {@K[0]},[@x[0]],#16
705 ldp @d[2],@d[3],[$key] // load key
706 ldp @d[4],@d[5],[$key,#16]
707 ld1 {@K[1],@K[2]},[$key]
708 ldp @d[6],@d[7],[$ctr] // load counter
709 ld1 {@K[3]},[$ctr]
710 ld1 {$ONE},[@x[0]]
711#ifdef __ARMEB__
712 rev64 @K[0],@K[0]
713 ror @d[2],@d[2],#32
714 ror @d[3],@d[3],#32
715 ror @d[4],@d[4],#32
716 ror @d[5],@d[5],#32
717 ror @d[6],@d[6],#32
718 ror @d[7],@d[7],#32
719#endif
720 add @K[3],@K[3],$ONE // += 1
721 stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
722 add @K[3],@K[3],$ONE // not typo
723 str @K[2],[sp,#32]
724 add @K[4],@K[3],$ONE
725 add @K[5],@K[4],$ONE
726 add @K[6],@K[5],$ONE
727 shl $ONE,$ONE,#2 // 1 -> 4
728
729 stp d8,d9,[sp,#128+0] // meet ABI requirements
730 stp d10,d11,[sp,#128+16]
731 stp d12,d13,[sp,#128+32]
732 stp d14,d15,[sp,#128+48]
733
734 sub $len,$len,#512 // not typo
735
736.Loop_outer_512_neon:
737 mov $A0,@K[0]
738 mov $A1,@K[0]
739 mov $A2,@K[0]
740 mov $A3,@K[0]
741 mov $A4,@K[0]
742 mov $A5,@K[0]
743 mov $B0,@K[1]
744 mov.32 @x[0],@d[0] // unpack key block
745 mov $B1,@K[1]
746 lsr @x[1],@d[0],#32
747 mov $B2,@K[1]
748 mov.32 @x[2],@d[1]
749 mov $B3,@K[1]
750 lsr @x[3],@d[1],#32
751 mov $B4,@K[1]
752 mov.32 @x[4],@d[2]
753 mov $B5,@K[1]
754 lsr @x[5],@d[2],#32
755 mov $D0,@K[3]
756 mov.32 @x[6],@d[3]
757 mov $D1,@K[4]
758 lsr @x[7],@d[3],#32
759 mov $D2,@K[5]
760 mov.32 @x[8],@d[4]
761 mov $D3,@K[6]
762 lsr @x[9],@d[4],#32
763 mov $C0,@K[2]
764 mov.32 @x[10],@d[5]
765 mov $C1,@K[2]
766 lsr @x[11],@d[5],#32
767 add $D4,$D0,$ONE // +4
768 mov.32 @x[12],@d[6]
769 add $D5,$D1,$ONE // +4
770 lsr @x[13],@d[6],#32
771 mov $C2,@K[2]
772 mov.32 @x[14],@d[7]
773 mov $C3,@K[2]
774 lsr @x[15],@d[7],#32
775 mov $C4,@K[2]
776 stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
777 mov $C5,@K[2]
778 str @K[5],[sp,#80]
779
780 mov $ctr,#5
781 subs $len,$len,#512
782.Loop_upper_neon:
783 sub $ctr,$ctr,#1
784___
785 my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
786 my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
787 my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
788 my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
789 my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
790 my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
791 my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
792 my $diff = ($#thread0+1)*6 - $#thread67 - 1;
793 my $i = 0;
794
795 foreach (@thread0) {
796 eval; eval(shift(@thread67));
797 eval(shift(@thread1)); eval(shift(@thread67));
798 eval(shift(@thread2)); eval(shift(@thread67));
799 eval(shift(@thread3)); eval(shift(@thread67));
800 eval(shift(@thread4)); eval(shift(@thread67));
801 eval(shift(@thread5)); eval(shift(@thread67));
802 }
803
804 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
805 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
806 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
807 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
808 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
809 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
810 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
811
812 foreach (@thread0) {
813 eval; eval(shift(@thread67));
814 eval(shift(@thread1)); eval(shift(@thread67));
815 eval(shift(@thread2)); eval(shift(@thread67));
816 eval(shift(@thread3)); eval(shift(@thread67));
817 eval(shift(@thread4)); eval(shift(@thread67));
818 eval(shift(@thread5)); eval(shift(@thread67));
819 }
820$code.=<<___;
821 cbnz $ctr,.Loop_upper_neon
822
823 add.32 @x[0],@x[0],@d[0] // accumulate key block
824 add @x[1],@x[1],@d[0],lsr#32
825 add.32 @x[2],@x[2],@d[1]
826 add @x[3],@x[3],@d[1],lsr#32
827 add.32 @x[4],@x[4],@d[2]
828 add @x[5],@x[5],@d[2],lsr#32
829 add.32 @x[6],@x[6],@d[3]
830 add @x[7],@x[7],@d[3],lsr#32
831 add.32 @x[8],@x[8],@d[4]
832 add @x[9],@x[9],@d[4],lsr#32
833 add.32 @x[10],@x[10],@d[5]
834 add @x[11],@x[11],@d[5],lsr#32
835 add.32 @x[12],@x[12],@d[6]
836 add @x[13],@x[13],@d[6],lsr#32
837 add.32 @x[14],@x[14],@d[7]
838 add @x[15],@x[15],@d[7],lsr#32
839
840 add @x[0],@x[0],@x[1],lsl#32 // pack
841 add @x[2],@x[2],@x[3],lsl#32
842 ldp @x[1],@x[3],[$inp,#0] // load input
843 add @x[4],@x[4],@x[5],lsl#32
844 add @x[6],@x[6],@x[7],lsl#32
845 ldp @x[5],@x[7],[$inp,#16]
846 add @x[8],@x[8],@x[9],lsl#32
847 add @x[10],@x[10],@x[11],lsl#32
848 ldp @x[9],@x[11],[$inp,#32]
849 add @x[12],@x[12],@x[13],lsl#32
850 add @x[14],@x[14],@x[15],lsl#32
851 ldp @x[13],@x[15],[$inp,#48]
852 add $inp,$inp,#64
853#ifdef __ARMEB__
854 rev @x[0],@x[0]
855 rev @x[2],@x[2]
856 rev @x[4],@x[4]
857 rev @x[6],@x[6]
858 rev @x[8],@x[8]
859 rev @x[10],@x[10]
860 rev @x[12],@x[12]
861 rev @x[14],@x[14]
862#endif
863 eor @x[0],@x[0],@x[1]
864 eor @x[2],@x[2],@x[3]
865 eor @x[4],@x[4],@x[5]
866 eor @x[6],@x[6],@x[7]
867 eor @x[8],@x[8],@x[9]
868 eor @x[10],@x[10],@x[11]
869 eor @x[12],@x[12],@x[13]
870 eor @x[14],@x[14],@x[15]
871
872 stp @x[0],@x[2],[$out,#0] // store output
873 add @d[6],@d[6],#1 // increment counter
874 mov.32 @x[0],@d[0] // unpack key block
875 lsr @x[1],@d[0],#32
876 stp @x[4],@x[6],[$out,#16]
877 mov.32 @x[2],@d[1]
878 lsr @x[3],@d[1],#32
879 stp @x[8],@x[10],[$out,#32]
880 mov.32 @x[4],@d[2]
881 lsr @x[5],@d[2],#32
882 stp @x[12],@x[14],[$out,#48]
883 add $out,$out,#64
884 mov.32 @x[6],@d[3]
885 lsr @x[7],@d[3],#32
886 mov.32 @x[8],@d[4]
887 lsr @x[9],@d[4],#32
888 mov.32 @x[10],@d[5]
889 lsr @x[11],@d[5],#32
890 mov.32 @x[12],@d[6]
891 lsr @x[13],@d[6],#32
892 mov.32 @x[14],@d[7]
893 lsr @x[15],@d[7],#32
894
895 mov $ctr,#5
896.Loop_lower_neon:
897 sub $ctr,$ctr,#1
898___
899 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
900 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
901 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
902 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
903 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
904 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
905 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
906
907 foreach (@thread0) {
908 eval; eval(shift(@thread67));
909 eval(shift(@thread1)); eval(shift(@thread67));
910 eval(shift(@thread2)); eval(shift(@thread67));
911 eval(shift(@thread3)); eval(shift(@thread67));
912 eval(shift(@thread4)); eval(shift(@thread67));
913 eval(shift(@thread5)); eval(shift(@thread67));
914 }
915
916 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
917 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
918 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
919 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
920 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
921 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
922 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
923
924 foreach (@thread0) {
925 eval; eval(shift(@thread67));
926 eval(shift(@thread1)); eval(shift(@thread67));
927 eval(shift(@thread2)); eval(shift(@thread67));
928 eval(shift(@thread3)); eval(shift(@thread67));
929 eval(shift(@thread4)); eval(shift(@thread67));
930 eval(shift(@thread5)); eval(shift(@thread67));
931 }
932$code.=<<___;
933 cbnz $ctr,.Loop_lower_neon
934
935 add.32 @x[0],@x[0],@d[0] // accumulate key block
936 ldp @K[0],@K[1],[sp,#0]
937 add @x[1],@x[1],@d[0],lsr#32
938 ldp @K[2],@K[3],[sp,#32]
939 add.32 @x[2],@x[2],@d[1]
940 ldp @K[4],@K[5],[sp,#64]
941 add @x[3],@x[3],@d[1],lsr#32
942 add $A0,$A0,@K[0]
943 add.32 @x[4],@x[4],@d[2]
944 add $A1,$A1,@K[0]
945 add @x[5],@x[5],@d[2],lsr#32
946 add $A2,$A2,@K[0]
947 add.32 @x[6],@x[6],@d[3]
948 add $A3,$A3,@K[0]
949 add @x[7],@x[7],@d[3],lsr#32
950 add $A4,$A4,@K[0]
951 add.32 @x[8],@x[8],@d[4]
952 add $A5,$A5,@K[0]
953 add @x[9],@x[9],@d[4],lsr#32
954 add $C0,$C0,@K[2]
955 add.32 @x[10],@x[10],@d[5]
956 add $C1,$C1,@K[2]
957 add @x[11],@x[11],@d[5],lsr#32
958 add $C2,$C2,@K[2]
959 add.32 @x[12],@x[12],@d[6]
960 add $C3,$C3,@K[2]
961 add @x[13],@x[13],@d[6],lsr#32
962 add $C4,$C4,@K[2]
963 add.32 @x[14],@x[14],@d[7]
964 add $C5,$C5,@K[2]
965 add @x[15],@x[15],@d[7],lsr#32
966 add $D4,$D4,$ONE // +4
967 add @x[0],@x[0],@x[1],lsl#32 // pack
968 add $D5,$D5,$ONE // +4
969 add @x[2],@x[2],@x[3],lsl#32
970 add $D0,$D0,@K[3]
971 ldp @x[1],@x[3],[$inp,#0] // load input
972 add $D1,$D1,@K[4]
973 add @x[4],@x[4],@x[5],lsl#32
974 add $D2,$D2,@K[5]
975 add @x[6],@x[6],@x[7],lsl#32
976 add $D3,$D3,@K[6]
977 ldp @x[5],@x[7],[$inp,#16]
978 add $D4,$D4,@K[3]
979 add @x[8],@x[8],@x[9],lsl#32
980 add $D5,$D5,@K[4]
981 add @x[10],@x[10],@x[11],lsl#32
982 add $B0,$B0,@K[1]
983 ldp @x[9],@x[11],[$inp,#32]
984 add $B1,$B1,@K[1]
985 add @x[12],@x[12],@x[13],lsl#32
986 add $B2,$B2,@K[1]
987 add @x[14],@x[14],@x[15],lsl#32
988 add $B3,$B3,@K[1]
989 ldp @x[13],@x[15],[$inp,#48]
990 add $B4,$B4,@K[1]
991 add $inp,$inp,#64
992 add $B5,$B5,@K[1]
993
994#ifdef __ARMEB__
995 rev @x[0],@x[0]
996 rev @x[2],@x[2]
997 rev @x[4],@x[4]
998 rev @x[6],@x[6]
999 rev @x[8],@x[8]
1000 rev @x[10],@x[10]
1001 rev @x[12],@x[12]
1002 rev @x[14],@x[14]
1003#endif
1004 ld1.8 {$T0-$T3},[$inp],#64
1005 eor @x[0],@x[0],@x[1]
1006 eor @x[2],@x[2],@x[3]
1007 eor @x[4],@x[4],@x[5]
1008 eor @x[6],@x[6],@x[7]
1009 eor @x[8],@x[8],@x[9]
1010 eor $A0,$A0,$T0
1011 eor @x[10],@x[10],@x[11]
1012 eor $B0,$B0,$T1
1013 eor @x[12],@x[12],@x[13]
1014 eor $C0,$C0,$T2
1015 eor @x[14],@x[14],@x[15]
1016 eor $D0,$D0,$T3
1017 ld1.8 {$T0-$T3},[$inp],#64
1018
1019 stp @x[0],@x[2],[$out,#0] // store output
1020 add @d[6],@d[6],#7 // increment counter
1021 stp @x[4],@x[6],[$out,#16]
1022 stp @x[8],@x[10],[$out,#32]
1023 stp @x[12],@x[14],[$out,#48]
1024 add $out,$out,#64
1025 st1.8 {$A0-$D0},[$out],#64
1026
1027 ld1.8 {$A0-$D0},[$inp],#64
1028 eor $A1,$A1,$T0
1029 eor $B1,$B1,$T1
1030 eor $C1,$C1,$T2
1031 eor $D1,$D1,$T3
1032 st1.8 {$A1-$D1},[$out],#64
1033
1034 ld1.8 {$A1-$D1},[$inp],#64
1035 eor $A2,$A2,$A0
1036 ldp @K[0],@K[1],[sp,#0]
1037 eor $B2,$B2,$B0
1038 ldp @K[2],@K[3],[sp,#32]
1039 eor $C2,$C2,$C0
1040 eor $D2,$D2,$D0
1041 st1.8 {$A2-$D2},[$out],#64
1042
1043 ld1.8 {$A2-$D2},[$inp],#64
1044 eor $A3,$A3,$A1
1045 eor $B3,$B3,$B1
1046 eor $C3,$C3,$C1
1047 eor $D3,$D3,$D1
1048 st1.8 {$A3-$D3},[$out],#64
1049
1050 ld1.8 {$A3-$D3},[$inp],#64
1051 eor $A4,$A4,$A2
1052 eor $B4,$B4,$B2
1053 eor $C4,$C4,$C2
1054 eor $D4,$D4,$D2
1055 st1.8 {$A4-$D4},[$out],#64
1056
1057 shl $A0,$ONE,#1 // 4 -> 8
1058 eor $A5,$A5,$A3
1059 eor $B5,$B5,$B3
1060 eor $C5,$C5,$C3
1061 eor $D5,$D5,$D3
1062 st1.8 {$A5-$D5},[$out],#64
1063
1064 add @K[3],@K[3],$A0 // += 8
1065 add @K[4],@K[4],$A0
1066 add @K[5],@K[5],$A0
1067 add @K[6],@K[6],$A0
1068
1069 b.hs .Loop_outer_512_neon
1070
1071 adds $len,$len,#512
1072 ushr $A0,$ONE,#2 // 4 -> 1
1073
1074 ldp d8,d9,[sp,#128+0] // meet ABI requirements
1075 ldp d10,d11,[sp,#128+16]
1076 ldp d12,d13,[sp,#128+32]
1077 ldp d14,d15,[sp,#128+48]
1078
1079 stp @K[0],$ONE,[sp,#0] // wipe off-load area
1080 stp @K[0],$ONE,[sp,#32]
1081 stp @K[0],$ONE,[sp,#64]
1082
1083 b.eq .Ldone_512_neon
1084
1085 cmp $len,#192
1086 sub @K[3],@K[3],$A0 // -= 1
1087 sub @K[4],@K[4],$A0
1088 sub @K[5],@K[5],$A0
1089 add sp,sp,#128
1090 b.hs .Loop_outer_neon
1091
1092 eor @K[1],@K[1],@K[1]
1093 eor @K[2],@K[2],@K[2]
1094 eor @K[3],@K[3],@K[3]
1095 eor @K[4],@K[4],@K[4]
1096 eor @K[5],@K[5],@K[5]
1097 eor @K[6],@K[6],@K[6]
1098 b .Loop_outer
1099
1100.Ldone_512_neon:
1101 ldp x19,x20,[x29,#16]
1102 add sp,sp,#128+64
1103 ldp x21,x22,[x29,#32]
1104 ldp x23,x24,[x29,#48]
1105 ldp x25,x26,[x29,#64]
1106 ldp x27,x28,[x29,#80]
1107 ldp x29,x30,[sp],#96
1108 ret
1109.size ChaCha20_512_neon,.-ChaCha20_512_neon
1110___
1111}
1112}}}
1113
1114foreach (split("\n",$code)) {
1115 s/\`([^\`]*)\`/eval $1/geo;
1116
1117 (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
1118 (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
1119 (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
1120 (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
1121 (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
1122
1123 #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1124
1125 print $_,"\n";
1126}
1127close STDOUT; # flush