Blame - src/crypto/fipsmodule/modes/asm/ghashv8-armx.pl - platform/external/boringssl

blob: 452bd630758c0cd745e881290ce025a1aa64afc2 [file] [log] [blame]

Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1	#!/usr/bin/env perl
				2	#
				3	# ====================================================================
				4	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
				5	# project. The module is, however, dual licensed under OpenSSL and
				6	# CRYPTOGAMS licenses depending on where you obtain it. For further
				7	# details see http://www.openssl.org/~appro/cryptogams/.
				8	# ====================================================================
				9	#
				10	# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
				11	#
				12	# June 2014
				13	#
				14	# Initial version was developed in tight cooperation with Ard
				15	# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
				16	# other assembly modules. Just like aesv8-armx.pl this module
				17	# supports both AArch32 and AArch64 execution modes.
				18	#
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	19	# July 2014
				20	#
				21	# Implement 2x aggregated reduction [see ghash-x86.pl for background
				22	# information].
				23	#
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	24	# Current performance in cycles per processed byte:
				25	#
				26	# PMULL[2] 32-bit NEON(*)
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	27	# Apple A7 0.92 5.62
				28	# Cortex-A53 1.01 8.39
				29	# Cortex-A57 1.17 7.61
				30	# Denver 0.71 6.02
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	31	#
				32	# (*) presented for reference/comparison purposes;
				33
				34	$flavour = shift;
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	35	$output = shift;
				36
				37	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
				38	( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
Robert Sloan	9254e68	2017-04-24 09:42:06 -0700	[diff] [blame]	39	( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	40	die "can't locate arm-xlate.pl";
				41
				42	open OUT,"\| \"$^X\" $xlate $flavour $output";
				43	STDOUT=OUT;
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	44
				45	$Xi="x0"; # argument block
				46	$Htbl="x1";
				47	$inp="x2";
				48	$len="x3";
				49
				50	$inc="x12";
				51
				52	{
				53	my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	54	my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	55
				56	$code=<<___;
Kenny Root	b849459	2015-09-25 02:29:14 +0000	[diff] [blame]	57	#include <openssl/arm_arch.h>
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	58
				59	.text
				60	___
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	61	$code.=<<___ if ($flavour =~ /64/);
David Benjamin	f0c4a6c	2016-08-11 13:26:41 -0400	[diff] [blame]	62	#if !defined(__clang__) \|\| defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH)
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	63	.arch armv8-a+crypto
Kenny Root	d18b633	2015-04-18 14:27:55 -0700	[diff] [blame]	64	#endif
				65	___
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	66	$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
				67
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	68	################################################################################
				69	# void gcm_init_v8(u128 Htable[16],const u64 H[2]);
				70	#
				71	# input: 128-bit H - secret parameter E(K,0^128)
				72	# output: precomputed table filled with degrees of twisted H;
				73	# H is twisted to handle reverse bitness of GHASH;
				74	# only few of 16 slots of Htable[16] are used;
				75	# data is opaque to outside world (which allows to
				76	# optimize the code independently);
				77	#
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	78	$code.=<<___;
				79	.global gcm_init_v8
				80	.type gcm_init_v8,%function
				81	.align 4
				82	gcm_init_v8:
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	83	vld1.64 {$t1},[x1] @ load input H
				84	vmov.i8 $xC2,#0xe1
				85	vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	86	vext.8 $IN,$t1,$t1,#8
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	87	vshr.u64 $t2,$xC2,#63
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	88	vdup.32 $t1,${t1}[1]
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	89	vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01
				90	vshr.u64 $t2,$IN,#63
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	91	vshr.s32 $t1,$t1,#31 @ broadcast carry bit
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	92	vand $t2,$t2,$t0
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	93	vshl.i64 $IN,$IN,#1
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	94	vext.8 $t2,$t2,$t2,#8
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	95	vand $t0,$t0,$t1
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	96	vorr $IN,$IN,$t2 @ H<<<=1
				97	veor $H,$IN,$t0 @ twisted H
				98	vst1.64 {$H},[x0],#16 @ store Htable[0]
				99
				100	@ calculate H^2
				101	vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing
				102	vpmull.p64 $Xl,$H,$H
				103	veor $t0,$t0,$H
				104	vpmull2.p64 $Xh,$H,$H
				105	vpmull.p64 $Xm,$t0,$t0
				106
				107	vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
				108	veor $t2,$Xl,$Xh
				109	veor $Xm,$Xm,$t1
				110	veor $Xm,$Xm,$t2
				111	vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
				112
				113	vmov $Xh#lo,$Xm#hi @ Xh\|Xm - 256-bit result
				114	vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
				115	veor $Xl,$Xm,$t2
				116
				117	vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
				118	vpmull.p64 $Xl,$Xl,$xC2
				119	veor $t2,$t2,$Xh
				120	veor $H2,$Xl,$t2
				121
				122	vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
				123	veor $t1,$t1,$H2
				124	vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
				125	vst1.64 {$Hhl-$H2},[x0] @ store Htable[1..2]
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	126
				127	ret
				128	.size gcm_init_v8,.-gcm_init_v8
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	129	___
				130	################################################################################
				131	# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
				132	#
				133	# input: Xi - current hash value;
				134	# Htable - table precomputed in gcm_init_v8;
				135	# output: Xi - next hash value Xi;
				136	#
				137	$code.=<<___;
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	138	.global gcm_gmult_v8
				139	.type gcm_gmult_v8,%function
				140	.align 4
				141	gcm_gmult_v8:
				142	vld1.64 {$t1},[$Xi] @ load Xi
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	143	vmov.i8 $xC2,#0xe1
				144	vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
				145	vshl.u64 $xC2,$xC2,#57
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	146	#ifndef __ARMEB__
				147	vrev64.8 $t1,$t1
				148	#endif
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	149	vext.8 $IN,$t1,$t1,#8
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	150
Kenny Root	b849459	2015-09-25 02:29:14 +0000	[diff] [blame]	151	vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	152	veor $t1,$t1,$IN @ Karatsuba pre-processing
Kenny Root	b849459	2015-09-25 02:29:14 +0000	[diff] [blame]	153	vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
				154	vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	155
				156	vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
				157	veor $t2,$Xl,$Xh
				158	veor $Xm,$Xm,$t1
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	159	veor $Xm,$Xm,$t2
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	160	vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	161
				162	vmov $Xh#lo,$Xm#hi @ Xh\|Xm - 256-bit result
				163	vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	164	veor $Xl,$Xm,$t2
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	165
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	166	vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
				167	vpmull.p64 $Xl,$Xl,$xC2
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	168	veor $t2,$t2,$Xh
				169	veor $Xl,$Xl,$t2
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	170
				171	#ifndef __ARMEB__
				172	vrev64.8 $Xl,$Xl
				173	#endif
				174	vext.8 $Xl,$Xl,$Xl,#8
				175	vst1.64 {$Xl},[$Xi] @ write out Xi
				176
				177	ret
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	178	.size gcm_gmult_v8,.-gcm_gmult_v8
				179	___
				180	################################################################################
				181	# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
				182	#
				183	# input: table precomputed in gcm_init_v8;
				184	# current hash value Xi;
				185	# pointer to input data;
				186	# length of input data in bytes, but divisible by block size;
				187	# output: next hash value Xi;
				188	#
				189	$code.=<<___;
				190	.global gcm_ghash_v8
				191	.type gcm_ghash_v8,%function
				192	.align 4
				193	gcm_ghash_v8:
				194	___
				195	$code.=<<___ if ($flavour !~ /64/);
				196	vstmdb sp!,{d8-d15} @ 32-bit ABI says so
				197	___
				198	$code.=<<___;
				199	vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
				200	@ "[rotated]" means that
				201	@ loaded value would have
				202	@ to be rotated in order to
				203	@ make it appear as in
				204	@ alorithm specification
				205	subs $len,$len,#32 @ see if $len is 32 or larger
				206	mov $inc,#16 @ $inc is used as post-
				207	@ increment for input pointer;
				208	@ as loop is modulo-scheduled
				209	@ $inc is zeroed just in time
				210	@ to preclude oversteping
				211	@ inp[len], which means that
				212	@ last block[s] are actually
				213	@ loaded twice, but last
				214	@ copy is not processed
				215	vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2
				216	vmov.i8 $xC2,#0xe1
				217	vld1.64 {$H2},[$Htbl]
				218	cclr $inc,eq @ is it time to zero $inc?
				219	vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi
				220	vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0]
				221	vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
				222	#ifndef __ARMEB__
				223	vrev64.8 $t0,$t0
				224	vrev64.8 $Xl,$Xl
				225	#endif
				226	vext.8 $IN,$t0,$t0,#8 @ rotate I[0]
				227	b.lo .Lodd_tail_v8 @ $len was less than 32
				228	___
				229	{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
				230	#######
				231	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
				232	# [(HIi+1) + (HXi+1)] mod P =
				233	# [(HIi+1) + H^2(Ii+Xi)] mod P
				234	#
				235	$code.=<<___;
				236	vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1]
				237	#ifndef __ARMEB__
				238	vrev64.8 $t1,$t1
				239	#endif
				240	vext.8 $In,$t1,$t1,#8
				241	veor $IN,$IN,$Xl @ I[i]^=Xi
Kenny Root	b849459	2015-09-25 02:29:14 +0000	[diff] [blame]	242	vpmull.p64 $Xln,$H,$In @ H·Ii+1
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	243	veor $t1,$t1,$In @ Karatsuba pre-processing
				244	vpmull2.p64 $Xhn,$H,$In
				245	b .Loop_mod2x_v8
				246
				247	.align 4
				248	.Loop_mod2x_v8:
				249	vext.8 $t2,$IN,$IN,#8
				250	subs $len,$len,#32 @ is there more data?
Kenny Root	b849459	2015-09-25 02:29:14 +0000	[diff] [blame]	251	vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	252	cclr $inc,lo @ is it time to zero $inc?
				253
				254	vpmull.p64 $Xmn,$Hhl,$t1
				255	veor $t2,$t2,$IN @ Karatsuba pre-processing
Kenny Root	b849459	2015-09-25 02:29:14 +0000	[diff] [blame]	256	vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	257	veor $Xl,$Xl,$Xln @ accumulate
Kenny Root	b849459	2015-09-25 02:29:14 +0000	[diff] [blame]	258	vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	259	vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2]
				260
				261	veor $Xh,$Xh,$Xhn
				262	cclr $inc,eq @ is it time to zero $inc?
				263	veor $Xm,$Xm,$Xmn
				264
				265	vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
				266	veor $t2,$Xl,$Xh
				267	veor $Xm,$Xm,$t1
				268	vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3]
				269	#ifndef __ARMEB__
				270	vrev64.8 $t0,$t0
				271	#endif
				272	veor $Xm,$Xm,$t2
				273	vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
				274
				275	#ifndef __ARMEB__
				276	vrev64.8 $t1,$t1
				277	#endif
				278	vmov $Xh#lo,$Xm#hi @ Xh\|Xm - 256-bit result
				279	vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
				280	vext.8 $In,$t1,$t1,#8
				281	vext.8 $IN,$t0,$t0,#8
				282	veor $Xl,$Xm,$t2
Kenny Root	b849459	2015-09-25 02:29:14 +0000	[diff] [blame]	283	vpmull.p64 $Xln,$H,$In @ H·Ii+1
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	284	veor $IN,$IN,$Xh @ accumulate $IN early
				285
				286	vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
				287	vpmull.p64 $Xl,$Xl,$xC2
				288	veor $IN,$IN,$t2
				289	veor $t1,$t1,$In @ Karatsuba pre-processing
				290	veor $IN,$IN,$Xl
				291	vpmull2.p64 $Xhn,$H,$In
				292	b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes
				293
				294	veor $Xh,$Xh,$t2
				295	vext.8 $IN,$t0,$t0,#8 @ re-construct $IN
				296	adds $len,$len,#32 @ re-construct $len
				297	veor $Xl,$Xl,$Xh @ re-construct $Xl
				298	b.eq .Ldone_v8 @ is $len zero?
				299	___
				300	}
				301	$code.=<<___;
				302	.Lodd_tail_v8:
				303	vext.8 $t2,$Xl,$Xl,#8
				304	veor $IN,$IN,$Xl @ inp^=Xi
				305	veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi
				306
Kenny Root	b849459	2015-09-25 02:29:14 +0000	[diff] [blame]	307	vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	308	veor $t1,$t1,$IN @ Karatsuba pre-processing
Kenny Root	b849459	2015-09-25 02:29:14 +0000	[diff] [blame]	309	vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
				310	vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	311
				312	vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
				313	veor $t2,$Xl,$Xh
				314	veor $Xm,$Xm,$t1
				315	veor $Xm,$Xm,$t2
				316	vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
				317
				318	vmov $Xh#lo,$Xm#hi @ Xh\|Xm - 256-bit result
				319	vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
				320	veor $Xl,$Xm,$t2
				321
				322	vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
				323	vpmull.p64 $Xl,$Xl,$xC2
				324	veor $t2,$t2,$Xh
				325	veor $Xl,$Xl,$t2
				326
				327	.Ldone_v8:
				328	#ifndef __ARMEB__
				329	vrev64.8 $Xl,$Xl
				330	#endif
				331	vext.8 $Xl,$Xl,$Xl,#8
				332	vst1.64 {$Xl},[$Xi] @ write out Xi
				333
				334	___
				335	$code.=<<___ if ($flavour !~ /64/);
				336	vldmia sp!,{d8-d15} @ 32-bit ABI says so
				337	___
				338	$code.=<<___;
				339	ret
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	340	.size gcm_ghash_v8,.-gcm_ghash_v8
				341	___
				342	}
				343	$code.=<<___;
				344	.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
				345	.align 2
				346	___
				347
				348	if ($flavour =~ /64/) { ######## 64-bit code
				349	sub unvmov {
				350	my $arg=shift;
				351
				352	$arg =~ m/q([0-9]+)#(lo\|hi),\s*q([0-9]+)#(lo\|hi)/o &&
				353	sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
				354	}
				355	foreach(split("\n",$code)) {
				356	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
				357	s/vmov\.i8/movi/o or # fix up legacy mnemonics
				358	s/vmov\s+(.*)/unvmov($1)/geo or
				359	s/vext\.8/ext/o or
				360	s/vshr\.s/sshr\.s/o or
				361	s/vshr/ushr/o or
				362	s/^(\s+)v/$1/o or # strip off v prefix
				363	s/\bbx\s+lr\b/ret/o;
				364
				365	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
				366	s/@\s/\/\//o; # old->new style commentary
				367
				368	# fix up remainig legacy suffixes
				369	s/\.[ui]?8(\s)/$1/o;
				370	s/\.[uis]?32//o and s/\.16b/\.4s/go;
				371	m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
				372	m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
				373	s/\.[uisp]?64//o and s/\.16b/\.2d/go;
				374	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
				375
				376	print $_,"\n";
				377	}
				378	} else { ######## 32-bit code
				379	sub unvdup32 {
				380	my $arg=shift;
				381
				382	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
				383	sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
				384	}
				385	sub unvpmullp64 {
				386	my ($mnemonic,$arg)=@_;
				387
				388	if ($arg =~ m/q([0-9]+),\sq([0-9]+),\sq([0-9]+)/o) {
				389	my $word = 0xf2a00e00\|(($1&7)<<13)\|(($1&8)<<19)
				390	\|(($2&7)<<17)\|(($2&8)<<4)
				391	\|(($3&7)<<1) \|(($3&8)<<2);
				392	$word \|= 0x00010001 if ($mnemonic =~ "2");
				393	# since ARMv7 instructions are always encoded little-endian.
				394	# correct solution is to use .inst directive, but older
				395	# assemblers don't implement it:-(
				396	sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
				397	$word&0xff,($word>>8)&0xff,
				398	($word>>16)&0xff,($word>>24)&0xff,
				399	$mnemonic,$arg;
				400	}
				401	}
				402
				403	foreach(split("\n",$code)) {
				404	s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
				405	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	406	s/\/\/\s?/@ /o; # new->old style commentary
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	407
				408	# fix up remainig new-style suffixes
				409	s/\],#[0-9]+/]!/o;
				410
				411	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
				412	s/vdup\.32\s+(.*)/unvdup32($1)/geo or
				413	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
				414	s/\bq([0-9]+)#(lo\|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
				415	s/^(\s+)b\./$1b/o or
				416	s/^(\s+)ret/$1bx\tlr/o;
				417
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	418	print $_,"\n";
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	419	}
				420	}
				421
				422	close STDOUT; # enforce flush