Blame - arch/arm/crypto/bsaes-armv7.pl - kernel/msm-4.9

blob: a4d3856e7d2477ec6379f1f10a9121e32f36a974 [file] [log] [blame]

Ard Biesheuvel	e4e7f10	2013-09-16 18:31:38 +0200	[diff] [blame]	1	#!/usr/bin/env perl
				2
				3	# ====================================================================
				4	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
				5	# project. The module is, however, dual licensed under OpenSSL and
				6	# CRYPTOGAMS licenses depending on where you obtain it. For further
				7	# details see http://www.openssl.org/~appro/cryptogams/.
				8	#
				9	# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
				10	# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
				11	# granted.
				12	# ====================================================================
				13
				14	# Bit-sliced AES for ARM NEON
				15	#
				16	# February 2012.
				17	#
				18	# This implementation is direct adaptation of bsaes-x86_64 module for
				19	# ARM NEON. Except that this module is endian-neutral [in sense that
				20	# it can be compiled for either endianness] by courtesy of vld1.8's
				21	# neutrality. Initial version doesn't implement interface to OpenSSL,
				22	# only low-level primitives and unsupported entry points, just enough
				23	# to collect performance results, which for Cortex-A8 core are:
				24	#
				25	# encrypt 19.5 cycles per byte processed with 128-bit key
				26	# decrypt 22.1 cycles per byte processed with 128-bit key
				27	# key conv. 440 cycles per 128-bit key/0.18 of 8x block
				28	#
				29	# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
				30	# which is [much] worse than anticipated (for further details see
				31	# http://www.openssl.org/~appro/Snapdragon-S4.html).
				32	#
				33	# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
				34	# manages in 20.0 cycles].
				35	#
				36	# When comparing to x86_64 results keep in mind that NEON unit is
				37	# [mostly] single-issue and thus can't [fully] benefit from
				38	# instruction-level parallelism. And when comparing to aes-armv4
				39	# results keep in mind key schedule conversion overhead (see
				40	# bsaes-x86_64.pl for further details)...
				41	#
				42	# <appro@openssl.org>
				43
				44	# April-August 2013
				45	#
				46	# Add CBC, CTR and XTS subroutines, adapt for kernel use.
				47	#
				48	# <ard.biesheuvel@linaro.org>
				49
				50	while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
				51	open STDOUT,">$output";
				52
				53	my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
				54	my @XMM=map("q$_",(0..15));
				55
				56	{
				57	my ($key,$rounds,$const)=("r4","r5","r6");
				58
				59	sub Dlo() { shift=~m\|q([1]?[0-9])\|?"d".($1*2):""; }
				60	sub Dhi() { shift=~m\|q([1]?[0-9])\|?"d".($1*2+1):""; }
				61
				62	sub Sbox {
				63	# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
				64	# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
				65	my @b=@_[0..7];
				66	my @t=@_[8..11];
				67	my @s=@_[12..15];
				68	&InBasisChange (@b);
				69	&Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
				70	&OutBasisChange (@b[7,1,4,2,6,5,0,3]);
				71	}
				72
				73	sub InBasisChange {
				74	# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
				75	# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
				76	my @b=@_[0..7];
				77	$code.=<<___;
				78	veor @b[2], @b[2], @b[1]
				79	veor @b[5], @b[5], @b[6]
				80	veor @b[3], @b[3], @b[0]
				81	veor @b[6], @b[6], @b[2]
				82	veor @b[5], @b[5], @b[0]
				83
				84	veor @b[6], @b[6], @b[3]
				85	veor @b[3], @b[3], @b[7]
				86	veor @b[7], @b[7], @b[5]
				87	veor @b[3], @b[3], @b[4]
				88	veor @b[4], @b[4], @b[5]
				89
				90	veor @b[2], @b[2], @b[7]
				91	veor @b[3], @b[3], @b[1]
				92	veor @b[1], @b[1], @b[5]
				93	___
				94	}
				95
				96	sub OutBasisChange {
				97	# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
				98	# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
				99	my @b=@_[0..7];
				100	$code.=<<___;
				101	veor @b[0], @b[0], @b[6]
				102	veor @b[1], @b[1], @b[4]
				103	veor @b[4], @b[4], @b[6]
				104	veor @b[2], @b[2], @b[0]
				105	veor @b[6], @b[6], @b[1]
				106
				107	veor @b[1], @b[1], @b[5]
				108	veor @b[5], @b[5], @b[3]
				109	veor @b[3], @b[3], @b[7]
				110	veor @b[7], @b[7], @b[5]
				111	veor @b[2], @b[2], @b[5]
				112
				113	veor @b[4], @b[4], @b[7]
				114	___
				115	}
				116
				117	sub InvSbox {
				118	# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
				119	# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
				120	my @b=@_[0..7];
				121	my @t=@_[8..11];
				122	my @s=@_[12..15];
				123	&InvInBasisChange (@b);
				124	&Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
				125	&InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
				126	}
				127
				128	sub InvInBasisChange { # OutBasisChange in reverse (with twist)
				129	my @b=@_[5,1,2,6,3,7,0,4];
				130	$code.=<<___
				131	veor @b[1], @b[1], @b[7]
				132	veor @b[4], @b[4], @b[7]
				133
				134	veor @b[7], @b[7], @b[5]
				135	veor @b[1], @b[1], @b[3]
				136	veor @b[2], @b[2], @b[5]
				137	veor @b[3], @b[3], @b[7]
				138
				139	veor @b[6], @b[6], @b[1]
				140	veor @b[2], @b[2], @b[0]
				141	veor @b[5], @b[5], @b[3]
				142	veor @b[4], @b[4], @b[6]
				143	veor @b[0], @b[0], @b[6]
				144	veor @b[1], @b[1], @b[4]
				145	___
				146	}
				147
				148	sub InvOutBasisChange { # InBasisChange in reverse
				149	my @b=@_[2,5,7,3,6,1,0,4];
				150	$code.=<<___;
				151	veor @b[1], @b[1], @b[5]
				152	veor @b[2], @b[2], @b[7]
				153
				154	veor @b[3], @b[3], @b[1]
				155	veor @b[4], @b[4], @b[5]
				156	veor @b[7], @b[7], @b[5]
				157	veor @b[3], @b[3], @b[4]
				158	veor @b[5], @b[5], @b[0]
				159	veor @b[3], @b[3], @b[7]
				160	veor @b[6], @b[6], @b[2]
				161	veor @b[2], @b[2], @b[1]
				162	veor @b[6], @b[6], @b[3]
				163
				164	veor @b[3], @b[3], @b[0]
				165	veor @b[5], @b[5], @b[6]
				166	___
				167	}
				168
				169	sub Mul_GF4 {
				170	#;*************************************************************
				171	#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
				172	#;*************************************************************
				173	my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
				174	$code.=<<___;
				175	veor $t0, $y0, $y1
				176	vand $t0, $t0, $x0
				177	veor $x0, $x0, $x1
				178	vand $t1, $x1, $y0
				179	vand $x0, $x0, $y1
				180	veor $x1, $t1, $t0
				181	veor $x0, $x0, $t1
				182	___
				183	}
				184
				185	sub Mul_GF4_N { # not used, see next subroutine
				186	# multiply and scale by N
				187	my ($x0,$x1,$y0,$y1,$t0)=@_;
				188	$code.=<<___;
				189	veor $t0, $y0, $y1
				190	vand $t0, $t0, $x0
				191	veor $x0, $x0, $x1
				192	vand $x1, $x1, $y0
				193	vand $x0, $x0, $y1
				194	veor $x1, $x1, $x0
				195	veor $x0, $x0, $t0
				196	___
				197	}
				198
				199	sub Mul_GF4_N_GF4 {
				200	# interleaved Mul_GF4_N and Mul_GF4
				201	my ($x0,$x1,$y0,$y1,$t0,
				202	$x2,$x3,$y2,$y3,$t1)=@_;
				203	$code.=<<___;
				204	veor $t0, $y0, $y1
				205	veor $t1, $y2, $y3
				206	vand $t0, $t0, $x0
				207	vand $t1, $t1, $x2
				208	veor $x0, $x0, $x1
				209	veor $x2, $x2, $x3
				210	vand $x1, $x1, $y0
				211	vand $x3, $x3, $y2
				212	vand $x0, $x0, $y1
				213	vand $x2, $x2, $y3
				214	veor $x1, $x1, $x0
				215	veor $x2, $x2, $x3
				216	veor $x0, $x0, $t0
				217	veor $x3, $x3, $t1
				218	___
				219	}
				220	sub Mul_GF16_2 {
				221	my @x=@_[0..7];
				222	my @y=@_[8..11];
				223	my @t=@_[12..15];
				224	$code.=<<___;
				225	veor @t[0], @x[0], @x[2]
				226	veor @t[1], @x[1], @x[3]
				227	___
				228	&Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
				229	$code.=<<___;
				230	veor @y[0], @y[0], @y[2]
				231	veor @y[1], @y[1], @y[3]
				232	___
				233	Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
				234	@x[2], @x[3], @y[2], @y[3], @t[2]);
				235	$code.=<<___;
				236	veor @x[0], @x[0], @t[0]
				237	veor @x[2], @x[2], @t[0]
				238	veor @x[1], @x[1], @t[1]
				239	veor @x[3], @x[3], @t[1]
				240
				241	veor @t[0], @x[4], @x[6]
				242	veor @t[1], @x[5], @x[7]
				243	___
				244	&Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
				245	@x[6], @x[7], @y[2], @y[3], @t[2]);
				246	$code.=<<___;
				247	veor @y[0], @y[0], @y[2]
				248	veor @y[1], @y[1], @y[3]
				249	___
				250	&Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
				251	$code.=<<___;
				252	veor @x[4], @x[4], @t[0]
				253	veor @x[6], @x[6], @t[0]
				254	veor @x[5], @x[5], @t[1]
				255	veor @x[7], @x[7], @t[1]
				256	___
				257	}
				258	sub Inv_GF256 {
				259	#;********************************************************************
				260	#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
				261	#;********************************************************************
				262	my @x=@_[0..7];
				263	my @t=@_[8..11];
				264	my @s=@_[12..15];
				265	# direct optimizations from hardware
				266	$code.=<<___;
				267	veor @t[3], @x[4], @x[6]
				268	veor @t[2], @x[5], @x[7]
				269	veor @t[1], @x[1], @x[3]
				270	veor @s[1], @x[7], @x[6]
				271	vmov @t[0], @t[2]
				272	veor @s[0], @x[0], @x[2]
				273
				274	vorr @t[2], @t[2], @t[1]
				275	veor @s[3], @t[3], @t[0]
				276	vand @s[2], @t[3], @s[0]
				277	vorr @t[3], @t[3], @s[0]
				278	veor @s[0], @s[0], @t[1]
				279	vand @t[0], @t[0], @t[1]
				280	veor @t[1], @x[3], @x[2]
				281	vand @s[3], @s[3], @s[0]
				282	vand @s[1], @s[1], @t[1]
				283	veor @t[1], @x[4], @x[5]
				284	veor @s[0], @x[1], @x[0]
				285	veor @t[3], @t[3], @s[1]
				286	veor @t[2], @t[2], @s[1]
				287	vand @s[1], @t[1], @s[0]
				288	vorr @t[1], @t[1], @s[0]
				289	veor @t[3], @t[3], @s[3]
				290	veor @t[0], @t[0], @s[1]
				291	veor @t[2], @t[2], @s[2]
				292	veor @t[1], @t[1], @s[3]
				293	veor @t[0], @t[0], @s[2]
				294	vand @s[0], @x[7], @x[3]
				295	veor @t[1], @t[1], @s[2]
				296	vand @s[1], @x[6], @x[2]
				297	vand @s[2], @x[5], @x[1]
				298	vorr @s[3], @x[4], @x[0]
				299	veor @t[3], @t[3], @s[0]
				300	veor @t[1], @t[1], @s[2]
				301	veor @t[0], @t[0], @s[3]
				302	veor @t[2], @t[2], @s[1]
				303
				304	@ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
				305
				306	@ new smaller inversion
				307
				308	vand @s[2], @t[3], @t[1]
				309	vmov @s[0], @t[0]
				310
				311	veor @s[1], @t[2], @s[2]
				312	veor @s[3], @t[0], @s[2]
				313	veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
				314
				315	vbsl @s[1], @t[1], @t[0]
				316	vbsl @s[3], @t[3], @t[2]
				317	veor @t[3], @t[3], @t[2]
				318
				319	vbsl @s[0], @s[1], @s[2]
				320	vbsl @t[0], @s[2], @s[1]
				321
				322	vand @s[2], @s[0], @s[3]
				323	veor @t[1], @t[1], @t[0]
				324
				325	veor @s[2], @s[2], @t[3]
				326	___
				327	# output in s3, s2, s1, t1
				328
				329	# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
				330
				331	# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
				332	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
				333
				334	### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
				335	}
				336
				337	# AES linear components
				338
				339	sub ShiftRows {
				340	my @x=@_[0..7];
				341	my @t=@_[8..11];
				342	my $mask=pop;
				343	$code.=<<___;
				344	vldmia $key!, {@t[0]-@t[3]}
				345	veor @t[0], @t[0], @x[0]
				346	veor @t[1], @t[1], @x[1]
				347	vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
				348	vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
				349	vldmia $key!, {@t[0]}
				350	veor @t[2], @t[2], @x[2]
				351	vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
				352	vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
				353	vldmia $key!, {@t[1]}
				354	veor @t[3], @t[3], @x[3]
				355	vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
				356	vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
				357	vldmia $key!, {@t[2]}
				358	vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
				359	vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
				360	vldmia $key!, {@t[3]}
				361	veor @t[0], @t[0], @x[4]
				362	veor @t[1], @t[1], @x[5]
				363	vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
				364	vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
				365	veor @t[2], @t[2], @x[6]
				366	vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
				367	vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
				368	veor @t[3], @t[3], @x[7]
				369	vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
				370	vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
				371	vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
				372	vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
				373	___
				374	}
				375
				376	sub MixColumns {
				377	# modified to emit output in order suitable for feeding back to aesenc[last]
				378	my @x=@_[0..7];
				379	my @t=@_[8..15];
				380	my $inv=@_[16]; # optional
				381	$code.=<<___;
				382	vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
				383	vext.8 @t[1], @x[1], @x[1], #12
				384	veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
				385	vext.8 @t[2], @x[2], @x[2], #12
				386	veor @x[1], @x[1], @t[1]
				387	vext.8 @t[3], @x[3], @x[3], #12
				388	veor @x[2], @x[2], @t[2]
				389	vext.8 @t[4], @x[4], @x[4], #12
				390	veor @x[3], @x[3], @t[3]
				391	vext.8 @t[5], @x[5], @x[5], #12
				392	veor @x[4], @x[4], @t[4]
				393	vext.8 @t[6], @x[6], @x[6], #12
				394	veor @x[5], @x[5], @t[5]
				395	vext.8 @t[7], @x[7], @x[7], #12
				396	veor @x[6], @x[6], @t[6]
				397
				398	veor @t[1], @t[1], @x[0]
				399	veor @x[7], @x[7], @t[7]
				400	vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
				401	veor @t[2], @t[2], @x[1]
				402	veor @t[0], @t[0], @x[7]
				403	veor @t[1], @t[1], @x[7]
				404	vext.8 @x[1], @x[1], @x[1], #8
				405	veor @t[5], @t[5], @x[4]
				406	veor @x[0], @x[0], @t[0]
				407	veor @t[6], @t[6], @x[5]
				408	veor @x[1], @x[1], @t[1]
				409	vext.8 @t[0], @x[4], @x[4], #8
				410	veor @t[4], @t[4], @x[3]
				411	vext.8 @t[1], @x[5], @x[5], #8
				412	veor @t[7], @t[7], @x[6]
				413	vext.8 @x[4], @x[3], @x[3], #8
				414	veor @t[3], @t[3], @x[2]
				415	vext.8 @x[5], @x[7], @x[7], #8
				416	veor @t[4], @t[4], @x[7]
				417	vext.8 @x[3], @x[6], @x[6], #8
				418	veor @t[3], @t[3], @x[7]
				419	vext.8 @x[6], @x[2], @x[2], #8
				420	veor @x[7], @t[1], @t[5]
				421	___
				422	$code.=<<___ if (!$inv);
				423	veor @x[2], @t[0], @t[4]
				424	veor @x[4], @x[4], @t[3]
				425	veor @x[5], @x[5], @t[7]
				426	veor @x[3], @x[3], @t[6]
				427	@ vmov @x[2], @t[0]
				428	veor @x[6], @x[6], @t[2]
				429	@ vmov @x[7], @t[1]
				430	___
				431	$code.=<<___ if ($inv);
				432	veor @t[3], @t[3], @x[4]
				433	veor @x[5], @x[5], @t[7]
				434	veor @x[2], @x[3], @t[6]
				435	veor @x[3], @t[0], @t[4]
				436	veor @x[4], @x[6], @t[2]
				437	vmov @x[6], @t[3]
				438	@ vmov @x[7], @t[1]
				439	___
				440	}
				441
				442	sub InvMixColumns_orig {
				443	my @x=@_[0..7];
				444	my @t=@_[8..15];
				445
				446	$code.=<<___;
				447	@ multiplication by 0x0e
				448	vext.8 @t[7], @x[7], @x[7], #12
				449	vmov @t[2], @x[2]
				450	veor @x[2], @x[2], @x[5] @ 2 5
				451	veor @x[7], @x[7], @x[5] @ 7 5
				452	vext.8 @t[0], @x[0], @x[0], #12
				453	vmov @t[5], @x[5]
				454	veor @x[5], @x[5], @x[0] @ 5 0 [1]
				455	veor @x[0], @x[0], @x[1] @ 0 1
				456	vext.8 @t[1], @x[1], @x[1], #12
				457	veor @x[1], @x[1], @x[2] @ 1 25
				458	veor @x[0], @x[0], @x[6] @ 01 6 [2]
				459	vext.8 @t[3], @x[3], @x[3], #12
				460	veor @x[1], @x[1], @x[3] @ 125 3 [4]
				461	veor @x[2], @x[2], @x[0] @ 25 016 [3]
				462	veor @x[3], @x[3], @x[7] @ 3 75
				463	veor @x[7], @x[7], @x[6] @ 75 6 [0]
				464	vext.8 @t[6], @x[6], @x[6], #12
				465	vmov @t[4], @x[4]
				466	veor @x[6], @x[6], @x[4] @ 6 4
				467	veor @x[4], @x[4], @x[3] @ 4 375 [6]
				468	veor @x[3], @x[3], @x[7] @ 375 756=36
				469	veor @x[6], @x[6], @t[5] @ 64 5 [7]
				470	veor @x[3], @x[3], @t[2] @ 36 2
				471	vext.8 @t[5], @t[5], @t[5], #12
				472	veor @x[3], @x[3], @t[4] @ 362 4 [5]
				473	___
				474	my @y = @x[7,5,0,2,1,3,4,6];
				475	$code.=<<___;
				476	@ multiplication by 0x0b
				477	veor @y[1], @y[1], @y[0]
				478	veor @y[0], @y[0], @t[0]
				479	vext.8 @t[2], @t[2], @t[2], #12
				480	veor @y[1], @y[1], @t[1]
				481	veor @y[0], @y[0], @t[5]
				482	vext.8 @t[4], @t[4], @t[4], #12
				483	veor @y[1], @y[1], @t[6]
				484	veor @y[0], @y[0], @t[7]
				485	veor @t[7], @t[7], @t[6] @ clobber t[7]
				486
				487	veor @y[3], @y[3], @t[0]
				488	veor @y[1], @y[1], @y[0]
				489	vext.8 @t[0], @t[0], @t[0], #12
				490	veor @y[2], @y[2], @t[1]
				491	veor @y[4], @y[4], @t[1]
				492	vext.8 @t[1], @t[1], @t[1], #12
				493	veor @y[2], @y[2], @t[2]
				494	veor @y[3], @y[3], @t[2]
				495	veor @y[5], @y[5], @t[2]
				496	veor @y[2], @y[2], @t[7]
				497	vext.8 @t[2], @t[2], @t[2], #12
				498	veor @y[3], @y[3], @t[3]
				499	veor @y[6], @y[6], @t[3]
				500	veor @y[4], @y[4], @t[3]
				501	veor @y[7], @y[7], @t[4]
				502	vext.8 @t[3], @t[3], @t[3], #12
				503	veor @y[5], @y[5], @t[4]
				504	veor @y[7], @y[7], @t[7]
				505	veor @t[7], @t[7], @t[5] @ clobber t[7] even more
				506	veor @y[3], @y[3], @t[5]
				507	veor @y[4], @y[4], @t[4]
				508
				509	veor @y[5], @y[5], @t[7]
				510	vext.8 @t[4], @t[4], @t[4], #12
				511	veor @y[6], @y[6], @t[7]
				512	veor @y[4], @y[4], @t[7]
				513
				514	veor @t[7], @t[7], @t[5]
				515	vext.8 @t[5], @t[5], @t[5], #12
				516
				517	@ multiplication by 0x0d
				518	veor @y[4], @y[4], @y[7]
				519	veor @t[7], @t[7], @t[6] @ restore t[7]
				520	veor @y[7], @y[7], @t[4]
				521	vext.8 @t[6], @t[6], @t[6], #12
				522	veor @y[2], @y[2], @t[0]
				523	veor @y[7], @y[7], @t[5]
				524	vext.8 @t[7], @t[7], @t[7], #12
				525	veor @y[2], @y[2], @t[2]
				526
				527	veor @y[3], @y[3], @y[1]
				528	veor @y[1], @y[1], @t[1]
				529	veor @y[0], @y[0], @t[0]
				530	veor @y[3], @y[3], @t[0]
				531	veor @y[1], @y[1], @t[5]
				532	veor @y[0], @y[0], @t[5]
				533	vext.8 @t[0], @t[0], @t[0], #12
				534	veor @y[1], @y[1], @t[7]
				535	veor @y[0], @y[0], @t[6]
				536	veor @y[3], @y[3], @y[1]
				537	veor @y[4], @y[4], @t[1]
				538	vext.8 @t[1], @t[1], @t[1], #12
				539
				540	veor @y[7], @y[7], @t[7]
				541	veor @y[4], @y[4], @t[2]
				542	veor @y[5], @y[5], @t[2]
				543	veor @y[2], @y[2], @t[6]
				544	veor @t[6], @t[6], @t[3] @ clobber t[6]
				545	vext.8 @t[2], @t[2], @t[2], #12
				546	veor @y[4], @y[4], @y[7]
				547	veor @y[3], @y[3], @t[6]
				548
				549	veor @y[6], @y[6], @t[6]
				550	veor @y[5], @y[5], @t[5]
				551	vext.8 @t[5], @t[5], @t[5], #12
				552	veor @y[6], @y[6], @t[4]
				553	vext.8 @t[4], @t[4], @t[4], #12
				554	veor @y[5], @y[5], @t[6]
				555	veor @y[6], @y[6], @t[7]
				556	vext.8 @t[7], @t[7], @t[7], #12
				557	veor @t[6], @t[6], @t[3] @ restore t[6]
				558	vext.8 @t[3], @t[3], @t[3], #12
				559
				560	@ multiplication by 0x09
				561	veor @y[4], @y[4], @y[1]
				562	veor @t[1], @t[1], @y[1] @ t[1]=y[1]
				563	veor @t[0], @t[0], @t[5] @ clobber t[0]
				564	vext.8 @t[6], @t[6], @t[6], #12
				565	veor @t[1], @t[1], @t[5]
				566	veor @y[3], @y[3], @t[0]
				567	veor @t[0], @t[0], @y[0] @ t[0]=y[0]
				568	veor @t[1], @t[1], @t[6]
				569	veor @t[6], @t[6], @t[7] @ clobber t[6]
				570	veor @y[4], @y[4], @t[1]
				571	veor @y[7], @y[7], @t[4]
				572	veor @y[6], @y[6], @t[3]
				573	veor @y[5], @y[5], @t[2]
				574	veor @t[4], @t[4], @y[4] @ t[4]=y[4]
				575	veor @t[3], @t[3], @y[3] @ t[3]=y[3]
				576	veor @t[5], @t[5], @y[5] @ t[5]=y[5]
				577	veor @t[2], @t[2], @y[2] @ t[2]=y[2]
				578	veor @t[3], @t[3], @t[7]
				579	veor @XMM[5], @t[5], @t[6]
				580	veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
				581	veor @XMM[2], @t[2], @t[6]
				582	veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
				583
				584	vmov @XMM[0], @t[0]
				585	vmov @XMM[1], @t[1]
				586	@ vmov @XMM[2], @t[2]
				587	vmov @XMM[3], @t[3]
				588	vmov @XMM[4], @t[4]
				589	@ vmov @XMM[5], @t[5]
				590	@ vmov @XMM[6], @t[6]
				591	@ vmov @XMM[7], @t[7]
				592	___
				593	}
				594
				595	sub InvMixColumns {
				596	my @x=@_[0..7];
				597	my @t=@_[8..15];
				598
				599	# Thanks to Jussi Kivilinna for providing pointer to
				600	#
				601	# \| 0e 0b 0d 09 \| \| 02 03 01 01 \| \| 05 00 04 00 \|
				602	# \| 09 0e 0b 0d \| = \| 01 02 03 01 \| x \| 00 05 00 04 \|
				603	# \| 0d 09 0e 0b \| \| 01 01 02 03 \| \| 04 00 05 00 \|
				604	# \| 0b 0d 09 0e \| \| 03 01 01 02 \| \| 00 04 00 05 \|
				605
				606	$code.=<<___;
				607	@ multiplication by 0x05-0x00-0x04-0x00
				608	vext.8 @t[0], @x[0], @x[0], #8
				609	vext.8 @t[6], @x[6], @x[6], #8
				610	vext.8 @t[7], @x[7], @x[7], #8
				611	veor @t[0], @t[0], @x[0]
				612	vext.8 @t[1], @x[1], @x[1], #8
				613	veor @t[6], @t[6], @x[6]
				614	vext.8 @t[2], @x[2], @x[2], #8
				615	veor @t[7], @t[7], @x[7]
				616	vext.8 @t[3], @x[3], @x[3], #8
				617	veor @t[1], @t[1], @x[1]
				618	vext.8 @t[4], @x[4], @x[4], #8
				619	veor @t[2], @t[2], @x[2]
				620	vext.8 @t[5], @x[5], @x[5], #8
				621	veor @t[3], @t[3], @x[3]
				622	veor @t[4], @t[4], @x[4]
				623	veor @t[5], @t[5], @x[5]
				624
				625	veor @x[0], @x[0], @t[6]
				626	veor @x[1], @x[1], @t[6]
				627	veor @x[2], @x[2], @t[0]
				628	veor @x[4], @x[4], @t[2]
				629	veor @x[3], @x[3], @t[1]
				630	veor @x[1], @x[1], @t[7]
				631	veor @x[2], @x[2], @t[7]
				632	veor @x[4], @x[4], @t[6]
				633	veor @x[5], @x[5], @t[3]
				634	veor @x[3], @x[3], @t[6]
				635	veor @x[6], @x[6], @t[4]
				636	veor @x[4], @x[4], @t[7]
				637	veor @x[5], @x[5], @t[7]
				638	veor @x[7], @x[7], @t[5]
				639	___
				640	&MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
				641	}
				642
				643	sub swapmove {
				644	my ($a,$b,$n,$mask,$t)=@_;
				645	$code.=<<___;
				646	vshr.u64 $t, $b, #$n
				647	veor $t, $t, $a
				648	vand $t, $t, $mask
				649	veor $a, $a, $t
				650	vshl.u64 $t, $t, #$n
				651	veor $b, $b, $t
				652	___
				653	}
				654	sub swapmove2x {
				655	my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
				656	$code.=<<___;
				657	vshr.u64 $t0, $b0, #$n
				658	vshr.u64 $t1, $b1, #$n
				659	veor $t0, $t0, $a0
				660	veor $t1, $t1, $a1
				661	vand $t0, $t0, $mask
				662	vand $t1, $t1, $mask
				663	veor $a0, $a0, $t0
				664	vshl.u64 $t0, $t0, #$n
				665	veor $a1, $a1, $t1
				666	vshl.u64 $t1, $t1, #$n
				667	veor $b0, $b0, $t0
				668	veor $b1, $b1, $t1
				669	___
				670	}
				671
				672	sub bitslice {
				673	my @x=reverse(@_[0..7]);
				674	my ($t0,$t1,$t2,$t3)=@_[8..11];
				675	$code.=<<___;
				676	vmov.i8 $t0,#0x55 @ compose .LBS0
				677	vmov.i8 $t1,#0x33 @ compose .LBS1
				678	___
				679	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
				680	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
				681	$code.=<<___;
				682	vmov.i8 $t0,#0x0f @ compose .LBS2
				683	___
				684	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
				685	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
				686
				687	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
				688	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
				689	}
				690
				691	$code.=<<___;
				692	#ifndef __KERNEL__
				693	# include "arm_arch.h"
				694
				695	# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
				696	# define VFP_ABI_POP vldmia sp!,{d8-d15}
				697	# define VFP_ABI_FRAME 0x40
				698	#else
				699	# define VFP_ABI_PUSH
				700	# define VFP_ABI_POP
				701	# define VFP_ABI_FRAME 0
				702	# define BSAES_ASM_EXTENDED_KEY
				703	# define XTS_CHAIN_TWEAK
Ard Biesheuvel	001eabf	2015-02-26 07:22:05 +0000	[diff] [blame]	704	# define __ARM_ARCH__ __LINUX_ARM_ARCH__
				705	# define __ARM_MAX_ARCH__ 7
Ard Biesheuvel	e4e7f10	2013-09-16 18:31:38 +0200	[diff] [blame]	706	#endif
				707
				708	#ifdef __thumb__
				709	# define adrl adr
				710	#endif
				711
Ard Biesheuvel	001eabf	2015-02-26 07:22:05 +0000	[diff] [blame]	712	#if __ARM_MAX_ARCH__>=7
				713	.arch armv7-a
				714	.fpu neon
				715
Ard Biesheuvel	e4e7f10	2013-09-16 18:31:38 +0200	[diff] [blame]	716	.text
				717	.syntax unified @ ARMv7-capable assembler is expected to handle this
				718	#ifdef __thumb2__
				719	.thumb
				720	#else
				721	.code 32
				722	#endif
				723
Ard Biesheuvel	e4e7f10	2013-09-16 18:31:38 +0200	[diff] [blame]	724	.type _bsaes_decrypt8,%function
				725	.align 4
				726	_bsaes_decrypt8:
				727	adr $const,_bsaes_decrypt8
				728	vldmia $key!, {@XMM[9]} @ round 0 key
				729	add $const,$const,#.LM0ISR-_bsaes_decrypt8
				730
				731	vldmia $const!, {@XMM[8]} @ .LM0ISR
				732	veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
				733	veor @XMM[11], @XMM[1], @XMM[9]
				734	vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
				735	vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
				736	veor @XMM[12], @XMM[2], @XMM[9]
				737	vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
				738	vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
				739	veor @XMM[13], @XMM[3], @XMM[9]
				740	vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
				741	vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
				742	veor @XMM[14], @XMM[4], @XMM[9]
				743	vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
				744	vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
				745	veor @XMM[15], @XMM[5], @XMM[9]
				746	vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
				747	vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
				748	veor @XMM[10], @XMM[6], @XMM[9]
				749	vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
				750	vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
				751	veor @XMM[11], @XMM[7], @XMM[9]
				752	vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
				753	vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
				754	vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
				755	vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
				756	___
				757	&bitslice (@XMM[0..7, 8..11]);
				758	$code.=<<___;
				759	sub $rounds,$rounds,#1
				760	b .Ldec_sbox
				761	.align 4
				762	.Ldec_loop:
				763	___
				764	&ShiftRows (@XMM[0..7, 8..12]);
				765	$code.=".Ldec_sbox:\n";
				766	&InvSbox (@XMM[0..7, 8..15]);
				767	$code.=<<___;
				768	subs $rounds,$rounds,#1
				769	bcc .Ldec_done
				770	___
				771	&InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
				772	$code.=<<___;
				773	vldmia $const, {@XMM[12]} @ .LISR
				774	ite eq @ Thumb2 thing, sanity check in ARM
				775	addeq $const,$const,#0x10
				776	bne .Ldec_loop
				777	vldmia $const, {@XMM[12]} @ .LISRM0
				778	b .Ldec_loop
				779	.align 4
				780	.Ldec_done:
				781	___
				782	&bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
				783	$code.=<<___;
				784	vldmia $key, {@XMM[8]} @ last round key
				785	veor @XMM[6], @XMM[6], @XMM[8]
				786	veor @XMM[4], @XMM[4], @XMM[8]
				787	veor @XMM[2], @XMM[2], @XMM[8]
				788	veor @XMM[7], @XMM[7], @XMM[8]
				789	veor @XMM[3], @XMM[3], @XMM[8]
				790	veor @XMM[5], @XMM[5], @XMM[8]
				791	veor @XMM[0], @XMM[0], @XMM[8]
				792	veor @XMM[1], @XMM[1], @XMM[8]
				793	bx lr
				794	.size _bsaes_decrypt8,.-_bsaes_decrypt8
				795
				796	.type _bsaes_const,%object
				797	.align 6
				798	_bsaes_const:
				799	.LM0ISR: @ InvShiftRows constants
				800	.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
				801	.LISR:
				802	.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
				803	.LISRM0:
				804	.quad 0x01040b0e0205080f, 0x0306090c00070a0d
				805	.LM0SR: @ ShiftRows constants
				806	.quad 0x0a0e02060f03070b, 0x0004080c05090d01
				807	.LSR:
				808	.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
				809	.LSRM0:
				810	.quad 0x0304090e00050a0f, 0x01060b0c0207080d
				811	.LM0:
				812	.quad 0x02060a0e03070b0f, 0x0004080c0105090d
				813	.LREVM0SR:
				814	.quad 0x090d01050c000408, 0x03070b0f060a0e02
				815	.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
				816	.align 6
				817	.size _bsaes_const,.-_bsaes_const
				818
				819	.type _bsaes_encrypt8,%function
				820	.align 4
				821	_bsaes_encrypt8:
				822	adr $const,_bsaes_encrypt8
				823	vldmia $key!, {@XMM[9]} @ round 0 key
				824	sub $const,$const,#_bsaes_encrypt8-.LM0SR
				825
				826	vldmia $const!, {@XMM[8]} @ .LM0SR
				827	_bsaes_encrypt8_alt:
				828	veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
				829	veor @XMM[11], @XMM[1], @XMM[9]
				830	vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
				831	vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
				832	veor @XMM[12], @XMM[2], @XMM[9]
				833	vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
				834	vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
				835	veor @XMM[13], @XMM[3], @XMM[9]
				836	vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
				837	vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
				838	veor @XMM[14], @XMM[4], @XMM[9]
				839	vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
				840	vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
				841	veor @XMM[15], @XMM[5], @XMM[9]
				842	vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
				843	vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
				844	veor @XMM[10], @XMM[6], @XMM[9]
				845	vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
				846	vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
				847	veor @XMM[11], @XMM[7], @XMM[9]
				848	vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
				849	vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
				850	vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
				851	vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
				852	_bsaes_encrypt8_bitslice:
				853	___
				854	&bitslice (@XMM[0..7, 8..11]);
				855	$code.=<<___;
				856	sub $rounds,$rounds,#1
				857	b .Lenc_sbox
				858	.align 4
				859	.Lenc_loop:
				860	___
				861	&ShiftRows (@XMM[0..7, 8..12]);
				862	$code.=".Lenc_sbox:\n";
				863	&Sbox (@XMM[0..7, 8..15]);
				864	$code.=<<___;
				865	subs $rounds,$rounds,#1
				866	bcc .Lenc_done
				867	___
				868	&MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
				869	$code.=<<___;
				870	vldmia $const, {@XMM[12]} @ .LSR
				871	ite eq @ Thumb2 thing, samity check in ARM
				872	addeq $const,$const,#0x10
				873	bne .Lenc_loop
				874	vldmia $const, {@XMM[12]} @ .LSRM0
				875	b .Lenc_loop
				876	.align 4
				877	.Lenc_done:
				878	___
				879	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
				880	&bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
				881	$code.=<<___;
				882	vldmia $key, {@XMM[8]} @ last round key
				883	veor @XMM[4], @XMM[4], @XMM[8]
				884	veor @XMM[6], @XMM[6], @XMM[8]
				885	veor @XMM[3], @XMM[3], @XMM[8]
				886	veor @XMM[7], @XMM[7], @XMM[8]
				887	veor @XMM[2], @XMM[2], @XMM[8]
				888	veor @XMM[5], @XMM[5], @XMM[8]
				889	veor @XMM[0], @XMM[0], @XMM[8]
				890	veor @XMM[1], @XMM[1], @XMM[8]
				891	bx lr
				892	.size _bsaes_encrypt8,.-_bsaes_encrypt8
				893	___
				894	}
				895	{
				896	my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
				897
				898	sub bitslice_key {
				899	my @x=reverse(@_[0..7]);
				900	my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
				901
				902	&swapmove (@x[0,1],1,$bs0,$t2,$t3);
				903	$code.=<<___;
				904	@ &swapmove(@x[2,3],1,$t0,$t2,$t3);
				905	vmov @x[2], @x[0]
				906	vmov @x[3], @x[1]
				907	___
				908	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
				909
				910	&swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
				911	$code.=<<___;
				912	@ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
				913	vmov @x[4], @x[0]
				914	vmov @x[6], @x[2]
				915	vmov @x[5], @x[1]
				916	vmov @x[7], @x[3]
				917	___
				918	&swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
				919	&swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
				920	}
				921
				922	$code.=<<___;
				923	.type _bsaes_key_convert,%function
				924	.align 4
				925	_bsaes_key_convert:
				926	adr $const,_bsaes_key_convert
				927	vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
				928	sub $const,$const,#_bsaes_key_convert-.LM0
				929	vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
				930
				931	vmov.i8 @XMM[8], #0x01 @ bit masks
				932	vmov.i8 @XMM[9], #0x02
				933	vmov.i8 @XMM[10], #0x04
				934	vmov.i8 @XMM[11], #0x08
				935	vmov.i8 @XMM[12], #0x10
				936	vmov.i8 @XMM[13], #0x20
				937	vldmia $const, {@XMM[14]} @ .LM0
				938
				939	#ifdef __ARMEL__
				940	vrev32.8 @XMM[7], @XMM[7]
				941	vrev32.8 @XMM[15], @XMM[15]
				942	#endif
				943	sub $rounds,$rounds,#1
				944	vstmia $out!, {@XMM[7]} @ save round 0 key
				945	b .Lkey_loop
				946
				947	.align 4
				948	.Lkey_loop:
				949	vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
				950	vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
				951	vmov.i8 @XMM[6], #0x40
				952	vmov.i8 @XMM[15], #0x80
				953
				954	vtst.8 @XMM[0], @XMM[7], @XMM[8]
				955	vtst.8 @XMM[1], @XMM[7], @XMM[9]
				956	vtst.8 @XMM[2], @XMM[7], @XMM[10]
				957	vtst.8 @XMM[3], @XMM[7], @XMM[11]
				958	vtst.8 @XMM[4], @XMM[7], @XMM[12]
				959	vtst.8 @XMM[5], @XMM[7], @XMM[13]
				960	vtst.8 @XMM[6], @XMM[7], @XMM[6]
				961	vtst.8 @XMM[7], @XMM[7], @XMM[15]
				962	vld1.8 {@XMM[15]}, [$inp]! @ load next round key
				963	vmvn @XMM[0], @XMM[0] @ "pnot"
				964	vmvn @XMM[1], @XMM[1]
				965	vmvn @XMM[5], @XMM[5]
				966	vmvn @XMM[6], @XMM[6]
				967	#ifdef __ARMEL__
				968	vrev32.8 @XMM[15], @XMM[15]
				969	#endif
				970	subs $rounds,$rounds,#1
				971	vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
				972	bne .Lkey_loop
				973
				974	vmov.i8 @XMM[7],#0x63 @ compose .L63
				975	@ don't save last round key
				976	bx lr
				977	.size _bsaes_key_convert,.-_bsaes_key_convert
				978	___
				979	}
				980
				981	if (0) { # following four functions are unsupported interface
				982	# used for benchmarking...
				983	$code.=<<___;
				984	.globl bsaes_enc_key_convert
				985	.type bsaes_enc_key_convert,%function
				986	.align 4
				987	bsaes_enc_key_convert:
				988	stmdb sp!,{r4-r6,lr}
				989	vstmdb sp!,{d8-d15} @ ABI specification says so
				990
				991	ldr r5,[$inp,#240] @ pass rounds
				992	mov r4,$inp @ pass key
				993	mov r12,$out @ pass key schedule
				994	bl _bsaes_key_convert
				995	veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
				996	vstmia r12, {@XMM[7]} @ save last round key
				997
				998	vldmia sp!,{d8-d15}
				999	ldmia sp!,{r4-r6,pc}
				1000	.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
				1001
				1002	.globl bsaes_encrypt_128
				1003	.type bsaes_encrypt_128,%function
				1004	.align 4
				1005	bsaes_encrypt_128:
				1006	stmdb sp!,{r4-r6,lr}
				1007	vstmdb sp!,{d8-d15} @ ABI specification says so
				1008	.Lenc128_loop:
				1009	vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
				1010	vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
				1011	mov r4,$key @ pass the key
				1012	vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
				1013	mov r5,#10 @ pass rounds
				1014	vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
				1015
				1016	bl _bsaes_encrypt8
				1017
				1018	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1019	vst1.8 {@XMM[4]}, [$out]!
				1020	vst1.8 {@XMM[6]}, [$out]!
				1021	vst1.8 {@XMM[3]}, [$out]!
				1022	vst1.8 {@XMM[7]}, [$out]!
				1023	vst1.8 {@XMM[2]}, [$out]!
				1024	subs $len,$len,#0x80
				1025	vst1.8 {@XMM[5]}, [$out]!
				1026	bhi .Lenc128_loop
				1027
				1028	vldmia sp!,{d8-d15}
				1029	ldmia sp!,{r4-r6,pc}
				1030	.size bsaes_encrypt_128,.-bsaes_encrypt_128
				1031
				1032	.globl bsaes_dec_key_convert
				1033	.type bsaes_dec_key_convert,%function
				1034	.align 4
				1035	bsaes_dec_key_convert:
				1036	stmdb sp!,{r4-r6,lr}
				1037	vstmdb sp!,{d8-d15} @ ABI specification says so
				1038
				1039	ldr r5,[$inp,#240] @ pass rounds
				1040	mov r4,$inp @ pass key
				1041	mov r12,$out @ pass key schedule
				1042	bl _bsaes_key_convert
				1043	vldmia $out, {@XMM[6]}
				1044	vstmia r12, {@XMM[15]} @ save last round key
				1045	veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
				1046	vstmia $out, {@XMM[7]}
				1047
				1048	vldmia sp!,{d8-d15}
				1049	ldmia sp!,{r4-r6,pc}
				1050	.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
				1051
				1052	.globl bsaes_decrypt_128
				1053	.type bsaes_decrypt_128,%function
				1054	.align 4
				1055	bsaes_decrypt_128:
				1056	stmdb sp!,{r4-r6,lr}
				1057	vstmdb sp!,{d8-d15} @ ABI specification says so
				1058	.Ldec128_loop:
				1059	vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
				1060	vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
				1061	mov r4,$key @ pass the key
				1062	vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
				1063	mov r5,#10 @ pass rounds
				1064	vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
				1065
				1066	bl _bsaes_decrypt8
				1067
				1068	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1069	vst1.8 {@XMM[6]}, [$out]!
				1070	vst1.8 {@XMM[4]}, [$out]!
				1071	vst1.8 {@XMM[2]}, [$out]!
				1072	vst1.8 {@XMM[7]}, [$out]!
				1073	vst1.8 {@XMM[3]}, [$out]!
				1074	subs $len,$len,#0x80
				1075	vst1.8 {@XMM[5]}, [$out]!
				1076	bhi .Ldec128_loop
				1077
				1078	vldmia sp!,{d8-d15}
				1079	ldmia sp!,{r4-r6,pc}
				1080	.size bsaes_decrypt_128,.-bsaes_decrypt_128
				1081	___
				1082	}
				1083	{
				1084	my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
				1085	my ($keysched)=("sp");
				1086
				1087	$code.=<<___;
				1088	.extern AES_cbc_encrypt
				1089	.extern AES_decrypt
				1090
				1091	.global bsaes_cbc_encrypt
				1092	.type bsaes_cbc_encrypt,%function
				1093	.align 5
				1094	bsaes_cbc_encrypt:
				1095	#ifndef __KERNEL__
				1096	cmp $len, #128
				1097	#ifndef __thumb__
				1098	blo AES_cbc_encrypt
				1099	#else
				1100	bhs 1f
				1101	b AES_cbc_encrypt
				1102	1:
				1103	#endif
				1104	#endif
				1105
				1106	@ it is up to the caller to make sure we are called with enc == 0
				1107
				1108	mov ip, sp
				1109	stmdb sp!, {r4-r10, lr}
				1110	VFP_ABI_PUSH
				1111	ldr $ivp, [ip] @ IV is 1st arg on the stack
				1112	mov $len, $len, lsr#4 @ len in 16 byte blocks
				1113	sub sp, #0x10 @ scratch space to carry over the IV
				1114	mov $fp, sp @ save sp
				1115
				1116	ldr $rounds, [$key, #240] @ get # of rounds
				1117	#ifndef BSAES_ASM_EXTENDED_KEY
				1118	@ allocate the key schedule on the stack
				1119	sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
				1120	add r12, #`128-32` @ sifze of bit-slices key schedule
				1121
				1122	@ populate the key schedule
				1123	mov r4, $key @ pass key
				1124	mov r5, $rounds @ pass # of rounds
				1125	mov sp, r12 @ sp is $keysched
				1126	bl _bsaes_key_convert
				1127	vldmia $keysched, {@XMM[6]}
				1128	vstmia r12, {@XMM[15]} @ save last round key
				1129	veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
				1130	vstmia $keysched, {@XMM[7]}
				1131	#else
				1132	ldr r12, [$key, #244]
				1133	eors r12, #1
				1134	beq 0f
				1135
				1136	@ populate the key schedule
				1137	str r12, [$key, #244]
				1138	mov r4, $key @ pass key
				1139	mov r5, $rounds @ pass # of rounds
				1140	add r12, $key, #248 @ pass key schedule
				1141	bl _bsaes_key_convert
				1142	add r4, $key, #248
				1143	vldmia r4, {@XMM[6]}
				1144	vstmia r12, {@XMM[15]} @ save last round key
				1145	veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
				1146	vstmia r4, {@XMM[7]}
				1147
				1148	.align 2
				1149	0:
				1150	#endif
				1151
				1152	vld1.8 {@XMM[15]}, [$ivp] @ load IV
				1153	b .Lcbc_dec_loop
				1154
				1155	.align 4
				1156	.Lcbc_dec_loop:
				1157	subs $len, $len, #0x8
				1158	bmi .Lcbc_dec_loop_finish
				1159
				1160	vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
				1161	vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
				1162	#ifndef BSAES_ASM_EXTENDED_KEY
				1163	mov r4, $keysched @ pass the key
				1164	#else
				1165	add r4, $key, #248
				1166	#endif
				1167	vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
				1168	mov r5, $rounds
				1169	vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
				1170	sub $inp, $inp, #0x60
				1171	vstmia $fp, {@XMM[15]} @ put aside IV
				1172
				1173	bl _bsaes_decrypt8
				1174
				1175	vldmia $fp, {@XMM[14]} @ reload IV
				1176	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
				1177	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
				1178	vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
				1179	veor @XMM[1], @XMM[1], @XMM[8]
				1180	veor @XMM[6], @XMM[6], @XMM[9]
				1181	vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
				1182	veor @XMM[4], @XMM[4], @XMM[10]
				1183	veor @XMM[2], @XMM[2], @XMM[11]
				1184	vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
				1185	veor @XMM[7], @XMM[7], @XMM[12]
				1186	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1187	veor @XMM[3], @XMM[3], @XMM[13]
				1188	vst1.8 {@XMM[6]}, [$out]!
				1189	veor @XMM[5], @XMM[5], @XMM[14]
				1190	vst1.8 {@XMM[4]}, [$out]!
				1191	vst1.8 {@XMM[2]}, [$out]!
				1192	vst1.8 {@XMM[7]}, [$out]!
				1193	vst1.8 {@XMM[3]}, [$out]!
				1194	vst1.8 {@XMM[5]}, [$out]!
				1195
				1196	b .Lcbc_dec_loop
				1197
				1198	.Lcbc_dec_loop_finish:
				1199	adds $len, $len, #8
				1200	beq .Lcbc_dec_done
				1201
				1202	vld1.8 {@XMM[0]}, [$inp]! @ load input
				1203	cmp $len, #2
				1204	blo .Lcbc_dec_one
				1205	vld1.8 {@XMM[1]}, [$inp]!
				1206	#ifndef BSAES_ASM_EXTENDED_KEY
				1207	mov r4, $keysched @ pass the key
				1208	#else
				1209	add r4, $key, #248
				1210	#endif
				1211	mov r5, $rounds
				1212	vstmia $fp, {@XMM[15]} @ put aside IV
				1213	beq .Lcbc_dec_two
				1214	vld1.8 {@XMM[2]}, [$inp]!
				1215	cmp $len, #4
				1216	blo .Lcbc_dec_three
				1217	vld1.8 {@XMM[3]}, [$inp]!
				1218	beq .Lcbc_dec_four
				1219	vld1.8 {@XMM[4]}, [$inp]!
				1220	cmp $len, #6
				1221	blo .Lcbc_dec_five
				1222	vld1.8 {@XMM[5]}, [$inp]!
				1223	beq .Lcbc_dec_six
				1224	vld1.8 {@XMM[6]}, [$inp]!
				1225	sub $inp, $inp, #0x70
				1226
				1227	bl _bsaes_decrypt8
				1228
				1229	vldmia $fp, {@XMM[14]} @ reload IV
				1230	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
				1231	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
				1232	vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
				1233	veor @XMM[1], @XMM[1], @XMM[8]
				1234	veor @XMM[6], @XMM[6], @XMM[9]
				1235	vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
				1236	veor @XMM[4], @XMM[4], @XMM[10]
				1237	veor @XMM[2], @XMM[2], @XMM[11]
				1238	vld1.8 {@XMM[15]}, [$inp]!
				1239	veor @XMM[7], @XMM[7], @XMM[12]
				1240	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1241	veor @XMM[3], @XMM[3], @XMM[13]
				1242	vst1.8 {@XMM[6]}, [$out]!
				1243	vst1.8 {@XMM[4]}, [$out]!
				1244	vst1.8 {@XMM[2]}, [$out]!
				1245	vst1.8 {@XMM[7]}, [$out]!
				1246	vst1.8 {@XMM[3]}, [$out]!
				1247	b .Lcbc_dec_done
				1248	.align 4
				1249	.Lcbc_dec_six:
				1250	sub $inp, $inp, #0x60
				1251	bl _bsaes_decrypt8
				1252	vldmia $fp,{@XMM[14]} @ reload IV
				1253	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
				1254	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
				1255	vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
				1256	veor @XMM[1], @XMM[1], @XMM[8]
				1257	veor @XMM[6], @XMM[6], @XMM[9]
				1258	vld1.8 {@XMM[12]}, [$inp]!
				1259	veor @XMM[4], @XMM[4], @XMM[10]
				1260	veor @XMM[2], @XMM[2], @XMM[11]
				1261	vld1.8 {@XMM[15]}, [$inp]!
				1262	veor @XMM[7], @XMM[7], @XMM[12]
				1263	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1264	vst1.8 {@XMM[6]}, [$out]!
				1265	vst1.8 {@XMM[4]}, [$out]!
				1266	vst1.8 {@XMM[2]}, [$out]!
				1267	vst1.8 {@XMM[7]}, [$out]!
				1268	b .Lcbc_dec_done
				1269	.align 4
				1270	.Lcbc_dec_five:
				1271	sub $inp, $inp, #0x50
				1272	bl _bsaes_decrypt8
				1273	vldmia $fp, {@XMM[14]} @ reload IV
				1274	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
				1275	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
				1276	vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
				1277	veor @XMM[1], @XMM[1], @XMM[8]
				1278	veor @XMM[6], @XMM[6], @XMM[9]
				1279	vld1.8 {@XMM[15]}, [$inp]!
				1280	veor @XMM[4], @XMM[4], @XMM[10]
				1281	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1282	veor @XMM[2], @XMM[2], @XMM[11]
				1283	vst1.8 {@XMM[6]}, [$out]!
				1284	vst1.8 {@XMM[4]}, [$out]!
				1285	vst1.8 {@XMM[2]}, [$out]!
				1286	b .Lcbc_dec_done
				1287	.align 4
				1288	.Lcbc_dec_four:
				1289	sub $inp, $inp, #0x40
				1290	bl _bsaes_decrypt8
				1291	vldmia $fp, {@XMM[14]} @ reload IV
				1292	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
				1293	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
				1294	vld1.8 {@XMM[10]}, [$inp]!
				1295	veor @XMM[1], @XMM[1], @XMM[8]
				1296	veor @XMM[6], @XMM[6], @XMM[9]
				1297	vld1.8 {@XMM[15]}, [$inp]!
				1298	veor @XMM[4], @XMM[4], @XMM[10]
				1299	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1300	vst1.8 {@XMM[6]}, [$out]!
				1301	vst1.8 {@XMM[4]}, [$out]!
				1302	b .Lcbc_dec_done
				1303	.align 4
				1304	.Lcbc_dec_three:
				1305	sub $inp, $inp, #0x30
				1306	bl _bsaes_decrypt8
				1307	vldmia $fp, {@XMM[14]} @ reload IV
				1308	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
				1309	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
				1310	vld1.8 {@XMM[15]}, [$inp]!
				1311	veor @XMM[1], @XMM[1], @XMM[8]
				1312	veor @XMM[6], @XMM[6], @XMM[9]
				1313	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1314	vst1.8 {@XMM[6]}, [$out]!
				1315	b .Lcbc_dec_done
				1316	.align 4
				1317	.Lcbc_dec_two:
				1318	sub $inp, $inp, #0x20
				1319	bl _bsaes_decrypt8
				1320	vldmia $fp, {@XMM[14]} @ reload IV
				1321	vld1.8 {@XMM[8]}, [$inp]! @ reload input
				1322	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
				1323	vld1.8 {@XMM[15]}, [$inp]! @ reload input
				1324	veor @XMM[1], @XMM[1], @XMM[8]
				1325	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1326	b .Lcbc_dec_done
				1327	.align 4
				1328	.Lcbc_dec_one:
				1329	sub $inp, $inp, #0x10
				1330	mov $rounds, $out @ save original out pointer
				1331	mov $out, $fp @ use the iv scratch space as out buffer
				1332	mov r2, $key
				1333	vmov @XMM[4],@XMM[15] @ just in case ensure that IV
				1334	vmov @XMM[5],@XMM[0] @ and input are preserved
				1335	bl AES_decrypt
				1336	vld1.8 {@XMM[0]}, [$fp,:64] @ load result
				1337	veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
				1338	vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
				1339	vst1.8 {@XMM[0]}, [$rounds] @ write output
				1340
				1341	.Lcbc_dec_done:
				1342	#ifndef BSAES_ASM_EXTENDED_KEY
				1343	vmov.i32 q0, #0
				1344	vmov.i32 q1, #0
				1345	.Lcbc_dec_bzero: @ wipe key schedule [if any]
				1346	vstmia $keysched!, {q0-q1}
				1347	cmp $keysched, $fp
				1348	bne .Lcbc_dec_bzero
				1349	#endif
				1350
				1351	mov sp, $fp
				1352	add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
				1353	vst1.8 {@XMM[15]}, [$ivp] @ return IV
				1354	VFP_ABI_POP
				1355	ldmia sp!, {r4-r10, pc}
				1356	.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
				1357	___
				1358	}
				1359	{
				1360	my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
				1361	my $const = "r6"; # shared with _bsaes_encrypt8_alt
				1362	my $keysched = "sp";
				1363
				1364	$code.=<<___;
				1365	.extern AES_encrypt
				1366	.global bsaes_ctr32_encrypt_blocks
				1367	.type bsaes_ctr32_encrypt_blocks,%function
				1368	.align 5
				1369	bsaes_ctr32_encrypt_blocks:
				1370	cmp $len, #8 @ use plain AES for
				1371	blo .Lctr_enc_short @ small sizes
				1372
				1373	mov ip, sp
				1374	stmdb sp!, {r4-r10, lr}
				1375	VFP_ABI_PUSH
				1376	ldr $ctr, [ip] @ ctr is 1st arg on the stack
				1377	sub sp, sp, #0x10 @ scratch space to carry over the ctr
				1378	mov $fp, sp @ save sp
				1379
				1380	ldr $rounds, [$key, #240] @ get # of rounds
				1381	#ifndef BSAES_ASM_EXTENDED_KEY
				1382	@ allocate the key schedule on the stack
				1383	sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
				1384	add r12, #`128-32` @ size of bit-sliced key schedule
				1385
				1386	@ populate the key schedule
				1387	mov r4, $key @ pass key
				1388	mov r5, $rounds @ pass # of rounds
				1389	mov sp, r12 @ sp is $keysched
				1390	bl _bsaes_key_convert
				1391	veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
				1392	vstmia r12, {@XMM[7]} @ save last round key
				1393
				1394	vld1.8 {@XMM[0]}, [$ctr] @ load counter
				1395	add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
				1396	vldmia $keysched, {@XMM[4]} @ load round0 key
				1397	#else
				1398	ldr r12, [$key, #244]
				1399	eors r12, #1
				1400	beq 0f
				1401
				1402	@ populate the key schedule
				1403	str r12, [$key, #244]
				1404	mov r4, $key @ pass key
				1405	mov r5, $rounds @ pass # of rounds
				1406	add r12, $key, #248 @ pass key schedule
				1407	bl _bsaes_key_convert
				1408	veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
				1409	vstmia r12, {@XMM[7]} @ save last round key
				1410
				1411	.align 2
				1412	0: add r12, $key, #248
				1413	vld1.8 {@XMM[0]}, [$ctr] @ load counter
				1414	adrl $ctr, .LREVM0SR @ borrow $ctr
				1415	vldmia r12, {@XMM[4]} @ load round0 key
				1416	sub sp, #0x10 @ place for adjusted round0 key
				1417	#endif
				1418
				1419	vmov.i32 @XMM[8],#1 @ compose 1<<96
				1420	veor @XMM[9],@XMM[9],@XMM[9]
				1421	vrev32.8 @XMM[0],@XMM[0]
				1422	vext.8 @XMM[8],@XMM[9],@XMM[8],#4
				1423	vrev32.8 @XMM[4],@XMM[4]
				1424	vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
				1425	vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
				1426	b .Lctr_enc_loop
				1427
				1428	.align 4
				1429	.Lctr_enc_loop:
				1430	vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
				1431	vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
				1432	vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
				1433	vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
				1434	vadd.u32 @XMM[4], @XMM[1], @XMM[10]
				1435	vadd.u32 @XMM[5], @XMM[2], @XMM[10]
				1436	vadd.u32 @XMM[6], @XMM[3], @XMM[10]
				1437	vadd.u32 @XMM[7], @XMM[4], @XMM[10]
				1438	vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
				1439
				1440	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
				1441	@ to flip byte order in 32-bit counter
				1442
				1443	vldmia $keysched, {@XMM[9]} @ load round0 key
				1444	#ifndef BSAES_ASM_EXTENDED_KEY
				1445	add r4, $keysched, #0x10 @ pass next round key
				1446	#else
				1447	add r4, $key, #`248+16`
				1448	#endif
				1449	vldmia $ctr, {@XMM[8]} @ .LREVM0SR
				1450	mov r5, $rounds @ pass rounds
				1451	vstmia $fp, {@XMM[10]} @ save next counter
				1452	sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
				1453
				1454	bl _bsaes_encrypt8_alt
				1455
				1456	subs $len, $len, #8
				1457	blo .Lctr_enc_loop_done
				1458
				1459	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
				1460	vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
				1461	veor @XMM[0], @XMM[8]
				1462	veor @XMM[1], @XMM[9]
				1463	vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
				1464	veor @XMM[4], @XMM[10]
				1465	veor @XMM[6], @XMM[11]
				1466	vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
				1467	veor @XMM[3], @XMM[12]
				1468	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1469	veor @XMM[7], @XMM[13]
				1470	veor @XMM[2], @XMM[14]
				1471	vst1.8 {@XMM[4]}, [$out]!
				1472	veor @XMM[5], @XMM[15]
				1473	vst1.8 {@XMM[6]}, [$out]!
				1474	vmov.i32 @XMM[8], #1 @ compose 1<<96
				1475	vst1.8 {@XMM[3]}, [$out]!
				1476	veor @XMM[9], @XMM[9], @XMM[9]
				1477	vst1.8 {@XMM[7]}, [$out]!
				1478	vext.8 @XMM[8], @XMM[9], @XMM[8], #4
				1479	vst1.8 {@XMM[2]}, [$out]!
				1480	vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
				1481	vst1.8 {@XMM[5]}, [$out]!
				1482	vldmia $fp, {@XMM[0]} @ load counter
				1483
				1484	bne .Lctr_enc_loop
				1485	b .Lctr_enc_done
				1486
				1487	.align 4
				1488	.Lctr_enc_loop_done:
				1489	add $len, $len, #8
				1490	vld1.8 {@XMM[8]}, [$inp]! @ load input
				1491	veor @XMM[0], @XMM[8]
				1492	vst1.8 {@XMM[0]}, [$out]! @ write output
				1493	cmp $len, #2
				1494	blo .Lctr_enc_done
				1495	vld1.8 {@XMM[9]}, [$inp]!
				1496	veor @XMM[1], @XMM[9]
				1497	vst1.8 {@XMM[1]}, [$out]!
				1498	beq .Lctr_enc_done
				1499	vld1.8 {@XMM[10]}, [$inp]!
				1500	veor @XMM[4], @XMM[10]
				1501	vst1.8 {@XMM[4]}, [$out]!
				1502	cmp $len, #4
				1503	blo .Lctr_enc_done
				1504	vld1.8 {@XMM[11]}, [$inp]!
				1505	veor @XMM[6], @XMM[11]
				1506	vst1.8 {@XMM[6]}, [$out]!
				1507	beq .Lctr_enc_done
				1508	vld1.8 {@XMM[12]}, [$inp]!
				1509	veor @XMM[3], @XMM[12]
				1510	vst1.8 {@XMM[3]}, [$out]!
				1511	cmp $len, #6
				1512	blo .Lctr_enc_done
				1513	vld1.8 {@XMM[13]}, [$inp]!
				1514	veor @XMM[7], @XMM[13]
				1515	vst1.8 {@XMM[7]}, [$out]!
				1516	beq .Lctr_enc_done
				1517	vld1.8 {@XMM[14]}, [$inp]
				1518	veor @XMM[2], @XMM[14]
				1519	vst1.8 {@XMM[2]}, [$out]!
				1520
				1521	.Lctr_enc_done:
				1522	vmov.i32 q0, #0
				1523	vmov.i32 q1, #0
				1524	#ifndef BSAES_ASM_EXTENDED_KEY
				1525	.Lctr_enc_bzero: @ wipe key schedule [if any]
				1526	vstmia $keysched!, {q0-q1}
				1527	cmp $keysched, $fp
				1528	bne .Lctr_enc_bzero
				1529	#else
				1530	vstmia $keysched, {q0-q1}
				1531	#endif
				1532
				1533	mov sp, $fp
				1534	add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
				1535	VFP_ABI_POP
				1536	ldmia sp!, {r4-r10, pc} @ return
				1537
				1538	.align 4
				1539	.Lctr_enc_short:
				1540	ldr ip, [sp] @ ctr pointer is passed on stack
				1541	stmdb sp!, {r4-r8, lr}
				1542
				1543	mov r4, $inp @ copy arguments
				1544	mov r5, $out
				1545	mov r6, $len
				1546	mov r7, $key
				1547	ldr r8, [ip, #12] @ load counter LSW
				1548	vld1.8 {@XMM[1]}, [ip] @ load whole counter value
				1549	#ifdef __ARMEL__
				1550	rev r8, r8
				1551	#endif
				1552	sub sp, sp, #0x10
				1553	vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value
				1554	sub sp, sp, #0x10
				1555
				1556	.Lctr_enc_short_loop:
				1557	add r0, sp, #0x10 @ input counter value
				1558	mov r1, sp @ output on the stack
				1559	mov r2, r7 @ key
				1560
				1561	bl AES_encrypt
				1562
				1563	vld1.8 {@XMM[0]}, [r4]! @ load input
				1564	vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter
				1565	add r8, r8, #1
				1566	#ifdef __ARMEL__
				1567	rev r0, r8
				1568	str r0, [sp, #0x1c] @ next counter value
				1569	#else
				1570	str r8, [sp, #0x1c] @ next counter value
				1571	#endif
				1572	veor @XMM[0],@XMM[0],@XMM[1]
				1573	vst1.8 {@XMM[0]}, [r5]! @ store output
				1574	subs r6, r6, #1
				1575	bne .Lctr_enc_short_loop
				1576
				1577	vmov.i32 q0, #0
				1578	vmov.i32 q1, #0
				1579	vstmia sp!, {q0-q1}
				1580
				1581	ldmia sp!, {r4-r8, pc}
				1582	.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
				1583	___
				1584	}
				1585	{
				1586	######################################################################
				1587	# void bsaes_xts_[en\|de]crypt(const char inp,char out,size_t len,
				1588	# const AES_KEY key1, const AES_KEY key2,
				1589	# const unsigned char iv[16]);
				1590	#
				1591	my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
				1592	my $const="r6"; # returned by _bsaes_key_convert
				1593	my $twmask=@XMM[5];
				1594	my @T=@XMM[6..7];
				1595
				1596	$code.=<<___;
				1597	.globl bsaes_xts_encrypt
				1598	.type bsaes_xts_encrypt,%function
				1599	.align 4
				1600	bsaes_xts_encrypt:
				1601	mov ip, sp
				1602	stmdb sp!, {r4-r10, lr} @ 0x20
				1603	VFP_ABI_PUSH
				1604	mov r6, sp @ future $fp
				1605
				1606	mov $inp, r0
				1607	mov $out, r1
				1608	mov $len, r2
				1609	mov $key, r3
				1610
				1611	sub r0, sp, #0x10 @ 0x10
				1612	bic r0, #0xf @ align at 16 bytes
				1613	mov sp, r0
				1614
				1615	#ifdef XTS_CHAIN_TWEAK
				1616	ldr r0, [ip] @ pointer to input tweak
				1617	#else
				1618	@ generate initial tweak
				1619	ldr r0, [ip, #4] @ iv[]
				1620	mov r1, sp
				1621	ldr r2, [ip, #0] @ key2
				1622	bl AES_encrypt
				1623	mov r0,sp @ pointer to initial tweak
				1624	#endif
				1625
				1626	ldr $rounds, [$key, #240] @ get # of rounds
				1627	mov $fp, r6
				1628	#ifndef BSAES_ASM_EXTENDED_KEY
				1629	@ allocate the key schedule on the stack
				1630	sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
				1631	@ add r12, #`128-32` @ size of bit-sliced key schedule
				1632	sub r12, #`32+16` @ place for tweak[9]
				1633
				1634	@ populate the key schedule
				1635	mov r4, $key @ pass key
				1636	mov r5, $rounds @ pass # of rounds
				1637	mov sp, r12
				1638	add r12, #0x90 @ pass key schedule
				1639	bl _bsaes_key_convert
				1640	veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
				1641	vstmia r12, {@XMM[7]} @ save last round key
				1642	#else
				1643	ldr r12, [$key, #244]
				1644	eors r12, #1
				1645	beq 0f
				1646
				1647	str r12, [$key, #244]
				1648	mov r4, $key @ pass key
				1649	mov r5, $rounds @ pass # of rounds
				1650	add r12, $key, #248 @ pass key schedule
				1651	bl _bsaes_key_convert
				1652	veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
				1653	vstmia r12, {@XMM[7]}
				1654
				1655	.align 2
				1656	0: sub sp, #0x90 @ place for tweak[9]
				1657	#endif
				1658
				1659	vld1.8 {@XMM[8]}, [r0] @ initial tweak
				1660	adr $magic, .Lxts_magic
				1661
				1662	subs $len, #0x80
				1663	blo .Lxts_enc_short
				1664	b .Lxts_enc_loop
				1665
				1666	.align 4
				1667	.Lxts_enc_loop:
				1668	vldmia $magic, {$twmask} @ load XTS magic
				1669	vshr.s64 @T[0], @XMM[8], #63
				1670	mov r0, sp
				1671	vand @T[0], @T[0], $twmask
				1672	___
				1673	for($i=9;$i<16;$i++) {
				1674	$code.=<<___;
				1675	vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
				1676	vst1.64 {@XMM[$i-1]}, [r0,:128]!
				1677	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
				1678	vshr.s64 @T[1], @XMM[$i], #63
				1679	veor @XMM[$i], @XMM[$i], @T[0]
				1680	vand @T[1], @T[1], $twmask
				1681	___
				1682	@T=reverse(@T);
				1683
				1684	$code.=<<___ if ($i>=10);
				1685	vld1.8 {@XMM[$i-10]}, [$inp]!
				1686	___
				1687	$code.=<<___ if ($i>=11);
				1688	veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
				1689	___
				1690	}
				1691	$code.=<<___;
				1692	vadd.u64 @XMM[8], @XMM[15], @XMM[15]
				1693	vst1.64 {@XMM[15]}, [r0,:128]!
				1694	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
				1695	veor @XMM[8], @XMM[8], @T[0]
				1696	vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1697
				1698	vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
				1699	veor @XMM[5], @XMM[5], @XMM[13]
				1700	#ifndef BSAES_ASM_EXTENDED_KEY
				1701	add r4, sp, #0x90 @ pass key schedule
				1702	#else
				1703	add r4, $key, #248 @ pass key schedule
				1704	#endif
				1705	veor @XMM[6], @XMM[6], @XMM[14]
				1706	mov r5, $rounds @ pass rounds
				1707	veor @XMM[7], @XMM[7], @XMM[15]
				1708	mov r0, sp
				1709
				1710	bl _bsaes_encrypt8
				1711
				1712	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				1713	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				1714	veor @XMM[0], @XMM[0], @XMM[ 8]
				1715	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
				1716	veor @XMM[1], @XMM[1], @XMM[ 9]
				1717	veor @XMM[8], @XMM[4], @XMM[10]
				1718	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				1719	veor @XMM[9], @XMM[6], @XMM[11]
				1720	vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
				1721	veor @XMM[10], @XMM[3], @XMM[12]
				1722	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				1723	veor @XMM[11], @XMM[7], @XMM[13]
				1724	veor @XMM[12], @XMM[2], @XMM[14]
				1725	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
				1726	veor @XMM[13], @XMM[5], @XMM[15]
				1727	vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
				1728
				1729	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1730
				1731	subs $len, #0x80
				1732	bpl .Lxts_enc_loop
				1733
				1734	.Lxts_enc_short:
				1735	adds $len, #0x70
				1736	bmi .Lxts_enc_done
				1737
				1738	vldmia $magic, {$twmask} @ load XTS magic
				1739	vshr.s64 @T[0], @XMM[8], #63
				1740	mov r0, sp
				1741	vand @T[0], @T[0], $twmask
				1742	___
				1743	for($i=9;$i<16;$i++) {
				1744	$code.=<<___;
				1745	vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
				1746	vst1.64 {@XMM[$i-1]}, [r0,:128]!
				1747	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
				1748	vshr.s64 @T[1], @XMM[$i], #63
				1749	veor @XMM[$i], @XMM[$i], @T[0]
				1750	vand @T[1], @T[1], $twmask
				1751	___
				1752	@T=reverse(@T);
				1753
				1754	$code.=<<___ if ($i>=10);
				1755	vld1.8 {@XMM[$i-10]}, [$inp]!
				1756	subs $len, #0x10
				1757	bmi .Lxts_enc_`$i-9`
				1758	___
				1759	$code.=<<___ if ($i>=11);
				1760	veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
				1761	___
				1762	}
				1763	$code.=<<___;
				1764	sub $len, #0x10
				1765	vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
				1766
				1767	vld1.8 {@XMM[6]}, [$inp]!
				1768	veor @XMM[5], @XMM[5], @XMM[13]
				1769	#ifndef BSAES_ASM_EXTENDED_KEY
				1770	add r4, sp, #0x90 @ pass key schedule
				1771	#else
				1772	add r4, $key, #248 @ pass key schedule
				1773	#endif
				1774	veor @XMM[6], @XMM[6], @XMM[14]
				1775	mov r5, $rounds @ pass rounds
				1776	mov r0, sp
				1777
				1778	bl _bsaes_encrypt8
				1779
				1780	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				1781	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				1782	veor @XMM[0], @XMM[0], @XMM[ 8]
				1783	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
				1784	veor @XMM[1], @XMM[1], @XMM[ 9]
				1785	veor @XMM[8], @XMM[4], @XMM[10]
				1786	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				1787	veor @XMM[9], @XMM[6], @XMM[11]
				1788	vld1.64 {@XMM[14]}, [r0,:128]!
				1789	veor @XMM[10], @XMM[3], @XMM[12]
				1790	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				1791	veor @XMM[11], @XMM[7], @XMM[13]
				1792	veor @XMM[12], @XMM[2], @XMM[14]
				1793	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
				1794	vst1.8 {@XMM[12]}, [$out]!
				1795
				1796	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1797	b .Lxts_enc_done
				1798	.align 4
				1799	.Lxts_enc_6:
				1800	vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
				1801
				1802	veor @XMM[4], @XMM[4], @XMM[12]
				1803	#ifndef BSAES_ASM_EXTENDED_KEY
				1804	add r4, sp, #0x90 @ pass key schedule
				1805	#else
				1806	add r4, $key, #248 @ pass key schedule
				1807	#endif
				1808	veor @XMM[5], @XMM[5], @XMM[13]
				1809	mov r5, $rounds @ pass rounds
				1810	mov r0, sp
				1811
				1812	bl _bsaes_encrypt8
				1813
				1814	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				1815	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				1816	veor @XMM[0], @XMM[0], @XMM[ 8]
				1817	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
				1818	veor @XMM[1], @XMM[1], @XMM[ 9]
				1819	veor @XMM[8], @XMM[4], @XMM[10]
				1820	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				1821	veor @XMM[9], @XMM[6], @XMM[11]
				1822	veor @XMM[10], @XMM[3], @XMM[12]
				1823	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				1824	veor @XMM[11], @XMM[7], @XMM[13]
				1825	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
				1826
				1827	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1828	b .Lxts_enc_done
				1829
				1830	@ put this in range for both ARM and Thumb mode adr instructions
				1831	.align 5
				1832	.Lxts_magic:
				1833	.quad 1, 0x87
				1834
				1835	.align 5
				1836	.Lxts_enc_5:
				1837	vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
				1838
				1839	veor @XMM[3], @XMM[3], @XMM[11]
				1840	#ifndef BSAES_ASM_EXTENDED_KEY
				1841	add r4, sp, #0x90 @ pass key schedule
				1842	#else
				1843	add r4, $key, #248 @ pass key schedule
				1844	#endif
				1845	veor @XMM[4], @XMM[4], @XMM[12]
				1846	mov r5, $rounds @ pass rounds
				1847	mov r0, sp
				1848
				1849	bl _bsaes_encrypt8
				1850
				1851	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				1852	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				1853	veor @XMM[0], @XMM[0], @XMM[ 8]
				1854	vld1.64 {@XMM[12]}, [r0,:128]!
				1855	veor @XMM[1], @XMM[1], @XMM[ 9]
				1856	veor @XMM[8], @XMM[4], @XMM[10]
				1857	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				1858	veor @XMM[9], @XMM[6], @XMM[11]
				1859	veor @XMM[10], @XMM[3], @XMM[12]
				1860	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				1861	vst1.8 {@XMM[10]}, [$out]!
				1862
				1863	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1864	b .Lxts_enc_done
				1865	.align 4
				1866	.Lxts_enc_4:
				1867	vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
				1868
				1869	veor @XMM[2], @XMM[2], @XMM[10]
				1870	#ifndef BSAES_ASM_EXTENDED_KEY
				1871	add r4, sp, #0x90 @ pass key schedule
				1872	#else
				1873	add r4, $key, #248 @ pass key schedule
				1874	#endif
				1875	veor @XMM[3], @XMM[3], @XMM[11]
				1876	mov r5, $rounds @ pass rounds
				1877	mov r0, sp
				1878
				1879	bl _bsaes_encrypt8
				1880
				1881	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				1882	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				1883	veor @XMM[0], @XMM[0], @XMM[ 8]
				1884	veor @XMM[1], @XMM[1], @XMM[ 9]
				1885	veor @XMM[8], @XMM[4], @XMM[10]
				1886	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				1887	veor @XMM[9], @XMM[6], @XMM[11]
				1888	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				1889
				1890	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1891	b .Lxts_enc_done
				1892	.align 4
				1893	.Lxts_enc_3:
				1894	vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
				1895
				1896	veor @XMM[1], @XMM[1], @XMM[9]
				1897	#ifndef BSAES_ASM_EXTENDED_KEY
				1898	add r4, sp, #0x90 @ pass key schedule
				1899	#else
				1900	add r4, $key, #248 @ pass key schedule
				1901	#endif
				1902	veor @XMM[2], @XMM[2], @XMM[10]
				1903	mov r5, $rounds @ pass rounds
				1904	mov r0, sp
				1905
				1906	bl _bsaes_encrypt8
				1907
				1908	vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
				1909	vld1.64 {@XMM[10]}, [r0,:128]!
				1910	veor @XMM[0], @XMM[0], @XMM[ 8]
				1911	veor @XMM[1], @XMM[1], @XMM[ 9]
				1912	veor @XMM[8], @XMM[4], @XMM[10]
				1913	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				1914	vst1.8 {@XMM[8]}, [$out]!
				1915
				1916	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1917	b .Lxts_enc_done
				1918	.align 4
				1919	.Lxts_enc_2:
				1920	vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
				1921
				1922	veor @XMM[0], @XMM[0], @XMM[8]
				1923	#ifndef BSAES_ASM_EXTENDED_KEY
				1924	add r4, sp, #0x90 @ pass key schedule
				1925	#else
				1926	add r4, $key, #248 @ pass key schedule
				1927	#endif
				1928	veor @XMM[1], @XMM[1], @XMM[9]
				1929	mov r5, $rounds @ pass rounds
				1930	mov r0, sp
				1931
				1932	bl _bsaes_encrypt8
				1933
				1934	vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
				1935	veor @XMM[0], @XMM[0], @XMM[ 8]
				1936	veor @XMM[1], @XMM[1], @XMM[ 9]
				1937	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				1938
				1939	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1940	b .Lxts_enc_done
				1941	.align 4
				1942	.Lxts_enc_1:
				1943	mov r0, sp
				1944	veor @XMM[0], @XMM[8]
				1945	mov r1, sp
				1946	vst1.8 {@XMM[0]}, [sp,:128]
				1947	mov r2, $key
				1948	mov r4, $fp @ preserve fp
				1949
				1950	bl AES_encrypt
				1951
				1952	vld1.8 {@XMM[0]}, [sp,:128]
				1953	veor @XMM[0], @XMM[0], @XMM[8]
				1954	vst1.8 {@XMM[0]}, [$out]!
				1955	mov $fp, r4
				1956
				1957	vmov @XMM[8], @XMM[9] @ next round tweak
				1958
				1959	.Lxts_enc_done:
				1960	#ifndef XTS_CHAIN_TWEAK
				1961	adds $len, #0x10
				1962	beq .Lxts_enc_ret
				1963	sub r6, $out, #0x10
				1964
				1965	.Lxts_enc_steal:
				1966	ldrb r0, [$inp], #1
				1967	ldrb r1, [$out, #-0x10]
				1968	strb r0, [$out, #-0x10]
				1969	strb r1, [$out], #1
				1970
				1971	subs $len, #1
				1972	bhi .Lxts_enc_steal
				1973
				1974	vld1.8 {@XMM[0]}, [r6]
				1975	mov r0, sp
				1976	veor @XMM[0], @XMM[0], @XMM[8]
				1977	mov r1, sp
				1978	vst1.8 {@XMM[0]}, [sp,:128]
				1979	mov r2, $key
				1980	mov r4, $fp @ preserve fp
				1981
				1982	bl AES_encrypt
				1983
				1984	vld1.8 {@XMM[0]}, [sp,:128]
				1985	veor @XMM[0], @XMM[0], @XMM[8]
				1986	vst1.8 {@XMM[0]}, [r6]
				1987	mov $fp, r4
				1988	#endif
				1989
				1990	.Lxts_enc_ret:
				1991	bic r0, $fp, #0xf
				1992	vmov.i32 q0, #0
				1993	vmov.i32 q1, #0
				1994	#ifdef XTS_CHAIN_TWEAK
				1995	ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
				1996	#endif
				1997	.Lxts_enc_bzero: @ wipe key schedule [if any]
				1998	vstmia sp!, {q0-q1}
				1999	cmp sp, r0
				2000	bne .Lxts_enc_bzero
				2001
				2002	mov sp, $fp
				2003	#ifdef XTS_CHAIN_TWEAK
				2004	vst1.8 {@XMM[8]}, [r1]
				2005	#endif
				2006	VFP_ABI_POP
				2007	ldmia sp!, {r4-r10, pc} @ return
				2008
				2009	.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
				2010
				2011	.globl bsaes_xts_decrypt
				2012	.type bsaes_xts_decrypt,%function
				2013	.align 4
				2014	bsaes_xts_decrypt:
				2015	mov ip, sp
				2016	stmdb sp!, {r4-r10, lr} @ 0x20
				2017	VFP_ABI_PUSH
				2018	mov r6, sp @ future $fp
				2019
				2020	mov $inp, r0
				2021	mov $out, r1
				2022	mov $len, r2
				2023	mov $key, r3
				2024
				2025	sub r0, sp, #0x10 @ 0x10
				2026	bic r0, #0xf @ align at 16 bytes
				2027	mov sp, r0
				2028
				2029	#ifdef XTS_CHAIN_TWEAK
				2030	ldr r0, [ip] @ pointer to input tweak
				2031	#else
				2032	@ generate initial tweak
				2033	ldr r0, [ip, #4] @ iv[]
				2034	mov r1, sp
				2035	ldr r2, [ip, #0] @ key2
				2036	bl AES_encrypt
				2037	mov r0, sp @ pointer to initial tweak
				2038	#endif
				2039
				2040	ldr $rounds, [$key, #240] @ get # of rounds
				2041	mov $fp, r6
				2042	#ifndef BSAES_ASM_EXTENDED_KEY
				2043	@ allocate the key schedule on the stack
				2044	sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
				2045	@ add r12, #`128-32` @ size of bit-sliced key schedule
				2046	sub r12, #`32+16` @ place for tweak[9]
				2047
				2048	@ populate the key schedule
				2049	mov r4, $key @ pass key
				2050	mov r5, $rounds @ pass # of rounds
				2051	mov sp, r12
				2052	add r12, #0x90 @ pass key schedule
				2053	bl _bsaes_key_convert
				2054	add r4, sp, #0x90
				2055	vldmia r4, {@XMM[6]}
				2056	vstmia r12, {@XMM[15]} @ save last round key
				2057	veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
				2058	vstmia r4, {@XMM[7]}
				2059	#else
				2060	ldr r12, [$key, #244]
				2061	eors r12, #1
				2062	beq 0f
				2063
				2064	str r12, [$key, #244]
				2065	mov r4, $key @ pass key
				2066	mov r5, $rounds @ pass # of rounds
				2067	add r12, $key, #248 @ pass key schedule
				2068	bl _bsaes_key_convert
				2069	add r4, $key, #248
				2070	vldmia r4, {@XMM[6]}
				2071	vstmia r12, {@XMM[15]} @ save last round key
				2072	veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
				2073	vstmia r4, {@XMM[7]}
				2074
				2075	.align 2
				2076	0: sub sp, #0x90 @ place for tweak[9]
				2077	#endif
				2078	vld1.8 {@XMM[8]}, [r0] @ initial tweak
				2079	adr $magic, .Lxts_magic
				2080
Ard Biesheuvel	001eabf	2015-02-26 07:22:05 +0000	[diff] [blame]	2081	#ifndef XTS_CHAIN_TWEAK
Ard Biesheuvel	e4e7f10	2013-09-16 18:31:38 +0200	[diff] [blame]	2082	tst $len, #0xf @ if not multiple of 16
				2083	it ne @ Thumb2 thing, sanity check in ARM
				2084	subne $len, #0x10 @ subtract another 16 bytes
Ard Biesheuvel	001eabf	2015-02-26 07:22:05 +0000	[diff] [blame]	2085	#endif
Ard Biesheuvel	e4e7f10	2013-09-16 18:31:38 +0200	[diff] [blame]	2086	subs $len, #0x80
				2087
				2088	blo .Lxts_dec_short
				2089	b .Lxts_dec_loop
				2090
				2091	.align 4
				2092	.Lxts_dec_loop:
				2093	vldmia $magic, {$twmask} @ load XTS magic
				2094	vshr.s64 @T[0], @XMM[8], #63
				2095	mov r0, sp
				2096	vand @T[0], @T[0], $twmask
				2097	___
				2098	for($i=9;$i<16;$i++) {
				2099	$code.=<<___;
				2100	vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
				2101	vst1.64 {@XMM[$i-1]}, [r0,:128]!
				2102	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
				2103	vshr.s64 @T[1], @XMM[$i], #63
				2104	veor @XMM[$i], @XMM[$i], @T[0]
				2105	vand @T[1], @T[1], $twmask
				2106	___
				2107	@T=reverse(@T);
				2108
				2109	$code.=<<___ if ($i>=10);
				2110	vld1.8 {@XMM[$i-10]}, [$inp]!
				2111	___
				2112	$code.=<<___ if ($i>=11);
				2113	veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
				2114	___
				2115	}
				2116	$code.=<<___;
				2117	vadd.u64 @XMM[8], @XMM[15], @XMM[15]
				2118	vst1.64 {@XMM[15]}, [r0,:128]!
				2119	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
				2120	veor @XMM[8], @XMM[8], @T[0]
				2121	vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2122
				2123	vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
				2124	veor @XMM[5], @XMM[5], @XMM[13]
				2125	#ifndef BSAES_ASM_EXTENDED_KEY
				2126	add r4, sp, #0x90 @ pass key schedule
				2127	#else
				2128	add r4, $key, #248 @ pass key schedule
				2129	#endif
				2130	veor @XMM[6], @XMM[6], @XMM[14]
				2131	mov r5, $rounds @ pass rounds
				2132	veor @XMM[7], @XMM[7], @XMM[15]
				2133	mov r0, sp
				2134
				2135	bl _bsaes_decrypt8
				2136
				2137	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				2138	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				2139	veor @XMM[0], @XMM[0], @XMM[ 8]
				2140	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
				2141	veor @XMM[1], @XMM[1], @XMM[ 9]
				2142	veor @XMM[8], @XMM[6], @XMM[10]
				2143	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				2144	veor @XMM[9], @XMM[4], @XMM[11]
				2145	vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
				2146	veor @XMM[10], @XMM[2], @XMM[12]
				2147	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				2148	veor @XMM[11], @XMM[7], @XMM[13]
				2149	veor @XMM[12], @XMM[3], @XMM[14]
				2150	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
				2151	veor @XMM[13], @XMM[5], @XMM[15]
				2152	vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
				2153
				2154	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2155
				2156	subs $len, #0x80
				2157	bpl .Lxts_dec_loop
				2158
				2159	.Lxts_dec_short:
				2160	adds $len, #0x70
				2161	bmi .Lxts_dec_done
				2162
				2163	vldmia $magic, {$twmask} @ load XTS magic
				2164	vshr.s64 @T[0], @XMM[8], #63
				2165	mov r0, sp
				2166	vand @T[0], @T[0], $twmask
				2167	___
				2168	for($i=9;$i<16;$i++) {
				2169	$code.=<<___;
				2170	vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
				2171	vst1.64 {@XMM[$i-1]}, [r0,:128]!
				2172	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
				2173	vshr.s64 @T[1], @XMM[$i], #63
				2174	veor @XMM[$i], @XMM[$i], @T[0]
				2175	vand @T[1], @T[1], $twmask
				2176	___
				2177	@T=reverse(@T);
				2178
				2179	$code.=<<___ if ($i>=10);
				2180	vld1.8 {@XMM[$i-10]}, [$inp]!
				2181	subs $len, #0x10
				2182	bmi .Lxts_dec_`$i-9`
				2183	___
				2184	$code.=<<___ if ($i>=11);
				2185	veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
				2186	___
				2187	}
				2188	$code.=<<___;
				2189	sub $len, #0x10
				2190	vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
				2191
				2192	vld1.8 {@XMM[6]}, [$inp]!
				2193	veor @XMM[5], @XMM[5], @XMM[13]
				2194	#ifndef BSAES_ASM_EXTENDED_KEY
				2195	add r4, sp, #0x90 @ pass key schedule
				2196	#else
				2197	add r4, $key, #248 @ pass key schedule
				2198	#endif
				2199	veor @XMM[6], @XMM[6], @XMM[14]
				2200	mov r5, $rounds @ pass rounds
				2201	mov r0, sp
				2202
				2203	bl _bsaes_decrypt8
				2204
				2205	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				2206	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				2207	veor @XMM[0], @XMM[0], @XMM[ 8]
				2208	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
				2209	veor @XMM[1], @XMM[1], @XMM[ 9]
				2210	veor @XMM[8], @XMM[6], @XMM[10]
				2211	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				2212	veor @XMM[9], @XMM[4], @XMM[11]
				2213	vld1.64 {@XMM[14]}, [r0,:128]!
				2214	veor @XMM[10], @XMM[2], @XMM[12]
				2215	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				2216	veor @XMM[11], @XMM[7], @XMM[13]
				2217	veor @XMM[12], @XMM[3], @XMM[14]
				2218	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
				2219	vst1.8 {@XMM[12]}, [$out]!
				2220
				2221	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2222	b .Lxts_dec_done
				2223	.align 4
				2224	.Lxts_dec_6:
				2225	vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
				2226
				2227	veor @XMM[4], @XMM[4], @XMM[12]
				2228	#ifndef BSAES_ASM_EXTENDED_KEY
				2229	add r4, sp, #0x90 @ pass key schedule
				2230	#else
				2231	add r4, $key, #248 @ pass key schedule
				2232	#endif
				2233	veor @XMM[5], @XMM[5], @XMM[13]
				2234	mov r5, $rounds @ pass rounds
				2235	mov r0, sp
				2236
				2237	bl _bsaes_decrypt8
				2238
				2239	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				2240	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				2241	veor @XMM[0], @XMM[0], @XMM[ 8]
				2242	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
				2243	veor @XMM[1], @XMM[1], @XMM[ 9]
				2244	veor @XMM[8], @XMM[6], @XMM[10]
				2245	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				2246	veor @XMM[9], @XMM[4], @XMM[11]
				2247	veor @XMM[10], @XMM[2], @XMM[12]
				2248	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				2249	veor @XMM[11], @XMM[7], @XMM[13]
				2250	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
				2251
				2252	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2253	b .Lxts_dec_done
				2254	.align 4
				2255	.Lxts_dec_5:
				2256	vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
				2257
				2258	veor @XMM[3], @XMM[3], @XMM[11]
				2259	#ifndef BSAES_ASM_EXTENDED_KEY
				2260	add r4, sp, #0x90 @ pass key schedule
				2261	#else
				2262	add r4, $key, #248 @ pass key schedule
				2263	#endif
				2264	veor @XMM[4], @XMM[4], @XMM[12]
				2265	mov r5, $rounds @ pass rounds
				2266	mov r0, sp
				2267
				2268	bl _bsaes_decrypt8
				2269
				2270	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				2271	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				2272	veor @XMM[0], @XMM[0], @XMM[ 8]
				2273	vld1.64 {@XMM[12]}, [r0,:128]!
				2274	veor @XMM[1], @XMM[1], @XMM[ 9]
				2275	veor @XMM[8], @XMM[6], @XMM[10]
				2276	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				2277	veor @XMM[9], @XMM[4], @XMM[11]
				2278	veor @XMM[10], @XMM[2], @XMM[12]
				2279	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				2280	vst1.8 {@XMM[10]}, [$out]!
				2281
				2282	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2283	b .Lxts_dec_done
				2284	.align 4
				2285	.Lxts_dec_4:
				2286	vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
				2287
				2288	veor @XMM[2], @XMM[2], @XMM[10]
				2289	#ifndef BSAES_ASM_EXTENDED_KEY
				2290	add r4, sp, #0x90 @ pass key schedule
				2291	#else
				2292	add r4, $key, #248 @ pass key schedule
				2293	#endif
				2294	veor @XMM[3], @XMM[3], @XMM[11]
				2295	mov r5, $rounds @ pass rounds
				2296	mov r0, sp
				2297
				2298	bl _bsaes_decrypt8
				2299
				2300	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				2301	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				2302	veor @XMM[0], @XMM[0], @XMM[ 8]
				2303	veor @XMM[1], @XMM[1], @XMM[ 9]
				2304	veor @XMM[8], @XMM[6], @XMM[10]
				2305	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				2306	veor @XMM[9], @XMM[4], @XMM[11]
				2307	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				2308
				2309	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2310	b .Lxts_dec_done
				2311	.align 4
				2312	.Lxts_dec_3:
				2313	vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
				2314
				2315	veor @XMM[1], @XMM[1], @XMM[9]
				2316	#ifndef BSAES_ASM_EXTENDED_KEY
				2317	add r4, sp, #0x90 @ pass key schedule
				2318	#else
				2319	add r4, $key, #248 @ pass key schedule
				2320	#endif
				2321	veor @XMM[2], @XMM[2], @XMM[10]
				2322	mov r5, $rounds @ pass rounds
				2323	mov r0, sp
				2324
				2325	bl _bsaes_decrypt8
				2326
				2327	vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
				2328	vld1.64 {@XMM[10]}, [r0,:128]!
				2329	veor @XMM[0], @XMM[0], @XMM[ 8]
				2330	veor @XMM[1], @XMM[1], @XMM[ 9]
				2331	veor @XMM[8], @XMM[6], @XMM[10]
				2332	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				2333	vst1.8 {@XMM[8]}, [$out]!
				2334
				2335	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2336	b .Lxts_dec_done
				2337	.align 4
				2338	.Lxts_dec_2:
				2339	vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
				2340
				2341	veor @XMM[0], @XMM[0], @XMM[8]
				2342	#ifndef BSAES_ASM_EXTENDED_KEY
				2343	add r4, sp, #0x90 @ pass key schedule
				2344	#else
				2345	add r4, $key, #248 @ pass key schedule
				2346	#endif
				2347	veor @XMM[1], @XMM[1], @XMM[9]
				2348	mov r5, $rounds @ pass rounds
				2349	mov r0, sp
				2350
				2351	bl _bsaes_decrypt8
				2352
				2353	vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
				2354	veor @XMM[0], @XMM[0], @XMM[ 8]
				2355	veor @XMM[1], @XMM[1], @XMM[ 9]
				2356	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				2357
				2358	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2359	b .Lxts_dec_done
				2360	.align 4
				2361	.Lxts_dec_1:
				2362	mov r0, sp
				2363	veor @XMM[0], @XMM[8]
				2364	mov r1, sp
				2365	vst1.8 {@XMM[0]}, [sp,:128]
				2366	mov r2, $key
				2367	mov r4, $fp @ preserve fp
				2368	mov r5, $magic @ preserve magic
				2369
				2370	bl AES_decrypt
				2371
				2372	vld1.8 {@XMM[0]}, [sp,:128]
				2373	veor @XMM[0], @XMM[0], @XMM[8]
				2374	vst1.8 {@XMM[0]}, [$out]!
				2375	mov $fp, r4
				2376	mov $magic, r5
				2377
				2378	vmov @XMM[8], @XMM[9] @ next round tweak
				2379
				2380	.Lxts_dec_done:
				2381	#ifndef XTS_CHAIN_TWEAK
				2382	adds $len, #0x10
				2383	beq .Lxts_dec_ret
				2384
				2385	@ calculate one round of extra tweak for the stolen ciphertext
				2386	vldmia $magic, {$twmask}
				2387	vshr.s64 @XMM[6], @XMM[8], #63
				2388	vand @XMM[6], @XMM[6], $twmask
				2389	vadd.u64 @XMM[9], @XMM[8], @XMM[8]
				2390	vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
				2391	veor @XMM[9], @XMM[9], @XMM[6]
				2392
				2393	@ perform the final decryption with the last tweak value
				2394	vld1.8 {@XMM[0]}, [$inp]!
				2395	mov r0, sp
				2396	veor @XMM[0], @XMM[0], @XMM[9]
				2397	mov r1, sp
				2398	vst1.8 {@XMM[0]}, [sp,:128]
				2399	mov r2, $key
				2400	mov r4, $fp @ preserve fp
				2401
				2402	bl AES_decrypt
				2403
				2404	vld1.8 {@XMM[0]}, [sp,:128]
				2405	veor @XMM[0], @XMM[0], @XMM[9]
				2406	vst1.8 {@XMM[0]}, [$out]
				2407
				2408	mov r6, $out
				2409	.Lxts_dec_steal:
				2410	ldrb r1, [$out]
				2411	ldrb r0, [$inp], #1
				2412	strb r1, [$out, #0x10]
				2413	strb r0, [$out], #1
				2414
				2415	subs $len, #1
				2416	bhi .Lxts_dec_steal
				2417
				2418	vld1.8 {@XMM[0]}, [r6]
				2419	mov r0, sp
				2420	veor @XMM[0], @XMM[8]
				2421	mov r1, sp
				2422	vst1.8 {@XMM[0]}, [sp,:128]
				2423	mov r2, $key
				2424
				2425	bl AES_decrypt
				2426
				2427	vld1.8 {@XMM[0]}, [sp,:128]
				2428	veor @XMM[0], @XMM[0], @XMM[8]
				2429	vst1.8 {@XMM[0]}, [r6]
				2430	mov $fp, r4
				2431	#endif
				2432
				2433	.Lxts_dec_ret:
				2434	bic r0, $fp, #0xf
				2435	vmov.i32 q0, #0
				2436	vmov.i32 q1, #0
				2437	#ifdef XTS_CHAIN_TWEAK
				2438	ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
				2439	#endif
				2440	.Lxts_dec_bzero: @ wipe key schedule [if any]
				2441	vstmia sp!, {q0-q1}
				2442	cmp sp, r0
				2443	bne .Lxts_dec_bzero
				2444
				2445	mov sp, $fp
				2446	#ifdef XTS_CHAIN_TWEAK
				2447	vst1.8 {@XMM[8]}, [r1]
				2448	#endif
				2449	VFP_ABI_POP
				2450	ldmia sp!, {r4-r10, pc} @ return
				2451
				2452	.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
				2453	___
				2454	}
				2455	$code.=<<___;
				2456	#endif
				2457	___
				2458
				2459	$code =~ s/\`([^\`]*)\`/eval($1)/gem;
				2460
				2461	open SELF,$0;
				2462	while(<SELF>) {
				2463	next if (/^#!/);
				2464	last if (!s/^#/@/ and !/^$/);
				2465	print;
				2466	}
				2467	close SELF;
				2468
				2469	print $code;
				2470
				2471	close STDOUT;