Blame - src/crypto/fipsmodule/aes/asm/bsaes-armv7.pl - platform/external/boringssl

blob: 1ff890aa13efc4a955302a21026650d730414fb1 [file] [log] [blame]

Robert Sloan	6f79a50	2017-04-03 09:16:40 -0700	[diff] [blame]	1	#! /usr/bin/env perl
				2	# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
				3	#
				4	# Licensed under the OpenSSL license (the "License"). You may not use
				5	# this file except in compliance with the License. You can obtain a copy
				6	# in the file LICENSE in the source distribution or at
				7	# https://www.openssl.org/source/license.html
				8
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	9
				10	# ====================================================================
				11	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
				12	# project. The module is, however, dual licensed under OpenSSL and
				13	# CRYPTOGAMS licenses depending on where you obtain it. For further
				14	# details see http://www.openssl.org/~appro/cryptogams/.
				15	#
				16	# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
				17	# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
				18	# granted.
				19	# ====================================================================
				20
				21	# Bit-sliced AES for ARM NEON
				22	#
				23	# February 2012.
				24	#
				25	# This implementation is direct adaptation of bsaes-x86_64 module for
				26	# ARM NEON. Except that this module is endian-neutral [in sense that
				27	# it can be compiled for either endianness] by courtesy of vld1.8's
				28	# neutrality. Initial version doesn't implement interface to OpenSSL,
				29	# only low-level primitives and unsupported entry points, just enough
				30	# to collect performance results, which for Cortex-A8 core are:
				31	#
				32	# encrypt 19.5 cycles per byte processed with 128-bit key
				33	# decrypt 22.1 cycles per byte processed with 128-bit key
				34	# key conv. 440 cycles per 128-bit key/0.18 of 8x block
				35	#
				36	# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
				37	# which is [much] worse than anticipated (for further details see
				38	# http://www.openssl.org/~appro/Snapdragon-S4.html).
				39	#
				40	# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
				41	# manages in 20.0 cycles].
				42	#
				43	# When comparing to x86_64 results keep in mind that NEON unit is
				44	# [mostly] single-issue and thus can't [fully] benefit from
				45	# instruction-level parallelism. And when comparing to aes-armv4
				46	# results keep in mind key schedule conversion overhead (see
				47	# bsaes-x86_64.pl for further details)...
				48	#
				49	# <appro@openssl.org>
				50
				51	# April-August 2013
				52	#
				53	# Add CBC, CTR and XTS subroutines, adapt for kernel use.
				54	#
				55	# <ard.biesheuvel@linaro.org>
				56
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	57	$flavour = shift;
David Benjamin	c895d6b	2016-08-11 13:26:41 -0400	[diff] [blame]	58	if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
				59	else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	60
				61	if ($flavour && $flavour ne "void") {
				62	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
				63	( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
Robert Sloan	572a4e2	2017-04-17 10:52:19 -0700	[diff] [blame]	64	( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	65	die "can't locate arm-xlate.pl";
				66
				67	open STDOUT,"\| \"$^X\" $xlate $flavour $output";
				68	} else {
				69	open STDOUT,">$output";
				70	}
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	71
				72	my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
				73	my @XMM=map("q$_",(0..15));
				74
				75	{
				76	my ($key,$rounds,$const)=("r4","r5","r6");
				77
				78	sub Dlo() { shift=~m\|q([1]?[0-9])\|?"d".($1*2):""; }
				79	sub Dhi() { shift=~m\|q([1]?[0-9])\|?"d".($1*2+1):""; }
				80
				81	sub Sbox {
				82	# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
				83	# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
				84	my @b=@_[0..7];
				85	my @t=@_[8..11];
				86	my @s=@_[12..15];
				87	&InBasisChange (@b);
				88	&Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
				89	&OutBasisChange (@b[7,1,4,2,6,5,0,3]);
				90	}
				91
				92	sub InBasisChange {
				93	# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
Robert Sloan	a94fe05	2017-02-21 08:49:28 -0800	[diff] [blame]	94	# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	95	my @b=@_[0..7];
				96	$code.=<<___;
				97	veor @b[2], @b[2], @b[1]
				98	veor @b[5], @b[5], @b[6]
				99	veor @b[3], @b[3], @b[0]
				100	veor @b[6], @b[6], @b[2]
				101	veor @b[5], @b[5], @b[0]
				102
				103	veor @b[6], @b[6], @b[3]
				104	veor @b[3], @b[3], @b[7]
				105	veor @b[7], @b[7], @b[5]
				106	veor @b[3], @b[3], @b[4]
				107	veor @b[4], @b[4], @b[5]
				108
				109	veor @b[2], @b[2], @b[7]
				110	veor @b[3], @b[3], @b[1]
				111	veor @b[1], @b[1], @b[5]
				112	___
				113	}
				114
				115	sub OutBasisChange {
				116	# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
				117	# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
				118	my @b=@_[0..7];
				119	$code.=<<___;
				120	veor @b[0], @b[0], @b[6]
				121	veor @b[1], @b[1], @b[4]
				122	veor @b[4], @b[4], @b[6]
				123	veor @b[2], @b[2], @b[0]
				124	veor @b[6], @b[6], @b[1]
				125
				126	veor @b[1], @b[1], @b[5]
				127	veor @b[5], @b[5], @b[3]
				128	veor @b[3], @b[3], @b[7]
				129	veor @b[7], @b[7], @b[5]
				130	veor @b[2], @b[2], @b[5]
				131
				132	veor @b[4], @b[4], @b[7]
				133	___
				134	}
				135
				136	sub InvSbox {
				137	# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
				138	# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
				139	my @b=@_[0..7];
				140	my @t=@_[8..11];
				141	my @s=@_[12..15];
				142	&InvInBasisChange (@b);
				143	&Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
				144	&InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
				145	}
				146
				147	sub InvInBasisChange { # OutBasisChange in reverse (with twist)
				148	my @b=@_[5,1,2,6,3,7,0,4];
				149	$code.=<<___
				150	veor @b[1], @b[1], @b[7]
				151	veor @b[4], @b[4], @b[7]
				152
				153	veor @b[7], @b[7], @b[5]
				154	veor @b[1], @b[1], @b[3]
				155	veor @b[2], @b[2], @b[5]
				156	veor @b[3], @b[3], @b[7]
				157
				158	veor @b[6], @b[6], @b[1]
				159	veor @b[2], @b[2], @b[0]
				160	veor @b[5], @b[5], @b[3]
				161	veor @b[4], @b[4], @b[6]
				162	veor @b[0], @b[0], @b[6]
				163	veor @b[1], @b[1], @b[4]
				164	___
				165	}
				166
				167	sub InvOutBasisChange { # InBasisChange in reverse
				168	my @b=@_[2,5,7,3,6,1,0,4];
				169	$code.=<<___;
				170	veor @b[1], @b[1], @b[5]
				171	veor @b[2], @b[2], @b[7]
				172
				173	veor @b[3], @b[3], @b[1]
				174	veor @b[4], @b[4], @b[5]
				175	veor @b[7], @b[7], @b[5]
				176	veor @b[3], @b[3], @b[4]
				177	veor @b[5], @b[5], @b[0]
				178	veor @b[3], @b[3], @b[7]
				179	veor @b[6], @b[6], @b[2]
				180	veor @b[2], @b[2], @b[1]
				181	veor @b[6], @b[6], @b[3]
				182
				183	veor @b[3], @b[3], @b[0]
				184	veor @b[5], @b[5], @b[6]
				185	___
				186	}
				187
				188	sub Mul_GF4 {
				189	#;*************************************************************
				190	#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
				191	#;*************************************************************
				192	my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
				193	$code.=<<___;
				194	veor $t0, $y0, $y1
				195	vand $t0, $t0, $x0
				196	veor $x0, $x0, $x1
				197	vand $t1, $x1, $y0
				198	vand $x0, $x0, $y1
				199	veor $x1, $t1, $t0
				200	veor $x0, $x0, $t1
				201	___
				202	}
				203
				204	sub Mul_GF4_N { # not used, see next subroutine
				205	# multiply and scale by N
				206	my ($x0,$x1,$y0,$y1,$t0)=@_;
				207	$code.=<<___;
				208	veor $t0, $y0, $y1
				209	vand $t0, $t0, $x0
				210	veor $x0, $x0, $x1
				211	vand $x1, $x1, $y0
				212	vand $x0, $x0, $y1
				213	veor $x1, $x1, $x0
				214	veor $x0, $x0, $t0
				215	___
				216	}
				217
				218	sub Mul_GF4_N_GF4 {
				219	# interleaved Mul_GF4_N and Mul_GF4
				220	my ($x0,$x1,$y0,$y1,$t0,
				221	$x2,$x3,$y2,$y3,$t1)=@_;
				222	$code.=<<___;
				223	veor $t0, $y0, $y1
				224	veor $t1, $y2, $y3
				225	vand $t0, $t0, $x0
				226	vand $t1, $t1, $x2
				227	veor $x0, $x0, $x1
				228	veor $x2, $x2, $x3
				229	vand $x1, $x1, $y0
				230	vand $x3, $x3, $y2
				231	vand $x0, $x0, $y1
				232	vand $x2, $x2, $y3
				233	veor $x1, $x1, $x0
				234	veor $x2, $x2, $x3
				235	veor $x0, $x0, $t0
				236	veor $x3, $x3, $t1
				237	___
				238	}
				239	sub Mul_GF16_2 {
				240	my @x=@_[0..7];
				241	my @y=@_[8..11];
				242	my @t=@_[12..15];
				243	$code.=<<___;
				244	veor @t[0], @x[0], @x[2]
				245	veor @t[1], @x[1], @x[3]
				246	___
				247	&Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
				248	$code.=<<___;
				249	veor @y[0], @y[0], @y[2]
				250	veor @y[1], @y[1], @y[3]
				251	___
				252	Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
				253	@x[2], @x[3], @y[2], @y[3], @t[2]);
				254	$code.=<<___;
				255	veor @x[0], @x[0], @t[0]
				256	veor @x[2], @x[2], @t[0]
				257	veor @x[1], @x[1], @t[1]
				258	veor @x[3], @x[3], @t[1]
				259
				260	veor @t[0], @x[4], @x[6]
				261	veor @t[1], @x[5], @x[7]
				262	___
				263	&Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
				264	@x[6], @x[7], @y[2], @y[3], @t[2]);
				265	$code.=<<___;
				266	veor @y[0], @y[0], @y[2]
				267	veor @y[1], @y[1], @y[3]
				268	___
				269	&Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
				270	$code.=<<___;
				271	veor @x[4], @x[4], @t[0]
				272	veor @x[6], @x[6], @t[0]
				273	veor @x[5], @x[5], @t[1]
				274	veor @x[7], @x[7], @t[1]
				275	___
				276	}
				277	sub Inv_GF256 {
				278	#;********************************************************************
				279	#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
				280	#;********************************************************************
				281	my @x=@_[0..7];
				282	my @t=@_[8..11];
				283	my @s=@_[12..15];
				284	# direct optimizations from hardware
				285	$code.=<<___;
				286	veor @t[3], @x[4], @x[6]
				287	veor @t[2], @x[5], @x[7]
				288	veor @t[1], @x[1], @x[3]
				289	veor @s[1], @x[7], @x[6]
				290	vmov @t[0], @t[2]
				291	veor @s[0], @x[0], @x[2]
				292
				293	vorr @t[2], @t[2], @t[1]
				294	veor @s[3], @t[3], @t[0]
				295	vand @s[2], @t[3], @s[0]
				296	vorr @t[3], @t[3], @s[0]
				297	veor @s[0], @s[0], @t[1]
				298	vand @t[0], @t[0], @t[1]
				299	veor @t[1], @x[3], @x[2]
				300	vand @s[3], @s[3], @s[0]
				301	vand @s[1], @s[1], @t[1]
				302	veor @t[1], @x[4], @x[5]
				303	veor @s[0], @x[1], @x[0]
				304	veor @t[3], @t[3], @s[1]
				305	veor @t[2], @t[2], @s[1]
				306	vand @s[1], @t[1], @s[0]
				307	vorr @t[1], @t[1], @s[0]
				308	veor @t[3], @t[3], @s[3]
				309	veor @t[0], @t[0], @s[1]
				310	veor @t[2], @t[2], @s[2]
				311	veor @t[1], @t[1], @s[3]
				312	veor @t[0], @t[0], @s[2]
				313	vand @s[0], @x[7], @x[3]
				314	veor @t[1], @t[1], @s[2]
				315	vand @s[1], @x[6], @x[2]
				316	vand @s[2], @x[5], @x[1]
				317	vorr @s[3], @x[4], @x[0]
				318	veor @t[3], @t[3], @s[0]
				319	veor @t[1], @t[1], @s[2]
				320	veor @t[0], @t[0], @s[3]
				321	veor @t[2], @t[2], @s[1]
				322
				323	@ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
				324
				325	@ new smaller inversion
				326
				327	vand @s[2], @t[3], @t[1]
				328	vmov @s[0], @t[0]
				329
				330	veor @s[1], @t[2], @s[2]
				331	veor @s[3], @t[0], @s[2]
				332	veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
				333
				334	vbsl @s[1], @t[1], @t[0]
				335	vbsl @s[3], @t[3], @t[2]
				336	veor @t[3], @t[3], @t[2]
				337
				338	vbsl @s[0], @s[1], @s[2]
				339	vbsl @t[0], @s[2], @s[1]
				340
				341	vand @s[2], @s[0], @s[3]
				342	veor @t[1], @t[1], @t[0]
				343
				344	veor @s[2], @s[2], @t[3]
				345	___
				346	# output in s3, s2, s1, t1
				347
				348	# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
				349
				350	# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
				351	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
				352
				353	### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
				354	}
				355
				356	# AES linear components
				357
				358	sub ShiftRows {
				359	my @x=@_[0..7];
				360	my @t=@_[8..11];
				361	my $mask=pop;
				362	$code.=<<___;
				363	vldmia $key!, {@t[0]-@t[3]}
				364	veor @t[0], @t[0], @x[0]
				365	veor @t[1], @t[1], @x[1]
				366	vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
				367	vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
				368	vldmia $key!, {@t[0]}
				369	veor @t[2], @t[2], @x[2]
				370	vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
				371	vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
				372	vldmia $key!, {@t[1]}
				373	veor @t[3], @t[3], @x[3]
				374	vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
				375	vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
				376	vldmia $key!, {@t[2]}
				377	vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
				378	vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
				379	vldmia $key!, {@t[3]}
				380	veor @t[0], @t[0], @x[4]
				381	veor @t[1], @t[1], @x[5]
				382	vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
				383	vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
				384	veor @t[2], @t[2], @x[6]
				385	vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
				386	vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
				387	veor @t[3], @t[3], @x[7]
				388	vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
				389	vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
				390	vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
				391	vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
				392	___
				393	}
				394
				395	sub MixColumns {
				396	# modified to emit output in order suitable for feeding back to aesenc[last]
				397	my @x=@_[0..7];
				398	my @t=@_[8..15];
				399	my $inv=@_[16]; # optional
				400	$code.=<<___;
				401	vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
				402	vext.8 @t[1], @x[1], @x[1], #12
				403	veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
				404	vext.8 @t[2], @x[2], @x[2], #12
				405	veor @x[1], @x[1], @t[1]
				406	vext.8 @t[3], @x[3], @x[3], #12
				407	veor @x[2], @x[2], @t[2]
				408	vext.8 @t[4], @x[4], @x[4], #12
				409	veor @x[3], @x[3], @t[3]
				410	vext.8 @t[5], @x[5], @x[5], #12
				411	veor @x[4], @x[4], @t[4]
				412	vext.8 @t[6], @x[6], @x[6], #12
				413	veor @x[5], @x[5], @t[5]
				414	vext.8 @t[7], @x[7], @x[7], #12
				415	veor @x[6], @x[6], @t[6]
				416
				417	veor @t[1], @t[1], @x[0]
				418	veor @x[7], @x[7], @t[7]
				419	vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
				420	veor @t[2], @t[2], @x[1]
				421	veor @t[0], @t[0], @x[7]
				422	veor @t[1], @t[1], @x[7]
				423	vext.8 @x[1], @x[1], @x[1], #8
				424	veor @t[5], @t[5], @x[4]
				425	veor @x[0], @x[0], @t[0]
				426	veor @t[6], @t[6], @x[5]
				427	veor @x[1], @x[1], @t[1]
				428	vext.8 @t[0], @x[4], @x[4], #8
				429	veor @t[4], @t[4], @x[3]
				430	vext.8 @t[1], @x[5], @x[5], #8
				431	veor @t[7], @t[7], @x[6]
				432	vext.8 @x[4], @x[3], @x[3], #8
				433	veor @t[3], @t[3], @x[2]
				434	vext.8 @x[5], @x[7], @x[7], #8
				435	veor @t[4], @t[4], @x[7]
				436	vext.8 @x[3], @x[6], @x[6], #8
				437	veor @t[3], @t[3], @x[7]
				438	vext.8 @x[6], @x[2], @x[2], #8
				439	veor @x[7], @t[1], @t[5]
				440	___
				441	$code.=<<___ if (!$inv);
				442	veor @x[2], @t[0], @t[4]
				443	veor @x[4], @x[4], @t[3]
				444	veor @x[5], @x[5], @t[7]
				445	veor @x[3], @x[3], @t[6]
				446	@ vmov @x[2], @t[0]
				447	veor @x[6], @x[6], @t[2]
				448	@ vmov @x[7], @t[1]
				449	___
				450	$code.=<<___ if ($inv);
				451	veor @t[3], @t[3], @x[4]
				452	veor @x[5], @x[5], @t[7]
				453	veor @x[2], @x[3], @t[6]
				454	veor @x[3], @t[0], @t[4]
				455	veor @x[4], @x[6], @t[2]
				456	vmov @x[6], @t[3]
				457	@ vmov @x[7], @t[1]
				458	___
				459	}
				460
				461	sub InvMixColumns_orig {
				462	my @x=@_[0..7];
				463	my @t=@_[8..15];
				464
				465	$code.=<<___;
				466	@ multiplication by 0x0e
				467	vext.8 @t[7], @x[7], @x[7], #12
				468	vmov @t[2], @x[2]
				469	veor @x[2], @x[2], @x[5] @ 2 5
				470	veor @x[7], @x[7], @x[5] @ 7 5
				471	vext.8 @t[0], @x[0], @x[0], #12
				472	vmov @t[5], @x[5]
				473	veor @x[5], @x[5], @x[0] @ 5 0 [1]
				474	veor @x[0], @x[0], @x[1] @ 0 1
				475	vext.8 @t[1], @x[1], @x[1], #12
				476	veor @x[1], @x[1], @x[2] @ 1 25
				477	veor @x[0], @x[0], @x[6] @ 01 6 [2]
				478	vext.8 @t[3], @x[3], @x[3], #12
				479	veor @x[1], @x[1], @x[3] @ 125 3 [4]
				480	veor @x[2], @x[2], @x[0] @ 25 016 [3]
				481	veor @x[3], @x[3], @x[7] @ 3 75
				482	veor @x[7], @x[7], @x[6] @ 75 6 [0]
				483	vext.8 @t[6], @x[6], @x[6], #12
				484	vmov @t[4], @x[4]
				485	veor @x[6], @x[6], @x[4] @ 6 4
				486	veor @x[4], @x[4], @x[3] @ 4 375 [6]
				487	veor @x[3], @x[3], @x[7] @ 375 756=36
				488	veor @x[6], @x[6], @t[5] @ 64 5 [7]
				489	veor @x[3], @x[3], @t[2] @ 36 2
				490	vext.8 @t[5], @t[5], @t[5], #12
				491	veor @x[3], @x[3], @t[4] @ 362 4 [5]
				492	___
				493	my @y = @x[7,5,0,2,1,3,4,6];
				494	$code.=<<___;
				495	@ multiplication by 0x0b
				496	veor @y[1], @y[1], @y[0]
				497	veor @y[0], @y[0], @t[0]
				498	vext.8 @t[2], @t[2], @t[2], #12
				499	veor @y[1], @y[1], @t[1]
				500	veor @y[0], @y[0], @t[5]
				501	vext.8 @t[4], @t[4], @t[4], #12
				502	veor @y[1], @y[1], @t[6]
				503	veor @y[0], @y[0], @t[7]
				504	veor @t[7], @t[7], @t[6] @ clobber t[7]
				505
				506	veor @y[3], @y[3], @t[0]
				507	veor @y[1], @y[1], @y[0]
				508	vext.8 @t[0], @t[0], @t[0], #12
				509	veor @y[2], @y[2], @t[1]
				510	veor @y[4], @y[4], @t[1]
				511	vext.8 @t[1], @t[1], @t[1], #12
				512	veor @y[2], @y[2], @t[2]
				513	veor @y[3], @y[3], @t[2]
				514	veor @y[5], @y[5], @t[2]
				515	veor @y[2], @y[2], @t[7]
				516	vext.8 @t[2], @t[2], @t[2], #12
				517	veor @y[3], @y[3], @t[3]
				518	veor @y[6], @y[6], @t[3]
				519	veor @y[4], @y[4], @t[3]
				520	veor @y[7], @y[7], @t[4]
				521	vext.8 @t[3], @t[3], @t[3], #12
				522	veor @y[5], @y[5], @t[4]
				523	veor @y[7], @y[7], @t[7]
				524	veor @t[7], @t[7], @t[5] @ clobber t[7] even more
				525	veor @y[3], @y[3], @t[5]
				526	veor @y[4], @y[4], @t[4]
				527
				528	veor @y[5], @y[5], @t[7]
				529	vext.8 @t[4], @t[4], @t[4], #12
				530	veor @y[6], @y[6], @t[7]
				531	veor @y[4], @y[4], @t[7]
				532
				533	veor @t[7], @t[7], @t[5]
				534	vext.8 @t[5], @t[5], @t[5], #12
				535
				536	@ multiplication by 0x0d
				537	veor @y[4], @y[4], @y[7]
				538	veor @t[7], @t[7], @t[6] @ restore t[7]
				539	veor @y[7], @y[7], @t[4]
				540	vext.8 @t[6], @t[6], @t[6], #12
				541	veor @y[2], @y[2], @t[0]
				542	veor @y[7], @y[7], @t[5]
				543	vext.8 @t[7], @t[7], @t[7], #12
				544	veor @y[2], @y[2], @t[2]
				545
				546	veor @y[3], @y[3], @y[1]
				547	veor @y[1], @y[1], @t[1]
				548	veor @y[0], @y[0], @t[0]
				549	veor @y[3], @y[3], @t[0]
				550	veor @y[1], @y[1], @t[5]
				551	veor @y[0], @y[0], @t[5]
				552	vext.8 @t[0], @t[0], @t[0], #12
				553	veor @y[1], @y[1], @t[7]
				554	veor @y[0], @y[0], @t[6]
				555	veor @y[3], @y[3], @y[1]
				556	veor @y[4], @y[4], @t[1]
				557	vext.8 @t[1], @t[1], @t[1], #12
				558
				559	veor @y[7], @y[7], @t[7]
				560	veor @y[4], @y[4], @t[2]
				561	veor @y[5], @y[5], @t[2]
				562	veor @y[2], @y[2], @t[6]
				563	veor @t[6], @t[6], @t[3] @ clobber t[6]
				564	vext.8 @t[2], @t[2], @t[2], #12
				565	veor @y[4], @y[4], @y[7]
				566	veor @y[3], @y[3], @t[6]
				567
				568	veor @y[6], @y[6], @t[6]
				569	veor @y[5], @y[5], @t[5]
				570	vext.8 @t[5], @t[5], @t[5], #12
				571	veor @y[6], @y[6], @t[4]
				572	vext.8 @t[4], @t[4], @t[4], #12
				573	veor @y[5], @y[5], @t[6]
				574	veor @y[6], @y[6], @t[7]
				575	vext.8 @t[7], @t[7], @t[7], #12
				576	veor @t[6], @t[6], @t[3] @ restore t[6]
				577	vext.8 @t[3], @t[3], @t[3], #12
				578
				579	@ multiplication by 0x09
				580	veor @y[4], @y[4], @y[1]
				581	veor @t[1], @t[1], @y[1] @ t[1]=y[1]
				582	veor @t[0], @t[0], @t[5] @ clobber t[0]
				583	vext.8 @t[6], @t[6], @t[6], #12
				584	veor @t[1], @t[1], @t[5]
				585	veor @y[3], @y[3], @t[0]
				586	veor @t[0], @t[0], @y[0] @ t[0]=y[0]
				587	veor @t[1], @t[1], @t[6]
				588	veor @t[6], @t[6], @t[7] @ clobber t[6]
				589	veor @y[4], @y[4], @t[1]
				590	veor @y[7], @y[7], @t[4]
				591	veor @y[6], @y[6], @t[3]
				592	veor @y[5], @y[5], @t[2]
				593	veor @t[4], @t[4], @y[4] @ t[4]=y[4]
				594	veor @t[3], @t[3], @y[3] @ t[3]=y[3]
				595	veor @t[5], @t[5], @y[5] @ t[5]=y[5]
				596	veor @t[2], @t[2], @y[2] @ t[2]=y[2]
				597	veor @t[3], @t[3], @t[7]
				598	veor @XMM[5], @t[5], @t[6]
				599	veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
				600	veor @XMM[2], @t[2], @t[6]
				601	veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
				602
				603	vmov @XMM[0], @t[0]
				604	vmov @XMM[1], @t[1]
				605	@ vmov @XMM[2], @t[2]
				606	vmov @XMM[3], @t[3]
				607	vmov @XMM[4], @t[4]
				608	@ vmov @XMM[5], @t[5]
				609	@ vmov @XMM[6], @t[6]
				610	@ vmov @XMM[7], @t[7]
				611	___
				612	}
				613
				614	sub InvMixColumns {
				615	my @x=@_[0..7];
				616	my @t=@_[8..15];
				617
				618	# Thanks to Jussi Kivilinna for providing pointer to
				619	#
				620	# \| 0e 0b 0d 09 \| \| 02 03 01 01 \| \| 05 00 04 00 \|
				621	# \| 09 0e 0b 0d \| = \| 01 02 03 01 \| x \| 00 05 00 04 \|
				622	# \| 0d 09 0e 0b \| \| 01 01 02 03 \| \| 04 00 05 00 \|
				623	# \| 0b 0d 09 0e \| \| 03 01 01 02 \| \| 00 04 00 05 \|
				624
				625	$code.=<<___;
				626	@ multiplication by 0x05-0x00-0x04-0x00
				627	vext.8 @t[0], @x[0], @x[0], #8
				628	vext.8 @t[6], @x[6], @x[6], #8
				629	vext.8 @t[7], @x[7], @x[7], #8
				630	veor @t[0], @t[0], @x[0]
				631	vext.8 @t[1], @x[1], @x[1], #8
				632	veor @t[6], @t[6], @x[6]
				633	vext.8 @t[2], @x[2], @x[2], #8
				634	veor @t[7], @t[7], @x[7]
				635	vext.8 @t[3], @x[3], @x[3], #8
				636	veor @t[1], @t[1], @x[1]
				637	vext.8 @t[4], @x[4], @x[4], #8
				638	veor @t[2], @t[2], @x[2]
				639	vext.8 @t[5], @x[5], @x[5], #8
				640	veor @t[3], @t[3], @x[3]
				641	veor @t[4], @t[4], @x[4]
				642	veor @t[5], @t[5], @x[5]
				643
				644	veor @x[0], @x[0], @t[6]
				645	veor @x[1], @x[1], @t[6]
				646	veor @x[2], @x[2], @t[0]
				647	veor @x[4], @x[4], @t[2]
				648	veor @x[3], @x[3], @t[1]
				649	veor @x[1], @x[1], @t[7]
				650	veor @x[2], @x[2], @t[7]
				651	veor @x[4], @x[4], @t[6]
				652	veor @x[5], @x[5], @t[3]
				653	veor @x[3], @x[3], @t[6]
				654	veor @x[6], @x[6], @t[4]
				655	veor @x[4], @x[4], @t[7]
				656	veor @x[5], @x[5], @t[7]
				657	veor @x[7], @x[7], @t[5]
				658	___
				659	&MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
				660	}
				661
				662	sub swapmove {
				663	my ($a,$b,$n,$mask,$t)=@_;
				664	$code.=<<___;
				665	vshr.u64 $t, $b, #$n
				666	veor $t, $t, $a
				667	vand $t, $t, $mask
				668	veor $a, $a, $t
				669	vshl.u64 $t, $t, #$n
				670	veor $b, $b, $t
				671	___
				672	}
				673	sub swapmove2x {
				674	my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
				675	$code.=<<___;
				676	vshr.u64 $t0, $b0, #$n
				677	vshr.u64 $t1, $b1, #$n
				678	veor $t0, $t0, $a0
				679	veor $t1, $t1, $a1
				680	vand $t0, $t0, $mask
				681	vand $t1, $t1, $mask
				682	veor $a0, $a0, $t0
				683	vshl.u64 $t0, $t0, #$n
				684	veor $a1, $a1, $t1
				685	vshl.u64 $t1, $t1, #$n
				686	veor $b0, $b0, $t0
				687	veor $b1, $b1, $t1
				688	___
				689	}
				690
				691	sub bitslice {
				692	my @x=reverse(@_[0..7]);
				693	my ($t0,$t1,$t2,$t3)=@_[8..11];
				694	$code.=<<___;
				695	vmov.i8 $t0,#0x55 @ compose .LBS0
				696	vmov.i8 $t1,#0x33 @ compose .LBS1
				697	___
				698	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
				699	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
				700	$code.=<<___;
				701	vmov.i8 $t0,#0x0f @ compose .LBS2
				702	___
				703	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
				704	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
				705
				706	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
				707	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
				708	}
				709
				710	$code.=<<___;
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	711	#ifndef __KERNEL__
Kenny Root	b849459	2015-09-25 02:29:14 +0000	[diff] [blame]	712	# include <openssl/arm_arch.h>
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	713
				714	# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
				715	# define VFP_ABI_POP vldmia sp!,{d8-d15}
				716	# define VFP_ABI_FRAME 0x40
				717	#else
				718	# define VFP_ABI_PUSH
				719	# define VFP_ABI_POP
				720	# define VFP_ABI_FRAME 0
				721	# define BSAES_ASM_EXTENDED_KEY
				722	# define XTS_CHAIN_TWEAK
				723	# define __ARM_ARCH__ __LINUX_ARM_ARCH__
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	724	# define __ARM_MAX_ARCH__ 7
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	725	#endif
				726
				727	#ifdef __thumb__
				728	# define adrl adr
				729	#endif
				730
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	731	#if __ARM_MAX_ARCH__>=7
				732	.arch armv7-a
				733	.fpu neon
				734
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	735	.text
				736	.syntax unified @ ARMv7-capable assembler is expected to handle this
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	737	#if defined(__thumb2__) && !defined(__APPLE__)
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	738	.thumb
				739	#else
				740	.code 32
Robert Sloan	6f79a50	2017-04-03 09:16:40 -0700	[diff] [blame]	741	# undef __thumb2__
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	742	#endif
				743
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	744	.type _bsaes_decrypt8,%function
				745	.align 4
				746	_bsaes_decrypt8:
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	747	adr $const,.
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	748	vldmia $key!, {@XMM[9]} @ round 0 key
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	749	#ifdef __APPLE__
				750	adr $const,.LM0ISR
				751	#else
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	752	add $const,$const,#.LM0ISR-_bsaes_decrypt8
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	753	#endif
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	754
				755	vldmia $const!, {@XMM[8]} @ .LM0ISR
				756	veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
				757	veor @XMM[11], @XMM[1], @XMM[9]
				758	vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
				759	vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
				760	veor @XMM[12], @XMM[2], @XMM[9]
				761	vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
				762	vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
				763	veor @XMM[13], @XMM[3], @XMM[9]
				764	vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
				765	vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
				766	veor @XMM[14], @XMM[4], @XMM[9]
				767	vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
				768	vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
				769	veor @XMM[15], @XMM[5], @XMM[9]
				770	vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
				771	vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
				772	veor @XMM[10], @XMM[6], @XMM[9]
				773	vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
				774	vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
				775	veor @XMM[11], @XMM[7], @XMM[9]
				776	vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
				777	vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
				778	vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
				779	vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
				780	___
				781	&bitslice (@XMM[0..7, 8..11]);
				782	$code.=<<___;
				783	sub $rounds,$rounds,#1
				784	b .Ldec_sbox
				785	.align 4
				786	.Ldec_loop:
				787	___
				788	&ShiftRows (@XMM[0..7, 8..12]);
				789	$code.=".Ldec_sbox:\n";
				790	&InvSbox (@XMM[0..7, 8..15]);
				791	$code.=<<___;
				792	subs $rounds,$rounds,#1
				793	bcc .Ldec_done
				794	___
				795	&InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
				796	$code.=<<___;
				797	vldmia $const, {@XMM[12]} @ .LISR
				798	ite eq @ Thumb2 thing, sanity check in ARM
				799	addeq $const,$const,#0x10
				800	bne .Ldec_loop
				801	vldmia $const, {@XMM[12]} @ .LISRM0
				802	b .Ldec_loop
				803	.align 4
				804	.Ldec_done:
				805	___
				806	&bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
				807	$code.=<<___;
				808	vldmia $key, {@XMM[8]} @ last round key
				809	veor @XMM[6], @XMM[6], @XMM[8]
				810	veor @XMM[4], @XMM[4], @XMM[8]
				811	veor @XMM[2], @XMM[2], @XMM[8]
				812	veor @XMM[7], @XMM[7], @XMM[8]
				813	veor @XMM[3], @XMM[3], @XMM[8]
				814	veor @XMM[5], @XMM[5], @XMM[8]
				815	veor @XMM[0], @XMM[0], @XMM[8]
				816	veor @XMM[1], @XMM[1], @XMM[8]
				817	bx lr
				818	.size _bsaes_decrypt8,.-_bsaes_decrypt8
				819
				820	.type _bsaes_const,%object
				821	.align 6
				822	_bsaes_const:
				823	.LM0ISR: @ InvShiftRows constants
				824	.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
				825	.LISR:
				826	.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
				827	.LISRM0:
				828	.quad 0x01040b0e0205080f, 0x0306090c00070a0d
				829	.LM0SR: @ ShiftRows constants
				830	.quad 0x0a0e02060f03070b, 0x0004080c05090d01
				831	.LSR:
				832	.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
				833	.LSRM0:
				834	.quad 0x0304090e00050a0f, 0x01060b0c0207080d
				835	.LM0:
				836	.quad 0x02060a0e03070b0f, 0x0004080c0105090d
				837	.LREVM0SR:
				838	.quad 0x090d01050c000408, 0x03070b0f060a0e02
				839	.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
				840	.align 6
				841	.size _bsaes_const,.-_bsaes_const
				842
				843	.type _bsaes_encrypt8,%function
				844	.align 4
				845	_bsaes_encrypt8:
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	846	adr $const,.
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	847	vldmia $key!, {@XMM[9]} @ round 0 key
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	848	#ifdef __APPLE__
				849	adr $const,.LM0SR
				850	#else
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	851	sub $const,$const,#_bsaes_encrypt8-.LM0SR
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	852	#endif
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	853
				854	vldmia $const!, {@XMM[8]} @ .LM0SR
				855	_bsaes_encrypt8_alt:
				856	veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
				857	veor @XMM[11], @XMM[1], @XMM[9]
				858	vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
				859	vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
				860	veor @XMM[12], @XMM[2], @XMM[9]
				861	vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
				862	vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
				863	veor @XMM[13], @XMM[3], @XMM[9]
				864	vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
				865	vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
				866	veor @XMM[14], @XMM[4], @XMM[9]
				867	vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
				868	vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
				869	veor @XMM[15], @XMM[5], @XMM[9]
				870	vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
				871	vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
				872	veor @XMM[10], @XMM[6], @XMM[9]
				873	vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
				874	vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
				875	veor @XMM[11], @XMM[7], @XMM[9]
				876	vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
				877	vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
				878	vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
				879	vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
				880	_bsaes_encrypt8_bitslice:
				881	___
				882	&bitslice (@XMM[0..7, 8..11]);
				883	$code.=<<___;
				884	sub $rounds,$rounds,#1
				885	b .Lenc_sbox
				886	.align 4
				887	.Lenc_loop:
				888	___
				889	&ShiftRows (@XMM[0..7, 8..12]);
				890	$code.=".Lenc_sbox:\n";
				891	&Sbox (@XMM[0..7, 8..15]);
				892	$code.=<<___;
				893	subs $rounds,$rounds,#1
				894	bcc .Lenc_done
				895	___
				896	&MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
				897	$code.=<<___;
				898	vldmia $const, {@XMM[12]} @ .LSR
				899	ite eq @ Thumb2 thing, samity check in ARM
				900	addeq $const,$const,#0x10
				901	bne .Lenc_loop
				902	vldmia $const, {@XMM[12]} @ .LSRM0
				903	b .Lenc_loop
				904	.align 4
				905	.Lenc_done:
				906	___
				907	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
				908	&bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
				909	$code.=<<___;
				910	vldmia $key, {@XMM[8]} @ last round key
				911	veor @XMM[4], @XMM[4], @XMM[8]
				912	veor @XMM[6], @XMM[6], @XMM[8]
				913	veor @XMM[3], @XMM[3], @XMM[8]
				914	veor @XMM[7], @XMM[7], @XMM[8]
				915	veor @XMM[2], @XMM[2], @XMM[8]
				916	veor @XMM[5], @XMM[5], @XMM[8]
				917	veor @XMM[0], @XMM[0], @XMM[8]
				918	veor @XMM[1], @XMM[1], @XMM[8]
				919	bx lr
				920	.size _bsaes_encrypt8,.-_bsaes_encrypt8
				921	___
				922	}
				923	{
				924	my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
				925
				926	sub bitslice_key {
				927	my @x=reverse(@_[0..7]);
				928	my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
				929
				930	&swapmove (@x[0,1],1,$bs0,$t2,$t3);
				931	$code.=<<___;
				932	@ &swapmove(@x[2,3],1,$t0,$t2,$t3);
				933	vmov @x[2], @x[0]
				934	vmov @x[3], @x[1]
				935	___
				936	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
				937
				938	&swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
				939	$code.=<<___;
				940	@ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
				941	vmov @x[4], @x[0]
				942	vmov @x[6], @x[2]
				943	vmov @x[5], @x[1]
				944	vmov @x[7], @x[3]
				945	___
				946	&swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
				947	&swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
				948	}
				949
				950	$code.=<<___;
				951	.type _bsaes_key_convert,%function
				952	.align 4
				953	_bsaes_key_convert:
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	954	adr $const,.
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	955	vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	956	#ifdef __APPLE__
				957	adr $const,.LM0
				958	#else
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	959	sub $const,$const,#_bsaes_key_convert-.LM0
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	960	#endif
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	961	vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
				962
				963	vmov.i8 @XMM[8], #0x01 @ bit masks
				964	vmov.i8 @XMM[9], #0x02
				965	vmov.i8 @XMM[10], #0x04
				966	vmov.i8 @XMM[11], #0x08
				967	vmov.i8 @XMM[12], #0x10
				968	vmov.i8 @XMM[13], #0x20
				969	vldmia $const, {@XMM[14]} @ .LM0
				970
				971	#ifdef __ARMEL__
				972	vrev32.8 @XMM[7], @XMM[7]
				973	vrev32.8 @XMM[15], @XMM[15]
				974	#endif
				975	sub $rounds,$rounds,#1
				976	vstmia $out!, {@XMM[7]} @ save round 0 key
				977	b .Lkey_loop
				978
				979	.align 4
				980	.Lkey_loop:
				981	vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
				982	vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
				983	vmov.i8 @XMM[6], #0x40
				984	vmov.i8 @XMM[15], #0x80
				985
				986	vtst.8 @XMM[0], @XMM[7], @XMM[8]
				987	vtst.8 @XMM[1], @XMM[7], @XMM[9]
				988	vtst.8 @XMM[2], @XMM[7], @XMM[10]
				989	vtst.8 @XMM[3], @XMM[7], @XMM[11]
				990	vtst.8 @XMM[4], @XMM[7], @XMM[12]
				991	vtst.8 @XMM[5], @XMM[7], @XMM[13]
				992	vtst.8 @XMM[6], @XMM[7], @XMM[6]
				993	vtst.8 @XMM[7], @XMM[7], @XMM[15]
				994	vld1.8 {@XMM[15]}, [$inp]! @ load next round key
				995	vmvn @XMM[0], @XMM[0] @ "pnot"
				996	vmvn @XMM[1], @XMM[1]
				997	vmvn @XMM[5], @XMM[5]
				998	vmvn @XMM[6], @XMM[6]
				999	#ifdef __ARMEL__
				1000	vrev32.8 @XMM[15], @XMM[15]
				1001	#endif
				1002	subs $rounds,$rounds,#1
				1003	vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
				1004	bne .Lkey_loop
				1005
				1006	vmov.i8 @XMM[7],#0x63 @ compose .L63
				1007	@ don't save last round key
				1008	bx lr
				1009	.size _bsaes_key_convert,.-_bsaes_key_convert
				1010	___
				1011	}
				1012
				1013	if (0) { # following four functions are unsupported interface
				1014	# used for benchmarking...
				1015	$code.=<<___;
				1016	.globl bsaes_enc_key_convert
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1017	.type bsaes_enc_key_convert,%function
				1018	.align 4
				1019	bsaes_enc_key_convert:
				1020	stmdb sp!,{r4-r6,lr}
				1021	vstmdb sp!,{d8-d15} @ ABI specification says so
				1022
				1023	ldr r5,[$inp,#240] @ pass rounds
				1024	mov r4,$inp @ pass key
				1025	mov r12,$out @ pass key schedule
				1026	bl _bsaes_key_convert
				1027	veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
				1028	vstmia r12, {@XMM[7]} @ save last round key
				1029
				1030	vldmia sp!,{d8-d15}
				1031	ldmia sp!,{r4-r6,pc}
				1032	.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
				1033
				1034	.globl bsaes_encrypt_128
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1035	.type bsaes_encrypt_128,%function
				1036	.align 4
				1037	bsaes_encrypt_128:
				1038	stmdb sp!,{r4-r6,lr}
				1039	vstmdb sp!,{d8-d15} @ ABI specification says so
				1040	.Lenc128_loop:
				1041	vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
				1042	vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
				1043	mov r4,$key @ pass the key
				1044	vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
				1045	mov r5,#10 @ pass rounds
				1046	vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
				1047
				1048	bl _bsaes_encrypt8
				1049
				1050	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1051	vst1.8 {@XMM[4]}, [$out]!
				1052	vst1.8 {@XMM[6]}, [$out]!
				1053	vst1.8 {@XMM[3]}, [$out]!
				1054	vst1.8 {@XMM[7]}, [$out]!
				1055	vst1.8 {@XMM[2]}, [$out]!
				1056	subs $len,$len,#0x80
				1057	vst1.8 {@XMM[5]}, [$out]!
				1058	bhi .Lenc128_loop
				1059
				1060	vldmia sp!,{d8-d15}
				1061	ldmia sp!,{r4-r6,pc}
				1062	.size bsaes_encrypt_128,.-bsaes_encrypt_128
				1063
				1064	.globl bsaes_dec_key_convert
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1065	.type bsaes_dec_key_convert,%function
				1066	.align 4
				1067	bsaes_dec_key_convert:
				1068	stmdb sp!,{r4-r6,lr}
				1069	vstmdb sp!,{d8-d15} @ ABI specification says so
				1070
				1071	ldr r5,[$inp,#240] @ pass rounds
				1072	mov r4,$inp @ pass key
				1073	mov r12,$out @ pass key schedule
				1074	bl _bsaes_key_convert
				1075	vldmia $out, {@XMM[6]}
				1076	vstmia r12, {@XMM[15]} @ save last round key
				1077	veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
				1078	vstmia $out, {@XMM[7]}
				1079
				1080	vldmia sp!,{d8-d15}
				1081	ldmia sp!,{r4-r6,pc}
				1082	.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
				1083
				1084	.globl bsaes_decrypt_128
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1085	.type bsaes_decrypt_128,%function
				1086	.align 4
				1087	bsaes_decrypt_128:
				1088	stmdb sp!,{r4-r6,lr}
				1089	vstmdb sp!,{d8-d15} @ ABI specification says so
				1090	.Ldec128_loop:
				1091	vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
				1092	vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
				1093	mov r4,$key @ pass the key
				1094	vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
				1095	mov r5,#10 @ pass rounds
				1096	vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
				1097
				1098	bl _bsaes_decrypt8
				1099
				1100	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1101	vst1.8 {@XMM[6]}, [$out]!
				1102	vst1.8 {@XMM[4]}, [$out]!
				1103	vst1.8 {@XMM[2]}, [$out]!
				1104	vst1.8 {@XMM[7]}, [$out]!
				1105	vst1.8 {@XMM[3]}, [$out]!
				1106	subs $len,$len,#0x80
				1107	vst1.8 {@XMM[5]}, [$out]!
				1108	bhi .Ldec128_loop
				1109
				1110	vldmia sp!,{d8-d15}
				1111	ldmia sp!,{r4-r6,pc}
				1112	.size bsaes_decrypt_128,.-bsaes_decrypt_128
				1113	___
				1114	}
				1115	{
				1116	my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
				1117	my ($keysched)=("sp");
				1118
				1119	$code.=<<___;
				1120	.extern AES_cbc_encrypt
				1121	.extern AES_decrypt
				1122
				1123	.global bsaes_cbc_encrypt
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1124	.type bsaes_cbc_encrypt,%function
				1125	.align 5
				1126	bsaes_cbc_encrypt:
				1127	#ifndef __KERNEL__
				1128	cmp $len, #128
				1129	#ifndef __thumb__
				1130	blo AES_cbc_encrypt
				1131	#else
				1132	bhs 1f
				1133	b AES_cbc_encrypt
				1134	1:
				1135	#endif
				1136	#endif
				1137
				1138	@ it is up to the caller to make sure we are called with enc == 0
				1139
				1140	mov ip, sp
				1141	stmdb sp!, {r4-r10, lr}
				1142	VFP_ABI_PUSH
				1143	ldr $ivp, [ip] @ IV is 1st arg on the stack
				1144	mov $len, $len, lsr#4 @ len in 16 byte blocks
				1145	sub sp, #0x10 @ scratch space to carry over the IV
				1146	mov $fp, sp @ save sp
				1147
				1148	ldr $rounds, [$key, #240] @ get # of rounds
				1149	#ifndef BSAES_ASM_EXTENDED_KEY
				1150	@ allocate the key schedule on the stack
				1151	sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
				1152	add r12, #`128-32` @ sifze of bit-slices key schedule
				1153
				1154	@ populate the key schedule
				1155	mov r4, $key @ pass key
				1156	mov r5, $rounds @ pass # of rounds
				1157	mov sp, r12 @ sp is $keysched
				1158	bl _bsaes_key_convert
				1159	vldmia $keysched, {@XMM[6]}
				1160	vstmia r12, {@XMM[15]} @ save last round key
				1161	veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
				1162	vstmia $keysched, {@XMM[7]}
				1163	#else
				1164	ldr r12, [$key, #244]
				1165	eors r12, #1
				1166	beq 0f
				1167
				1168	@ populate the key schedule
				1169	str r12, [$key, #244]
				1170	mov r4, $key @ pass key
				1171	mov r5, $rounds @ pass # of rounds
				1172	add r12, $key, #248 @ pass key schedule
				1173	bl _bsaes_key_convert
				1174	add r4, $key, #248
				1175	vldmia r4, {@XMM[6]}
				1176	vstmia r12, {@XMM[15]} @ save last round key
				1177	veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
				1178	vstmia r4, {@XMM[7]}
				1179
				1180	.align 2
				1181	0:
				1182	#endif
				1183
				1184	vld1.8 {@XMM[15]}, [$ivp] @ load IV
				1185	b .Lcbc_dec_loop
				1186
				1187	.align 4
				1188	.Lcbc_dec_loop:
				1189	subs $len, $len, #0x8
				1190	bmi .Lcbc_dec_loop_finish
				1191
				1192	vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
				1193	vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
				1194	#ifndef BSAES_ASM_EXTENDED_KEY
				1195	mov r4, $keysched @ pass the key
				1196	#else
				1197	add r4, $key, #248
				1198	#endif
				1199	vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
				1200	mov r5, $rounds
				1201	vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
				1202	sub $inp, $inp, #0x60
				1203	vstmia $fp, {@XMM[15]} @ put aside IV
				1204
				1205	bl _bsaes_decrypt8
				1206
				1207	vldmia $fp, {@XMM[14]} @ reload IV
				1208	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
				1209	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
				1210	vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
				1211	veor @XMM[1], @XMM[1], @XMM[8]
				1212	veor @XMM[6], @XMM[6], @XMM[9]
				1213	vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
				1214	veor @XMM[4], @XMM[4], @XMM[10]
				1215	veor @XMM[2], @XMM[2], @XMM[11]
				1216	vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
				1217	veor @XMM[7], @XMM[7], @XMM[12]
				1218	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1219	veor @XMM[3], @XMM[3], @XMM[13]
				1220	vst1.8 {@XMM[6]}, [$out]!
				1221	veor @XMM[5], @XMM[5], @XMM[14]
				1222	vst1.8 {@XMM[4]}, [$out]!
				1223	vst1.8 {@XMM[2]}, [$out]!
				1224	vst1.8 {@XMM[7]}, [$out]!
				1225	vst1.8 {@XMM[3]}, [$out]!
				1226	vst1.8 {@XMM[5]}, [$out]!
				1227
				1228	b .Lcbc_dec_loop
				1229
				1230	.Lcbc_dec_loop_finish:
				1231	adds $len, $len, #8
				1232	beq .Lcbc_dec_done
				1233
				1234	vld1.8 {@XMM[0]}, [$inp]! @ load input
				1235	cmp $len, #2
				1236	blo .Lcbc_dec_one
				1237	vld1.8 {@XMM[1]}, [$inp]!
				1238	#ifndef BSAES_ASM_EXTENDED_KEY
				1239	mov r4, $keysched @ pass the key
				1240	#else
				1241	add r4, $key, #248
				1242	#endif
				1243	mov r5, $rounds
				1244	vstmia $fp, {@XMM[15]} @ put aside IV
				1245	beq .Lcbc_dec_two
				1246	vld1.8 {@XMM[2]}, [$inp]!
				1247	cmp $len, #4
				1248	blo .Lcbc_dec_three
				1249	vld1.8 {@XMM[3]}, [$inp]!
				1250	beq .Lcbc_dec_four
				1251	vld1.8 {@XMM[4]}, [$inp]!
				1252	cmp $len, #6
				1253	blo .Lcbc_dec_five
				1254	vld1.8 {@XMM[5]}, [$inp]!
				1255	beq .Lcbc_dec_six
				1256	vld1.8 {@XMM[6]}, [$inp]!
				1257	sub $inp, $inp, #0x70
				1258
				1259	bl _bsaes_decrypt8
				1260
				1261	vldmia $fp, {@XMM[14]} @ reload IV
				1262	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
				1263	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
				1264	vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
				1265	veor @XMM[1], @XMM[1], @XMM[8]
				1266	veor @XMM[6], @XMM[6], @XMM[9]
				1267	vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
				1268	veor @XMM[4], @XMM[4], @XMM[10]
				1269	veor @XMM[2], @XMM[2], @XMM[11]
				1270	vld1.8 {@XMM[15]}, [$inp]!
				1271	veor @XMM[7], @XMM[7], @XMM[12]
				1272	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1273	veor @XMM[3], @XMM[3], @XMM[13]
				1274	vst1.8 {@XMM[6]}, [$out]!
				1275	vst1.8 {@XMM[4]}, [$out]!
				1276	vst1.8 {@XMM[2]}, [$out]!
				1277	vst1.8 {@XMM[7]}, [$out]!
				1278	vst1.8 {@XMM[3]}, [$out]!
				1279	b .Lcbc_dec_done
				1280	.align 4
				1281	.Lcbc_dec_six:
				1282	sub $inp, $inp, #0x60
				1283	bl _bsaes_decrypt8
				1284	vldmia $fp,{@XMM[14]} @ reload IV
				1285	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
				1286	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
				1287	vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
				1288	veor @XMM[1], @XMM[1], @XMM[8]
				1289	veor @XMM[6], @XMM[6], @XMM[9]
				1290	vld1.8 {@XMM[12]}, [$inp]!
				1291	veor @XMM[4], @XMM[4], @XMM[10]
				1292	veor @XMM[2], @XMM[2], @XMM[11]
				1293	vld1.8 {@XMM[15]}, [$inp]!
				1294	veor @XMM[7], @XMM[7], @XMM[12]
				1295	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1296	vst1.8 {@XMM[6]}, [$out]!
				1297	vst1.8 {@XMM[4]}, [$out]!
				1298	vst1.8 {@XMM[2]}, [$out]!
				1299	vst1.8 {@XMM[7]}, [$out]!
				1300	b .Lcbc_dec_done
				1301	.align 4
				1302	.Lcbc_dec_five:
				1303	sub $inp, $inp, #0x50
				1304	bl _bsaes_decrypt8
				1305	vldmia $fp, {@XMM[14]} @ reload IV
				1306	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
				1307	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
				1308	vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
				1309	veor @XMM[1], @XMM[1], @XMM[8]
				1310	veor @XMM[6], @XMM[6], @XMM[9]
				1311	vld1.8 {@XMM[15]}, [$inp]!
				1312	veor @XMM[4], @XMM[4], @XMM[10]
				1313	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1314	veor @XMM[2], @XMM[2], @XMM[11]
				1315	vst1.8 {@XMM[6]}, [$out]!
				1316	vst1.8 {@XMM[4]}, [$out]!
				1317	vst1.8 {@XMM[2]}, [$out]!
				1318	b .Lcbc_dec_done
				1319	.align 4
				1320	.Lcbc_dec_four:
				1321	sub $inp, $inp, #0x40
				1322	bl _bsaes_decrypt8
				1323	vldmia $fp, {@XMM[14]} @ reload IV
				1324	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
				1325	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
				1326	vld1.8 {@XMM[10]}, [$inp]!
				1327	veor @XMM[1], @XMM[1], @XMM[8]
				1328	veor @XMM[6], @XMM[6], @XMM[9]
				1329	vld1.8 {@XMM[15]}, [$inp]!
				1330	veor @XMM[4], @XMM[4], @XMM[10]
				1331	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1332	vst1.8 {@XMM[6]}, [$out]!
				1333	vst1.8 {@XMM[4]}, [$out]!
				1334	b .Lcbc_dec_done
				1335	.align 4
				1336	.Lcbc_dec_three:
				1337	sub $inp, $inp, #0x30
				1338	bl _bsaes_decrypt8
				1339	vldmia $fp, {@XMM[14]} @ reload IV
				1340	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
				1341	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
				1342	vld1.8 {@XMM[15]}, [$inp]!
				1343	veor @XMM[1], @XMM[1], @XMM[8]
				1344	veor @XMM[6], @XMM[6], @XMM[9]
				1345	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1346	vst1.8 {@XMM[6]}, [$out]!
				1347	b .Lcbc_dec_done
				1348	.align 4
				1349	.Lcbc_dec_two:
				1350	sub $inp, $inp, #0x20
				1351	bl _bsaes_decrypt8
				1352	vldmia $fp, {@XMM[14]} @ reload IV
				1353	vld1.8 {@XMM[8]}, [$inp]! @ reload input
				1354	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
				1355	vld1.8 {@XMM[15]}, [$inp]! @ reload input
				1356	veor @XMM[1], @XMM[1], @XMM[8]
				1357	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1358	b .Lcbc_dec_done
				1359	.align 4
				1360	.Lcbc_dec_one:
				1361	sub $inp, $inp, #0x10
				1362	mov $rounds, $out @ save original out pointer
				1363	mov $out, $fp @ use the iv scratch space as out buffer
				1364	mov r2, $key
				1365	vmov @XMM[4],@XMM[15] @ just in case ensure that IV
				1366	vmov @XMM[5],@XMM[0] @ and input are preserved
				1367	bl AES_decrypt
Robert Sloan	6f79a50	2017-04-03 09:16:40 -0700	[diff] [blame]	1368	vld1.8 {@XMM[0]}, [$fp] @ load result
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1369	veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
				1370	vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
				1371	vst1.8 {@XMM[0]}, [$rounds] @ write output
				1372
				1373	.Lcbc_dec_done:
				1374	#ifndef BSAES_ASM_EXTENDED_KEY
				1375	vmov.i32 q0, #0
				1376	vmov.i32 q1, #0
				1377	.Lcbc_dec_bzero: @ wipe key schedule [if any]
				1378	vstmia $keysched!, {q0-q1}
				1379	cmp $keysched, $fp
				1380	bne .Lcbc_dec_bzero
				1381	#endif
				1382
				1383	mov sp, $fp
				1384	add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
				1385	vst1.8 {@XMM[15]}, [$ivp] @ return IV
				1386	VFP_ABI_POP
				1387	ldmia sp!, {r4-r10, pc}
				1388	.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
				1389	___
				1390	}
				1391	{
				1392	my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
				1393	my $const = "r6"; # shared with _bsaes_encrypt8_alt
				1394	my $keysched = "sp";
				1395
				1396	$code.=<<___;
				1397	.extern AES_encrypt
				1398	.global bsaes_ctr32_encrypt_blocks
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1399	.type bsaes_ctr32_encrypt_blocks,%function
				1400	.align 5
				1401	bsaes_ctr32_encrypt_blocks:
				1402	cmp $len, #8 @ use plain AES for
				1403	blo .Lctr_enc_short @ small sizes
				1404
				1405	mov ip, sp
				1406	stmdb sp!, {r4-r10, lr}
				1407	VFP_ABI_PUSH
				1408	ldr $ctr, [ip] @ ctr is 1st arg on the stack
				1409	sub sp, sp, #0x10 @ scratch space to carry over the ctr
				1410	mov $fp, sp @ save sp
				1411
				1412	ldr $rounds, [$key, #240] @ get # of rounds
				1413	#ifndef BSAES_ASM_EXTENDED_KEY
				1414	@ allocate the key schedule on the stack
				1415	sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
				1416	add r12, #`128-32` @ size of bit-sliced key schedule
				1417
				1418	@ populate the key schedule
				1419	mov r4, $key @ pass key
				1420	mov r5, $rounds @ pass # of rounds
				1421	mov sp, r12 @ sp is $keysched
				1422	bl _bsaes_key_convert
				1423	veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
				1424	vstmia r12, {@XMM[7]} @ save last round key
				1425
				1426	vld1.8 {@XMM[0]}, [$ctr] @ load counter
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	1427	#ifdef __APPLE__
Adam Langley	f4e4272	2015-06-04 17:45:09 -0700	[diff] [blame]	1428	mov $ctr, #:lower16:(.LREVM0SR-.LM0)
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	1429	add $ctr, $const, $ctr
				1430	#else
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1431	add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	1432	#endif
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1433	vldmia $keysched, {@XMM[4]} @ load round0 key
				1434	#else
				1435	ldr r12, [$key, #244]
				1436	eors r12, #1
				1437	beq 0f
				1438
				1439	@ populate the key schedule
				1440	str r12, [$key, #244]
				1441	mov r4, $key @ pass key
				1442	mov r5, $rounds @ pass # of rounds
				1443	add r12, $key, #248 @ pass key schedule
				1444	bl _bsaes_key_convert
				1445	veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
				1446	vstmia r12, {@XMM[7]} @ save last round key
				1447
				1448	.align 2
				1449	0: add r12, $key, #248
				1450	vld1.8 {@XMM[0]}, [$ctr] @ load counter
				1451	adrl $ctr, .LREVM0SR @ borrow $ctr
				1452	vldmia r12, {@XMM[4]} @ load round0 key
				1453	sub sp, #0x10 @ place for adjusted round0 key
				1454	#endif
				1455
				1456	vmov.i32 @XMM[8],#1 @ compose 1<<96
				1457	veor @XMM[9],@XMM[9],@XMM[9]
				1458	vrev32.8 @XMM[0],@XMM[0]
				1459	vext.8 @XMM[8],@XMM[9],@XMM[8],#4
				1460	vrev32.8 @XMM[4],@XMM[4]
				1461	vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
				1462	vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
				1463	b .Lctr_enc_loop
				1464
				1465	.align 4
				1466	.Lctr_enc_loop:
				1467	vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
				1468	vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
				1469	vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
				1470	vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
				1471	vadd.u32 @XMM[4], @XMM[1], @XMM[10]
				1472	vadd.u32 @XMM[5], @XMM[2], @XMM[10]
				1473	vadd.u32 @XMM[6], @XMM[3], @XMM[10]
				1474	vadd.u32 @XMM[7], @XMM[4], @XMM[10]
				1475	vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
				1476
				1477	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
				1478	@ to flip byte order in 32-bit counter
				1479
				1480	vldmia $keysched, {@XMM[9]} @ load round0 key
				1481	#ifndef BSAES_ASM_EXTENDED_KEY
				1482	add r4, $keysched, #0x10 @ pass next round key
				1483	#else
				1484	add r4, $key, #`248+16`
				1485	#endif
				1486	vldmia $ctr, {@XMM[8]} @ .LREVM0SR
				1487	mov r5, $rounds @ pass rounds
				1488	vstmia $fp, {@XMM[10]} @ save next counter
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	1489	#ifdef __APPLE__
Adam Langley	f4e4272	2015-06-04 17:45:09 -0700	[diff] [blame]	1490	mov $const, #:lower16:(.LREVM0SR-.LSR)
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	1491	sub $const, $ctr, $const
				1492	#else
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1493	sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	1494	#endif
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1495
				1496	bl _bsaes_encrypt8_alt
				1497
				1498	subs $len, $len, #8
				1499	blo .Lctr_enc_loop_done
				1500
				1501	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
				1502	vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
				1503	veor @XMM[0], @XMM[8]
				1504	veor @XMM[1], @XMM[9]
				1505	vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
				1506	veor @XMM[4], @XMM[10]
				1507	veor @XMM[6], @XMM[11]
				1508	vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
				1509	veor @XMM[3], @XMM[12]
				1510	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
				1511	veor @XMM[7], @XMM[13]
				1512	veor @XMM[2], @XMM[14]
				1513	vst1.8 {@XMM[4]}, [$out]!
				1514	veor @XMM[5], @XMM[15]
				1515	vst1.8 {@XMM[6]}, [$out]!
				1516	vmov.i32 @XMM[8], #1 @ compose 1<<96
				1517	vst1.8 {@XMM[3]}, [$out]!
				1518	veor @XMM[9], @XMM[9], @XMM[9]
				1519	vst1.8 {@XMM[7]}, [$out]!
				1520	vext.8 @XMM[8], @XMM[9], @XMM[8], #4
				1521	vst1.8 {@XMM[2]}, [$out]!
				1522	vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
				1523	vst1.8 {@XMM[5]}, [$out]!
				1524	vldmia $fp, {@XMM[0]} @ load counter
				1525
				1526	bne .Lctr_enc_loop
				1527	b .Lctr_enc_done
				1528
				1529	.align 4
				1530	.Lctr_enc_loop_done:
				1531	add $len, $len, #8
				1532	vld1.8 {@XMM[8]}, [$inp]! @ load input
				1533	veor @XMM[0], @XMM[8]
				1534	vst1.8 {@XMM[0]}, [$out]! @ write output
				1535	cmp $len, #2
				1536	blo .Lctr_enc_done
				1537	vld1.8 {@XMM[9]}, [$inp]!
				1538	veor @XMM[1], @XMM[9]
				1539	vst1.8 {@XMM[1]}, [$out]!
				1540	beq .Lctr_enc_done
				1541	vld1.8 {@XMM[10]}, [$inp]!
				1542	veor @XMM[4], @XMM[10]
				1543	vst1.8 {@XMM[4]}, [$out]!
				1544	cmp $len, #4
				1545	blo .Lctr_enc_done
				1546	vld1.8 {@XMM[11]}, [$inp]!
				1547	veor @XMM[6], @XMM[11]
				1548	vst1.8 {@XMM[6]}, [$out]!
				1549	beq .Lctr_enc_done
				1550	vld1.8 {@XMM[12]}, [$inp]!
				1551	veor @XMM[3], @XMM[12]
				1552	vst1.8 {@XMM[3]}, [$out]!
				1553	cmp $len, #6
				1554	blo .Lctr_enc_done
				1555	vld1.8 {@XMM[13]}, [$inp]!
				1556	veor @XMM[7], @XMM[13]
				1557	vst1.8 {@XMM[7]}, [$out]!
				1558	beq .Lctr_enc_done
				1559	vld1.8 {@XMM[14]}, [$inp]
				1560	veor @XMM[2], @XMM[14]
				1561	vst1.8 {@XMM[2]}, [$out]!
				1562
				1563	.Lctr_enc_done:
				1564	vmov.i32 q0, #0
				1565	vmov.i32 q1, #0
				1566	#ifndef BSAES_ASM_EXTENDED_KEY
				1567	.Lctr_enc_bzero: @ wipe key schedule [if any]
				1568	vstmia $keysched!, {q0-q1}
				1569	cmp $keysched, $fp
				1570	bne .Lctr_enc_bzero
				1571	#else
				1572	vstmia $keysched, {q0-q1}
				1573	#endif
				1574
				1575	mov sp, $fp
				1576	add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
				1577	VFP_ABI_POP
				1578	ldmia sp!, {r4-r10, pc} @ return
				1579
				1580	.align 4
				1581	.Lctr_enc_short:
				1582	ldr ip, [sp] @ ctr pointer is passed on stack
				1583	stmdb sp!, {r4-r8, lr}
				1584
				1585	mov r4, $inp @ copy arguments
				1586	mov r5, $out
				1587	mov r6, $len
				1588	mov r7, $key
				1589	ldr r8, [ip, #12] @ load counter LSW
				1590	vld1.8 {@XMM[1]}, [ip] @ load whole counter value
				1591	#ifdef __ARMEL__
				1592	rev r8, r8
				1593	#endif
				1594	sub sp, sp, #0x10
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	1595	vst1.8 {@XMM[1]}, [sp] @ copy counter value
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1596	sub sp, sp, #0x10
				1597
				1598	.Lctr_enc_short_loop:
				1599	add r0, sp, #0x10 @ input counter value
				1600	mov r1, sp @ output on the stack
				1601	mov r2, r7 @ key
				1602
				1603	bl AES_encrypt
				1604
				1605	vld1.8 {@XMM[0]}, [r4]! @ load input
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	1606	vld1.8 {@XMM[1]}, [sp] @ load encrypted counter
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1607	add r8, r8, #1
				1608	#ifdef __ARMEL__
				1609	rev r0, r8
				1610	str r0, [sp, #0x1c] @ next counter value
				1611	#else
				1612	str r8, [sp, #0x1c] @ next counter value
				1613	#endif
				1614	veor @XMM[0],@XMM[0],@XMM[1]
				1615	vst1.8 {@XMM[0]}, [r5]! @ store output
				1616	subs r6, r6, #1
				1617	bne .Lctr_enc_short_loop
				1618
				1619	vmov.i32 q0, #0
				1620	vmov.i32 q1, #0
				1621	vstmia sp!, {q0-q1}
				1622
				1623	ldmia sp!, {r4-r8, pc}
				1624	.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
				1625	___
				1626	}
				1627	{
				1628	######################################################################
				1629	# void bsaes_xts_[en\|de]crypt(const char inp,char out,size_t len,
				1630	# const AES_KEY key1, const AES_KEY key2,
				1631	# const unsigned char iv[16]);
				1632	#
				1633	my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
				1634	my $const="r6"; # returned by _bsaes_key_convert
				1635	my $twmask=@XMM[5];
				1636	my @T=@XMM[6..7];
				1637
				1638	$code.=<<___;
				1639	.globl bsaes_xts_encrypt
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1640	.type bsaes_xts_encrypt,%function
				1641	.align 4
				1642	bsaes_xts_encrypt:
				1643	mov ip, sp
				1644	stmdb sp!, {r4-r10, lr} @ 0x20
				1645	VFP_ABI_PUSH
				1646	mov r6, sp @ future $fp
				1647
				1648	mov $inp, r0
				1649	mov $out, r1
				1650	mov $len, r2
				1651	mov $key, r3
				1652
				1653	sub r0, sp, #0x10 @ 0x10
				1654	bic r0, #0xf @ align at 16 bytes
				1655	mov sp, r0
				1656
				1657	#ifdef XTS_CHAIN_TWEAK
				1658	ldr r0, [ip] @ pointer to input tweak
				1659	#else
				1660	@ generate initial tweak
				1661	ldr r0, [ip, #4] @ iv[]
				1662	mov r1, sp
				1663	ldr r2, [ip, #0] @ key2
				1664	bl AES_encrypt
				1665	mov r0,sp @ pointer to initial tweak
				1666	#endif
				1667
				1668	ldr $rounds, [$key, #240] @ get # of rounds
				1669	mov $fp, r6
				1670	#ifndef BSAES_ASM_EXTENDED_KEY
				1671	@ allocate the key schedule on the stack
				1672	sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
				1673	@ add r12, #`128-32` @ size of bit-sliced key schedule
				1674	sub r12, #`32+16` @ place for tweak[9]
				1675
				1676	@ populate the key schedule
				1677	mov r4, $key @ pass key
				1678	mov r5, $rounds @ pass # of rounds
				1679	mov sp, r12
				1680	add r12, #0x90 @ pass key schedule
				1681	bl _bsaes_key_convert
				1682	veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
				1683	vstmia r12, {@XMM[7]} @ save last round key
				1684	#else
				1685	ldr r12, [$key, #244]
				1686	eors r12, #1
				1687	beq 0f
				1688
				1689	str r12, [$key, #244]
				1690	mov r4, $key @ pass key
				1691	mov r5, $rounds @ pass # of rounds
				1692	add r12, $key, #248 @ pass key schedule
				1693	bl _bsaes_key_convert
				1694	veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
				1695	vstmia r12, {@XMM[7]}
				1696
				1697	.align 2
				1698	0: sub sp, #0x90 @ place for tweak[9]
				1699	#endif
				1700
				1701	vld1.8 {@XMM[8]}, [r0] @ initial tweak
				1702	adr $magic, .Lxts_magic
				1703
				1704	subs $len, #0x80
				1705	blo .Lxts_enc_short
				1706	b .Lxts_enc_loop
				1707
				1708	.align 4
				1709	.Lxts_enc_loop:
				1710	vldmia $magic, {$twmask} @ load XTS magic
				1711	vshr.s64 @T[0], @XMM[8], #63
				1712	mov r0, sp
				1713	vand @T[0], @T[0], $twmask
				1714	___
				1715	for($i=9;$i<16;$i++) {
				1716	$code.=<<___;
				1717	vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
				1718	vst1.64 {@XMM[$i-1]}, [r0,:128]!
				1719	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
				1720	vshr.s64 @T[1], @XMM[$i], #63
				1721	veor @XMM[$i], @XMM[$i], @T[0]
				1722	vand @T[1], @T[1], $twmask
				1723	___
				1724	@T=reverse(@T);
				1725
				1726	$code.=<<___ if ($i>=10);
				1727	vld1.8 {@XMM[$i-10]}, [$inp]!
				1728	___
				1729	$code.=<<___ if ($i>=11);
				1730	veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
				1731	___
				1732	}
				1733	$code.=<<___;
				1734	vadd.u64 @XMM[8], @XMM[15], @XMM[15]
				1735	vst1.64 {@XMM[15]}, [r0,:128]!
				1736	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
				1737	veor @XMM[8], @XMM[8], @T[0]
				1738	vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1739
				1740	vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
				1741	veor @XMM[5], @XMM[5], @XMM[13]
				1742	#ifndef BSAES_ASM_EXTENDED_KEY
				1743	add r4, sp, #0x90 @ pass key schedule
				1744	#else
				1745	add r4, $key, #248 @ pass key schedule
				1746	#endif
				1747	veor @XMM[6], @XMM[6], @XMM[14]
				1748	mov r5, $rounds @ pass rounds
				1749	veor @XMM[7], @XMM[7], @XMM[15]
				1750	mov r0, sp
				1751
				1752	bl _bsaes_encrypt8
				1753
				1754	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				1755	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				1756	veor @XMM[0], @XMM[0], @XMM[ 8]
				1757	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
				1758	veor @XMM[1], @XMM[1], @XMM[ 9]
				1759	veor @XMM[8], @XMM[4], @XMM[10]
				1760	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				1761	veor @XMM[9], @XMM[6], @XMM[11]
				1762	vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
				1763	veor @XMM[10], @XMM[3], @XMM[12]
				1764	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				1765	veor @XMM[11], @XMM[7], @XMM[13]
				1766	veor @XMM[12], @XMM[2], @XMM[14]
				1767	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
				1768	veor @XMM[13], @XMM[5], @XMM[15]
				1769	vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
				1770
				1771	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1772
				1773	subs $len, #0x80
				1774	bpl .Lxts_enc_loop
				1775
				1776	.Lxts_enc_short:
				1777	adds $len, #0x70
				1778	bmi .Lxts_enc_done
				1779
				1780	vldmia $magic, {$twmask} @ load XTS magic
				1781	vshr.s64 @T[0], @XMM[8], #63
				1782	mov r0, sp
				1783	vand @T[0], @T[0], $twmask
				1784	___
				1785	for($i=9;$i<16;$i++) {
				1786	$code.=<<___;
				1787	vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
				1788	vst1.64 {@XMM[$i-1]}, [r0,:128]!
				1789	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
				1790	vshr.s64 @T[1], @XMM[$i], #63
				1791	veor @XMM[$i], @XMM[$i], @T[0]
				1792	vand @T[1], @T[1], $twmask
				1793	___
				1794	@T=reverse(@T);
				1795
				1796	$code.=<<___ if ($i>=10);
				1797	vld1.8 {@XMM[$i-10]}, [$inp]!
				1798	subs $len, #0x10
				1799	bmi .Lxts_enc_`$i-9`
				1800	___
				1801	$code.=<<___ if ($i>=11);
				1802	veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
				1803	___
				1804	}
				1805	$code.=<<___;
				1806	sub $len, #0x10
				1807	vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
				1808
				1809	vld1.8 {@XMM[6]}, [$inp]!
				1810	veor @XMM[5], @XMM[5], @XMM[13]
				1811	#ifndef BSAES_ASM_EXTENDED_KEY
				1812	add r4, sp, #0x90 @ pass key schedule
				1813	#else
				1814	add r4, $key, #248 @ pass key schedule
				1815	#endif
				1816	veor @XMM[6], @XMM[6], @XMM[14]
				1817	mov r5, $rounds @ pass rounds
				1818	mov r0, sp
				1819
				1820	bl _bsaes_encrypt8
				1821
				1822	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				1823	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				1824	veor @XMM[0], @XMM[0], @XMM[ 8]
				1825	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
				1826	veor @XMM[1], @XMM[1], @XMM[ 9]
				1827	veor @XMM[8], @XMM[4], @XMM[10]
				1828	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				1829	veor @XMM[9], @XMM[6], @XMM[11]
				1830	vld1.64 {@XMM[14]}, [r0,:128]!
				1831	veor @XMM[10], @XMM[3], @XMM[12]
				1832	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				1833	veor @XMM[11], @XMM[7], @XMM[13]
				1834	veor @XMM[12], @XMM[2], @XMM[14]
				1835	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
				1836	vst1.8 {@XMM[12]}, [$out]!
				1837
				1838	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1839	b .Lxts_enc_done
				1840	.align 4
				1841	.Lxts_enc_6:
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1842	veor @XMM[4], @XMM[4], @XMM[12]
				1843	#ifndef BSAES_ASM_EXTENDED_KEY
				1844	add r4, sp, #0x90 @ pass key schedule
				1845	#else
				1846	add r4, $key, #248 @ pass key schedule
				1847	#endif
				1848	veor @XMM[5], @XMM[5], @XMM[13]
				1849	mov r5, $rounds @ pass rounds
				1850	mov r0, sp
				1851
				1852	bl _bsaes_encrypt8
				1853
				1854	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				1855	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				1856	veor @XMM[0], @XMM[0], @XMM[ 8]
				1857	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
				1858	veor @XMM[1], @XMM[1], @XMM[ 9]
				1859	veor @XMM[8], @XMM[4], @XMM[10]
				1860	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				1861	veor @XMM[9], @XMM[6], @XMM[11]
				1862	veor @XMM[10], @XMM[3], @XMM[12]
				1863	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				1864	veor @XMM[11], @XMM[7], @XMM[13]
				1865	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
				1866
				1867	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1868	b .Lxts_enc_done
				1869
				1870	@ put this in range for both ARM and Thumb mode adr instructions
				1871	.align 5
				1872	.Lxts_magic:
				1873	.quad 1, 0x87
				1874
				1875	.align 5
				1876	.Lxts_enc_5:
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1877	veor @XMM[3], @XMM[3], @XMM[11]
				1878	#ifndef BSAES_ASM_EXTENDED_KEY
				1879	add r4, sp, #0x90 @ pass key schedule
				1880	#else
				1881	add r4, $key, #248 @ pass key schedule
				1882	#endif
				1883	veor @XMM[4], @XMM[4], @XMM[12]
				1884	mov r5, $rounds @ pass rounds
				1885	mov r0, sp
				1886
				1887	bl _bsaes_encrypt8
				1888
				1889	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				1890	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				1891	veor @XMM[0], @XMM[0], @XMM[ 8]
				1892	vld1.64 {@XMM[12]}, [r0,:128]!
				1893	veor @XMM[1], @XMM[1], @XMM[ 9]
				1894	veor @XMM[8], @XMM[4], @XMM[10]
				1895	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				1896	veor @XMM[9], @XMM[6], @XMM[11]
				1897	veor @XMM[10], @XMM[3], @XMM[12]
				1898	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				1899	vst1.8 {@XMM[10]}, [$out]!
				1900
				1901	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1902	b .Lxts_enc_done
				1903	.align 4
				1904	.Lxts_enc_4:
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1905	veor @XMM[2], @XMM[2], @XMM[10]
				1906	#ifndef BSAES_ASM_EXTENDED_KEY
				1907	add r4, sp, #0x90 @ pass key schedule
				1908	#else
				1909	add r4, $key, #248 @ pass key schedule
				1910	#endif
				1911	veor @XMM[3], @XMM[3], @XMM[11]
				1912	mov r5, $rounds @ pass rounds
				1913	mov r0, sp
				1914
				1915	bl _bsaes_encrypt8
				1916
				1917	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				1918	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				1919	veor @XMM[0], @XMM[0], @XMM[ 8]
				1920	veor @XMM[1], @XMM[1], @XMM[ 9]
				1921	veor @XMM[8], @XMM[4], @XMM[10]
				1922	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				1923	veor @XMM[9], @XMM[6], @XMM[11]
				1924	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				1925
				1926	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1927	b .Lxts_enc_done
				1928	.align 4
				1929	.Lxts_enc_3:
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1930	veor @XMM[1], @XMM[1], @XMM[9]
				1931	#ifndef BSAES_ASM_EXTENDED_KEY
				1932	add r4, sp, #0x90 @ pass key schedule
				1933	#else
				1934	add r4, $key, #248 @ pass key schedule
				1935	#endif
				1936	veor @XMM[2], @XMM[2], @XMM[10]
				1937	mov r5, $rounds @ pass rounds
				1938	mov r0, sp
				1939
				1940	bl _bsaes_encrypt8
				1941
				1942	vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
				1943	vld1.64 {@XMM[10]}, [r0,:128]!
				1944	veor @XMM[0], @XMM[0], @XMM[ 8]
				1945	veor @XMM[1], @XMM[1], @XMM[ 9]
				1946	veor @XMM[8], @XMM[4], @XMM[10]
				1947	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				1948	vst1.8 {@XMM[8]}, [$out]!
				1949
				1950	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1951	b .Lxts_enc_done
				1952	.align 4
				1953	.Lxts_enc_2:
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1954	veor @XMM[0], @XMM[0], @XMM[8]
				1955	#ifndef BSAES_ASM_EXTENDED_KEY
				1956	add r4, sp, #0x90 @ pass key schedule
				1957	#else
				1958	add r4, $key, #248 @ pass key schedule
				1959	#endif
				1960	veor @XMM[1], @XMM[1], @XMM[9]
				1961	mov r5, $rounds @ pass rounds
				1962	mov r0, sp
				1963
				1964	bl _bsaes_encrypt8
				1965
				1966	vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
				1967	veor @XMM[0], @XMM[0], @XMM[ 8]
				1968	veor @XMM[1], @XMM[1], @XMM[ 9]
				1969	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				1970
				1971	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				1972	b .Lxts_enc_done
				1973	.align 4
				1974	.Lxts_enc_1:
				1975	mov r0, sp
David Benjamin	c895d6b	2016-08-11 13:26:41 -0400	[diff] [blame]	1976	veor @XMM[0], @XMM[0], @XMM[8]
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1977	mov r1, sp
				1978	vst1.8 {@XMM[0]}, [sp,:128]
				1979	mov r2, $key
				1980	mov r4, $fp @ preserve fp
				1981
				1982	bl AES_encrypt
				1983
				1984	vld1.8 {@XMM[0]}, [sp,:128]
				1985	veor @XMM[0], @XMM[0], @XMM[8]
				1986	vst1.8 {@XMM[0]}, [$out]!
				1987	mov $fp, r4
				1988
				1989	vmov @XMM[8], @XMM[9] @ next round tweak
				1990
				1991	.Lxts_enc_done:
				1992	#ifndef XTS_CHAIN_TWEAK
				1993	adds $len, #0x10
				1994	beq .Lxts_enc_ret
				1995	sub r6, $out, #0x10
				1996
				1997	.Lxts_enc_steal:
				1998	ldrb r0, [$inp], #1
				1999	ldrb r1, [$out, #-0x10]
				2000	strb r0, [$out, #-0x10]
				2001	strb r1, [$out], #1
				2002
				2003	subs $len, #1
				2004	bhi .Lxts_enc_steal
				2005
				2006	vld1.8 {@XMM[0]}, [r6]
				2007	mov r0, sp
				2008	veor @XMM[0], @XMM[0], @XMM[8]
				2009	mov r1, sp
				2010	vst1.8 {@XMM[0]}, [sp,:128]
				2011	mov r2, $key
				2012	mov r4, $fp @ preserve fp
				2013
				2014	bl AES_encrypt
				2015
				2016	vld1.8 {@XMM[0]}, [sp,:128]
				2017	veor @XMM[0], @XMM[0], @XMM[8]
				2018	vst1.8 {@XMM[0]}, [r6]
				2019	mov $fp, r4
				2020	#endif
				2021
				2022	.Lxts_enc_ret:
				2023	bic r0, $fp, #0xf
				2024	vmov.i32 q0, #0
				2025	vmov.i32 q1, #0
				2026	#ifdef XTS_CHAIN_TWEAK
				2027	ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
				2028	#endif
				2029	.Lxts_enc_bzero: @ wipe key schedule [if any]
				2030	vstmia sp!, {q0-q1}
				2031	cmp sp, r0
				2032	bne .Lxts_enc_bzero
				2033
				2034	mov sp, $fp
				2035	#ifdef XTS_CHAIN_TWEAK
				2036	vst1.8 {@XMM[8]}, [r1]
				2037	#endif
				2038	VFP_ABI_POP
				2039	ldmia sp!, {r4-r10, pc} @ return
				2040
				2041	.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
				2042
				2043	.globl bsaes_xts_decrypt
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	2044	.type bsaes_xts_decrypt,%function
				2045	.align 4
				2046	bsaes_xts_decrypt:
				2047	mov ip, sp
				2048	stmdb sp!, {r4-r10, lr} @ 0x20
				2049	VFP_ABI_PUSH
				2050	mov r6, sp @ future $fp
				2051
				2052	mov $inp, r0
				2053	mov $out, r1
				2054	mov $len, r2
				2055	mov $key, r3
				2056
				2057	sub r0, sp, #0x10 @ 0x10
				2058	bic r0, #0xf @ align at 16 bytes
				2059	mov sp, r0
				2060
				2061	#ifdef XTS_CHAIN_TWEAK
				2062	ldr r0, [ip] @ pointer to input tweak
				2063	#else
				2064	@ generate initial tweak
				2065	ldr r0, [ip, #4] @ iv[]
				2066	mov r1, sp
				2067	ldr r2, [ip, #0] @ key2
				2068	bl AES_encrypt
				2069	mov r0, sp @ pointer to initial tweak
				2070	#endif
				2071
				2072	ldr $rounds, [$key, #240] @ get # of rounds
				2073	mov $fp, r6
				2074	#ifndef BSAES_ASM_EXTENDED_KEY
				2075	@ allocate the key schedule on the stack
				2076	sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
				2077	@ add r12, #`128-32` @ size of bit-sliced key schedule
				2078	sub r12, #`32+16` @ place for tweak[9]
				2079
				2080	@ populate the key schedule
				2081	mov r4, $key @ pass key
				2082	mov r5, $rounds @ pass # of rounds
				2083	mov sp, r12
				2084	add r12, #0x90 @ pass key schedule
				2085	bl _bsaes_key_convert
				2086	add r4, sp, #0x90
				2087	vldmia r4, {@XMM[6]}
				2088	vstmia r12, {@XMM[15]} @ save last round key
				2089	veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
				2090	vstmia r4, {@XMM[7]}
				2091	#else
				2092	ldr r12, [$key, #244]
				2093	eors r12, #1
				2094	beq 0f
				2095
				2096	str r12, [$key, #244]
				2097	mov r4, $key @ pass key
				2098	mov r5, $rounds @ pass # of rounds
				2099	add r12, $key, #248 @ pass key schedule
				2100	bl _bsaes_key_convert
				2101	add r4, $key, #248
				2102	vldmia r4, {@XMM[6]}
				2103	vstmia r12, {@XMM[15]} @ save last round key
				2104	veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
				2105	vstmia r4, {@XMM[7]}
				2106
				2107	.align 2
				2108	0: sub sp, #0x90 @ place for tweak[9]
				2109	#endif
				2110	vld1.8 {@XMM[8]}, [r0] @ initial tweak
				2111	adr $magic, .Lxts_magic
				2112
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	2113	#ifndef XTS_CHAIN_TWEAK
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	2114	tst $len, #0xf @ if not multiple of 16
				2115	it ne @ Thumb2 thing, sanity check in ARM
				2116	subne $len, #0x10 @ subtract another 16 bytes
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	2117	#endif
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	2118	subs $len, #0x80
				2119
				2120	blo .Lxts_dec_short
				2121	b .Lxts_dec_loop
				2122
				2123	.align 4
				2124	.Lxts_dec_loop:
				2125	vldmia $magic, {$twmask} @ load XTS magic
				2126	vshr.s64 @T[0], @XMM[8], #63
				2127	mov r0, sp
				2128	vand @T[0], @T[0], $twmask
				2129	___
				2130	for($i=9;$i<16;$i++) {
				2131	$code.=<<___;
				2132	vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
				2133	vst1.64 {@XMM[$i-1]}, [r0,:128]!
				2134	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
				2135	vshr.s64 @T[1], @XMM[$i], #63
				2136	veor @XMM[$i], @XMM[$i], @T[0]
				2137	vand @T[1], @T[1], $twmask
				2138	___
				2139	@T=reverse(@T);
				2140
				2141	$code.=<<___ if ($i>=10);
				2142	vld1.8 {@XMM[$i-10]}, [$inp]!
				2143	___
				2144	$code.=<<___ if ($i>=11);
				2145	veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
				2146	___
				2147	}
				2148	$code.=<<___;
				2149	vadd.u64 @XMM[8], @XMM[15], @XMM[15]
				2150	vst1.64 {@XMM[15]}, [r0,:128]!
				2151	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
				2152	veor @XMM[8], @XMM[8], @T[0]
				2153	vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2154
				2155	vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
				2156	veor @XMM[5], @XMM[5], @XMM[13]
				2157	#ifndef BSAES_ASM_EXTENDED_KEY
				2158	add r4, sp, #0x90 @ pass key schedule
				2159	#else
				2160	add r4, $key, #248 @ pass key schedule
				2161	#endif
				2162	veor @XMM[6], @XMM[6], @XMM[14]
				2163	mov r5, $rounds @ pass rounds
				2164	veor @XMM[7], @XMM[7], @XMM[15]
				2165	mov r0, sp
				2166
				2167	bl _bsaes_decrypt8
				2168
				2169	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				2170	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				2171	veor @XMM[0], @XMM[0], @XMM[ 8]
				2172	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
				2173	veor @XMM[1], @XMM[1], @XMM[ 9]
				2174	veor @XMM[8], @XMM[6], @XMM[10]
				2175	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				2176	veor @XMM[9], @XMM[4], @XMM[11]
				2177	vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
				2178	veor @XMM[10], @XMM[2], @XMM[12]
				2179	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				2180	veor @XMM[11], @XMM[7], @XMM[13]
				2181	veor @XMM[12], @XMM[3], @XMM[14]
				2182	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
				2183	veor @XMM[13], @XMM[5], @XMM[15]
				2184	vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
				2185
				2186	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2187
				2188	subs $len, #0x80
				2189	bpl .Lxts_dec_loop
				2190
				2191	.Lxts_dec_short:
				2192	adds $len, #0x70
				2193	bmi .Lxts_dec_done
				2194
				2195	vldmia $magic, {$twmask} @ load XTS magic
				2196	vshr.s64 @T[0], @XMM[8], #63
				2197	mov r0, sp
				2198	vand @T[0], @T[0], $twmask
				2199	___
				2200	for($i=9;$i<16;$i++) {
				2201	$code.=<<___;
				2202	vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
				2203	vst1.64 {@XMM[$i-1]}, [r0,:128]!
				2204	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
				2205	vshr.s64 @T[1], @XMM[$i], #63
				2206	veor @XMM[$i], @XMM[$i], @T[0]
				2207	vand @T[1], @T[1], $twmask
				2208	___
				2209	@T=reverse(@T);
				2210
				2211	$code.=<<___ if ($i>=10);
				2212	vld1.8 {@XMM[$i-10]}, [$inp]!
				2213	subs $len, #0x10
				2214	bmi .Lxts_dec_`$i-9`
				2215	___
				2216	$code.=<<___ if ($i>=11);
				2217	veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
				2218	___
				2219	}
				2220	$code.=<<___;
				2221	sub $len, #0x10
				2222	vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
				2223
				2224	vld1.8 {@XMM[6]}, [$inp]!
				2225	veor @XMM[5], @XMM[5], @XMM[13]
				2226	#ifndef BSAES_ASM_EXTENDED_KEY
				2227	add r4, sp, #0x90 @ pass key schedule
				2228	#else
				2229	add r4, $key, #248 @ pass key schedule
				2230	#endif
				2231	veor @XMM[6], @XMM[6], @XMM[14]
				2232	mov r5, $rounds @ pass rounds
				2233	mov r0, sp
				2234
				2235	bl _bsaes_decrypt8
				2236
				2237	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				2238	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				2239	veor @XMM[0], @XMM[0], @XMM[ 8]
				2240	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
				2241	veor @XMM[1], @XMM[1], @XMM[ 9]
				2242	veor @XMM[8], @XMM[6], @XMM[10]
				2243	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				2244	veor @XMM[9], @XMM[4], @XMM[11]
				2245	vld1.64 {@XMM[14]}, [r0,:128]!
				2246	veor @XMM[10], @XMM[2], @XMM[12]
				2247	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				2248	veor @XMM[11], @XMM[7], @XMM[13]
				2249	veor @XMM[12], @XMM[3], @XMM[14]
				2250	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
				2251	vst1.8 {@XMM[12]}, [$out]!
				2252
				2253	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2254	b .Lxts_dec_done
				2255	.align 4
				2256	.Lxts_dec_6:
				2257	vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
				2258
				2259	veor @XMM[4], @XMM[4], @XMM[12]
				2260	#ifndef BSAES_ASM_EXTENDED_KEY
				2261	add r4, sp, #0x90 @ pass key schedule
				2262	#else
				2263	add r4, $key, #248 @ pass key schedule
				2264	#endif
				2265	veor @XMM[5], @XMM[5], @XMM[13]
				2266	mov r5, $rounds @ pass rounds
				2267	mov r0, sp
				2268
				2269	bl _bsaes_decrypt8
				2270
				2271	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				2272	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				2273	veor @XMM[0], @XMM[0], @XMM[ 8]
				2274	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
				2275	veor @XMM[1], @XMM[1], @XMM[ 9]
				2276	veor @XMM[8], @XMM[6], @XMM[10]
				2277	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				2278	veor @XMM[9], @XMM[4], @XMM[11]
				2279	veor @XMM[10], @XMM[2], @XMM[12]
				2280	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				2281	veor @XMM[11], @XMM[7], @XMM[13]
				2282	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
				2283
				2284	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2285	b .Lxts_dec_done
				2286	.align 4
				2287	.Lxts_dec_5:
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	2288	veor @XMM[3], @XMM[3], @XMM[11]
				2289	#ifndef BSAES_ASM_EXTENDED_KEY
				2290	add r4, sp, #0x90 @ pass key schedule
				2291	#else
				2292	add r4, $key, #248 @ pass key schedule
				2293	#endif
				2294	veor @XMM[4], @XMM[4], @XMM[12]
				2295	mov r5, $rounds @ pass rounds
				2296	mov r0, sp
				2297
				2298	bl _bsaes_decrypt8
				2299
				2300	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				2301	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				2302	veor @XMM[0], @XMM[0], @XMM[ 8]
				2303	vld1.64 {@XMM[12]}, [r0,:128]!
				2304	veor @XMM[1], @XMM[1], @XMM[ 9]
				2305	veor @XMM[8], @XMM[6], @XMM[10]
				2306	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				2307	veor @XMM[9], @XMM[4], @XMM[11]
				2308	veor @XMM[10], @XMM[2], @XMM[12]
				2309	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				2310	vst1.8 {@XMM[10]}, [$out]!
				2311
				2312	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2313	b .Lxts_dec_done
				2314	.align 4
				2315	.Lxts_dec_4:
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	2316	veor @XMM[2], @XMM[2], @XMM[10]
				2317	#ifndef BSAES_ASM_EXTENDED_KEY
				2318	add r4, sp, #0x90 @ pass key schedule
				2319	#else
				2320	add r4, $key, #248 @ pass key schedule
				2321	#endif
				2322	veor @XMM[3], @XMM[3], @XMM[11]
				2323	mov r5, $rounds @ pass rounds
				2324	mov r0, sp
				2325
				2326	bl _bsaes_decrypt8
				2327
				2328	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
				2329	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
				2330	veor @XMM[0], @XMM[0], @XMM[ 8]
				2331	veor @XMM[1], @XMM[1], @XMM[ 9]
				2332	veor @XMM[8], @XMM[6], @XMM[10]
				2333	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				2334	veor @XMM[9], @XMM[4], @XMM[11]
				2335	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
				2336
				2337	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2338	b .Lxts_dec_done
				2339	.align 4
				2340	.Lxts_dec_3:
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	2341	veor @XMM[1], @XMM[1], @XMM[9]
				2342	#ifndef BSAES_ASM_EXTENDED_KEY
				2343	add r4, sp, #0x90 @ pass key schedule
				2344	#else
				2345	add r4, $key, #248 @ pass key schedule
				2346	#endif
				2347	veor @XMM[2], @XMM[2], @XMM[10]
				2348	mov r5, $rounds @ pass rounds
				2349	mov r0, sp
				2350
				2351	bl _bsaes_decrypt8
				2352
				2353	vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
				2354	vld1.64 {@XMM[10]}, [r0,:128]!
				2355	veor @XMM[0], @XMM[0], @XMM[ 8]
				2356	veor @XMM[1], @XMM[1], @XMM[ 9]
				2357	veor @XMM[8], @XMM[6], @XMM[10]
				2358	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				2359	vst1.8 {@XMM[8]}, [$out]!
				2360
				2361	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2362	b .Lxts_dec_done
				2363	.align 4
				2364	.Lxts_dec_2:
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	2365	veor @XMM[0], @XMM[0], @XMM[8]
				2366	#ifndef BSAES_ASM_EXTENDED_KEY
				2367	add r4, sp, #0x90 @ pass key schedule
				2368	#else
				2369	add r4, $key, #248 @ pass key schedule
				2370	#endif
				2371	veor @XMM[1], @XMM[1], @XMM[9]
				2372	mov r5, $rounds @ pass rounds
				2373	mov r0, sp
				2374
				2375	bl _bsaes_decrypt8
				2376
				2377	vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
				2378	veor @XMM[0], @XMM[0], @XMM[ 8]
				2379	veor @XMM[1], @XMM[1], @XMM[ 9]
				2380	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
				2381
				2382	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
				2383	b .Lxts_dec_done
				2384	.align 4
				2385	.Lxts_dec_1:
				2386	mov r0, sp
David Benjamin	c895d6b	2016-08-11 13:26:41 -0400	[diff] [blame]	2387	veor @XMM[0], @XMM[0], @XMM[8]
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	2388	mov r1, sp
				2389	vst1.8 {@XMM[0]}, [sp,:128]
David Benjamin	c895d6b	2016-08-11 13:26:41 -0400	[diff] [blame]	2390	mov r5, $magic @ preserve magic
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	2391	mov r2, $key
				2392	mov r4, $fp @ preserve fp
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	2393
				2394	bl AES_decrypt
				2395
				2396	vld1.8 {@XMM[0]}, [sp,:128]
				2397	veor @XMM[0], @XMM[0], @XMM[8]
				2398	vst1.8 {@XMM[0]}, [$out]!
				2399	mov $fp, r4
				2400	mov $magic, r5
				2401
				2402	vmov @XMM[8], @XMM[9] @ next round tweak
				2403
				2404	.Lxts_dec_done:
				2405	#ifndef XTS_CHAIN_TWEAK
				2406	adds $len, #0x10
				2407	beq .Lxts_dec_ret
				2408
				2409	@ calculate one round of extra tweak for the stolen ciphertext
				2410	vldmia $magic, {$twmask}
				2411	vshr.s64 @XMM[6], @XMM[8], #63
				2412	vand @XMM[6], @XMM[6], $twmask
				2413	vadd.u64 @XMM[9], @XMM[8], @XMM[8]
				2414	vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
				2415	veor @XMM[9], @XMM[9], @XMM[6]
				2416
				2417	@ perform the final decryption with the last tweak value
				2418	vld1.8 {@XMM[0]}, [$inp]!
				2419	mov r0, sp
				2420	veor @XMM[0], @XMM[0], @XMM[9]
				2421	mov r1, sp
				2422	vst1.8 {@XMM[0]}, [sp,:128]
				2423	mov r2, $key
				2424	mov r4, $fp @ preserve fp
				2425
				2426	bl AES_decrypt
				2427
				2428	vld1.8 {@XMM[0]}, [sp,:128]
				2429	veor @XMM[0], @XMM[0], @XMM[9]
				2430	vst1.8 {@XMM[0]}, [$out]
				2431
				2432	mov r6, $out
				2433	.Lxts_dec_steal:
				2434	ldrb r1, [$out]
				2435	ldrb r0, [$inp], #1
				2436	strb r1, [$out, #0x10]
				2437	strb r0, [$out], #1
				2438
				2439	subs $len, #1
				2440	bhi .Lxts_dec_steal
				2441
				2442	vld1.8 {@XMM[0]}, [r6]
				2443	mov r0, sp
				2444	veor @XMM[0], @XMM[8]
				2445	mov r1, sp
				2446	vst1.8 {@XMM[0]}, [sp,:128]
				2447	mov r2, $key
				2448
				2449	bl AES_decrypt
				2450
				2451	vld1.8 {@XMM[0]}, [sp,:128]
				2452	veor @XMM[0], @XMM[0], @XMM[8]
				2453	vst1.8 {@XMM[0]}, [r6]
				2454	mov $fp, r4
				2455	#endif
				2456
				2457	.Lxts_dec_ret:
				2458	bic r0, $fp, #0xf
				2459	vmov.i32 q0, #0
				2460	vmov.i32 q1, #0
				2461	#ifdef XTS_CHAIN_TWEAK
				2462	ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
				2463	#endif
				2464	.Lxts_dec_bzero: @ wipe key schedule [if any]
				2465	vstmia sp!, {q0-q1}
				2466	cmp sp, r0
				2467	bne .Lxts_dec_bzero
				2468
				2469	mov sp, $fp
				2470	#ifdef XTS_CHAIN_TWEAK
				2471	vst1.8 {@XMM[8]}, [r1]
				2472	#endif
				2473	VFP_ABI_POP
				2474	ldmia sp!, {r4-r10, pc} @ return
				2475
				2476	.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
				2477	___
				2478	}
				2479	$code.=<<___;
				2480	#endif
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	2481	___
				2482
				2483	$code =~ s/\`([^\`]*)\`/eval($1)/gem;
				2484
				2485	open SELF,$0;
				2486	while(<SELF>) {
				2487	next if (/^#!/);
				2488	last if (!s/^#/@/ and !/^$/);
				2489	print;
				2490	}
				2491	close SELF;
				2492
				2493	print $code;
				2494
				2495	close STDOUT;