Blame - src/crypto/fipsmodule/modes/asm/ghash-armv4.pl - platform/external/boringssl

blob: 183fe60bb1791052976c406ce9dcd0f4543a15a1 [file] [log] [blame]

Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1	#!/usr/bin/env perl
				2	#
				3	# ====================================================================
				4	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
				5	# project. The module is, however, dual licensed under OpenSSL and
				6	# CRYPTOGAMS licenses depending on where you obtain it. For further
				7	# details see http://www.openssl.org/~appro/cryptogams/.
				8	# ====================================================================
				9	#
				10	# April 2010
				11	#
				12	# The module implements "4-bit" GCM GHASH function and underlying
				13	# single multiplication operation in GF(2^128). "4-bit" means that it
				14	# uses 256 bytes per-key table [+32 bytes shared table]. There is no
				15	# experimental performance data available yet. The only approximation
				16	# that can be made at this point is based on code size. Inner loop is
				17	# 32 instructions long and on single-issue core should execute in <40
				18	# cycles. Having verified that gcc 3.4 didn't unroll corresponding
				19	# loop, this assembler loop body was found to be ~3x smaller than
				20	# compiler-generated one...
				21	#
				22	# July 2010
				23	#
				24	# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
				25	# Cortex A8 core and ~25 cycles per processed byte (which was observed
				26	# to be ~3 times faster than gcc-generated code:-)
				27	#
				28	# February 2011
				29	#
				30	# Profiler-assisted and platform-specific optimization resulted in 7%
				31	# improvement on Cortex A8 core and ~23.5 cycles per byte.
				32	#
				33	# March 2011
				34	#
				35	# Add NEON implementation featuring polynomial multiplication, i.e. no
				36	# lookup tables involved. On Cortex A8 it was measured to process one
				37	# byte in 15 cycles or 55% faster than integer-only code.
				38	#
				39	# April 2014
				40	#
				41	# Switch to multiplication algorithm suggested in paper referred
				42	# below and combine it with reduction algorithm from x86 module.
				43	# Performance improvement over previous version varies from 65% on
				44	# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	45	# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
				46	# Snapdragon S4 - in 9.33.
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	47	#
Kenny Root	b849459	2015-09-25 02:29:14 +0000	[diff] [blame]	48	# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	49	# Polynomial Multiplication on ARM Processors using the NEON Engine.
Robert Sloan	a94fe05	2017-02-21 08:49:28 -0800	[diff] [blame]	50	#
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	51	# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
				52
				53	# ====================================================================
				54	# Note about "528B" variant. In ARM case it makes lesser sense to
				55	# implement it for following reasons:
				56	#
				57	# - performance improvement won't be anywhere near 50%, because 128-
				58	# bit shift operation is neatly fused with 128-bit xor here, and
				59	# "538B" variant would eliminate only 4-5 instructions out of 32
				60	# in the inner loop (meaning that estimated improvement is ~15%);
				61	# - ARM-based systems are often embedded ones and extra memory
				62	# consumption might be unappreciated (for so little improvement);
				63	#
				64	# Byte order [in]dependence. =========================================
				65	#
				66	# Caller is expected to maintain specific dword order in Htable,
				67	# namely with least significant dword of 128-bit value at lower
				68	# address. This differs completely from C code and has everything to
				69	# do with ldm instruction and order in which dwords are "consumed" by
				70	# algorithm. Byte order within these dwords in turn is whatever
				71	# native byte order on current platform. See gcm128.c for working
				72	# example...
				73
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	74	$flavour = shift;
David Benjamin	c895d6b	2016-08-11 13:26:41 -0400	[diff] [blame]	75	if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
				76	else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	77
				78	if ($flavour && $flavour ne "void") {
				79	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
				80	( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
Robert Sloan	9254e68	2017-04-24 09:42:06 -0700	[diff] [blame]	81	( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	82	die "can't locate arm-xlate.pl";
				83
				84	open STDOUT,"\| \"$^X\" $xlate $flavour $output";
				85	} else {
				86	open STDOUT,">$output";
				87	}
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	88
				89	$Xi="r0"; # argument block
				90	$Htbl="r1";
				91	$inp="r2";
				92	$len="r3";
				93
				94	$Zll="r4"; # variables
				95	$Zlh="r5";
				96	$Zhl="r6";
				97	$Zhh="r7";
				98	$Tll="r8";
				99	$Tlh="r9";
				100	$Thl="r10";
				101	$Thh="r11";
				102	$nlo="r12";
				103	################# r13 is stack pointer
				104	$nhi="r14";
				105	################# r15 is program counter
				106
				107	$rem_4bit=$inp; # used in gcm_gmult_4bit
				108	$cnt=$len;
				109
				110	sub Zsmash() {
				111	my $i=12;
				112	my @args=@_;
				113	for ($Zll,$Zlh,$Zhl,$Zhh) {
				114	$code.=<<___;
				115	#if __ARM_ARCH__>=7 && defined(__ARMEL__)
				116	rev $_,$_
				117	str $_,[$Xi,#$i]
				118	#elif defined(__ARMEB__)
				119	str $_,[$Xi,#$i]
				120	#else
				121	mov $Tlh,$_,lsr#8
				122	strb $_,[$Xi,#$i+3]
				123	mov $Thl,$_,lsr#16
				124	strb $Tlh,[$Xi,#$i+2]
				125	mov $Thh,$_,lsr#24
				126	strb $Thl,[$Xi,#$i+1]
				127	strb $Thh,[$Xi,#$i]
				128	#endif
				129	___
				130	$code.="\t".shift(@args)."\n";
				131	$i-=4;
				132	}
				133	}
				134
				135	$code=<<___;
Kenny Root	b849459	2015-09-25 02:29:14 +0000	[diff] [blame]	136	#include <openssl/arm_arch.h>
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	137
				138	.syntax unified
				139
				140	.text
				141	.code 32
				142
Adam Langley	4139edb	2016-01-13 15:00:54 -0800	[diff] [blame]	143	#ifdef __clang__
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	144	#define ldrplb ldrbpl
				145	#define ldrneb ldrbne
				146	#endif
				147
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	148	.type rem_4bit,%object
				149	.align 5
				150	rem_4bit:
				151	.short 0x0000,0x1C20,0x3840,0x2460
				152	.short 0x7080,0x6CA0,0x48C0,0x54E0
				153	.short 0xE100,0xFD20,0xD940,0xC560
				154	.short 0x9180,0x8DA0,0xA9C0,0xB5E0
				155	.size rem_4bit,.-rem_4bit
				156
				157	.type rem_4bit_get,%function
				158	rem_4bit_get:
				159	sub $rem_4bit,pc,#8
				160	sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
				161	b .Lrem_4bit_got
				162	nop
				163	.size rem_4bit_get,.-rem_4bit_get
				164
				165	.global gcm_ghash_4bit
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	166	.type gcm_ghash_4bit,%function
				167	gcm_ghash_4bit:
				168	sub r12,pc,#8
				169	add $len,$inp,$len @ $len to point at the end
				170	stmdb sp!,{r3-r11,lr} @ save $len/end too
				171	sub r12,r12,#48 @ &rem_4bit
				172
				173	ldmia r12,{r4-r11} @ copy rem_4bit ...
				174	stmdb sp!,{r4-r11} @ ... to stack
				175
				176	ldrb $nlo,[$inp,#15]
				177	ldrb $nhi,[$Xi,#15]
				178	.Louter:
				179	eor $nlo,$nlo,$nhi
				180	and $nhi,$nlo,#0xf0
				181	and $nlo,$nlo,#0x0f
				182	mov $cnt,#14
				183
				184	add $Zhh,$Htbl,$nlo,lsl#4
				185	ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
				186	add $Thh,$Htbl,$nhi
				187	ldrb $nlo,[$inp,#14]
				188
				189	and $nhi,$Zll,#0xf @ rem
				190	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
				191	add $nhi,$nhi,$nhi
				192	eor $Zll,$Tll,$Zll,lsr#4
				193	ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
				194	eor $Zll,$Zll,$Zlh,lsl#28
				195	ldrb $nhi,[$Xi,#14]
				196	eor $Zlh,$Tlh,$Zlh,lsr#4
				197	eor $Zlh,$Zlh,$Zhl,lsl#28
				198	eor $Zhl,$Thl,$Zhl,lsr#4
				199	eor $Zhl,$Zhl,$Zhh,lsl#28
				200	eor $Zhh,$Thh,$Zhh,lsr#4
				201	eor $nlo,$nlo,$nhi
				202	and $nhi,$nlo,#0xf0
				203	and $nlo,$nlo,#0x0f
				204	eor $Zhh,$Zhh,$Tll,lsl#16
				205
				206	.Linner:
				207	add $Thh,$Htbl,$nlo,lsl#4
				208	and $nlo,$Zll,#0xf @ rem
				209	subs $cnt,$cnt,#1
				210	add $nlo,$nlo,$nlo
				211	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
				212	eor $Zll,$Tll,$Zll,lsr#4
				213	eor $Zll,$Zll,$Zlh,lsl#28
				214	eor $Zlh,$Tlh,$Zlh,lsr#4
				215	eor $Zlh,$Zlh,$Zhl,lsl#28
				216	ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
				217	eor $Zhl,$Thl,$Zhl,lsr#4
				218	ldrbpl $nlo,[$inp,$cnt]
				219	eor $Zhl,$Zhl,$Zhh,lsl#28
				220	eor $Zhh,$Thh,$Zhh,lsr#4
				221
				222	add $Thh,$Htbl,$nhi
				223	and $nhi,$Zll,#0xf @ rem
				224	eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
				225	add $nhi,$nhi,$nhi
				226	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
				227	eor $Zll,$Tll,$Zll,lsr#4
				228	ldrbpl $Tll,[$Xi,$cnt]
				229	eor $Zll,$Zll,$Zlh,lsl#28
				230	eor $Zlh,$Tlh,$Zlh,lsr#4
				231	ldrh $Tlh,[sp,$nhi]
				232	eor $Zlh,$Zlh,$Zhl,lsl#28
				233	eor $Zhl,$Thl,$Zhl,lsr#4
				234	eor $Zhl,$Zhl,$Zhh,lsl#28
				235	eorpl $nlo,$nlo,$Tll
				236	eor $Zhh,$Thh,$Zhh,lsr#4
				237	andpl $nhi,$nlo,#0xf0
				238	andpl $nlo,$nlo,#0x0f
				239	eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
				240	bpl .Linner
				241
				242	ldr $len,[sp,#32] @ re-load $len/end
				243	add $inp,$inp,#16
				244	mov $nhi,$Zll
				245	___
				246	&Zsmash("cmp\t$inp,$len","ldrbne\t$nlo,[$inp,#15]");
				247	$code.=<<___;
				248	bne .Louter
				249
				250	add sp,sp,#36
				251	#if __ARM_ARCH__>=5
				252	ldmia sp!,{r4-r11,pc}
				253	#else
				254	ldmia sp!,{r4-r11,lr}
				255	tst lr,#1
				256	moveq pc,lr @ be binary compatible with V4, yet
				257	bx lr @ interoperable with Thumb ISA:-)
				258	#endif
				259	.size gcm_ghash_4bit,.-gcm_ghash_4bit
				260
				261	.global gcm_gmult_4bit
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	262	.type gcm_gmult_4bit,%function
				263	gcm_gmult_4bit:
				264	stmdb sp!,{r4-r11,lr}
				265	ldrb $nlo,[$Xi,#15]
				266	b rem_4bit_get
				267	.Lrem_4bit_got:
				268	and $nhi,$nlo,#0xf0
				269	and $nlo,$nlo,#0x0f
				270	mov $cnt,#14
				271
				272	add $Zhh,$Htbl,$nlo,lsl#4
				273	ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
				274	ldrb $nlo,[$Xi,#14]
				275
				276	add $Thh,$Htbl,$nhi
				277	and $nhi,$Zll,#0xf @ rem
				278	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
				279	add $nhi,$nhi,$nhi
				280	eor $Zll,$Tll,$Zll,lsr#4
				281	ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
				282	eor $Zll,$Zll,$Zlh,lsl#28
				283	eor $Zlh,$Tlh,$Zlh,lsr#4
				284	eor $Zlh,$Zlh,$Zhl,lsl#28
				285	eor $Zhl,$Thl,$Zhl,lsr#4
				286	eor $Zhl,$Zhl,$Zhh,lsl#28
				287	eor $Zhh,$Thh,$Zhh,lsr#4
				288	and $nhi,$nlo,#0xf0
				289	eor $Zhh,$Zhh,$Tll,lsl#16
				290	and $nlo,$nlo,#0x0f
				291
				292	.Loop:
				293	add $Thh,$Htbl,$nlo,lsl#4
				294	and $nlo,$Zll,#0xf @ rem
				295	subs $cnt,$cnt,#1
				296	add $nlo,$nlo,$nlo
				297	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
				298	eor $Zll,$Tll,$Zll,lsr#4
				299	eor $Zll,$Zll,$Zlh,lsl#28
				300	eor $Zlh,$Tlh,$Zlh,lsr#4
				301	eor $Zlh,$Zlh,$Zhl,lsl#28
				302	ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
				303	eor $Zhl,$Thl,$Zhl,lsr#4
				304	ldrbpl $nlo,[$Xi,$cnt]
				305	eor $Zhl,$Zhl,$Zhh,lsl#28
				306	eor $Zhh,$Thh,$Zhh,lsr#4
				307
				308	add $Thh,$Htbl,$nhi
				309	and $nhi,$Zll,#0xf @ rem
				310	eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
				311	add $nhi,$nhi,$nhi
				312	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
				313	eor $Zll,$Tll,$Zll,lsr#4
				314	eor $Zll,$Zll,$Zlh,lsl#28
				315	eor $Zlh,$Tlh,$Zlh,lsr#4
				316	ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
				317	eor $Zlh,$Zlh,$Zhl,lsl#28
				318	eor $Zhl,$Thl,$Zhl,lsr#4
				319	eor $Zhl,$Zhl,$Zhh,lsl#28
				320	eor $Zhh,$Thh,$Zhh,lsr#4
				321	andpl $nhi,$nlo,#0xf0
				322	andpl $nlo,$nlo,#0x0f
				323	eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
				324	bpl .Loop
				325	___
				326	&Zsmash();
				327	$code.=<<___;
				328	#if __ARM_ARCH__>=5
				329	ldmia sp!,{r4-r11,pc}
				330	#else
				331	ldmia sp!,{r4-r11,lr}
				332	tst lr,#1
				333	moveq pc,lr @ be binary compatible with V4, yet
				334	bx lr @ interoperable with Thumb ISA:-)
				335	#endif
				336	.size gcm_gmult_4bit,.-gcm_gmult_4bit
				337	___
				338	{
				339	my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
				340	my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
				341	my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
				342
				343	sub clmul64x64 {
				344	my ($r,$a,$b)=@_;
				345	$code.=<<___;
				346	vext.8 $t0#lo, $a, $a, #1 @ A1
				347	vmull.p8 $t0, $t0#lo, $b @ F = A1*B
				348	vext.8 $r#lo, $b, $b, #1 @ B1
				349	vmull.p8 $r, $a, $r#lo @ E = A*B1
				350	vext.8 $t1#lo, $a, $a, #2 @ A2
				351	vmull.p8 $t1, $t1#lo, $b @ H = A2*B
				352	vext.8 $t3#lo, $b, $b, #2 @ B2
				353	vmull.p8 $t3, $a, $t3#lo @ G = A*B2
				354	vext.8 $t2#lo, $a, $a, #3 @ A3
				355	veor $t0, $t0, $r @ L = E + F
				356	vmull.p8 $t2, $t2#lo, $b @ J = A3*B
				357	vext.8 $r#lo, $b, $b, #3 @ B3
				358	veor $t1, $t1, $t3 @ M = G + H
				359	vmull.p8 $r, $a, $r#lo @ I = A*B3
				360	veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
				361	vand $t0#hi, $t0#hi, $k48
				362	vext.8 $t3#lo, $b, $b, #4 @ B4
				363	veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
				364	vand $t1#hi, $t1#hi, $k32
				365	vmull.p8 $t3, $a, $t3#lo @ K = A*B4
				366	veor $t2, $t2, $r @ N = I + J
				367	veor $t0#lo, $t0#lo, $t0#hi
				368	veor $t1#lo, $t1#lo, $t1#hi
				369	veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
				370	vand $t2#hi, $t2#hi, $k16
				371	vext.8 $t0, $t0, $t0, #15
				372	veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
				373	vmov.i64 $t3#hi, #0
				374	vext.8 $t1, $t1, $t1, #14
				375	veor $t2#lo, $t2#lo, $t2#hi
				376	vmull.p8 $r, $a, $b @ D = A*B
				377	vext.8 $t3, $t3, $t3, #12
				378	vext.8 $t2, $t2, $t2, #13
				379	veor $t0, $t0, $t1
				380	veor $t2, $t2, $t3
				381	veor $r, $r, $t0
				382	veor $r, $r, $t2
				383	___
				384	}
				385
				386	$code.=<<___;
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	387	#if __ARM_MAX_ARCH__>=7
				388	.arch armv7-a
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	389	.fpu neon
				390
				391	.global gcm_init_neon
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	392	.type gcm_init_neon,%function
				393	.align 4
				394	gcm_init_neon:
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	395	vld1.64 $IN#hi,[r1]! @ load H
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	396	vmov.i8 $t0,#0xe1
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	397	vld1.64 $IN#lo,[r1]
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	398	vshl.i64 $t0#hi,#57
				399	vshr.u64 $t0#lo,#63 @ t0=0xc2....01
				400	vdup.8 $t1,$IN#hi[7]
				401	vshr.u64 $Hlo,$IN#lo,#63
				402	vshr.s8 $t1,#7 @ broadcast carry bit
				403	vshl.i64 $IN,$IN,#1
				404	vand $t0,$t0,$t1
				405	vorr $IN#hi,$Hlo @ H<<<=1
				406	veor $IN,$IN,$t0 @ twisted H
				407	vstmia r0,{$IN}
				408
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	409	ret @ bx lr
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	410	.size gcm_init_neon,.-gcm_init_neon
				411
				412	.global gcm_gmult_neon
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	413	.type gcm_gmult_neon,%function
				414	.align 4
				415	gcm_gmult_neon:
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	416	vld1.64 $IN#hi,[$Xi]! @ load Xi
				417	vld1.64 $IN#lo,[$Xi]!
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	418	vmov.i64 $k48,#0x0000ffffffffffff
				419	vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
				420	vmov.i64 $k32,#0x00000000ffffffff
				421	#ifdef __ARMEL__
				422	vrev64.8 $IN,$IN
				423	#endif
				424	vmov.i64 $k16,#0x000000000000ffff
				425	veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
				426	mov $len,#16
				427	b .Lgmult_neon
				428	.size gcm_gmult_neon,.-gcm_gmult_neon
				429
				430	.global gcm_ghash_neon
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	431	.type gcm_ghash_neon,%function
				432	.align 4
				433	gcm_ghash_neon:
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	434	vld1.64 $Xl#hi,[$Xi]! @ load Xi
				435	vld1.64 $Xl#lo,[$Xi]!
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	436	vmov.i64 $k48,#0x0000ffffffffffff
				437	vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
				438	vmov.i64 $k32,#0x00000000ffffffff
				439	#ifdef __ARMEL__
				440	vrev64.8 $Xl,$Xl
				441	#endif
				442	vmov.i64 $k16,#0x000000000000ffff
				443	veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
				444
				445	.Loop_neon:
				446	vld1.64 $IN#hi,[$inp]! @ load inp
				447	vld1.64 $IN#lo,[$inp]!
				448	#ifdef __ARMEL__
				449	vrev64.8 $IN,$IN
				450	#endif
				451	veor $IN,$Xl @ inp^=Xi
				452	.Lgmult_neon:
				453	___
Kenny Root	b849459	2015-09-25 02:29:14 +0000	[diff] [blame]	454	&clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	455	$code.=<<___;
				456	veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
				457	___
Kenny Root	b849459	2015-09-25 02:29:14 +0000	[diff] [blame]	458	&clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
				459	&clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	460	$code.=<<___;
				461	veor $Xm,$Xm,$Xl @ Karatsuba post-processing
				462	veor $Xm,$Xm,$Xh
				463	veor $Xl#hi,$Xl#hi,$Xm#lo
				464	veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh\|Xl - 256-bit result
				465
				466	@ equivalent of reduction_avx from ghash-x86_64.pl
				467	vshl.i64 $t1,$Xl,#57 @ 1st phase
				468	vshl.i64 $t2,$Xl,#62
				469	veor $t2,$t2,$t1 @
				470	vshl.i64 $t1,$Xl,#63
				471	veor $t2, $t2, $t1 @
				472	veor $Xl#hi,$Xl#hi,$t2#lo @
				473	veor $Xh#lo,$Xh#lo,$t2#hi
				474
				475	vshr.u64 $t2,$Xl,#1 @ 2nd phase
				476	veor $Xh,$Xh,$Xl
				477	veor $Xl,$Xl,$t2 @
				478	vshr.u64 $t2,$t2,#6
				479	vshr.u64 $Xl,$Xl,#1 @
				480	veor $Xl,$Xl,$Xh @
				481	veor $Xl,$Xl,$t2 @
				482
				483	subs $len,#16
				484	bne .Loop_neon
				485
				486	#ifdef __ARMEL__
				487	vrev64.8 $Xl,$Xl
				488	#endif
Robert Sloan	a94fe05	2017-02-21 08:49:28 -0800	[diff] [blame]	489	sub $Xi,#16
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	490	vst1.64 $Xl#hi,[$Xi]! @ write out Xi
				491	vst1.64 $Xl#lo,[$Xi]
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	492
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	493	ret @ bx lr
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	494	.size gcm_ghash_neon,.-gcm_ghash_neon
				495	#endif
				496	___
				497	}
				498	$code.=<<___;
				499	.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
				500	.align 2
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	501	___
				502
				503	foreach (split("\n",$code)) {
				504	s/\`([^\`]*)\`/eval $1/geo;
				505
				506	s/\bq([0-9]+)#(lo\|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	507	s/\bret\b/bx lr/go or
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	508	s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
				509
				510	print $_,"\n";
				511	}
				512	close STDOUT; # enforce flush