Blame - ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S - platform/external/boringssl

blob: 60bff31018697eb2e294b2c619f64b87256a802d [file] [log] [blame]

Robert Sloan	9d5d1a7	2019-03-18 09:32:50 -0700	[diff] [blame]	1	// This file is generated from a similarly-named Perl script in the BoringSSL
				2	// source tree. Do not edit by hand.
				3
Pete Bentley	0c61efe	2019-08-13 09:32:23 +0100	[diff] [blame^]	4	#if !defined(__has_feature)
				5	#define __has_feature(x) 0
				6	#endif
Robert Sloan	9d5d1a7	2019-03-18 09:32:50 -0700	[diff] [blame]	7	#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
				8	#define OPENSSL_NO_ASM
				9	#endif
Robert Sloan	9d5d1a7	2019-03-18 09:32:50 -0700	[diff] [blame]	10
				11	#if !defined(OPENSSL_NO_ASM)
				12	#if defined(BORINGSSL_PREFIX)
				13	#include <boringssl_prefix_symbols_asm.h>
				14	#endif
				15	.text
				16
				17	.globl _gcm_init_neon
				18	.private_extern _gcm_init_neon
				19
				20	.align 4
				21	_gcm_init_neon:
				22	// This function is adapted from gcm_init_v8. xC2 is t3.
				23	ld1 {v17.2d}, [x1] // load H
				24	movi v19.16b, #0xe1
				25	shl v19.2d, v19.2d, #57 // 0xc2.0
				26	ext v3.16b, v17.16b, v17.16b, #8
				27	ushr v18.2d, v19.2d, #63
				28	dup v17.4s, v17.s[1]
				29	ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
				30	ushr v18.2d, v3.2d, #63
				31	sshr v17.4s, v17.4s, #31 // broadcast carry bit
				32	and v18.16b, v18.16b, v16.16b
				33	shl v3.2d, v3.2d, #1
				34	ext v18.16b, v18.16b, v18.16b, #8
				35	and v16.16b, v16.16b, v17.16b
				36	orr v3.16b, v3.16b, v18.16b // H<<<=1
				37	eor v5.16b, v3.16b, v16.16b // twisted H
				38	st1 {v5.2d}, [x0] // store Htable[0]
				39	ret
				40
				41
				42	.globl _gcm_gmult_neon
				43	.private_extern _gcm_gmult_neon
				44
				45	.align 4
				46	_gcm_gmult_neon:
				47	ld1 {v3.16b}, [x0] // load Xi
				48	ld1 {v5.1d}, [x1], #8 // load twisted H
				49	ld1 {v6.1d}, [x1]
				50	adrp x9, Lmasks@PAGE // load constants
				51	add x9, x9, Lmasks@PAGEOFF
				52	ld1 {v24.2d, v25.2d}, [x9]
				53	rev64 v3.16b, v3.16b // byteswap Xi
				54	ext v3.16b, v3.16b, v3.16b, #8
				55	eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
				56
				57	mov x3, #16
				58	b Lgmult_neon
				59
				60
				61	.globl _gcm_ghash_neon
				62	.private_extern _gcm_ghash_neon
				63
				64	.align 4
				65	_gcm_ghash_neon:
				66	ld1 {v0.16b}, [x0] // load Xi
				67	ld1 {v5.1d}, [x1], #8 // load twisted H
				68	ld1 {v6.1d}, [x1]
				69	adrp x9, Lmasks@PAGE // load constants
				70	add x9, x9, Lmasks@PAGEOFF
				71	ld1 {v24.2d, v25.2d}, [x9]
				72	rev64 v0.16b, v0.16b // byteswap Xi
				73	ext v0.16b, v0.16b, v0.16b, #8
				74	eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
				75
				76	Loop_neon:
				77	ld1 {v3.16b}, [x2], #16 // load inp
				78	rev64 v3.16b, v3.16b // byteswap inp
				79	ext v3.16b, v3.16b, v3.16b, #8
				80	eor v3.16b, v3.16b, v0.16b // inp ^= Xi
				81
				82	Lgmult_neon:
				83	// Split the input into v3 and v4. (The upper halves are unused,
				84	// so it is okay to leave them alone.)
				85	ins v4.d[0], v3.d[1]
				86	ext v16.8b, v5.8b, v5.8b, #1 // A1
				87	pmull v16.8h, v16.8b, v3.8b // F = A1*B
				88	ext v0.8b, v3.8b, v3.8b, #1 // B1
				89	pmull v0.8h, v5.8b, v0.8b // E = A*B1
				90	ext v17.8b, v5.8b, v5.8b, #2 // A2
				91	pmull v17.8h, v17.8b, v3.8b // H = A2*B
				92	ext v19.8b, v3.8b, v3.8b, #2 // B2
				93	pmull v19.8h, v5.8b, v19.8b // G = A*B2
				94	ext v18.8b, v5.8b, v5.8b, #3 // A3
				95	eor v16.16b, v16.16b, v0.16b // L = E + F
				96	pmull v18.8h, v18.8b, v3.8b // J = A3*B
				97	ext v0.8b, v3.8b, v3.8b, #3 // B3
				98	eor v17.16b, v17.16b, v19.16b // M = G + H
				99	pmull v0.8h, v5.8b, v0.8b // I = A*B3
				100
				101	// Here we diverge from the 32-bit version. It computes the following
				102	// (instructions reordered for clarity):
				103	//
				104	// veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
				105	// vand $t0#hi, $t0#hi, $k48
				106	// veor $t0#lo, $t0#lo, $t0#hi
				107	//
				108	// veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
				109	// vand $t1#hi, $t1#hi, $k32
				110	// veor $t1#lo, $t1#lo, $t1#hi
				111	//
				112	// veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
				113	// vand $t2#hi, $t2#hi, $k16
				114	// veor $t2#lo, $t2#lo, $t2#hi
				115	//
				116	// veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
				117	// vmov.i64 $t3#hi, #0
				118	//
				119	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
				120	// upper halves of SIMD registers, so we must split each half into
				121	// separate registers. To compensate, we pair computations up and
				122	// parallelize.
				123
				124	ext v19.8b, v3.8b, v3.8b, #4 // B4
				125	eor v18.16b, v18.16b, v0.16b // N = I + J
				126	pmull v19.8h, v5.8b, v19.8b // K = A*B4
				127
				128	// This can probably be scheduled more efficiently. For now, we just
				129	// pair up independent instructions.
				130	zip1 v20.2d, v16.2d, v17.2d
				131	zip1 v22.2d, v18.2d, v19.2d
				132	zip2 v21.2d, v16.2d, v17.2d
				133	zip2 v23.2d, v18.2d, v19.2d
				134	eor v20.16b, v20.16b, v21.16b
				135	eor v22.16b, v22.16b, v23.16b
				136	and v21.16b, v21.16b, v24.16b
				137	and v23.16b, v23.16b, v25.16b
				138	eor v20.16b, v20.16b, v21.16b
				139	eor v22.16b, v22.16b, v23.16b
				140	zip1 v16.2d, v20.2d, v21.2d
				141	zip1 v18.2d, v22.2d, v23.2d
				142	zip2 v17.2d, v20.2d, v21.2d
				143	zip2 v19.2d, v22.2d, v23.2d
				144
				145	ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
				146	ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
				147	pmull v0.8h, v5.8b, v3.8b // D = A*B
				148	ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
				149	ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
				150	eor v16.16b, v16.16b, v17.16b
				151	eor v18.16b, v18.16b, v19.16b
				152	eor v0.16b, v0.16b, v16.16b
				153	eor v0.16b, v0.16b, v18.16b
				154	eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
				155	ext v16.8b, v7.8b, v7.8b, #1 // A1
				156	pmull v16.8h, v16.8b, v3.8b // F = A1*B
				157	ext v1.8b, v3.8b, v3.8b, #1 // B1
				158	pmull v1.8h, v7.8b, v1.8b // E = A*B1
				159	ext v17.8b, v7.8b, v7.8b, #2 // A2
				160	pmull v17.8h, v17.8b, v3.8b // H = A2*B
				161	ext v19.8b, v3.8b, v3.8b, #2 // B2
				162	pmull v19.8h, v7.8b, v19.8b // G = A*B2
				163	ext v18.8b, v7.8b, v7.8b, #3 // A3
				164	eor v16.16b, v16.16b, v1.16b // L = E + F
				165	pmull v18.8h, v18.8b, v3.8b // J = A3*B
				166	ext v1.8b, v3.8b, v3.8b, #3 // B3
				167	eor v17.16b, v17.16b, v19.16b // M = G + H
				168	pmull v1.8h, v7.8b, v1.8b // I = A*B3
				169
				170	// Here we diverge from the 32-bit version. It computes the following
				171	// (instructions reordered for clarity):
				172	//
				173	// veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
				174	// vand $t0#hi, $t0#hi, $k48
				175	// veor $t0#lo, $t0#lo, $t0#hi
				176	//
				177	// veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
				178	// vand $t1#hi, $t1#hi, $k32
				179	// veor $t1#lo, $t1#lo, $t1#hi
				180	//
				181	// veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
				182	// vand $t2#hi, $t2#hi, $k16
				183	// veor $t2#lo, $t2#lo, $t2#hi
				184	//
				185	// veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
				186	// vmov.i64 $t3#hi, #0
				187	//
				188	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
				189	// upper halves of SIMD registers, so we must split each half into
				190	// separate registers. To compensate, we pair computations up and
				191	// parallelize.
				192
				193	ext v19.8b, v3.8b, v3.8b, #4 // B4
				194	eor v18.16b, v18.16b, v1.16b // N = I + J
				195	pmull v19.8h, v7.8b, v19.8b // K = A*B4
				196
				197	// This can probably be scheduled more efficiently. For now, we just
				198	// pair up independent instructions.
				199	zip1 v20.2d, v16.2d, v17.2d
				200	zip1 v22.2d, v18.2d, v19.2d
				201	zip2 v21.2d, v16.2d, v17.2d
				202	zip2 v23.2d, v18.2d, v19.2d
				203	eor v20.16b, v20.16b, v21.16b
				204	eor v22.16b, v22.16b, v23.16b
				205	and v21.16b, v21.16b, v24.16b
				206	and v23.16b, v23.16b, v25.16b
				207	eor v20.16b, v20.16b, v21.16b
				208	eor v22.16b, v22.16b, v23.16b
				209	zip1 v16.2d, v20.2d, v21.2d
				210	zip1 v18.2d, v22.2d, v23.2d
				211	zip2 v17.2d, v20.2d, v21.2d
				212	zip2 v19.2d, v22.2d, v23.2d
				213
				214	ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
				215	ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
				216	pmull v1.8h, v7.8b, v3.8b // D = A*B
				217	ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
				218	ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
				219	eor v16.16b, v16.16b, v17.16b
				220	eor v18.16b, v18.16b, v19.16b
				221	eor v1.16b, v1.16b, v16.16b
				222	eor v1.16b, v1.16b, v18.16b
				223	ext v16.8b, v6.8b, v6.8b, #1 // A1
				224	pmull v16.8h, v16.8b, v4.8b // F = A1*B
				225	ext v2.8b, v4.8b, v4.8b, #1 // B1
				226	pmull v2.8h, v6.8b, v2.8b // E = A*B1
				227	ext v17.8b, v6.8b, v6.8b, #2 // A2
				228	pmull v17.8h, v17.8b, v4.8b // H = A2*B
				229	ext v19.8b, v4.8b, v4.8b, #2 // B2
				230	pmull v19.8h, v6.8b, v19.8b // G = A*B2
				231	ext v18.8b, v6.8b, v6.8b, #3 // A3
				232	eor v16.16b, v16.16b, v2.16b // L = E + F
				233	pmull v18.8h, v18.8b, v4.8b // J = A3*B
				234	ext v2.8b, v4.8b, v4.8b, #3 // B3
				235	eor v17.16b, v17.16b, v19.16b // M = G + H
				236	pmull v2.8h, v6.8b, v2.8b // I = A*B3
				237
				238	// Here we diverge from the 32-bit version. It computes the following
				239	// (instructions reordered for clarity):
				240	//
				241	// veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
				242	// vand $t0#hi, $t0#hi, $k48
				243	// veor $t0#lo, $t0#lo, $t0#hi
				244	//
				245	// veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
				246	// vand $t1#hi, $t1#hi, $k32
				247	// veor $t1#lo, $t1#lo, $t1#hi
				248	//
				249	// veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
				250	// vand $t2#hi, $t2#hi, $k16
				251	// veor $t2#lo, $t2#lo, $t2#hi
				252	//
				253	// veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
				254	// vmov.i64 $t3#hi, #0
				255	//
				256	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
				257	// upper halves of SIMD registers, so we must split each half into
				258	// separate registers. To compensate, we pair computations up and
				259	// parallelize.
				260
				261	ext v19.8b, v4.8b, v4.8b, #4 // B4
				262	eor v18.16b, v18.16b, v2.16b // N = I + J
				263	pmull v19.8h, v6.8b, v19.8b // K = A*B4
				264
				265	// This can probably be scheduled more efficiently. For now, we just
				266	// pair up independent instructions.
				267	zip1 v20.2d, v16.2d, v17.2d
				268	zip1 v22.2d, v18.2d, v19.2d
				269	zip2 v21.2d, v16.2d, v17.2d
				270	zip2 v23.2d, v18.2d, v19.2d
				271	eor v20.16b, v20.16b, v21.16b
				272	eor v22.16b, v22.16b, v23.16b
				273	and v21.16b, v21.16b, v24.16b
				274	and v23.16b, v23.16b, v25.16b
				275	eor v20.16b, v20.16b, v21.16b
				276	eor v22.16b, v22.16b, v23.16b
				277	zip1 v16.2d, v20.2d, v21.2d
				278	zip1 v18.2d, v22.2d, v23.2d
				279	zip2 v17.2d, v20.2d, v21.2d
				280	zip2 v19.2d, v22.2d, v23.2d
				281
				282	ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
				283	ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
				284	pmull v2.8h, v6.8b, v4.8b // D = A*B
				285	ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
				286	ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
				287	eor v16.16b, v16.16b, v17.16b
				288	eor v18.16b, v18.16b, v19.16b
				289	eor v2.16b, v2.16b, v16.16b
				290	eor v2.16b, v2.16b, v18.16b
				291	ext v16.16b, v0.16b, v2.16b, #8
				292	eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
				293	eor v1.16b, v1.16b, v2.16b
				294	eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
				295	ins v0.d[1], v1.d[0] // Xh\|Xl - 256-bit result
				296	// This is a no-op due to the ins instruction below.
				297	// ins v2.d[0], v1.d[1]
				298
				299	// equivalent of reduction_avx from ghash-x86_64.pl
				300	shl v17.2d, v0.2d, #57 // 1st phase
				301	shl v18.2d, v0.2d, #62
				302	eor v18.16b, v18.16b, v17.16b //
				303	shl v17.2d, v0.2d, #63
				304	eor v18.16b, v18.16b, v17.16b //
				305	// Note Xm contains {Xl.d[1], Xh.d[0]}.
				306	eor v18.16b, v18.16b, v1.16b
				307	ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
				308	ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
				309
				310	ushr v18.2d, v0.2d, #1 // 2nd phase
				311	eor v2.16b, v2.16b,v0.16b
				312	eor v0.16b, v0.16b,v18.16b //
				313	ushr v18.2d, v18.2d, #6
				314	ushr v0.2d, v0.2d, #1 //
				315	eor v0.16b, v0.16b, v2.16b //
				316	eor v0.16b, v0.16b, v18.16b //
				317
				318	subs x3, x3, #16
				319	bne Loop_neon
				320
				321	rev64 v0.16b, v0.16b // byteswap Xi and write
				322	ext v0.16b, v0.16b, v0.16b, #8
				323	st1 {v0.16b}, [x0]
				324
				325	ret
				326
				327
				328	.section __TEXT,__const
				329	.align 4
				330	Lmasks:
				331	.quad 0x0000ffffffffffff // k48
				332	.quad 0x00000000ffffffff // k32
				333	.quad 0x000000000000ffff // k16
				334	.quad 0x0000000000000000 // k0
				335	.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
				336	.align 2
				337	.align 2
				338	#endif // !OPENSSL_NO_ASM