Blame - arch/arm64/crypto/speck-neon-core.S - kernel/msm-4.9

blob: b14463438b0966b6bc37f2f7784b0285c51ce290 [file] [log] [blame]

Eric Biggers	f152ce1	2018-03-05 11:17:07 -0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* ARM64 NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
				4	*
				5	* Copyright (c) 2018 Google, Inc
				6	*
				7	* Author: Eric Biggers <ebiggers@google.com>
				8	*/
				9
				10	#include <linux/linkage.h>
				11
				12	.text
				13
				14	// arguments
				15	ROUND_KEYS .req x0 // const {u64,u32} *round_keys
				16	NROUNDS .req w1 // int nrounds
				17	NROUNDS_X .req x1
				18	DST .req x2 // void *dst
				19	SRC .req x3 // const void *src
				20	NBYTES .req w4 // unsigned int nbytes
				21	TWEAK .req x5 // void *tweak
				22
				23	// registers which hold the data being encrypted/decrypted
				24	// (underscores avoid a naming collision with ARM64 registers x0-x3)
				25	X_0 .req v0
				26	Y_0 .req v1
				27	X_1 .req v2
				28	Y_1 .req v3
				29	X_2 .req v4
				30	Y_2 .req v5
				31	X_3 .req v6
				32	Y_3 .req v7
				33
				34	// the round key, duplicated in all lanes
				35	ROUND_KEY .req v8
				36
				37	// index vector for tbl-based 8-bit rotates
				38	ROTATE_TABLE .req v9
				39	ROTATE_TABLE_Q .req q9
				40
				41	// temporary registers
				42	TMP0 .req v10
				43	TMP1 .req v11
				44	TMP2 .req v12
				45	TMP3 .req v13
				46
				47	// multiplication table for updating XTS tweaks
				48	GFMUL_TABLE .req v14
				49	GFMUL_TABLE_Q .req q14
				50
				51	// next XTS tweak value(s)
				52	TWEAKV_NEXT .req v15
				53
				54	// XTS tweaks for the blocks currently being encrypted/decrypted
				55	TWEAKV0 .req v16
				56	TWEAKV1 .req v17
				57	TWEAKV2 .req v18
				58	TWEAKV3 .req v19
				59	TWEAKV4 .req v20
				60	TWEAKV5 .req v21
				61	TWEAKV6 .req v22
				62	TWEAKV7 .req v23
				63
				64	.align 4
				65	.Lror64_8_table:
				66	.octa 0x080f0e0d0c0b0a090007060504030201
				67	.Lror32_8_table:
				68	.octa 0x0c0f0e0d080b0a090407060500030201
				69	.Lrol64_8_table:
				70	.octa 0x0e0d0c0b0a09080f0605040302010007
				71	.Lrol32_8_table:
				72	.octa 0x0e0d0c0f0a09080b0605040702010003
				73	.Lgf128mul_table:
				74	.octa 0x00000000000000870000000000000001
				75	.Lgf64mul_table:
				76	.octa 0x0000000000000000000000002d361b00
				77
				78	/*
				79	* _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
				80	*
				81	* Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
				82	* Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
				83	* of ROUND_KEY. 'n' is the lane size: 64 for Speck128, or 32 for Speck64.
				84	* 'lanes' is the lane specifier: "2d" for Speck128 or "4s" for Speck64.
				85	*/
				86	.macro _speck_round_128bytes n, lanes
				87
				88	// x = ror(x, 8)
				89	tbl X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
				90	tbl X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
				91	tbl X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
				92	tbl X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
				93
				94	// x += y
				95	add X_0.\lanes, X_0.\lanes, Y_0.\lanes
				96	add X_1.\lanes, X_1.\lanes, Y_1.\lanes
				97	add X_2.\lanes, X_2.\lanes, Y_2.\lanes
				98	add X_3.\lanes, X_3.\lanes, Y_3.\lanes
				99
				100	// x ^= k
				101	eor X_0.16b, X_0.16b, ROUND_KEY.16b
				102	eor X_1.16b, X_1.16b, ROUND_KEY.16b
				103	eor X_2.16b, X_2.16b, ROUND_KEY.16b
				104	eor X_3.16b, X_3.16b, ROUND_KEY.16b
				105
				106	// y = rol(y, 3)
				107	shl TMP0.\lanes, Y_0.\lanes, #3
				108	shl TMP1.\lanes, Y_1.\lanes, #3
				109	shl TMP2.\lanes, Y_2.\lanes, #3
				110	shl TMP3.\lanes, Y_3.\lanes, #3
				111	sri TMP0.\lanes, Y_0.\lanes, #(\n - 3)
				112	sri TMP1.\lanes, Y_1.\lanes, #(\n - 3)
				113	sri TMP2.\lanes, Y_2.\lanes, #(\n - 3)
				114	sri TMP3.\lanes, Y_3.\lanes, #(\n - 3)
				115
				116	// y ^= x
				117	eor Y_0.16b, TMP0.16b, X_0.16b
				118	eor Y_1.16b, TMP1.16b, X_1.16b
				119	eor Y_2.16b, TMP2.16b, X_2.16b
				120	eor Y_3.16b, TMP3.16b, X_3.16b
				121	.endm
				122
				123	/*
				124	* _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
				125	*
				126	* This is the inverse of _speck_round_128bytes().
				127	*/
				128	.macro _speck_unround_128bytes n, lanes
				129
				130	// y ^= x
				131	eor TMP0.16b, Y_0.16b, X_0.16b
				132	eor TMP1.16b, Y_1.16b, X_1.16b
				133	eor TMP2.16b, Y_2.16b, X_2.16b
				134	eor TMP3.16b, Y_3.16b, X_3.16b
				135
				136	// y = ror(y, 3)
				137	ushr Y_0.\lanes, TMP0.\lanes, #3
				138	ushr Y_1.\lanes, TMP1.\lanes, #3
				139	ushr Y_2.\lanes, TMP2.\lanes, #3
				140	ushr Y_3.\lanes, TMP3.\lanes, #3
				141	sli Y_0.\lanes, TMP0.\lanes, #(\n - 3)
				142	sli Y_1.\lanes, TMP1.\lanes, #(\n - 3)
				143	sli Y_2.\lanes, TMP2.\lanes, #(\n - 3)
				144	sli Y_3.\lanes, TMP3.\lanes, #(\n - 3)
				145
				146	// x ^= k
				147	eor X_0.16b, X_0.16b, ROUND_KEY.16b
				148	eor X_1.16b, X_1.16b, ROUND_KEY.16b
				149	eor X_2.16b, X_2.16b, ROUND_KEY.16b
				150	eor X_3.16b, X_3.16b, ROUND_KEY.16b
				151
				152	// x -= y
				153	sub X_0.\lanes, X_0.\lanes, Y_0.\lanes
				154	sub X_1.\lanes, X_1.\lanes, Y_1.\lanes
				155	sub X_2.\lanes, X_2.\lanes, Y_2.\lanes
				156	sub X_3.\lanes, X_3.\lanes, Y_3.\lanes
				157
				158	// x = rol(x, 8)
				159	tbl X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
				160	tbl X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
				161	tbl X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
				162	tbl X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
				163	.endm
				164
				165	.macro _next_xts_tweak next, cur, tmp, n
				166	.if \n == 64
				167	/*
				168	* Calculate the next tweak by multiplying the current one by x,
				169	* modulo p(x) = x^128 + x^7 + x^2 + x + 1.
				170	*/
				171	sshr \tmp\().2d, \cur\().2d, #63
				172	and \tmp\().16b, \tmp\().16b, GFMUL_TABLE.16b
				173	shl \next\().2d, \cur\().2d, #1
				174	ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
				175	eor \next\().16b, \next\().16b, \tmp\().16b
				176	.else
				177	/*
				178	* Calculate the next two tweaks by multiplying the current ones by x^2,
				179	* modulo p(x) = x^64 + x^4 + x^3 + x + 1.
				180	*/
				181	ushr \tmp\().2d, \cur\().2d, #62
				182	shl \next\().2d, \cur\().2d, #2
				183	tbl \tmp\().16b, {GFMUL_TABLE.16b}, \tmp\().16b
				184	eor \next\().16b, \next\().16b, \tmp\().16b
				185	.endif
				186	.endm
				187
				188	/*
				189	* _speck_xts_crypt() - Speck-XTS encryption/decryption
				190	*
				191	* Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
				192	* using Speck-XTS, specifically the variant with a block size of '2n' and round
				193	* count given by NROUNDS. The expanded round keys are given in ROUND_KEYS, and
				194	* the current XTS tweak value is given in TWEAK. It's assumed that NBYTES is a
				195	* nonzero multiple of 128.
				196	*/
				197	.macro _speck_xts_crypt n, lanes, decrypting
				198
				199	/*
				200	* If decrypting, modify the ROUND_KEYS parameter to point to the last
				201	* round key rather than the first, since for decryption the round keys
				202	* are used in reverse order.
				203	*/
				204	.if \decrypting
				205	mov NROUNDS, NROUNDS /* zero the high 32 bits */
				206	.if \n == 64
				207	add ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #3
				208	sub ROUND_KEYS, ROUND_KEYS, #8
				209	.else
				210	add ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #2
				211	sub ROUND_KEYS, ROUND_KEYS, #4
				212	.endif
				213	.endif
				214
				215	// Load the index vector for tbl-based 8-bit rotates
				216	.if \decrypting
				217	ldr ROTATE_TABLE_Q, .Lrol\n\()_8_table
				218	.else
				219	ldr ROTATE_TABLE_Q, .Lror\n\()_8_table
				220	.endif
				221
				222	// One-time XTS preparation
				223	.if \n == 64
				224	// Load first tweak
				225	ld1 {TWEAKV0.16b}, [TWEAK]
				226
				227	// Load GF(2^128) multiplication table
				228	ldr GFMUL_TABLE_Q, .Lgf128mul_table
				229	.else
				230	// Load first tweak
				231	ld1 {TWEAKV0.8b}, [TWEAK]
				232
				233	// Load GF(2^64) multiplication table
				234	ldr GFMUL_TABLE_Q, .Lgf64mul_table
				235
				236	// Calculate second tweak, packing it together with the first
				237	ushr TMP0.2d, TWEAKV0.2d, #63
				238	shl TMP1.2d, TWEAKV0.2d, #1
				239	tbl TMP0.8b, {GFMUL_TABLE.16b}, TMP0.8b
				240	eor TMP0.8b, TMP0.8b, TMP1.8b
				241	mov TWEAKV0.d[1], TMP0.d[0]
				242	.endif
				243
				244	.Lnext_128bytes_\@:
				245
				246	// Calculate XTS tweaks for next 128 bytes
				247	_next_xts_tweak TWEAKV1, TWEAKV0, TMP0, \n
				248	_next_xts_tweak TWEAKV2, TWEAKV1, TMP0, \n
				249	_next_xts_tweak TWEAKV3, TWEAKV2, TMP0, \n
				250	_next_xts_tweak TWEAKV4, TWEAKV3, TMP0, \n
				251	_next_xts_tweak TWEAKV5, TWEAKV4, TMP0, \n
				252	_next_xts_tweak TWEAKV6, TWEAKV5, TMP0, \n
				253	_next_xts_tweak TWEAKV7, TWEAKV6, TMP0, \n
				254	_next_xts_tweak TWEAKV_NEXT, TWEAKV7, TMP0, \n
				255
				256	// Load the next source blocks into {X,Y}[0-3]
				257	ld1 {X_0.16b-Y_1.16b}, [SRC], #64
				258	ld1 {X_2.16b-Y_3.16b}, [SRC], #64
				259
				260	// XOR the source blocks with their XTS tweaks
				261	eor TMP0.16b, X_0.16b, TWEAKV0.16b
				262	eor Y_0.16b, Y_0.16b, TWEAKV1.16b
				263	eor TMP1.16b, X_1.16b, TWEAKV2.16b
				264	eor Y_1.16b, Y_1.16b, TWEAKV3.16b
				265	eor TMP2.16b, X_2.16b, TWEAKV4.16b
				266	eor Y_2.16b, Y_2.16b, TWEAKV5.16b
				267	eor TMP3.16b, X_3.16b, TWEAKV6.16b
				268	eor Y_3.16b, Y_3.16b, TWEAKV7.16b
				269
				270	/*
				271	* De-interleave the 'x' and 'y' elements of each block, i.e. make it so
				272	* that the X[0-3] registers contain only the second halves of blocks,
				273	* and the Y[0-3] registers contain only the first halves of blocks.
				274	* (Speck uses the order (y, x) rather than the more intuitive (x, y).)
				275	*/
				276	uzp2 X_0.\lanes, TMP0.\lanes, Y_0.\lanes
				277	uzp1 Y_0.\lanes, TMP0.\lanes, Y_0.\lanes
				278	uzp2 X_1.\lanes, TMP1.\lanes, Y_1.\lanes
				279	uzp1 Y_1.\lanes, TMP1.\lanes, Y_1.\lanes
				280	uzp2 X_2.\lanes, TMP2.\lanes, Y_2.\lanes
				281	uzp1 Y_2.\lanes, TMP2.\lanes, Y_2.\lanes
				282	uzp2 X_3.\lanes, TMP3.\lanes, Y_3.\lanes
				283	uzp1 Y_3.\lanes, TMP3.\lanes, Y_3.\lanes
				284
				285	// Do the cipher rounds
				286	mov x6, ROUND_KEYS
				287	mov w7, NROUNDS
				288	.Lnext_round_\@:
				289	.if \decrypting
				290	ld1r {ROUND_KEY.\lanes}, [x6]
				291	sub x6, x6, #( \n / 8 )
				292	_speck_unround_128bytes \n, \lanes
				293	.else
				294	ld1r {ROUND_KEY.\lanes}, [x6], #( \n / 8 )
				295	_speck_round_128bytes \n, \lanes
				296	.endif
				297	subs w7, w7, #1
				298	bne .Lnext_round_\@
				299
				300	// Re-interleave the 'x' and 'y' elements of each block
				301	zip1 TMP0.\lanes, Y_0.\lanes, X_0.\lanes
				302	zip2 Y_0.\lanes, Y_0.\lanes, X_0.\lanes
				303	zip1 TMP1.\lanes, Y_1.\lanes, X_1.\lanes
				304	zip2 Y_1.\lanes, Y_1.\lanes, X_1.\lanes
				305	zip1 TMP2.\lanes, Y_2.\lanes, X_2.\lanes
				306	zip2 Y_2.\lanes, Y_2.\lanes, X_2.\lanes
				307	zip1 TMP3.\lanes, Y_3.\lanes, X_3.\lanes
				308	zip2 Y_3.\lanes, Y_3.\lanes, X_3.\lanes
				309
				310	// XOR the encrypted/decrypted blocks with the tweaks calculated earlier
				311	eor X_0.16b, TMP0.16b, TWEAKV0.16b
				312	eor Y_0.16b, Y_0.16b, TWEAKV1.16b
				313	eor X_1.16b, TMP1.16b, TWEAKV2.16b
				314	eor Y_1.16b, Y_1.16b, TWEAKV3.16b
				315	eor X_2.16b, TMP2.16b, TWEAKV4.16b
				316	eor Y_2.16b, Y_2.16b, TWEAKV5.16b
				317	eor X_3.16b, TMP3.16b, TWEAKV6.16b
				318	eor Y_3.16b, Y_3.16b, TWEAKV7.16b
				319	mov TWEAKV0.16b, TWEAKV_NEXT.16b
				320
				321	// Store the ciphertext in the destination buffer
				322	st1 {X_0.16b-Y_1.16b}, [DST], #64
				323	st1 {X_2.16b-Y_3.16b}, [DST], #64
				324
				325	// Continue if there are more 128-byte chunks remaining
				326	subs NBYTES, NBYTES, #128
				327	bne .Lnext_128bytes_\@
				328
				329	// Store the next tweak and return
				330	.if \n == 64
				331	st1 {TWEAKV_NEXT.16b}, [TWEAK]
				332	.else
				333	st1 {TWEAKV_NEXT.8b}, [TWEAK]
				334	.endif
				335	ret
				336	.endm
				337
				338	ENTRY(speck128_xts_encrypt_neon)
				339	_speck_xts_crypt n=64, lanes=2d, decrypting=0
				340	ENDPROC(speck128_xts_encrypt_neon)
				341
				342	ENTRY(speck128_xts_decrypt_neon)
				343	_speck_xts_crypt n=64, lanes=2d, decrypting=1
				344	ENDPROC(speck128_xts_decrypt_neon)
				345
				346	ENTRY(speck64_xts_encrypt_neon)
				347	_speck_xts_crypt n=32, lanes=4s, decrypting=0
				348	ENDPROC(speck64_xts_encrypt_neon)
				349
				350	ENTRY(speck64_xts_decrypt_neon)
				351	_speck_xts_crypt n=32, lanes=4s, decrypting=1
				352	ENDPROC(speck64_xts_decrypt_neon)