Blame - arch/arm/crypto/aes-neonbs-core.S - kernel/msm-4.19

blob: c9477044fbbaca7ae42ca70669a91d63aad41993 [file] [log] [blame]

Ard Biesheuvel	cc477bf	2017-01-11 16:41:54 +0000	[diff] [blame^]	1	/*
				2	* Bit sliced AES using NEON instructions
				3	*
				4	* Copyright (C) 2017 Linaro Ltd.
				5	* Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
				6	*
				7	* This program is free software; you can redistribute it and/or modify
				8	* it under the terms of the GNU General Public License version 2 as
				9	* published by the Free Software Foundation.
				10	*/
				11
				12	/*
				13	* The algorithm implemented here is described in detail by the paper
				14	* 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
				15	* Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
				16	*
				17	* This implementation is based primarily on the OpenSSL implementation
				18	* for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
				19	*/
				20
				21	#include <linux/linkage.h>
				22	#include <asm/assembler.h>
				23
				24	.text
				25	.fpu neon
				26
				27	rounds .req ip
				28	bskey .req r4
				29
				30	q0l .req d0
				31	q0h .req d1
				32	q1l .req d2
				33	q1h .req d3
				34	q2l .req d4
				35	q2h .req d5
				36	q3l .req d6
				37	q3h .req d7
				38	q4l .req d8
				39	q4h .req d9
				40	q5l .req d10
				41	q5h .req d11
				42	q6l .req d12
				43	q6h .req d13
				44	q7l .req d14
				45	q7h .req d15
				46	q8l .req d16
				47	q8h .req d17
				48	q9l .req d18
				49	q9h .req d19
				50	q10l .req d20
				51	q10h .req d21
				52	q11l .req d22
				53	q11h .req d23
				54	q12l .req d24
				55	q12h .req d25
				56	q13l .req d26
				57	q13h .req d27
				58	q14l .req d28
				59	q14h .req d29
				60	q15l .req d30
				61	q15h .req d31
				62
				63	.macro __tbl, out, tbl, in, tmp
				64	.ifc \out, \tbl
				65	.ifb \tmp
				66	.error __tbl needs temp register if out == tbl
				67	.endif
				68	vmov \tmp, \out
				69	.endif
				70	vtbl.8 \out\()l, {\tbl}, \in\()l
				71	.ifc \out, \tbl
				72	vtbl.8 \out\()h, {\tmp}, \in\()h
				73	.else
				74	vtbl.8 \out\()h, {\tbl}, \in\()h
				75	.endif
				76	.endm
				77
				78	.macro __ldr, out, sym
				79	vldr \out\()l, \sym
				80	vldr \out\()h, \sym + 8
				81	.endm
				82
				83	.macro __adr, reg, lbl
				84	adr \reg, \lbl
				85	THUMB( orr \reg, \reg, #1 )
				86	.endm
				87
				88	.macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
				89	veor \b2, \b2, \b1
				90	veor \b5, \b5, \b6
				91	veor \b3, \b3, \b0
				92	veor \b6, \b6, \b2
				93	veor \b5, \b5, \b0
				94	veor \b6, \b6, \b3
				95	veor \b3, \b3, \b7
				96	veor \b7, \b7, \b5
				97	veor \b3, \b3, \b4
				98	veor \b4, \b4, \b5
				99	veor \b2, \b2, \b7
				100	veor \b3, \b3, \b1
				101	veor \b1, \b1, \b5
				102	.endm
				103
				104	.macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
				105	veor \b0, \b0, \b6
				106	veor \b1, \b1, \b4
				107	veor \b4, \b4, \b6
				108	veor \b2, \b2, \b0
				109	veor \b6, \b6, \b1
				110	veor \b1, \b1, \b5
				111	veor \b5, \b5, \b3
				112	veor \b3, \b3, \b7
				113	veor \b7, \b7, \b5
				114	veor \b2, \b2, \b5
				115	veor \b4, \b4, \b7
				116	.endm
				117
				118	.macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
				119	veor \b1, \b1, \b7
				120	veor \b4, \b4, \b7
				121	veor \b7, \b7, \b5
				122	veor \b1, \b1, \b3
				123	veor \b2, \b2, \b5
				124	veor \b3, \b3, \b7
				125	veor \b6, \b6, \b1
				126	veor \b2, \b2, \b0
				127	veor \b5, \b5, \b3
				128	veor \b4, \b4, \b6
				129	veor \b0, \b0, \b6
				130	veor \b1, \b1, \b4
				131	.endm
				132
				133	.macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
				134	veor \b1, \b1, \b5
				135	veor \b2, \b2, \b7
				136	veor \b3, \b3, \b1
				137	veor \b4, \b4, \b5
				138	veor \b7, \b7, \b5
				139	veor \b3, \b3, \b4
				140	veor \b5, \b5, \b0
				141	veor \b3, \b3, \b7
				142	veor \b6, \b6, \b2
				143	veor \b2, \b2, \b1
				144	veor \b6, \b6, \b3
				145	veor \b3, \b3, \b0
				146	veor \b5, \b5, \b6
				147	.endm
				148
				149	.macro mul_gf4, x0, x1, y0, y1, t0, t1
				150	veor \t0, \y0, \y1
				151	vand \t0, \t0, \x0
				152	veor \x0, \x0, \x1
				153	vand \t1, \x1, \y0
				154	vand \x0, \x0, \y1
				155	veor \x1, \t1, \t0
				156	veor \x0, \x0, \t1
				157	.endm
				158
				159	.macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
				160	veor \t0, \y0, \y1
				161	veor \t1, \y2, \y3
				162	vand \t0, \t0, \x0
				163	vand \t1, \t1, \x2
				164	veor \x0, \x0, \x1
				165	veor \x2, \x2, \x3
				166	vand \x1, \x1, \y0
				167	vand \x3, \x3, \y2
				168	vand \x0, \x0, \y1
				169	vand \x2, \x2, \y3
				170	veor \x1, \x1, \x0
				171	veor \x2, \x2, \x3
				172	veor \x0, \x0, \t0
				173	veor \x3, \x3, \t1
				174	.endm
				175
				176	.macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
				177	y0, y1, y2, y3, t0, t1, t2, t3
				178	veor \t0, \x0, \x2
				179	veor \t1, \x1, \x3
				180	mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
				181	veor \y0, \y0, \y2
				182	veor \y1, \y1, \y3
				183	mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
				184	veor \x0, \x0, \t0
				185	veor \x2, \x2, \t0
				186	veor \x1, \x1, \t1
				187	veor \x3, \x3, \t1
				188	veor \t0, \x4, \x6
				189	veor \t1, \x5, \x7
				190	mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
				191	veor \y0, \y0, \y2
				192	veor \y1, \y1, \y3
				193	mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
				194	veor \x4, \x4, \t0
				195	veor \x6, \x6, \t0
				196	veor \x5, \x5, \t1
				197	veor \x7, \x7, \t1
				198	.endm
				199
				200	.macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
				201	t0, t1, t2, t3, s0, s1, s2, s3
				202	veor \t3, \x4, \x6
				203	veor \t0, \x5, \x7
				204	veor \t1, \x1, \x3
				205	veor \s1, \x7, \x6
				206	veor \s0, \x0, \x2
				207	veor \s3, \t3, \t0
				208	vorr \t2, \t0, \t1
				209	vand \s2, \t3, \s0
				210	vorr \t3, \t3, \s0
				211	veor \s0, \s0, \t1
				212	vand \t0, \t0, \t1
				213	veor \t1, \x3, \x2
				214	vand \s3, \s3, \s0
				215	vand \s1, \s1, \t1
				216	veor \t1, \x4, \x5
				217	veor \s0, \x1, \x0
				218	veor \t3, \t3, \s1
				219	veor \t2, \t2, \s1
				220	vand \s1, \t1, \s0
				221	vorr \t1, \t1, \s0
				222	veor \t3, \t3, \s3
				223	veor \t0, \t0, \s1
				224	veor \t2, \t2, \s2
				225	veor \t1, \t1, \s3
				226	veor \t0, \t0, \s2
				227	vand \s0, \x7, \x3
				228	veor \t1, \t1, \s2
				229	vand \s1, \x6, \x2
				230	vand \s2, \x5, \x1
				231	vorr \s3, \x4, \x0
				232	veor \t3, \t3, \s0
				233	veor \t1, \t1, \s2
				234	veor \s0, \t0, \s3
				235	veor \t2, \t2, \s1
				236	vand \s2, \t3, \t1
				237	veor \s1, \t2, \s2
				238	veor \s3, \s0, \s2
				239	vbsl \s1, \t1, \s0
				240	vmvn \t0, \s0
				241	vbsl \s0, \s1, \s3
				242	vbsl \t0, \s1, \s3
				243	vbsl \s3, \t3, \t2
				244	veor \t3, \t3, \t2
				245	vand \s2, \s0, \s3
				246	veor \t1, \t1, \t0
				247	veor \s2, \s2, \t3
				248	mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
				249	\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
				250	.endm
				251
				252	.macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
				253	t0, t1, t2, t3, s0, s1, s2, s3
				254	in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
				255	inv_gf256 \b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
				256	\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
				257	out_bs_ch \b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
				258	.endm
				259
				260	.macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
				261	t0, t1, t2, t3, s0, s1, s2, s3
				262	inv_in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
				263	inv_gf256 \b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
				264	\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
				265	inv_out_bs_ch \b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
				266	.endm
				267
				268	.macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
				269	t0, t1, t2, t3, mask
				270	vld1.8 {\t0-\t1}, [bskey, :256]!
				271	veor \t0, \t0, \x0
				272	vld1.8 {\t2-\t3}, [bskey, :256]!
				273	veor \t1, \t1, \x1
				274	__tbl \x0, \t0, \mask
				275	veor \t2, \t2, \x2
				276	__tbl \x1, \t1, \mask
				277	vld1.8 {\t0-\t1}, [bskey, :256]!
				278	veor \t3, \t3, \x3
				279	__tbl \x2, \t2, \mask
				280	__tbl \x3, \t3, \mask
				281	vld1.8 {\t2-\t3}, [bskey, :256]!
				282	veor \t0, \t0, \x4
				283	veor \t1, \t1, \x5
				284	__tbl \x4, \t0, \mask
				285	veor \t2, \t2, \x6
				286	__tbl \x5, \t1, \mask
				287	veor \t3, \t3, \x7
				288	__tbl \x6, \t2, \mask
				289	__tbl \x7, \t3, \mask
				290	.endm
				291
				292	.macro inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
				293	t0, t1, t2, t3, mask
				294	__tbl \x0, \x0, \mask, \t0
				295	__tbl \x1, \x1, \mask, \t1
				296	__tbl \x2, \x2, \mask, \t2
				297	__tbl \x3, \x3, \mask, \t3
				298	__tbl \x4, \x4, \mask, \t0
				299	__tbl \x5, \x5, \mask, \t1
				300	__tbl \x6, \x6, \mask, \t2
				301	__tbl \x7, \x7, \mask, \t3
				302	.endm
				303
				304	.macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
				305	t0, t1, t2, t3, t4, t5, t6, t7, inv
				306	vext.8 \t0, \x0, \x0, #12
				307	vext.8 \t1, \x1, \x1, #12
				308	veor \x0, \x0, \t0
				309	vext.8 \t2, \x2, \x2, #12
				310	veor \x1, \x1, \t1
				311	vext.8 \t3, \x3, \x3, #12
				312	veor \x2, \x2, \t2
				313	vext.8 \t4, \x4, \x4, #12
				314	veor \x3, \x3, \t3
				315	vext.8 \t5, \x5, \x5, #12
				316	veor \x4, \x4, \t4
				317	vext.8 \t6, \x6, \x6, #12
				318	veor \x5, \x5, \t5
				319	vext.8 \t7, \x7, \x7, #12
				320	veor \x6, \x6, \t6
				321	veor \t1, \t1, \x0
				322	veor.8 \x7, \x7, \t7
				323	vext.8 \x0, \x0, \x0, #8
				324	veor \t2, \t2, \x1
				325	veor \t0, \t0, \x7
				326	veor \t1, \t1, \x7
				327	vext.8 \x1, \x1, \x1, #8
				328	veor \t5, \t5, \x4
				329	veor \x0, \x0, \t0
				330	veor \t6, \t6, \x5
				331	veor \x1, \x1, \t1
				332	vext.8 \t0, \x4, \x4, #8
				333	veor \t4, \t4, \x3
				334	vext.8 \t1, \x5, \x5, #8
				335	veor \t7, \t7, \x6
				336	vext.8 \x4, \x3, \x3, #8
				337	veor \t3, \t3, \x2
				338	vext.8 \x5, \x7, \x7, #8
				339	veor \t4, \t4, \x7
				340	vext.8 \x3, \x6, \x6, #8
				341	veor \t3, \t3, \x7
				342	vext.8 \x6, \x2, \x2, #8
				343	veor \x7, \t1, \t5
				344	.ifb \inv
				345	veor \x2, \t0, \t4
				346	veor \x4, \x4, \t3
				347	veor \x5, \x5, \t7
				348	veor \x3, \x3, \t6
				349	veor \x6, \x6, \t2
				350	.else
				351	veor \t3, \t3, \x4
				352	veor \x5, \x5, \t7
				353	veor \x2, \x3, \t6
				354	veor \x3, \t0, \t4
				355	veor \x4, \x6, \t2
				356	vmov \x6, \t3
				357	.endif
				358	.endm
				359
				360	.macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
				361	t0, t1, t2, t3, t4, t5, t6, t7
				362	vld1.8 {\t0-\t1}, [bskey, :256]!
				363	veor \x0, \x0, \t0
				364	vld1.8 {\t2-\t3}, [bskey, :256]!
				365	veor \x1, \x1, \t1
				366	vld1.8 {\t4-\t5}, [bskey, :256]!
				367	veor \x2, \x2, \t2
				368	vld1.8 {\t6-\t7}, [bskey, :256]
				369	sub bskey, bskey, #224
				370	veor \x3, \x3, \t3
				371	veor \x4, \x4, \t4
				372	veor \x5, \x5, \t5
				373	veor \x6, \x6, \t6
				374	veor \x7, \x7, \t7
				375	vext.8 \t0, \x0, \x0, #8
				376	vext.8 \t6, \x6, \x6, #8
				377	vext.8 \t7, \x7, \x7, #8
				378	veor \t0, \t0, \x0
				379	vext.8 \t1, \x1, \x1, #8
				380	veor \t6, \t6, \x6
				381	vext.8 \t2, \x2, \x2, #8
				382	veor \t7, \t7, \x7
				383	vext.8 \t3, \x3, \x3, #8
				384	veor \t1, \t1, \x1
				385	vext.8 \t4, \x4, \x4, #8
				386	veor \t2, \t2, \x2
				387	vext.8 \t5, \x5, \x5, #8
				388	veor \t3, \t3, \x3
				389	veor \t4, \t4, \x4
				390	veor \t5, \t5, \x5
				391	veor \x0, \x0, \t6
				392	veor \x1, \x1, \t6
				393	veor \x2, \x2, \t0
				394	veor \x4, \x4, \t2
				395	veor \x3, \x3, \t1
				396	veor \x1, \x1, \t7
				397	veor \x2, \x2, \t7
				398	veor \x4, \x4, \t6
				399	veor \x5, \x5, \t3
				400	veor \x3, \x3, \t6
				401	veor \x6, \x6, \t4
				402	veor \x4, \x4, \t7
				403	veor \x5, \x5, \t7
				404	veor \x7, \x7, \t5
				405	mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
				406	\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
				407	.endm
				408
				409	.macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
				410	vshr.u64 \t0, \b0, #\n
				411	vshr.u64 \t1, \b1, #\n
				412	veor \t0, \t0, \a0
				413	veor \t1, \t1, \a1
				414	vand \t0, \t0, \mask
				415	vand \t1, \t1, \mask
				416	veor \a0, \a0, \t0
				417	vshl.s64 \t0, \t0, #\n
				418	veor \a1, \a1, \t1
				419	vshl.s64 \t1, \t1, #\n
				420	veor \b0, \b0, \t0
				421	veor \b1, \b1, \t1
				422	.endm
				423
				424	.macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
				425	vmov.i8 \t0, #0x55
				426	vmov.i8 \t1, #0x33
				427	swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
				428	swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
				429	vmov.i8 \t0, #0x0f
				430	swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
				431	swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
				432	swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
				433	swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
				434	.endm
				435
				436	.align 4
				437	M0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d
				438
				439	/*
				440	* void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
				441	*/
				442	ENTRY(aesbs_convert_key)
				443	vld1.32 {q7}, [r1]! // load round 0 key
				444	vld1.32 {q15}, [r1]! // load round 1 key
				445
				446	vmov.i8 q8, #0x01 // bit masks
				447	vmov.i8 q9, #0x02
				448	vmov.i8 q10, #0x04
				449	vmov.i8 q11, #0x08
				450	vmov.i8 q12, #0x10
				451	vmov.i8 q13, #0x20
				452	__ldr q14, M0
				453
				454	sub r2, r2, #1
				455	vst1.8 {q7}, [r0, :128]! // save round 0 key
				456
				457	.Lkey_loop:
				458	__tbl q7, q15, q14
				459	vmov.i8 q6, #0x40
				460	vmov.i8 q15, #0x80
				461
				462	vtst.8 q0, q7, q8
				463	vtst.8 q1, q7, q9
				464	vtst.8 q2, q7, q10
				465	vtst.8 q3, q7, q11
				466	vtst.8 q4, q7, q12
				467	vtst.8 q5, q7, q13
				468	vtst.8 q6, q7, q6
				469	vtst.8 q7, q7, q15
				470	vld1.32 {q15}, [r1]! // load next round key
				471	vmvn q0, q0
				472	vmvn q1, q1
				473	vmvn q5, q5
				474	vmvn q6, q6
				475
				476	subs r2, r2, #1
				477	vst1.8 {q0-q1}, [r0, :256]!
				478	vst1.8 {q2-q3}, [r0, :256]!
				479	vst1.8 {q4-q5}, [r0, :256]!
				480	vst1.8 {q6-q7}, [r0, :256]!
				481	bne .Lkey_loop
				482
				483	vmov.i8 q7, #0x63 // compose .L63
				484	veor q15, q15, q7
				485	vst1.8 {q15}, [r0, :128]
				486	bx lr
				487	ENDPROC(aesbs_convert_key)
				488
				489	.align 4
				490	M0SR: .quad 0x0a0e02060f03070b, 0x0004080c05090d01
				491
				492	aesbs_encrypt8:
				493	vld1.8 {q9}, [bskey, :128]! // round 0 key
				494	__ldr q8, M0SR
				495
				496	veor q10, q0, q9 // xor with round0 key
				497	veor q11, q1, q9
				498	__tbl q0, q10, q8
				499	veor q12, q2, q9
				500	__tbl q1, q11, q8
				501	veor q13, q3, q9
				502	__tbl q2, q12, q8
				503	veor q14, q4, q9
				504	__tbl q3, q13, q8
				505	veor q15, q5, q9
				506	__tbl q4, q14, q8
				507	veor q10, q6, q9
				508	__tbl q5, q15, q8
				509	veor q11, q7, q9
				510	__tbl q6, q10, q8
				511	__tbl q7, q11, q8
				512
				513	bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
				514
				515	sub rounds, rounds, #1
				516	b .Lenc_sbox
				517
				518	.align 5
				519	SR: .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
				520	SRM0: .quad 0x0304090e00050a0f, 0x01060b0c0207080d
				521
				522	.Lenc_last:
				523	__ldr q12, SRM0
				524	.Lenc_loop:
				525	shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
				526	.Lenc_sbox:
				527	sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
				528	q13, q14, q15
				529	subs rounds, rounds, #1
				530	bcc .Lenc_done
				531
				532	mix_cols q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
				533	q13, q14, q15
				534
				535	beq .Lenc_last
				536	__ldr q12, SR
				537	b .Lenc_loop
				538
				539	.Lenc_done:
				540	vld1.8 {q12}, [bskey, :128] // last round key
				541
				542	bitslice q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
				543
				544	veor q0, q0, q12
				545	veor q1, q1, q12
				546	veor q4, q4, q12
				547	veor q6, q6, q12
				548	veor q3, q3, q12
				549	veor q7, q7, q12
				550	veor q2, q2, q12
				551	veor q5, q5, q12
				552	bx lr
				553	ENDPROC(aesbs_encrypt8)
				554
				555	.align 4
				556	M0ISR: .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
				557
				558	aesbs_decrypt8:
				559	add bskey, bskey, rounds, lsl #7
				560	sub bskey, bskey, #112
				561	vld1.8 {q9}, [bskey, :128] // round 0 key
				562	sub bskey, bskey, #128
				563	__ldr q8, M0ISR
				564
				565	veor q10, q0, q9 // xor with round0 key
				566	veor q11, q1, q9
				567	__tbl q0, q10, q8
				568	veor q12, q2, q9
				569	__tbl q1, q11, q8
				570	veor q13, q3, q9
				571	__tbl q2, q12, q8
				572	veor q14, q4, q9
				573	__tbl q3, q13, q8
				574	veor q15, q5, q9
				575	__tbl q4, q14, q8
				576	veor q10, q6, q9
				577	__tbl q5, q15, q8
				578	veor q11, q7, q9
				579	__tbl q6, q10, q8
				580	__tbl q7, q11, q8
				581
				582	bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
				583
				584	sub rounds, rounds, #1
				585	b .Ldec_sbox
				586
				587	.align 5
				588	ISR: .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
				589	ISRM0: .quad 0x01040b0e0205080f, 0x0306090c00070a0d
				590
				591	.Ldec_last:
				592	__ldr q12, ISRM0
				593	.Ldec_loop:
				594	inv_shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
				595	.Ldec_sbox:
				596	inv_sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
				597	q13, q14, q15
				598	subs rounds, rounds, #1
				599	bcc .Ldec_done
				600
				601	inv_mix_cols q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
				602	q13, q14, q15
				603
				604	beq .Ldec_last
				605	__ldr q12, ISR
				606	b .Ldec_loop
				607
				608	.Ldec_done:
				609	add bskey, bskey, #112
				610	vld1.8 {q12}, [bskey, :128] // last round key
				611
				612	bitslice q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
				613
				614	veor q0, q0, q12
				615	veor q1, q1, q12
				616	veor q6, q6, q12
				617	veor q4, q4, q12
				618	veor q2, q2, q12
				619	veor q7, q7, q12
				620	veor q3, q3, q12
				621	veor q5, q5, q12
				622	bx lr
				623	ENDPROC(aesbs_decrypt8)
				624
				625	/*
				626	* aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
				627	* int blocks)
				628	* aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
				629	* int blocks)
				630	*/
				631	.macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
				632	push {r4-r6, lr}
				633	ldr r5, [sp, #16] // number of blocks
				634
				635	99: __adr ip, 0f
				636	and lr, r5, #7
				637	cmp r5, #8
				638	sub ip, ip, lr, lsl #2
				639	bxlt ip // computed goto if blocks < 8
				640
				641	vld1.8 {q0}, [r1]!
				642	vld1.8 {q1}, [r1]!
				643	vld1.8 {q2}, [r1]!
				644	vld1.8 {q3}, [r1]!
				645	vld1.8 {q4}, [r1]!
				646	vld1.8 {q5}, [r1]!
				647	vld1.8 {q6}, [r1]!
				648	vld1.8 {q7}, [r1]!
				649
				650	0: mov bskey, r2
				651	mov rounds, r3
				652	bl \do8
				653
				654	__adr ip, 1f
				655	and lr, r5, #7
				656	cmp r5, #8
				657	sub ip, ip, lr, lsl #2
				658	bxlt ip // computed goto if blocks < 8
				659
				660	vst1.8 {\o0}, [r0]!
				661	vst1.8 {\o1}, [r0]!
				662	vst1.8 {\o2}, [r0]!
				663	vst1.8 {\o3}, [r0]!
				664	vst1.8 {\o4}, [r0]!
				665	vst1.8 {\o5}, [r0]!
				666	vst1.8 {\o6}, [r0]!
				667	vst1.8 {\o7}, [r0]!
				668
				669	1: subs r5, r5, #8
				670	bgt 99b
				671
				672	pop {r4-r6, pc}
				673	.endm
				674
				675	.align 4
				676	ENTRY(aesbs_ecb_encrypt)
				677	__ecb_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
				678	ENDPROC(aesbs_ecb_encrypt)
				679
				680	.align 4
				681	ENTRY(aesbs_ecb_decrypt)
				682	__ecb_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
				683	ENDPROC(aesbs_ecb_decrypt)
				684
				685	/*
				686	* aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
				687	* int rounds, int blocks, u8 iv[])
				688	*/
				689	.align 4
				690	ENTRY(aesbs_cbc_decrypt)
				691	mov ip, sp
				692	push {r4-r6, lr}
				693	ldm ip, {r5-r6} // load args 4-5
				694
				695	99: __adr ip, 0f
				696	and lr, r5, #7
				697	cmp r5, #8
				698	sub ip, ip, lr, lsl #2
				699	mov lr, r1
				700	bxlt ip // computed goto if blocks < 8
				701
				702	vld1.8 {q0}, [lr]!
				703	vld1.8 {q1}, [lr]!
				704	vld1.8 {q2}, [lr]!
				705	vld1.8 {q3}, [lr]!
				706	vld1.8 {q4}, [lr]!
				707	vld1.8 {q5}, [lr]!
				708	vld1.8 {q6}, [lr]!
				709	vld1.8 {q7}, [lr]
				710
				711	0: mov bskey, r2
				712	mov rounds, r3
				713	bl aesbs_decrypt8
				714
				715	vld1.8 {q8}, [r6]
				716	vmov q9, q8
				717	vmov q10, q8
				718	vmov q11, q8
				719	vmov q12, q8
				720	vmov q13, q8
				721	vmov q14, q8
				722	vmov q15, q8
				723
				724	__adr ip, 1f
				725	and lr, r5, #7
				726	cmp r5, #8
				727	sub ip, ip, lr, lsl #2
				728	bxlt ip // computed goto if blocks < 8
				729
				730	vld1.8 {q9}, [r1]!
				731	vld1.8 {q10}, [r1]!
				732	vld1.8 {q11}, [r1]!
				733	vld1.8 {q12}, [r1]!
				734	vld1.8 {q13}, [r1]!
				735	vld1.8 {q14}, [r1]!
				736	vld1.8 {q15}, [r1]!
				737	W(nop)
				738
				739	1: __adr ip, 2f
				740	sub ip, ip, lr, lsl #3
				741	bxlt ip // computed goto if blocks < 8
				742
				743	veor q0, q0, q8
				744	vst1.8 {q0}, [r0]!
				745	veor q1, q1, q9
				746	vst1.8 {q1}, [r0]!
				747	veor q6, q6, q10
				748	vst1.8 {q6}, [r0]!
				749	veor q4, q4, q11
				750	vst1.8 {q4}, [r0]!
				751	veor q2, q2, q12
				752	vst1.8 {q2}, [r0]!
				753	veor q7, q7, q13
				754	vst1.8 {q7}, [r0]!
				755	veor q3, q3, q14
				756	vst1.8 {q3}, [r0]!
				757	veor q5, q5, q15
				758	vld1.8 {q8}, [r1]! // load next round's iv
				759	2: vst1.8 {q5}, [r0]!
				760
				761	subs r5, r5, #8
				762	vst1.8 {q8}, [r6] // store next round's iv
				763	bgt 99b
				764
				765	pop {r4-r6, pc}
				766	ENDPROC(aesbs_cbc_decrypt)
				767
				768	.macro next_ctr, q
				769	vmov \q\()h[1], r10
				770	adds r10, r10, #1
				771	vmov \q\()h[0], r9
				772	adcs r9, r9, #0
				773	vmov \q\()l[1], r8
				774	adcs r8, r8, #0
				775	vmov \q\()l[0], r7
				776	adc r7, r7, #0
				777	vrev32.8 \q, \q
				778	.endm
				779
				780	/*
				781	* aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
				782	* int rounds, int blocks, u8 ctr[], bool final)
				783	*/
				784	ENTRY(aesbs_ctr_encrypt)
				785	mov ip, sp
				786	push {r4-r10, lr}
				787
				788	ldm ip, {r5-r7} // load args 4-6
				789	add r5, r5, r7 // one extra block if final == 1
				790
				791	vld1.8 {q0}, [r6] // load counter
				792	vrev32.8 q1, q0
				793	vmov r9, r10, d3
				794	vmov r7, r8, d2
				795
				796	adds r10, r10, #1
				797	adcs r9, r9, #0
				798	adcs r8, r8, #0
				799	adc r7, r7, #0
				800
				801	99: vmov q1, q0
				802	vmov q2, q0
				803	vmov q3, q0
				804	vmov q4, q0
				805	vmov q5, q0
				806	vmov q6, q0
				807	vmov q7, q0
				808
				809	__adr ip, 0f
				810	sub lr, r5, #1
				811	and lr, lr, #7
				812	cmp r5, #8
				813	sub ip, ip, lr, lsl #5
				814	sub ip, ip, lr, lsl #2
				815	bxlt ip // computed goto if blocks < 8
				816
				817	next_ctr q1
				818	next_ctr q2
				819	next_ctr q3
				820	next_ctr q4
				821	next_ctr q5
				822	next_ctr q6
				823	next_ctr q7
				824
				825	0: mov bskey, r2
				826	mov rounds, r3
				827	bl aesbs_encrypt8
				828
				829	__adr ip, 1f
				830	and lr, r5, #7
				831	cmp r5, #8
				832	movgt r4, #0
				833	ldrle r4, [sp, #40] // load final in the last round
				834	sub ip, ip, lr, lsl #2
				835	bxlt ip // computed goto if blocks < 8
				836
				837	vld1.8 {q8}, [r1]!
				838	vld1.8 {q9}, [r1]!
				839	vld1.8 {q10}, [r1]!
				840	vld1.8 {q11}, [r1]!
				841	vld1.8 {q12}, [r1]!
				842	vld1.8 {q13}, [r1]!
				843	vld1.8 {q14}, [r1]!
				844	teq r4, #0 // skip last block if 'final'
				845	1: bne 2f
				846	vld1.8 {q15}, [r1]!
				847
				848	2: __adr ip, 3f
				849	cmp r5, #8
				850	sub ip, ip, lr, lsl #3
				851	bxlt ip // computed goto if blocks < 8
				852
				853	veor q0, q0, q8
				854	vst1.8 {q0}, [r0]!
				855	veor q1, q1, q9
				856	vst1.8 {q1}, [r0]!
				857	veor q4, q4, q10
				858	vst1.8 {q4}, [r0]!
				859	veor q6, q6, q11
				860	vst1.8 {q6}, [r0]!
				861	veor q3, q3, q12
				862	vst1.8 {q3}, [r0]!
				863	veor q7, q7, q13
				864	vst1.8 {q7}, [r0]!
				865	veor q2, q2, q14
				866	vst1.8 {q2}, [r0]!
				867	teq r4, #0 // skip last block if 'final'
				868	W(bne) 4f
				869	3: veor q5, q5, q15
				870	vst1.8 {q5}, [r0]!
				871
				872	next_ctr q0
				873
				874	subs r5, r5, #8
				875	bgt 99b
				876
				877	vmov q5, q0
				878
				879	4: vst1.8 {q5}, [r6]
				880	pop {r4-r10, pc}
				881	ENDPROC(aesbs_ctr_encrypt)
				882
				883	.macro next_tweak, out, in, const, tmp
				884	vshr.s64 \tmp, \in, #63
				885	vand \tmp, \tmp, \const
				886	vadd.u64 \out, \in, \in
				887	vext.8 \tmp, \tmp, \tmp, #8
				888	veor \out, \out, \tmp
				889	.endm
				890
				891	.align 4
				892	.Lxts_mul_x:
				893	.quad 1, 0x87
				894
				895	/*
				896	* aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
				897	* int blocks, u8 iv[])
				898	* aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
				899	* int blocks, u8 iv[])
				900	*/
				901	__xts_prepare8:
				902	vld1.8 {q14}, [r7] // load iv
				903	__ldr q15, .Lxts_mul_x // load tweak mask
				904	vmov q12, q14
				905
				906	__adr ip, 0f
				907	and r4, r6, #7
				908	cmp r6, #8
				909	sub ip, ip, r4, lsl #5
				910	mov r4, sp
				911	bxlt ip // computed goto if blocks < 8
				912
				913	vld1.8 {q0}, [r1]!
				914	next_tweak q12, q14, q15, q13
				915	veor q0, q0, q14
				916	vst1.8 {q14}, [r4, :128]!
				917
				918	vld1.8 {q1}, [r1]!
				919	next_tweak q14, q12, q15, q13
				920	veor q1, q1, q12
				921	vst1.8 {q12}, [r4, :128]!
				922
				923	vld1.8 {q2}, [r1]!
				924	next_tweak q12, q14, q15, q13
				925	veor q2, q2, q14
				926	vst1.8 {q14}, [r4, :128]!
				927
				928	vld1.8 {q3}, [r1]!
				929	next_tweak q14, q12, q15, q13
				930	veor q3, q3, q12
				931	vst1.8 {q12}, [r4, :128]!
				932
				933	vld1.8 {q4}, [r1]!
				934	next_tweak q12, q14, q15, q13
				935	veor q4, q4, q14
				936	vst1.8 {q14}, [r4, :128]!
				937
				938	vld1.8 {q5}, [r1]!
				939	next_tweak q14, q12, q15, q13
				940	veor q5, q5, q12
				941	vst1.8 {q12}, [r4, :128]!
				942
				943	vld1.8 {q6}, [r1]!
				944	next_tweak q12, q14, q15, q13
				945	veor q6, q6, q14
				946	vst1.8 {q14}, [r4, :128]!
				947
				948	vld1.8 {q7}, [r1]!
				949	next_tweak q14, q12, q15, q13
				950	veor q7, q7, q12
				951	vst1.8 {q12}, [r4, :128]
				952
				953	0: vst1.8 {q14}, [r7] // store next iv
				954	bx lr
				955	ENDPROC(__xts_prepare8)
				956
				957	.macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
				958	push {r4-r8, lr}
				959	mov r5, sp // preserve sp
				960	ldrd r6, r7, [sp, #24] // get blocks and iv args
				961	sub ip, sp, #128 // make room for 8x tweak
				962	bic ip, ip, #0xf // align sp to 16 bytes
				963	mov sp, ip
				964
				965	99: bl __xts_prepare8
				966
				967	mov bskey, r2
				968	mov rounds, r3
				969	bl \do8
				970
				971	__adr ip, 0f
				972	and lr, r6, #7
				973	cmp r6, #8
				974	sub ip, ip, lr, lsl #2
				975	mov r4, sp
				976	bxlt ip // computed goto if blocks < 8
				977
				978	vld1.8 {q8}, [r4, :128]!
				979	vld1.8 {q9}, [r4, :128]!
				980	vld1.8 {q10}, [r4, :128]!
				981	vld1.8 {q11}, [r4, :128]!
				982	vld1.8 {q12}, [r4, :128]!
				983	vld1.8 {q13}, [r4, :128]!
				984	vld1.8 {q14}, [r4, :128]!
				985	vld1.8 {q15}, [r4, :128]
				986
				987	0: __adr ip, 1f
				988	sub ip, ip, lr, lsl #3
				989	bxlt ip // computed goto if blocks < 8
				990
				991	veor \o0, \o0, q8
				992	vst1.8 {\o0}, [r0]!
				993	veor \o1, \o1, q9
				994	vst1.8 {\o1}, [r0]!
				995	veor \o2, \o2, q10
				996	vst1.8 {\o2}, [r0]!
				997	veor \o3, \o3, q11
				998	vst1.8 {\o3}, [r0]!
				999	veor \o4, \o4, q12
				1000	vst1.8 {\o4}, [r0]!
				1001	veor \o5, \o5, q13
				1002	vst1.8 {\o5}, [r0]!
				1003	veor \o6, \o6, q14
				1004	vst1.8 {\o6}, [r0]!
				1005	veor \o7, \o7, q15
				1006	vst1.8 {\o7}, [r0]!
				1007
				1008	1: subs r6, r6, #8
				1009	bgt 99b
				1010
				1011	mov sp, r5
				1012	pop {r4-r8, pc}
				1013	.endm
				1014
				1015	ENTRY(aesbs_xts_encrypt)
				1016	__xts_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
				1017	ENDPROC(aesbs_xts_encrypt)
				1018
				1019	ENTRY(aesbs_xts_decrypt)
				1020	__xts_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
				1021	ENDPROC(aesbs_xts_decrypt)