Blame - arch/arm/crypto/chacha-neon-core.S - kernel/msm-4.9

blob: eb22926d49127e894a060afc003871d743dfe36d [file] [log] [blame]

Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	1	/*
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	2	* ChaCha/XChaCha NEON helper functions
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	3	*
				4	* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
				5	*
				6	* This program is free software; you can redistribute it and/or modify
				7	* it under the terms of the GNU General Public License version 2 as
				8	* published by the Free Software Foundation.
				9	*
				10	* Based on:
				11	* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
				12	*
				13	* Copyright (C) 2015 Martin Willi
				14	*
				15	* This program is free software; you can redistribute it and/or modify
				16	* it under the terms of the GNU General Public License as published by
				17	* the Free Software Foundation; either version 2 of the License, or
				18	* (at your option) any later version.
				19	*/
				20
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	21	/*
				22	* NEON doesn't have a rotate instruction. The alternatives are, more or less:
				23	*
				24	* (a) vshl.u32 + vsri.u32 (needs temporary register)
				25	* (b) vshl.u32 + vshr.u32 + vorr (needs temporary register)
				26	* (c) vrev32.16 (16-bit rotations only)
				27	* (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only,
				28	* needs index vector)
				29	*
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	30	* ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations,
				31	* the only choices are (a) and (b). We use (a) since it takes two-thirds the
				32	* cycles of (b) on both Cortex-A7 and Cortex-A53.
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	33	*
				34	* For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
				35	* and doesn't need a temporary register.
				36	*
				37	* For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence
				38	* is twice as fast as (a), even when doing (a) on multiple registers
				39	* simultaneously to eliminate the stall between vshl and vsri. Also, it
				40	* parallelizes better when temporary registers are scarce.
				41	*
				42	* A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
				43	* (a), so the need to load the rotation table actually makes the vtbl method
				44	* slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it
				45	* seems to be a good compromise to get a more significant speed boost on some
				46	* CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
				47	*/
				48
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	49	#include <linux/linkage.h>
				50
				51	.text
				52	.fpu neon
				53	.align 5
				54
Eric Biggers	0b8e72b	2018-11-16 17:26:24 -0800	[diff] [blame]	55	/*
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	56	* chacha_permute - permute one block
Eric Biggers	0b8e72b	2018-11-16 17:26:24 -0800	[diff] [blame]	57	*
				58	* Permute one 64-byte block where the state matrix is stored in the four NEON
				59	* registers q0-q3. It performs matrix operations on four words in parallel,
				60	* but requires shuffling to rearrange the words after each round.
				61	*
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	62	* The round count is given in r3.
				63	*
Eric Biggers	0b8e72b	2018-11-16 17:26:24 -0800	[diff] [blame]	64	* Clobbers: r3, ip, q4-q5
				65	*/
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	66	chacha_permute:
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	67
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	68	adr ip, .Lrol8_table
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	69	vld1.8 {d10}, [ip, :64]
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	70
				71	.Ldoubleround:
				72	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
				73	vadd.i32 q0, q0, q1
Eric Biggers	52e5ce3	2018-07-24 18:29:07 -0700	[diff] [blame]	74	veor q3, q3, q0
				75	vrev32.16 q3, q3
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	76
				77	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
				78	vadd.i32 q2, q2, q3
				79	veor q4, q1, q2
				80	vshl.u32 q1, q4, #12
				81	vsri.u32 q1, q4, #20
				82
				83	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
				84	vadd.i32 q0, q0, q1
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	85	veor q3, q3, q0
				86	vtbl.8 d6, {d6}, d10
				87	vtbl.8 d7, {d7}, d10
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	88
				89	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
				90	vadd.i32 q2, q2, q3
				91	veor q4, q1, q2
				92	vshl.u32 q1, q4, #7
				93	vsri.u32 q1, q4, #25
				94
				95	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
				96	vext.8 q1, q1, q1, #4
				97	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
				98	vext.8 q2, q2, q2, #8
				99	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
				100	vext.8 q3, q3, q3, #12
				101
				102	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
				103	vadd.i32 q0, q0, q1
Eric Biggers	52e5ce3	2018-07-24 18:29:07 -0700	[diff] [blame]	104	veor q3, q3, q0
				105	vrev32.16 q3, q3
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	106
				107	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
				108	vadd.i32 q2, q2, q3
				109	veor q4, q1, q2
				110	vshl.u32 q1, q4, #12
				111	vsri.u32 q1, q4, #20
				112
				113	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
				114	vadd.i32 q0, q0, q1
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	115	veor q3, q3, q0
				116	vtbl.8 d6, {d6}, d10
				117	vtbl.8 d7, {d7}, d10
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	118
				119	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
				120	vadd.i32 q2, q2, q3
				121	veor q4, q1, q2
				122	vshl.u32 q1, q4, #7
				123	vsri.u32 q1, q4, #25
				124
				125	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
				126	vext.8 q1, q1, q1, #12
				127	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
				128	vext.8 q2, q2, q2, #8
				129	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
				130	vext.8 q3, q3, q3, #4
				131
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	132	subs r3, r3, #2
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	133	bne .Ldoubleround
				134
Eric Biggers	0b8e72b	2018-11-16 17:26:24 -0800	[diff] [blame]	135	bx lr
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	136	ENDPROC(chacha_permute)
Eric Biggers	0b8e72b	2018-11-16 17:26:24 -0800	[diff] [blame]	137
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	138	ENTRY(chacha_block_xor_neon)
Eric Biggers	0b8e72b	2018-11-16 17:26:24 -0800	[diff] [blame]	139	// r0: Input state matrix, s
				140	// r1: 1 data block output, o
				141	// r2: 1 data block input, i
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	142	// r3: nrounds
Eric Biggers	0b8e72b	2018-11-16 17:26:24 -0800	[diff] [blame]	143	push {lr}
				144
				145	// x0..3 = s0..3
				146	add ip, r0, #0x20
				147	vld1.32 {q0-q1}, [r0]
				148	vld1.32 {q2-q3}, [ip]
				149
				150	vmov q8, q0
				151	vmov q9, q1
				152	vmov q10, q2
				153	vmov q11, q3
				154
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	155	bl chacha_permute
Eric Biggers	0b8e72b	2018-11-16 17:26:24 -0800	[diff] [blame]	156
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	157	add ip, r2, #0x20
				158	vld1.8 {q4-q5}, [r2]
				159	vld1.8 {q6-q7}, [ip]
				160
				161	// o0 = i0 ^ (x0 + s0)
				162	vadd.i32 q0, q0, q8
				163	veor q0, q0, q4
				164
				165	// o1 = i1 ^ (x1 + s1)
				166	vadd.i32 q1, q1, q9
				167	veor q1, q1, q5
				168
				169	// o2 = i2 ^ (x2 + s2)
				170	vadd.i32 q2, q2, q10
				171	veor q2, q2, q6
				172
				173	// o3 = i3 ^ (x3 + s3)
				174	vadd.i32 q3, q3, q11
				175	veor q3, q3, q7
				176
				177	add ip, r1, #0x20
				178	vst1.8 {q0-q1}, [r1]
				179	vst1.8 {q2-q3}, [ip]
				180
Eric Biggers	0b8e72b	2018-11-16 17:26:24 -0800	[diff] [blame]	181	pop {pc}
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	182	ENDPROC(chacha_block_xor_neon)
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	183
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	184	ENTRY(hchacha_block_neon)
Eric Biggers	0b8e72b	2018-11-16 17:26:24 -0800	[diff] [blame]	185	// r0: Input state matrix, s
				186	// r1: output (8 32-bit words)
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	187	// r2: nrounds
Eric Biggers	0b8e72b	2018-11-16 17:26:24 -0800	[diff] [blame]	188	push {lr}
				189
				190	vld1.32 {q0-q1}, [r0]!
				191	vld1.32 {q2-q3}, [r0]
				192
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	193	mov r3, r2
				194	bl chacha_permute
Eric Biggers	0b8e72b	2018-11-16 17:26:24 -0800	[diff] [blame]	195
				196	vst1.32 {q0}, [r1]!
				197	vst1.32 {q3}, [r1]
				198
				199	pop {pc}
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	200	ENDPROC(hchacha_block_neon)
Eric Biggers	0b8e72b	2018-11-16 17:26:24 -0800	[diff] [blame]	201
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	202	.align 4
				203	.Lctrinc: .word 0, 1, 2, 3
				204	.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6
				205
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	206	.align 5
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	207	ENTRY(chacha_4block_xor_neon)
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	208	push {r4-r5}
				209	mov r4, sp // preserve the stack pointer
				210	sub ip, sp, #0x20 // allocate a 32 byte buffer
				211	bic ip, ip, #0x1f // aligned to 32 bytes
				212	mov sp, ip
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	213
				214	// r0: Input state matrix, s
				215	// r1: 4 data blocks output, o
				216	// r2: 4 data blocks input, i
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	217	// r3: nrounds
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	218
				219	//
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	220	// This function encrypts four consecutive ChaCha blocks by loading
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	221	// the state matrix in NEON registers four times. The algorithm performs
				222	// each operation on the corresponding word of each state matrix, hence
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	223	// requires no word shuffling. The words are re-interleaved before the
				224	// final addition of the original state and the XORing step.
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	225	//
				226
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	227	// x0..15[0-3] = s0..15[0-3]
				228	add ip, r0, #0x20
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	229	vld1.32 {q0-q1}, [r0]
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	230	vld1.32 {q2-q3}, [ip]
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	231
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	232	adr r5, .Lctrinc
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	233	vdup.32 q15, d7[1]
				234	vdup.32 q14, d7[0]
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	235	vld1.32 {q4}, [r5, :128]
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	236	vdup.32 q13, d6[1]
				237	vdup.32 q12, d6[0]
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	238	vdup.32 q11, d5[1]
				239	vdup.32 q10, d5[0]
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	240	vadd.u32 q12, q12, q4 // x12 += counter values 0-3
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	241	vdup.32 q9, d4[1]
				242	vdup.32 q8, d4[0]
				243	vdup.32 q7, d3[1]
				244	vdup.32 q6, d3[0]
				245	vdup.32 q5, d2[1]
				246	vdup.32 q4, d2[0]
				247	vdup.32 q3, d1[1]
				248	vdup.32 q2, d1[0]
				249	vdup.32 q1, d0[1]
				250	vdup.32 q0, d0[0]
				251
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	252	adr ip, .Lrol8_table
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	253	b 1f
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	254
				255	.Ldoubleround4:
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	256	vld1.32 {q8-q9}, [sp, :256]
				257	1:
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	258	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
				259	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
				260	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
				261	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
				262	vadd.i32 q0, q0, q4
				263	vadd.i32 q1, q1, q5
				264	vadd.i32 q2, q2, q6
				265	vadd.i32 q3, q3, q7
				266
				267	veor q12, q12, q0
				268	veor q13, q13, q1
				269	veor q14, q14, q2
				270	veor q15, q15, q3
				271
				272	vrev32.16 q12, q12
				273	vrev32.16 q13, q13
				274	vrev32.16 q14, q14
				275	vrev32.16 q15, q15
				276
				277	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
				278	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
				279	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
				280	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
				281	vadd.i32 q8, q8, q12
				282	vadd.i32 q9, q9, q13
				283	vadd.i32 q10, q10, q14
				284	vadd.i32 q11, q11, q15
				285
				286	vst1.32 {q8-q9}, [sp, :256]
				287
				288	veor q8, q4, q8
				289	veor q9, q5, q9
				290	vshl.u32 q4, q8, #12
				291	vshl.u32 q5, q9, #12
				292	vsri.u32 q4, q8, #20
				293	vsri.u32 q5, q9, #20
				294
				295	veor q8, q6, q10
				296	veor q9, q7, q11
				297	vshl.u32 q6, q8, #12
				298	vshl.u32 q7, q9, #12
				299	vsri.u32 q6, q8, #20
				300	vsri.u32 q7, q9, #20
				301
				302	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
				303	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
				304	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
				305	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	306	vld1.8 {d16}, [ip, :64]
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	307	vadd.i32 q0, q0, q4
				308	vadd.i32 q1, q1, q5
				309	vadd.i32 q2, q2, q6
				310	vadd.i32 q3, q3, q7
				311
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	312	veor q12, q12, q0
				313	veor q13, q13, q1
				314	veor q14, q14, q2
				315	veor q15, q15, q3
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	316
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	317	vtbl.8 d24, {d24}, d16
				318	vtbl.8 d25, {d25}, d16
				319	vtbl.8 d26, {d26}, d16
				320	vtbl.8 d27, {d27}, d16
				321	vtbl.8 d28, {d28}, d16
				322	vtbl.8 d29, {d29}, d16
				323	vtbl.8 d30, {d30}, d16
				324	vtbl.8 d31, {d31}, d16
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	325
				326	vld1.32 {q8-q9}, [sp, :256]
				327
				328	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
				329	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
				330	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
				331	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
				332	vadd.i32 q8, q8, q12
				333	vadd.i32 q9, q9, q13
				334	vadd.i32 q10, q10, q14
				335	vadd.i32 q11, q11, q15
				336
				337	vst1.32 {q8-q9}, [sp, :256]
				338
				339	veor q8, q4, q8
				340	veor q9, q5, q9
				341	vshl.u32 q4, q8, #7
				342	vshl.u32 q5, q9, #7
				343	vsri.u32 q4, q8, #25
				344	vsri.u32 q5, q9, #25
				345
				346	veor q8, q6, q10
				347	veor q9, q7, q11
				348	vshl.u32 q6, q8, #7
				349	vshl.u32 q7, q9, #7
				350	vsri.u32 q6, q8, #25
				351	vsri.u32 q7, q9, #25
				352
				353	vld1.32 {q8-q9}, [sp, :256]
				354
				355	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
				356	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
				357	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
				358	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
				359	vadd.i32 q0, q0, q5
				360	vadd.i32 q1, q1, q6
				361	vadd.i32 q2, q2, q7
				362	vadd.i32 q3, q3, q4
				363
				364	veor q15, q15, q0
				365	veor q12, q12, q1
				366	veor q13, q13, q2
				367	veor q14, q14, q3
				368
				369	vrev32.16 q15, q15
				370	vrev32.16 q12, q12
				371	vrev32.16 q13, q13
				372	vrev32.16 q14, q14
				373
				374	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
				375	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
				376	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
				377	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
				378	vadd.i32 q10, q10, q15
				379	vadd.i32 q11, q11, q12
				380	vadd.i32 q8, q8, q13
				381	vadd.i32 q9, q9, q14
				382
				383	vst1.32 {q8-q9}, [sp, :256]
				384
				385	veor q8, q7, q8
				386	veor q9, q4, q9
				387	vshl.u32 q7, q8, #12
				388	vshl.u32 q4, q9, #12
				389	vsri.u32 q7, q8, #20
				390	vsri.u32 q4, q9, #20
				391
				392	veor q8, q5, q10
				393	veor q9, q6, q11
				394	vshl.u32 q5, q8, #12
				395	vshl.u32 q6, q9, #12
				396	vsri.u32 q5, q8, #20
				397	vsri.u32 q6, q9, #20
				398
				399	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
				400	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
				401	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
				402	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	403	vld1.8 {d16}, [ip, :64]
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	404	vadd.i32 q0, q0, q5
				405	vadd.i32 q1, q1, q6
				406	vadd.i32 q2, q2, q7
				407	vadd.i32 q3, q3, q4
				408
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	409	veor q15, q15, q0
				410	veor q12, q12, q1
				411	veor q13, q13, q2
				412	veor q14, q14, q3
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	413
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	414	vtbl.8 d30, {d30}, d16
				415	vtbl.8 d31, {d31}, d16
				416	vtbl.8 d24, {d24}, d16
				417	vtbl.8 d25, {d25}, d16
				418	vtbl.8 d26, {d26}, d16
				419	vtbl.8 d27, {d27}, d16
				420	vtbl.8 d28, {d28}, d16
				421	vtbl.8 d29, {d29}, d16
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	422
				423	vld1.32 {q8-q9}, [sp, :256]
				424
				425	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
				426	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
				427	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
				428	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
				429	vadd.i32 q10, q10, q15
				430	vadd.i32 q11, q11, q12
				431	vadd.i32 q8, q8, q13
				432	vadd.i32 q9, q9, q14
				433
				434	vst1.32 {q8-q9}, [sp, :256]
				435
				436	veor q8, q7, q8
				437	veor q9, q4, q9
				438	vshl.u32 q7, q8, #7
				439	vshl.u32 q4, q9, #7
				440	vsri.u32 q7, q8, #25
				441	vsri.u32 q4, q9, #25
				442
				443	veor q8, q5, q10
				444	veor q9, q6, q11
				445	vshl.u32 q5, q8, #7
				446	vshl.u32 q6, q9, #7
				447	vsri.u32 q5, q8, #25
				448	vsri.u32 q6, q9, #25
				449
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	450	subs r3, r3, #2
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	451	bne .Ldoubleround4
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	452
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	453	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
				454	// x8..9[0-3] are on the stack.
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	455
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	456	// Re-interleave the words in the first two rows of each block (x0..7).
				457	// Also add the counter values 0-3 to x12[0-3].
				458	vld1.32 {q8}, [r5, :128] // load counter values 0-3
				459	vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
				460	vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
				461	vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
				462	vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7)
				463	vadd.u32 q12, q8 // x12 += counter values 0-3
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	464	vswp d1, d4
				465	vswp d3, d6
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	466	vld1.32 {q8-q9}, [r0]! // load s0..7
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	467	vswp d9, d12
				468	vswp d11, d14
				469
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	470	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
				471	// after XORing the first 32 bytes.
				472	vswp q1, q4
				473
				474	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
				475
				476	// x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block)
				477	vadd.u32 q0, q0, q8
				478	vadd.u32 q2, q2, q8
				479	vadd.u32 q4, q4, q8
				480	vadd.u32 q3, q3, q8
				481
				482	// x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block)
				483	vadd.u32 q1, q1, q9
				484	vadd.u32 q6, q6, q9
				485	vadd.u32 q5, q5, q9
				486	vadd.u32 q7, q7, q9
				487
				488	// XOR first 32 bytes using keystream from first two rows of first block
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	489	vld1.8 {q8-q9}, [r2]!
				490	veor q8, q8, q0
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	491	veor q9, q9, q1
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	492	vst1.8 {q8-q9}, [r1]!
				493
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	494	// Re-interleave the words in the last two rows of each block (x8..15).
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	495	vld1.32 {q8-q9}, [sp, :256]
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	496	vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
				497	vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
				498	vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
				499	vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11)
				500	vld1.32 {q0-q1}, [r0] // load s8..15
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	501	vswp d25, d28
				502	vswp d27, d30
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	503	vswp d17, d20
				504	vswp d19, d22
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	505
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	506	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
				507
				508	// x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block)
				509	vadd.u32 q8, q8, q0
				510	vadd.u32 q10, q10, q0
				511	vadd.u32 q9, q9, q0
				512	vadd.u32 q11, q11, q0
				513
				514	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
				515	vadd.u32 q12, q12, q1
				516	vadd.u32 q14, q14, q1
				517	vadd.u32 q13, q13, q1
				518	vadd.u32 q15, q15, q1
				519
				520	// XOR the rest of the data with the keystream
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	521
				522	vld1.8 {q0-q1}, [r2]!
				523	veor q0, q0, q8
				524	veor q1, q1, q12
				525	vst1.8 {q0-q1}, [r1]!
				526
				527	vld1.8 {q0-q1}, [r2]!
				528	veor q0, q0, q2
				529	veor q1, q1, q6
				530	vst1.8 {q0-q1}, [r1]!
				531
				532	vld1.8 {q0-q1}, [r2]!
				533	veor q0, q0, q10
				534	veor q1, q1, q14
				535	vst1.8 {q0-q1}, [r1]!
				536
				537	vld1.8 {q0-q1}, [r2]!
				538	veor q0, q0, q4
				539	veor q1, q1, q5
				540	vst1.8 {q0-q1}, [r1]!
				541
				542	vld1.8 {q0-q1}, [r2]!
				543	veor q0, q0, q9
				544	veor q1, q1, q13
				545	vst1.8 {q0-q1}, [r1]!
				546
				547	vld1.8 {q0-q1}, [r2]!
				548	veor q0, q0, q3
				549	veor q1, q1, q7
				550	vst1.8 {q0-q1}, [r1]!
				551
				552	vld1.8 {q0-q1}, [r2]
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	553	mov sp, r4 // restore original stack pointer
Ard Biesheuvel	dc6ff202	2016-12-08 14:28:59 +0000	[diff] [blame]	554	veor q0, q0, q11
				555	veor q1, q1, q15
				556	vst1.8 {q0-q1}, [r1]
				557
Eric Biggers	d233322	2018-09-01 00:17:07 -0700	[diff] [blame]	558	pop {r4-r5}
				559	bx lr
Eric Biggers	54a345a	2018-11-16 17:26:25 -0800	[diff] [blame]	560	ENDPROC(chacha_4block_xor_neon)