Blame - arch/x86/crypto/twofish-avx-x86_64-asm_64.S - kernel/msm-4.19

blob: 35f45574390d1d0da07305ac031e88119d2eb9c7 [file] [log] [blame]

Johannes Goetzfried	107778b	2012-05-28 15:54:24 +0200	[diff] [blame]	1	/*
				2	* Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
				3	*
				4	* Copyright (C) 2012 Johannes Goetzfried
				5	* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
				6	*
				7	* This program is free software; you can redistribute it and/or modify
				8	* it under the terms of the GNU General Public License as published by
				9	* the Free Software Foundation; either version 2 of the License, or
				10	* (at your option) any later version.
				11	*
				12	* This program is distributed in the hope that it will be useful,
				13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				15	* GNU General Public License for more details.
				16	*
				17	* You should have received a copy of the GNU General Public License
				18	* along with this program; if not, write to the Free Software
				19	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
				20	* USA
				21	*
				22	*/
				23
				24	.file "twofish-avx-x86_64-asm_64.S"
				25	.text
				26
				27	/* structure of crypto context */
				28	#define s0 0
				29	#define s1 1024
				30	#define s2 2048
				31	#define s3 3072
				32	#define w 4096
				33	#define k 4128
				34
				35	/**********************************************************************
				36	8-way AVX twofish
				37	**********************************************************************/
				38	#define CTX %rdi
				39
				40	#define RA1 %xmm0
				41	#define RB1 %xmm1
				42	#define RC1 %xmm2
				43	#define RD1 %xmm3
				44
				45	#define RA2 %xmm4
				46	#define RB2 %xmm5
				47	#define RC2 %xmm6
				48	#define RD2 %xmm7
				49
				50	#define RX %xmm8
				51	#define RY %xmm9
				52
				53	#define RK1 %xmm10
				54	#define RK2 %xmm11
				55
				56	#define RID1 %rax
				57	#define RID1b %al
				58	#define RID2 %rbx
				59	#define RID2b %bl
				60
				61	#define RGI1 %rdx
				62	#define RGI1bl %dl
				63	#define RGI1bh %dh
				64	#define RGI2 %rcx
				65	#define RGI2bl %cl
				66	#define RGI2bh %ch
				67
				68	#define RGS1 %r8
				69	#define RGS1d %r8d
				70	#define RGS2 %r9
				71	#define RGS2d %r9d
				72	#define RGS3 %r10
				73	#define RGS3d %r10d
				74
				75
				76	#define lookup_32bit(t0, t1, t2, t3, src, dst) \
				77	movb src ## bl, RID1b; \
				78	movb src ## bh, RID2b; \
				79	movl t0(CTX, RID1, 4), dst ## d; \
				80	xorl t1(CTX, RID2, 4), dst ## d; \
				81	shrq $16, src; \
				82	movb src ## bl, RID1b; \
				83	movb src ## bh, RID2b; \
				84	xorl t2(CTX, RID1, 4), dst ## d; \
				85	xorl t3(CTX, RID2, 4), dst ## d;
				86
				87	#define G(a, x, t0, t1, t2, t3) \
				88	vmovq a, RGI1; \
				89	vpsrldq $8, a, x; \
				90	vmovq x, RGI2; \
				91	\
				92	lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
				93	shrq $16, RGI1; \
				94	lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
				95	shlq $32, RGS2; \
				96	orq RGS1, RGS2; \
				97	\
				98	lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
				99	shrq $16, RGI2; \
				100	lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
				101	shlq $32, RGS3; \
				102	orq RGS1, RGS3; \
				103	\
				104	vmovq RGS2, x; \
				105	vpinsrq $1, RGS3, x, x;
				106
				107	#define encround(a, b, c, d, x, y) \
				108	G(a, x, s0, s1, s2, s3); \
				109	G(b, y, s1, s2, s3, s0); \
				110	vpaddd x, y, x; \
				111	vpaddd y, x, y; \
				112	vpaddd x, RK1, x; \
				113	vpaddd y, RK2, y; \
				114	vpxor x, c, c; \
				115	vpsrld $1, c, x; \
				116	vpslld $(32 - 1), c, c; \
				117	vpor c, x, c; \
				118	vpslld $1, d, x; \
				119	vpsrld $(32 - 1), d, d; \
				120	vpor d, x, d; \
				121	vpxor d, y, d;
				122
				123	#define decround(a, b, c, d, x, y) \
				124	G(a, x, s0, s1, s2, s3); \
				125	G(b, y, s1, s2, s3, s0); \
				126	vpaddd x, y, x; \
				127	vpaddd y, x, y; \
				128	vpaddd y, RK2, y; \
				129	vpxor d, y, d; \
				130	vpsrld $1, d, y; \
				131	vpslld $(32 - 1), d, d; \
				132	vpor d, y, d; \
				133	vpslld $1, c, y; \
				134	vpsrld $(32 - 1), c, c; \
				135	vpor c, y, c; \
				136	vpaddd x, RK1, x; \
				137	vpxor x, c, c;
				138
				139	#define encrypt_round(n, a, b, c, d) \
				140	vbroadcastss (k+4(2(n)))(CTX), RK1; \
				141	vbroadcastss (k+4(2(n)+1))(CTX), RK2; \
				142	encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
				143	encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
				144
				145	#define decrypt_round(n, a, b, c, d) \
				146	vbroadcastss (k+4(2(n)))(CTX), RK1; \
				147	vbroadcastss (k+4(2(n)+1))(CTX), RK2; \
				148	decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
				149	decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
				150
				151	#define encrypt_cycle(n) \
				152	encrypt_round((2*n), RA, RB, RC, RD); \
				153	encrypt_round(((2*n) + 1), RC, RD, RA, RB);
				154
				155	#define decrypt_cycle(n) \
				156	decrypt_round(((2*n) + 1), RC, RD, RA, RB); \
				157	decrypt_round((2*n), RA, RB, RC, RD);
				158
				159
				160	#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
				161	vpunpckldq x1, x0, t0; \
				162	vpunpckhdq x1, x0, t2; \
				163	vpunpckldq x3, x2, t1; \
				164	vpunpckhdq x3, x2, x3; \
				165	\
				166	vpunpcklqdq t1, t0, x0; \
				167	vpunpckhqdq t1, t0, x1; \
				168	vpunpcklqdq x3, t2, x2; \
				169	vpunpckhqdq x3, t2, x3;
				170
				171	#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
				172	vpxor (044)(in), wkey, x0; \
				173	vpxor (144)(in), wkey, x1; \
				174	vpxor (244)(in), wkey, x2; \
				175	vpxor (344)(in), wkey, x3; \
				176	\
				177	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
				178
				179	#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
				180	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
				181	\
				182	vpxor x0, wkey, x0; \
				183	vmovdqu x0, (044)(out); \
				184	vpxor x1, wkey, x1; \
				185	vmovdqu x1, (144)(out); \
				186	vpxor x2, wkey, x2; \
				187	vmovdqu x2, (244)(out); \
				188	vpxor x3, wkey, x3; \
				189	vmovdqu x3, (344)(out);
				190
				191	#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
				192	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
				193	\
				194	vpxor x0, wkey, x0; \
				195	vpxor (044)(out), x0, x0; \
				196	vmovdqu x0, (044)(out); \
				197	vpxor x1, wkey, x1; \
				198	vpxor (144)(out), x1, x1; \
				199	vmovdqu x1, (144)(out); \
				200	vpxor x2, wkey, x2; \
				201	vpxor (244)(out), x2, x2; \
				202	vmovdqu x2, (244)(out); \
				203	vpxor x3, wkey, x3; \
				204	vpxor (344)(out), x3, x3; \
				205	vmovdqu x3, (344)(out);
				206
				207	.align 8
				208	.global __twofish_enc_blk_8way
				209	.type __twofish_enc_blk_8way,@function;
				210
				211	__twofish_enc_blk_8way:
				212	/* input:
				213	* %rdi: ctx, CTX
				214	* %rsi: dst
				215	* %rdx: src
				216	* %rcx: bool, if true: xor output
				217	*/
				218
				219	pushq %rbx;
				220	pushq %rcx;
				221
				222	vmovdqu w(CTX), RK1;
				223
				224	leaq (444)(%rdx), %rax;
				225	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
				226	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
				227
				228	xorq RID1, RID1;
				229	xorq RID2, RID2;
				230
				231	encrypt_cycle(0);
				232	encrypt_cycle(1);
				233	encrypt_cycle(2);
				234	encrypt_cycle(3);
				235	encrypt_cycle(4);
				236	encrypt_cycle(5);
				237	encrypt_cycle(6);
				238	encrypt_cycle(7);
				239
				240	vmovdqu (w+4*4)(CTX), RK1;
				241
				242	popq %rcx;
				243	popq %rbx;
				244
				245	leaq (444)(%rsi), %rax;
Johannes Goetzfried	107778b	2012-05-28 15:54:24 +0200	[diff] [blame]	246
				247	testb %cl, %cl;
				248	jnz __enc_xor8;
				249
				250	outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
				251	outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
				252
				253	ret;
				254
				255	__enc_xor8:
				256	outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
				257	outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
				258
				259	ret;
				260
				261	.align 8
				262	.global twofish_dec_blk_8way
				263	.type twofish_dec_blk_8way,@function;
				264
				265	twofish_dec_blk_8way:
				266	/* input:
				267	* %rdi: ctx, CTX
				268	* %rsi: dst
				269	* %rdx: src
				270	*/
				271
				272	pushq %rbx;
				273
				274	vmovdqu (w+4*4)(CTX), RK1;
				275
				276	leaq (444)(%rdx), %rax;
				277	inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
				278	inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
				279
				280	xorq RID1, RID1;
				281	xorq RID2, RID2;
				282
				283	decrypt_cycle(7);
				284	decrypt_cycle(6);
				285	decrypt_cycle(5);
				286	decrypt_cycle(4);
				287	decrypt_cycle(3);
				288	decrypt_cycle(2);
				289	decrypt_cycle(1);
				290	decrypt_cycle(0);
				291
				292	vmovdqu (w)(CTX), RK1;
				293
				294	popq %rbx;
				295
				296	leaq (444)(%rsi), %rax;
				297	outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
				298	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
				299
				300	ret;