Blame - arch/x86/crypto/twofish-x86_64-asm_64.S - kernel/msm-4.9

blob: 35974a58661589c4984073937a49ec2e8d0ce682 [file] [log] [blame]

Joachim Fritschi	eaf4408	2006-06-20 21:12:02 +1000	[diff] [blame]	1	/***************************************************************************
				2	* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
				3	* *
				4	* This program is free software; you can redistribute it and/or modify *
				5	* it under the terms of the GNU General Public License as published by *
				6	* the Free Software Foundation; either version 2 of the License, or *
				7	* (at your option) any later version. *
				8	* *
				9	* This program is distributed in the hope that it will be useful, *
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
				12	* GNU General Public License for more details. *
				13	* *
				14	* You should have received a copy of the GNU General Public License *
				15	* along with this program; if not, write to the *
				16	* Free Software Foundation, Inc., *
				17	* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
				18	***************************************************************************/
				19
				20	.file "twofish-x86_64-asm.S"
				21	.text
				22
				23	#include <asm/asm-offsets.h>
				24
				25	#define a_offset 0
				26	#define b_offset 4
				27	#define c_offset 8
				28	#define d_offset 12
				29
				30	/* Structure of the crypto context struct*/
				31
				32	#define s0 0 /* S0 Array 256 Words each */
				33	#define s1 1024 /* S1 Array */
				34	#define s2 2048 /* S2 Array */
				35	#define s3 3072 /* S3 Array */
				36	#define w 4096 /* 8 whitening keys (word) */
				37	#define k 4128 /* key 1-32 ( word ) */
				38
				39	/* define a few register aliases to allow macro substitution */
				40
				41	#define R0 %rax
				42	#define R0D %eax
				43	#define R0B %al
				44	#define R0H %ah
				45
				46	#define R1 %rbx
				47	#define R1D %ebx
				48	#define R1B %bl
				49	#define R1H %bh
				50
				51	#define R2 %rcx
				52	#define R2D %ecx
				53	#define R2B %cl
				54	#define R2H %ch
				55
				56	#define R3 %rdx
				57	#define R3D %edx
				58	#define R3B %dl
				59	#define R3H %dh
				60
				61
				62	/* performs input whitening */
				63	#define input_whitening(src,context,offset)\
				64	xor w+offset(context), src;
				65
				66	/* performs input whitening */
				67	#define output_whitening(src,context,offset)\
				68	xor w+16+offset(context), src;
				69
				70
				71	/*
				72	* a input register containing a (rotated 16)
				73	* b input register containing b
				74	* c input register containing c
				75	* d input register containing d (already rol $1)
				76	* operations on a and b are interleaved to increase performance
				77	*/
				78	#define encrypt_round(a,b,c,d,round)\
				79	movzx b ## B, %edi;\
				80	mov s1(%r11,%rdi,4),%r8d;\
				81	movzx a ## B, %edi;\
				82	mov s2(%r11,%rdi,4),%r9d;\
				83	movzx b ## H, %edi;\
				84	ror $16, b ## D;\
				85	xor s2(%r11,%rdi,4),%r8d;\
				86	movzx a ## H, %edi;\
				87	ror $16, a ## D;\
				88	xor s3(%r11,%rdi,4),%r9d;\
				89	movzx b ## B, %edi;\
				90	xor s3(%r11,%rdi,4),%r8d;\
				91	movzx a ## B, %edi;\
				92	xor (%r11,%rdi,4), %r9d;\
				93	movzx b ## H, %edi;\
				94	ror $15, b ## D;\
				95	xor (%r11,%rdi,4), %r8d;\
				96	movzx a ## H, %edi;\
				97	xor s1(%r11,%rdi,4),%r9d;\
				98	add %r8d, %r9d;\
				99	add %r9d, %r8d;\
				100	add k+round(%r11), %r9d;\
				101	xor %r9d, c ## D;\
				102	rol $15, c ## D;\
				103	add k+4+round(%r11),%r8d;\
				104	xor %r8d, d ## D;
				105
				106	/*
				107	* a input register containing a(rotated 16)
				108	* b input register containing b
				109	* c input register containing c
				110	* d input register containing d (already rol $1)
				111	* operations on a and b are interleaved to increase performance
				112	* during the round a and b are prepared for the output whitening
				113	*/
				114	#define encrypt_last_round(a,b,c,d,round)\
				115	mov b ## D, %r10d;\
				116	shl $32, %r10;\
				117	movzx b ## B, %edi;\
				118	mov s1(%r11,%rdi,4),%r8d;\
				119	movzx a ## B, %edi;\
				120	mov s2(%r11,%rdi,4),%r9d;\
				121	movzx b ## H, %edi;\
				122	ror $16, b ## D;\
				123	xor s2(%r11,%rdi,4),%r8d;\
				124	movzx a ## H, %edi;\
				125	ror $16, a ## D;\
				126	xor s3(%r11,%rdi,4),%r9d;\
				127	movzx b ## B, %edi;\
				128	xor s3(%r11,%rdi,4),%r8d;\
				129	movzx a ## B, %edi;\
				130	xor (%r11,%rdi,4), %r9d;\
				131	xor a, %r10;\
				132	movzx b ## H, %edi;\
				133	xor (%r11,%rdi,4), %r8d;\
				134	movzx a ## H, %edi;\
				135	xor s1(%r11,%rdi,4),%r9d;\
				136	add %r8d, %r9d;\
				137	add %r9d, %r8d;\
				138	add k+round(%r11), %r9d;\
				139	xor %r9d, c ## D;\
				140	ror $1, c ## D;\
				141	add k+4+round(%r11),%r8d;\
				142	xor %r8d, d ## D
				143
				144	/*
				145	* a input register containing a
				146	* b input register containing b (rotated 16)
				147	* c input register containing c (already rol $1)
				148	* d input register containing d
				149	* operations on a and b are interleaved to increase performance
				150	*/
				151	#define decrypt_round(a,b,c,d,round)\
				152	movzx a ## B, %edi;\
				153	mov (%r11,%rdi,4), %r9d;\
				154	movzx b ## B, %edi;\
				155	mov s3(%r11,%rdi,4),%r8d;\
				156	movzx a ## H, %edi;\
				157	ror $16, a ## D;\
				158	xor s1(%r11,%rdi,4),%r9d;\
				159	movzx b ## H, %edi;\
				160	ror $16, b ## D;\
				161	xor (%r11,%rdi,4), %r8d;\
				162	movzx a ## B, %edi;\
				163	xor s2(%r11,%rdi,4),%r9d;\
				164	movzx b ## B, %edi;\
				165	xor s1(%r11,%rdi,4),%r8d;\
				166	movzx a ## H, %edi;\
				167	ror $15, a ## D;\
				168	xor s3(%r11,%rdi,4),%r9d;\
				169	movzx b ## H, %edi;\
				170	xor s2(%r11,%rdi,4),%r8d;\
				171	add %r8d, %r9d;\
				172	add %r9d, %r8d;\
				173	add k+round(%r11), %r9d;\
				174	xor %r9d, c ## D;\
				175	add k+4+round(%r11),%r8d;\
				176	xor %r8d, d ## D;\
				177	rol $15, d ## D;
				178
				179	/*
				180	* a input register containing a
				181	* b input register containing b
				182	* c input register containing c (already rol $1)
				183	* d input register containing d
				184	* operations on a and b are interleaved to increase performance
				185	* during the round a and b are prepared for the output whitening
				186	*/
				187	#define decrypt_last_round(a,b,c,d,round)\
				188	movzx a ## B, %edi;\
				189	mov (%r11,%rdi,4), %r9d;\
				190	movzx b ## B, %edi;\
				191	mov s3(%r11,%rdi,4),%r8d;\
				192	movzx b ## H, %edi;\
				193	ror $16, b ## D;\
				194	xor (%r11,%rdi,4), %r8d;\
				195	movzx a ## H, %edi;\
				196	mov b ## D, %r10d;\
				197	shl $32, %r10;\
				198	xor a, %r10;\
				199	ror $16, a ## D;\
				200	xor s1(%r11,%rdi,4),%r9d;\
				201	movzx b ## B, %edi;\
				202	xor s1(%r11,%rdi,4),%r8d;\
				203	movzx a ## B, %edi;\
				204	xor s2(%r11,%rdi,4),%r9d;\
				205	movzx b ## H, %edi;\
				206	xor s2(%r11,%rdi,4),%r8d;\
				207	movzx a ## H, %edi;\
				208	xor s3(%r11,%rdi,4),%r9d;\
				209	add %r8d, %r9d;\
				210	add %r9d, %r8d;\
				211	add k+round(%r11), %r9d;\
				212	xor %r9d, c ## D;\
				213	add k+4+round(%r11),%r8d;\
				214	xor %r8d, d ## D;\
				215	ror $1, d ## D;
				216
				217	.align 8
				218	.global twofish_enc_blk
				219	.global twofish_dec_blk
				220
				221	twofish_enc_blk:
				222	pushq R1
				223
				224	/* %rdi contains the crypto tfm adress */
				225	/* %rsi contains the output adress */
				226	/* %rdx contains the input adress */
				227	add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */
				228	/* ctx adress is moved to free one non-rex register
				229	as target for the 8bit high operations */
				230	mov %rdi, %r11
				231
				232	movq (R3), R1
				233	movq 8(R3), R3
				234	input_whitening(R1,%r11,a_offset)
				235	input_whitening(R3,%r11,c_offset)
				236	mov R1D, R0D
				237	rol $16, R0D
				238	shr $32, R1
				239	mov R3D, R2D
				240	shr $32, R3
				241	rol $1, R3D
				242
				243	encrypt_round(R0,R1,R2,R3,0);
				244	encrypt_round(R2,R3,R0,R1,8);
				245	encrypt_round(R0,R1,R2,R3,2*8);
				246	encrypt_round(R2,R3,R0,R1,3*8);
				247	encrypt_round(R0,R1,R2,R3,4*8);
				248	encrypt_round(R2,R3,R0,R1,5*8);
				249	encrypt_round(R0,R1,R2,R3,6*8);
				250	encrypt_round(R2,R3,R0,R1,7*8);
				251	encrypt_round(R0,R1,R2,R3,8*8);
				252	encrypt_round(R2,R3,R0,R1,9*8);
				253	encrypt_round(R0,R1,R2,R3,10*8);
				254	encrypt_round(R2,R3,R0,R1,11*8);
				255	encrypt_round(R0,R1,R2,R3,12*8);
				256	encrypt_round(R2,R3,R0,R1,13*8);
				257	encrypt_round(R0,R1,R2,R3,14*8);
				258	encrypt_last_round(R2,R3,R0,R1,15*8);
				259
				260
				261	output_whitening(%r10,%r11,a_offset)
				262	movq %r10, (%rsi)
				263
				264	shl $32, R1
				265	xor R0, R1
				266
				267	output_whitening(R1,%r11,c_offset)
				268	movq R1, 8(%rsi)
				269
				270	popq R1
				271	movq $1,%rax
				272	ret
				273
				274	twofish_dec_blk:
				275	pushq R1
				276
				277	/* %rdi contains the crypto tfm adress */
				278	/* %rsi contains the output adress */
				279	/* %rdx contains the input adress */
				280	add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */
				281	/* ctx adress is moved to free one non-rex register
				282	as target for the 8bit high operations */
				283	mov %rdi, %r11
				284
				285	movq (R3), R1
				286	movq 8(R3), R3
				287	output_whitening(R1,%r11,a_offset)
				288	output_whitening(R3,%r11,c_offset)
				289	mov R1D, R0D
				290	shr $32, R1
				291	rol $16, R1D
				292	mov R3D, R2D
				293	shr $32, R3
				294	rol $1, R2D
				295
				296	decrypt_round(R0,R1,R2,R3,15*8);
				297	decrypt_round(R2,R3,R0,R1,14*8);
				298	decrypt_round(R0,R1,R2,R3,13*8);
				299	decrypt_round(R2,R3,R0,R1,12*8);
				300	decrypt_round(R0,R1,R2,R3,11*8);
				301	decrypt_round(R2,R3,R0,R1,10*8);
				302	decrypt_round(R0,R1,R2,R3,9*8);
				303	decrypt_round(R2,R3,R0,R1,8*8);
				304	decrypt_round(R0,R1,R2,R3,7*8);
				305	decrypt_round(R2,R3,R0,R1,6*8);
				306	decrypt_round(R0,R1,R2,R3,5*8);
				307	decrypt_round(R2,R3,R0,R1,4*8);
				308	decrypt_round(R0,R1,R2,R3,3*8);
				309	decrypt_round(R2,R3,R0,R1,2*8);
				310	decrypt_round(R0,R1,R2,R3,1*8);
				311	decrypt_last_round(R2,R3,R0,R1,0);
				312
				313	input_whitening(%r10,%r11,a_offset)
				314	movq %r10, (%rsi)
				315
				316	shl $32, R1
				317	xor R0, R1
				318
				319	input_whitening(R1,%r11,c_offset)
				320	movq R1, 8(%rsi)
				321
				322	popq R1
				323	movq $1,%rax
				324	ret