Blame - arch/i386/lib/checksum.S - fp2-dev/kernel/msm

blob: 75ffd02654fc57b89f8a4912946cb6ea6a4d95bb [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* IP/TCP/UDP checksumming routines
				7	*
				8	* Authors: Jorge Cwik, <jorge@laser.satlink.net>
				9	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				10	* Tom May, <ftom@netcom.com>
				11	* Pentium Pro/II routines:
				12	* Alexander Kjeldaas <astor@guardian.no>
				13	* Finn Arne Gangstad <finnag@guardian.no>
				14	* Lots of code moved from tcp.c and ip.c; see those files
				15	* for more names.
				16	*
				17	* Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
				18	* handling.
				19	* Andi Kleen, add zeroing on error
				20	* converted to pure assembler
				21	*
				22	* This program is free software; you can redistribute it and/or
				23	* modify it under the terms of the GNU General Public License
				24	* as published by the Free Software Foundation; either version
				25	* 2 of the License, or (at your option) any later version.
				26	*/
				27
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	28	#include <asm/errno.h>
				29
				30	/*
				31	* computes a partial checksum, e.g. for TCP/UDP fragments
				32	*/
				33
				34	/*
				35	unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
				36	*/
				37
				38	.text
				39	.align 4
				40	.globl csum_partial
				41
				42	#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
				43
				44	/*
				45	* Experiments with Ethernet and SLIP connections show that buff
				46	* is aligned on either a 2-byte or 4-byte boundary. We get at
				47	* least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
				48	* Fortunately, it is easy to convert 2-byte alignment to 4-byte
				49	* alignment for the unrolled loop.
				50	*/
				51	csum_partial:
				52	pushl %esi
				53	pushl %ebx
				54	movl 20(%esp),%eax # Function arg: unsigned int sum
				55	movl 16(%esp),%ecx # Function arg: int len
				56	movl 12(%esp),%esi # Function arg: unsigned char *buff
				57	testl $3, %esi # Check alignment.
				58	jz 2f # Jump if alignment is ok.
				59	testl $1, %esi # Check alignment.
				60	jz 10f # Jump if alignment is boundary of 2bytes.
				61
				62	# buf is odd
				63	dec %ecx
				64	jl 8f
				65	movzbl (%esi), %ebx
				66	adcl %ebx, %eax
				67	roll $8, %eax
				68	inc %esi
				69	testl $2, %esi
				70	jz 2f
				71	10:
				72	subl $2, %ecx # Alignment uses up two bytes.
				73	jae 1f # Jump if we had at least two bytes.
				74	addl $2, %ecx # ecx was < 2. Deal with it.
				75	jmp 4f
				76	1: movw (%esi), %bx
				77	addl $2, %esi
				78	addw %bx, %ax
				79	adcl $0, %eax
				80	2:
				81	movl %ecx, %edx
				82	shrl $5, %ecx
				83	jz 2f
				84	testl %esi, %esi
				85	1: movl (%esi), %ebx
				86	adcl %ebx, %eax
				87	movl 4(%esi), %ebx
				88	adcl %ebx, %eax
				89	movl 8(%esi), %ebx
				90	adcl %ebx, %eax
				91	movl 12(%esi), %ebx
				92	adcl %ebx, %eax
				93	movl 16(%esi), %ebx
				94	adcl %ebx, %eax
				95	movl 20(%esi), %ebx
				96	adcl %ebx, %eax
				97	movl 24(%esi), %ebx
				98	adcl %ebx, %eax
				99	movl 28(%esi), %ebx
				100	adcl %ebx, %eax
				101	lea 32(%esi), %esi
				102	dec %ecx
				103	jne 1b
				104	adcl $0, %eax
				105	2: movl %edx, %ecx
				106	andl $0x1c, %edx
				107	je 4f
				108	shrl $2, %edx # This clears CF
				109	3: adcl (%esi), %eax
				110	lea 4(%esi), %esi
				111	dec %edx
				112	jne 3b
				113	adcl $0, %eax
				114	4: andl $3, %ecx
				115	jz 7f
				116	cmpl $2, %ecx
				117	jb 5f
				118	movw (%esi),%cx
				119	leal 2(%esi),%esi
				120	je 6f
				121	shll $16,%ecx
				122	5: movb (%esi),%cl
				123	6: addl %ecx,%eax
				124	adcl $0, %eax
				125	7:
				126	testl $1, 12(%esp)
				127	jz 8f
				128	roll $8, %eax
				129	8:
				130	popl %ebx
				131	popl %esi
				132	ret
				133
				134	#else
				135
				136	/* Version for PentiumII/PPro */
				137
				138	csum_partial:
				139	pushl %esi
				140	pushl %ebx
				141	movl 20(%esp),%eax # Function arg: unsigned int sum
				142	movl 16(%esp),%ecx # Function arg: int len
				143	movl 12(%esp),%esi # Function arg: const unsigned char *buf
				144
				145	testl $3, %esi
				146	jnz 25f
				147	10:
				148	movl %ecx, %edx
				149	movl %ecx, %ebx
				150	andl $0x7c, %ebx
				151	shrl $7, %ecx
				152	addl %ebx,%esi
				153	shrl $2, %ebx
				154	negl %ebx
				155	lea 45f(%ebx,%ebx,2), %ebx
				156	testl %esi, %esi
				157	jmp *%ebx
				158
				159	# Handle 2-byte-aligned regions
				160	20: addw (%esi), %ax
				161	lea 2(%esi), %esi
				162	adcl $0, %eax
				163	jmp 10b
				164	25:
				165	testl $1, %esi
				166	jz 30f
				167	# buf is odd
				168	dec %ecx
				169	jl 90f
				170	movzbl (%esi), %ebx
				171	addl %ebx, %eax
				172	adcl $0, %eax
				173	roll $8, %eax
				174	inc %esi
				175	testl $2, %esi
				176	jz 10b
				177
				178	30: subl $2, %ecx
				179	ja 20b
				180	je 32f
				181	addl $2, %ecx
				182	jz 80f
				183	movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
				184	addl %ebx, %eax
				185	adcl $0, %eax
				186	jmp 80f
				187	32:
				188	addw (%esi), %ax # csumming 2 bytes, 2-aligned
				189	adcl $0, %eax
				190	jmp 80f
				191
				192	40:
				193	addl -128(%esi), %eax
				194	adcl -124(%esi), %eax
				195	adcl -120(%esi), %eax
				196	adcl -116(%esi), %eax
				197	adcl -112(%esi), %eax
				198	adcl -108(%esi), %eax
				199	adcl -104(%esi), %eax
				200	adcl -100(%esi), %eax
				201	adcl -96(%esi), %eax
				202	adcl -92(%esi), %eax
				203	adcl -88(%esi), %eax
				204	adcl -84(%esi), %eax
				205	adcl -80(%esi), %eax
				206	adcl -76(%esi), %eax
				207	adcl -72(%esi), %eax
				208	adcl -68(%esi), %eax
				209	adcl -64(%esi), %eax
				210	adcl -60(%esi), %eax
				211	adcl -56(%esi), %eax
				212	adcl -52(%esi), %eax
				213	adcl -48(%esi), %eax
				214	adcl -44(%esi), %eax
				215	adcl -40(%esi), %eax
				216	adcl -36(%esi), %eax
				217	adcl -32(%esi), %eax
				218	adcl -28(%esi), %eax
				219	adcl -24(%esi), %eax
				220	adcl -20(%esi), %eax
				221	adcl -16(%esi), %eax
				222	adcl -12(%esi), %eax
				223	adcl -8(%esi), %eax
				224	adcl -4(%esi), %eax
				225	45:
				226	lea 128(%esi), %esi
				227	adcl $0, %eax
				228	dec %ecx
				229	jge 40b
				230	movl %edx, %ecx
				231	50: andl $3, %ecx
				232	jz 80f
				233
				234	# Handle the last 1-3 bytes without jumping
				235	notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
				236	movl $0xffffff,%ebx # by the shll and shrl instructions
				237	shll $3,%ecx
				238	shrl %cl,%ebx
				239	andl -128(%esi),%ebx # esi is 4-aligned so should be ok
				240	addl %ebx,%eax
				241	adcl $0,%eax
				242	80:
				243	testl $1, 12(%esp)
				244	jz 90f
				245	roll $8, %eax
				246	90:
				247	popl %ebx
				248	popl %esi
				249	ret
				250
				251	#endif
				252
				253	/*
				254	unsigned int csum_partial_copy_generic (const char src, char dst,
				255	int len, int sum, int src_err_ptr, int dst_err_ptr)
				256	*/
				257
				258	/*
				259	* Copy from ds while checksumming, otherwise like csum_partial
				260	*
				261	* The macros SRC and DST specify the type of access for the instruction.
				262	* thus we can call a custom exception handler for all access types.
				263	*
				264	* FIXME: could someone double-check whether I haven't mixed up some SRC and
				265	* DST definitions? It's damn hard to trigger all cases. I hope I got
				266	* them all but there's no guarantee.
				267	*/
				268
				269	#define SRC(y...) \
				270	9999: y; \
				271	.section __ex_table, "a"; \
				272	.long 9999b, 6001f ; \
				273	.previous
				274
				275	#define DST(y...) \
				276	9999: y; \
				277	.section __ex_table, "a"; \
				278	.long 9999b, 6002f ; \
				279	.previous
				280
				281	.align 4
				282	.globl csum_partial_copy_generic
				283
				284	#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
				285
				286	#define ARGBASE 16
				287	#define FP 12
				288
				289	csum_partial_copy_generic:
				290	subl $4,%esp
				291	pushl %edi
				292	pushl %esi
				293	pushl %ebx
				294	movl ARGBASE+16(%esp),%eax # sum
				295	movl ARGBASE+12(%esp),%ecx # len
				296	movl ARGBASE+4(%esp),%esi # src
				297	movl ARGBASE+8(%esp),%edi # dst
				298
				299	testl $2, %edi # Check alignment.
				300	jz 2f # Jump if alignment is ok.
				301	subl $2, %ecx # Alignment uses up two bytes.
				302	jae 1f # Jump if we had at least two bytes.
				303	addl $2, %ecx # ecx was < 2. Deal with it.
				304	jmp 4f
				305	SRC(1: movw (%esi), %bx )
				306	addl $2, %esi
				307	DST( movw %bx, (%edi) )
				308	addl $2, %edi
				309	addw %bx, %ax
				310	adcl $0, %eax
				311	2:
				312	movl %ecx, FP(%esp)
				313	shrl $5, %ecx
				314	jz 2f
				315	testl %esi, %esi
				316	SRC(1: movl (%esi), %ebx )
				317	SRC( movl 4(%esi), %edx )
				318	adcl %ebx, %eax
				319	DST( movl %ebx, (%edi) )
				320	adcl %edx, %eax
				321	DST( movl %edx, 4(%edi) )
				322
				323	SRC( movl 8(%esi), %ebx )
				324	SRC( movl 12(%esi), %edx )
				325	adcl %ebx, %eax
				326	DST( movl %ebx, 8(%edi) )
				327	adcl %edx, %eax
				328	DST( movl %edx, 12(%edi) )
				329
				330	SRC( movl 16(%esi), %ebx )
				331	SRC( movl 20(%esi), %edx )
				332	adcl %ebx, %eax
				333	DST( movl %ebx, 16(%edi) )
				334	adcl %edx, %eax
				335	DST( movl %edx, 20(%edi) )
				336
				337	SRC( movl 24(%esi), %ebx )
				338	SRC( movl 28(%esi), %edx )
				339	adcl %ebx, %eax
				340	DST( movl %ebx, 24(%edi) )
				341	adcl %edx, %eax
				342	DST( movl %edx, 28(%edi) )
				343
				344	lea 32(%esi), %esi
				345	lea 32(%edi), %edi
				346	dec %ecx
				347	jne 1b
				348	adcl $0, %eax
				349	2: movl FP(%esp), %edx
				350	movl %edx, %ecx
				351	andl $0x1c, %edx
				352	je 4f
				353	shrl $2, %edx # This clears CF
				354	SRC(3: movl (%esi), %ebx )
				355	adcl %ebx, %eax
				356	DST( movl %ebx, (%edi) )
				357	lea 4(%esi), %esi
				358	lea 4(%edi), %edi
				359	dec %edx
				360	jne 3b
				361	adcl $0, %eax
				362	4: andl $3, %ecx
				363	jz 7f
				364	cmpl $2, %ecx
				365	jb 5f
				366	SRC( movw (%esi), %cx )
				367	leal 2(%esi), %esi
				368	DST( movw %cx, (%edi) )
				369	leal 2(%edi), %edi
				370	je 6f
				371	shll $16,%ecx
				372	SRC(5: movb (%esi), %cl )
				373	DST( movb %cl, (%edi) )
				374	6: addl %ecx, %eax
				375	adcl $0, %eax
				376	7:
				377	5000:
				378
				379	# Exception handler:
				380	.section .fixup, "ax"
				381
				382	6001:
				383	movl ARGBASE+20(%esp), %ebx # src_err_ptr
				384	movl $-EFAULT, (%ebx)
				385
				386	# zero the complete destination - computing the rest
				387	# is too much work
				388	movl ARGBASE+8(%esp), %edi # dst
				389	movl ARGBASE+12(%esp), %ecx # len
				390	xorl %eax,%eax
				391	rep ; stosb
				392
				393	jmp 5000b
				394
				395	6002:
				396	movl ARGBASE+24(%esp), %ebx # dst_err_ptr
				397	movl $-EFAULT,(%ebx)
				398	jmp 5000b
				399
				400	.previous
				401
				402	popl %ebx
				403	popl %esi
				404	popl %edi
				405	popl %ecx # equivalent to addl $4,%esp
				406	ret
				407
				408	#else
				409
				410	/* Version for PentiumII/PPro */
				411
				412	#define ROUND1(x) \
				413	SRC(movl x(%esi), %ebx ) ; \
				414	addl %ebx, %eax ; \
				415	DST(movl %ebx, x(%edi) ) ;
				416
				417	#define ROUND(x) \
				418	SRC(movl x(%esi), %ebx ) ; \
				419	adcl %ebx, %eax ; \
				420	DST(movl %ebx, x(%edi) ) ;
				421
				422	#define ARGBASE 12
				423
				424	csum_partial_copy_generic:
				425	pushl %ebx
				426	pushl %edi
				427	pushl %esi
				428	movl ARGBASE+4(%esp),%esi #src
				429	movl ARGBASE+8(%esp),%edi #dst
				430	movl ARGBASE+12(%esp),%ecx #len
				431	movl ARGBASE+16(%esp),%eax #sum
				432	# movl %ecx, %edx
				433	movl %ecx, %ebx
				434	movl %esi, %edx
				435	shrl $6, %ecx
				436	andl $0x3c, %ebx
				437	negl %ebx
				438	subl %ebx, %esi
				439	subl %ebx, %edi
				440	lea -1(%esi),%edx
				441	andl $-32,%edx
				442	lea 3f(%ebx,%ebx), %ebx
				443	testl %esi, %esi
				444	jmp *%ebx
				445	1: addl $64,%esi
				446	addl $64,%edi
				447	SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
				448	ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
				449	ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
				450	ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
				451	ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
				452	3: adcl $0,%eax
				453	addl $64, %edx
				454	dec %ecx
				455	jge 1b
				456	4: movl ARGBASE+12(%esp),%edx #len
				457	andl $3, %edx
				458	jz 7f
				459	cmpl $2, %edx
				460	jb 5f
				461	SRC( movw (%esi), %dx )
				462	leal 2(%esi), %esi
				463	DST( movw %dx, (%edi) )
				464	leal 2(%edi), %edi
				465	je 6f
				466	shll $16,%edx
				467	5:
				468	SRC( movb (%esi), %dl )
				469	DST( movb %dl, (%edi) )
				470	6: addl %edx, %eax
				471	adcl $0, %eax
				472	7:
				473	.section .fixup, "ax"
				474	6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
				475	movl $-EFAULT, (%ebx)
				476	# zero the complete destination (computing the rest is too much work)
				477	movl ARGBASE+8(%esp),%edi # dst
				478	movl ARGBASE+12(%esp),%ecx # len
				479	xorl %eax,%eax
				480	rep; stosb
				481	jmp 7b
				482	6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
				483	movl $-EFAULT, (%ebx)
				484	jmp 7b
				485	.previous
				486
				487	popl %esi
				488	popl %edi
				489	popl %ebx
				490	ret
				491
				492	#undef ROUND
				493	#undef ROUND1
				494
				495	#endif