Blame - arch/powerpc/lib/checksum_64.S - kernel/msm-4.9

blob: 57a0720650576a9f72e98cece18a71e25045ec3a [file] [log] [blame]

Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	1	/*
				2	* This file contains assembly-language implementations
				3	* of IP-style 1's complement checksum routines.
				4	*
				5	* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
				6	*
				7	* This program is free software; you can redistribute it and/or
				8	* modify it under the terms of the GNU General Public License
				9	* as published by the Free Software Foundation; either version
				10	* 2 of the License, or (at your option) any later version.
				11	*
				12	* Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
				13	*/
				14
				15	#include <linux/sys.h>
				16	#include <asm/processor.h>
				17	#include <asm/errno.h>
				18	#include <asm/ppc_asm.h>
				19
				20	/*
				21	* ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
				22	* len is in words and is always >= 5.
				23	*
				24	* In practice len == 5, but this is not guaranteed. So this code does not
				25	* attempt to use doubleword instructions.
				26	*/
				27	_GLOBAL(ip_fast_csum)
				28	lwz r0,0(r3)
				29	lwzu r5,4(r3)
				30	addic. r4,r4,-2
				31	addc r0,r0,r5
				32	mtctr r4
				33	blelr-
				34	1: lwzu r4,4(r3)
				35	adde r0,r0,r4
				36	bdnz 1b
				37	addze r0,r0 /* add in final carry */
				38	rldicl r4,r0,32,0 /* fold two 32-bit halves together */
				39	add r0,r0,r4
				40	srdi r0,r0,32
				41	rlwinm r3,r0,16,0,31 /* fold two halves together */
				42	add r3,r0,r3
				43	not r3,r3
				44	srwi r3,r3,16
				45	blr
				46
				47	/*
				48	* Compute checksum of TCP or UDP pseudo-header:
				49	* csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
				50	* No real gain trying to do this specially for 64 bit, but
				51	* the 32 bit addition may spill into the upper bits of
				52	* the doubleword so we still must fold it down from 64.
				53	*/
				54	_GLOBAL(csum_tcpudp_magic)
				55	rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
				56	addc r0,r3,r4 /* add 4 32-bit words together */
				57	adde r0,r0,r5
				58	adde r0,r0,r7
				59	rldicl r4,r0,32,0 /* fold 64 bit value */
				60	add r0,r4,r0
				61	srdi r0,r0,32
				62	rlwinm r3,r0,16,0,31 /* fold two halves together */
				63	add r3,r0,r3
				64	not r3,r3
				65	srwi r3,r3,16
				66	blr
				67
				68	/*
				69	* Computes the checksum of a memory block at buff, length len,
				70	* and adds in "sum" (32-bit).
				71	*
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	72	* csum_partial(r3=buff, r4=len, r5=sum)
				73	*/
				74	_GLOBAL(csum_partial)
Anton Blanchard	9b83ecb	2010-08-02 20:08:34 +0000	[diff] [blame]	75	addic r0,r5,0 /* clear carry */
				76
				77	srdi. r6,r4,3 /* less than 8 bytes? */
				78	beq .Lcsum_tail_word
				79
				80	/*
				81	* If only halfword aligned, align to a double word. Since odd
				82	* aligned addresses should be rare and they would require more
				83	* work to calculate the correct checksum, we ignore that case
				84	* and take the potential slowdown of unaligned loads.
				85	*/
				86	rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
				87	beq .Lcsum_aligned
				88
				89	li r7,4
				90	sub r6,r7,r6
				91	mtctr r6
				92
				93	1:
				94	lhz r6,0(r3) /* align to doubleword */
				95	subi r4,r4,2
				96	addi r3,r3,2
				97	adde r0,r0,r6
				98	bdnz 1b
				99
				100	.Lcsum_aligned:
				101	/*
				102	* We unroll the loop such that each iteration is 64 bytes with an
				103	* entry and exit limb of 64 bytes, meaning a minimum size of
				104	* 128 bytes.
				105	*/
				106	srdi. r6,r4,7
				107	beq .Lcsum_tail_doublewords /* len < 128 */
				108
				109	srdi r6,r4,6
				110	subi r6,r6,1
				111	mtctr r6
				112
				113	stdu r1,-STACKFRAMESIZE(r1)
Michael Neuling	c75df6f	2012-06-25 13:33:10 +0000	[diff] [blame]	114	std r14,STK_REG(R14)(r1)
				115	std r15,STK_REG(R15)(r1)
				116	std r16,STK_REG(R16)(r1)
Anton Blanchard	9b83ecb	2010-08-02 20:08:34 +0000	[diff] [blame]	117
				118	ld r6,0(r3)
				119	ld r9,8(r3)
				120
				121	ld r10,16(r3)
				122	ld r11,24(r3)
				123
				124	/*
				125	* On POWER6 and POWER7 back to back addes take 2 cycles because of
				126	* the XER dependency. This means the fastest this loop can go is
				127	* 16 cycles per iteration. The scheduling of the loop below has
				128	* been shown to hit this on both POWER6 and POWER7.
				129	*/
				130	.align 5
				131	2:
				132	adde r0,r0,r6
				133	ld r12,32(r3)
				134	ld r14,40(r3)
				135
				136	adde r0,r0,r9
				137	ld r15,48(r3)
				138	ld r16,56(r3)
				139	addi r3,r3,64
				140
				141	adde r0,r0,r10
				142
				143	adde r0,r0,r11
				144
				145	adde r0,r0,r12
				146
				147	adde r0,r0,r14
				148
				149	adde r0,r0,r15
				150	ld r6,0(r3)
				151	ld r9,8(r3)
				152
				153	adde r0,r0,r16
				154	ld r10,16(r3)
				155	ld r11,24(r3)
				156	bdnz 2b
				157
				158
				159	adde r0,r0,r6
				160	ld r12,32(r3)
				161	ld r14,40(r3)
				162
				163	adde r0,r0,r9
				164	ld r15,48(r3)
				165	ld r16,56(r3)
				166	addi r3,r3,64
				167
				168	adde r0,r0,r10
				169	adde r0,r0,r11
				170	adde r0,r0,r12
				171	adde r0,r0,r14
				172	adde r0,r0,r15
				173	adde r0,r0,r16
				174
Michael Neuling	c75df6f	2012-06-25 13:33:10 +0000	[diff] [blame]	175	ld r14,STK_REG(R14)(r1)
				176	ld r15,STK_REG(R15)(r1)
				177	ld r16,STK_REG(R16)(r1)
Anton Blanchard	9b83ecb	2010-08-02 20:08:34 +0000	[diff] [blame]	178	addi r1,r1,STACKFRAMESIZE
				179
				180	andi. r4,r4,63
				181
				182	.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
				183	srdi. r6,r4,3
				184	beq .Lcsum_tail_word
				185
				186	mtctr r6
				187	3:
				188	ld r6,0(r3)
				189	addi r3,r3,8
				190	adde r0,r0,r6
				191	bdnz 3b
				192
				193	andi. r4,r4,7
				194
				195	.Lcsum_tail_word: /* Up to 7 bytes to go */
				196	srdi. r6,r4,2
				197	beq .Lcsum_tail_halfword
				198
				199	lwz r6,0(r3)
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	200	addi r3,r3,4
Anton Blanchard	9b83ecb	2010-08-02 20:08:34 +0000	[diff] [blame]	201	adde r0,r0,r6
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	202	subi r4,r4,4
Anton Blanchard	9b83ecb	2010-08-02 20:08:34 +0000	[diff] [blame]	203
				204	.Lcsum_tail_halfword: /* Up to 3 bytes to go */
				205	srdi. r6,r4,1
				206	beq .Lcsum_tail_byte
				207
				208	lhz r6,0(r3)
				209	addi r3,r3,2
				210	adde r0,r0,r6
				211	subi r4,r4,2
				212
				213	.Lcsum_tail_byte: /* Up to 1 byte to go */
				214	andi. r6,r4,1
				215	beq .Lcsum_finish
				216
				217	lbz r6,0(r3)
				218	sldi r9,r6,8 /* Pad the byte out to 16 bits */
				219	adde r0,r0,r9
				220
				221	.Lcsum_finish:
				222	addze r0,r0 /* add in final carry */
				223	rldicl r4,r0,32,0 /* fold two 32 bit halves together */
				224	add r3,r4,r0
				225	srdi r3,r3,32
				226	blr
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	227
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	228
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	229	.macro srcnr
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	230	100:
				231	.section __ex_table,"a"
				232	.align 3
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	233	.llong 100b,.Lsrc_error_nr
				234	.previous
				235	.endm
				236
				237	.macro source
				238	150:
				239	.section __ex_table,"a"
				240	.align 3
				241	.llong 150b,.Lsrc_error
				242	.previous
				243	.endm
				244
				245	.macro dstnr
				246	200:
				247	.section __ex_table,"a"
				248	.align 3
				249	.llong 200b,.Ldest_error_nr
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	250	.previous
				251	.endm
				252
				253	.macro dest
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	254	250:
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	255	.section __ex_table,"a"
				256	.align 3
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	257	.llong 250b,.Ldest_error
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	258	.previous
				259	.endm
				260
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	261	/*
				262	* Computes the checksum of a memory block at src, length len,
				263	* and adds in "sum" (32-bit), while copying the block to dst.
				264	* If an access exception occurs on src or dst, it stores -EFAULT
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	265	* to src_err or dst_err respectively. The caller must take any action
				266	* required in this case (zeroing memory, recalculating partial checksum etc).
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	267	*
				268	* csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
				269	*/
				270	_GLOBAL(csum_partial_copy_generic)
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	271	addic r0,r6,0 /* clear carry */
				272
				273	srdi. r6,r5,3 /* less than 8 bytes? */
				274	beq .Lcopy_tail_word
				275
				276	/*
				277	* If only halfword aligned, align to a double word. Since odd
				278	* aligned addresses should be rare and they would require more
				279	* work to calculate the correct checksum, we ignore that case
				280	* and take the potential slowdown of unaligned loads.
				281	*
				282	* If the source and destination are relatively unaligned we only
				283	* align the source. This keeps things simple.
				284	*/
				285	rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
				286	beq .Lcopy_aligned
				287
Paul E. McKenney	d9813c3	2013-10-01 16:54:05 +1000	[diff] [blame]	288	li r9,4
				289	sub r6,r9,r6
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	290	mtctr r6
				291
				292	1:
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	293	srcnr; lhz r6,0(r3) /* align to doubleword */
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	294	subi r5,r5,2
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	295	addi r3,r3,2
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	296	adde r0,r0,r6
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	297	dstnr; sth r6,0(r4)
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	298	addi r4,r4,2
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	299	bdnz 1b
				300
				301	.Lcopy_aligned:
				302	/*
				303	* We unroll the loop such that each iteration is 64 bytes with an
				304	* entry and exit limb of 64 bytes, meaning a minimum size of
				305	* 128 bytes.
				306	*/
				307	srdi. r6,r5,7
				308	beq .Lcopy_tail_doublewords /* len < 128 */
				309
				310	srdi r6,r5,6
				311	subi r6,r6,1
				312	mtctr r6
				313
				314	stdu r1,-STACKFRAMESIZE(r1)
Michael Neuling	c75df6f	2012-06-25 13:33:10 +0000	[diff] [blame]	315	std r14,STK_REG(R14)(r1)
				316	std r15,STK_REG(R15)(r1)
				317	std r16,STK_REG(R16)(r1)
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	318
				319	source; ld r6,0(r3)
				320	source; ld r9,8(r3)
				321
				322	source; ld r10,16(r3)
				323	source; ld r11,24(r3)
				324
				325	/*
				326	* On POWER6 and POWER7 back to back addes take 2 cycles because of
				327	* the XER dependency. This means the fastest this loop can go is
				328	* 16 cycles per iteration. The scheduling of the loop below has
				329	* been shown to hit this on both POWER6 and POWER7.
				330	*/
				331	.align 5
				332	2:
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	333	adde r0,r0,r6
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	334	source; ld r12,32(r3)
				335	source; ld r14,40(r3)
				336
				337	adde r0,r0,r9
				338	source; ld r15,48(r3)
				339	source; ld r16,56(r3)
				340	addi r3,r3,64
				341
				342	adde r0,r0,r10
				343	dest; std r6,0(r4)
				344	dest; std r9,8(r4)
				345
				346	adde r0,r0,r11
				347	dest; std r10,16(r4)
				348	dest; std r11,24(r4)
				349
				350	adde r0,r0,r12
				351	dest; std r12,32(r4)
				352	dest; std r14,40(r4)
				353
				354	adde r0,r0,r14
				355	dest; std r15,48(r4)
				356	dest; std r16,56(r4)
				357	addi r4,r4,64
				358
				359	adde r0,r0,r15
				360	source; ld r6,0(r3)
				361	source; ld r9,8(r3)
				362
				363	adde r0,r0,r16
				364	source; ld r10,16(r3)
				365	source; ld r11,24(r3)
				366	bdnz 2b
				367
				368
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	369	adde r0,r0,r6
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	370	source; ld r12,32(r3)
				371	source; ld r14,40(r3)
				372
				373	adde r0,r0,r9
				374	source; ld r15,48(r3)
				375	source; ld r16,56(r3)
				376	addi r3,r3,64
				377
				378	adde r0,r0,r10
				379	dest; std r6,0(r4)
				380	dest; std r9,8(r4)
				381
				382	adde r0,r0,r11
				383	dest; std r10,16(r4)
				384	dest; std r11,24(r4)
				385
				386	adde r0,r0,r12
				387	dest; std r12,32(r4)
				388	dest; std r14,40(r4)
				389
				390	adde r0,r0,r14
				391	dest; std r15,48(r4)
				392	dest; std r16,56(r4)
				393	addi r4,r4,64
				394
				395	adde r0,r0,r15
				396	adde r0,r0,r16
				397
Michael Neuling	c75df6f	2012-06-25 13:33:10 +0000	[diff] [blame]	398	ld r14,STK_REG(R14)(r1)
				399	ld r15,STK_REG(R15)(r1)
				400	ld r16,STK_REG(R16)(r1)
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	401	addi r1,r1,STACKFRAMESIZE
				402
				403	andi. r5,r5,63
				404
				405	.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
				406	srdi. r6,r5,3
				407	beq .Lcopy_tail_word
				408
				409	mtctr r6
				410	3:
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	411	srcnr; ld r6,0(r3)
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	412	addi r3,r3,8
				413	adde r0,r0,r6
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	414	dstnr; std r6,0(r4)
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	415	addi r4,r4,8
				416	bdnz 3b
				417
				418	andi. r5,r5,7
				419
				420	.Lcopy_tail_word: /* Up to 7 bytes to go */
				421	srdi. r6,r5,2
				422	beq .Lcopy_tail_halfword
				423
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	424	srcnr; lwz r6,0(r3)
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	425	addi r3,r3,4
				426	adde r0,r0,r6
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	427	dstnr; stw r6,0(r4)
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	428	addi r4,r4,4
				429	subi r5,r5,4
				430
				431	.Lcopy_tail_halfword: /* Up to 3 bytes to go */
				432	srdi. r6,r5,1
				433	beq .Lcopy_tail_byte
				434
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	435	srcnr; lhz r6,0(r3)
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	436	addi r3,r3,2
				437	adde r0,r0,r6
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	438	dstnr; sth r6,0(r4)
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	439	addi r4,r4,2
				440	subi r5,r5,2
				441
				442	.Lcopy_tail_byte: /* Up to 1 byte to go */
				443	andi. r6,r5,1
				444	beq .Lcopy_finish
				445
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	446	srcnr; lbz r6,0(r3)
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	447	sldi r9,r6,8 /* Pad the byte out to 16 bits */
				448	adde r0,r0,r9
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	449	dstnr; stb r6,0(r4)
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	450
				451	.Lcopy_finish:
				452	addze r0,r0 /* add in final carry */
				453	rldicl r4,r0,32,0 /* fold two 32 bit halves together */
				454	add r3,r4,r0
				455	srdi r3,r3,32
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	456	blr
				457
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	458	.Lsrc_error:
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	459	ld r14,STK_REG(R14)(r1)
				460	ld r15,STK_REG(R15)(r1)
				461	ld r16,STK_REG(R16)(r1)
				462	addi r1,r1,STACKFRAMESIZE
				463	.Lsrc_error_nr:
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	464	cmpdi 0,r7,0
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	465	beqlr
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	466	li r6,-EFAULT
				467	stw r6,0(r7)
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	468	blr
				469
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	470	.Ldest_error:
Paul E. McKenney	8f21bd0	2013-10-01 17:11:35 +1000	[diff] [blame]	471	ld r14,STK_REG(R14)(r1)
				472	ld r15,STK_REG(R15)(r1)
				473	ld r16,STK_REG(R16)(r1)
				474	addi r1,r1,STACKFRAMESIZE
				475	.Ldest_error_nr:
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	476	cmpdi 0,r8,0
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	477	beqlr
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	478	li r6,-EFAULT
				479	stw r6,0(r8)
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	480	blr