Blame - arch/powerpc/lib/checksum_64.S - kernel/msm-4.9

blob: 167f72555d604dc6c24194e3bb9c1e0b3d021bc4 [file] [log] [blame]

Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	1	/*
				2	* This file contains assembly-language implementations
				3	* of IP-style 1's complement checksum routines.
				4	*
				5	* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
				6	*
				7	* This program is free software; you can redistribute it and/or
				8	* modify it under the terms of the GNU General Public License
				9	* as published by the Free Software Foundation; either version
				10	* 2 of the License, or (at your option) any later version.
				11	*
				12	* Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
				13	*/
				14
				15	#include <linux/sys.h>
				16	#include <asm/processor.h>
				17	#include <asm/errno.h>
				18	#include <asm/ppc_asm.h>
				19
				20	/*
				21	* ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
				22	* len is in words and is always >= 5.
				23	*
				24	* In practice len == 5, but this is not guaranteed. So this code does not
				25	* attempt to use doubleword instructions.
				26	*/
				27	_GLOBAL(ip_fast_csum)
				28	lwz r0,0(r3)
				29	lwzu r5,4(r3)
				30	addic. r4,r4,-2
				31	addc r0,r0,r5
				32	mtctr r4
				33	blelr-
				34	1: lwzu r4,4(r3)
				35	adde r0,r0,r4
				36	bdnz 1b
				37	addze r0,r0 /* add in final carry */
				38	rldicl r4,r0,32,0 /* fold two 32-bit halves together */
				39	add r0,r0,r4
				40	srdi r0,r0,32
				41	rlwinm r3,r0,16,0,31 /* fold two halves together */
				42	add r3,r0,r3
				43	not r3,r3
				44	srwi r3,r3,16
				45	blr
				46
				47	/*
				48	* Compute checksum of TCP or UDP pseudo-header:
				49	* csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
				50	* No real gain trying to do this specially for 64 bit, but
				51	* the 32 bit addition may spill into the upper bits of
				52	* the doubleword so we still must fold it down from 64.
				53	*/
				54	_GLOBAL(csum_tcpudp_magic)
				55	rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
				56	addc r0,r3,r4 /* add 4 32-bit words together */
				57	adde r0,r0,r5
				58	adde r0,r0,r7
				59	rldicl r4,r0,32,0 /* fold 64 bit value */
				60	add r0,r4,r0
				61	srdi r0,r0,32
				62	rlwinm r3,r0,16,0,31 /* fold two halves together */
				63	add r3,r0,r3
				64	not r3,r3
				65	srwi r3,r3,16
				66	blr
				67
				68	/*
				69	* Computes the checksum of a memory block at buff, length len,
				70	* and adds in "sum" (32-bit).
				71	*
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	72	* csum_partial(r3=buff, r4=len, r5=sum)
				73	*/
				74	_GLOBAL(csum_partial)
Anton Blanchard	9b83ecb	2010-08-02 20:08:34 +0000	[diff] [blame]	75	addic r0,r5,0 /* clear carry */
				76
				77	srdi. r6,r4,3 /* less than 8 bytes? */
				78	beq .Lcsum_tail_word
				79
				80	/*
				81	* If only halfword aligned, align to a double word. Since odd
				82	* aligned addresses should be rare and they would require more
				83	* work to calculate the correct checksum, we ignore that case
				84	* and take the potential slowdown of unaligned loads.
				85	*/
				86	rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
				87	beq .Lcsum_aligned
				88
				89	li r7,4
				90	sub r6,r7,r6
				91	mtctr r6
				92
				93	1:
				94	lhz r6,0(r3) /* align to doubleword */
				95	subi r4,r4,2
				96	addi r3,r3,2
				97	adde r0,r0,r6
				98	bdnz 1b
				99
				100	.Lcsum_aligned:
				101	/*
				102	* We unroll the loop such that each iteration is 64 bytes with an
				103	* entry and exit limb of 64 bytes, meaning a minimum size of
				104	* 128 bytes.
				105	*/
				106	srdi. r6,r4,7
				107	beq .Lcsum_tail_doublewords /* len < 128 */
				108
				109	srdi r6,r4,6
				110	subi r6,r6,1
				111	mtctr r6
				112
				113	stdu r1,-STACKFRAMESIZE(r1)
Michael Neuling	c75df6f	2012-06-25 13:33:10 +0000	[diff] [blame]	114	std r14,STK_REG(R14)(r1)
				115	std r15,STK_REG(R15)(r1)
				116	std r16,STK_REG(R16)(r1)
Anton Blanchard	9b83ecb	2010-08-02 20:08:34 +0000	[diff] [blame]	117
				118	ld r6,0(r3)
				119	ld r9,8(r3)
				120
				121	ld r10,16(r3)
				122	ld r11,24(r3)
				123
				124	/*
				125	* On POWER6 and POWER7 back to back addes take 2 cycles because of
				126	* the XER dependency. This means the fastest this loop can go is
				127	* 16 cycles per iteration. The scheduling of the loop below has
				128	* been shown to hit this on both POWER6 and POWER7.
				129	*/
				130	.align 5
				131	2:
				132	adde r0,r0,r6
				133	ld r12,32(r3)
				134	ld r14,40(r3)
				135
				136	adde r0,r0,r9
				137	ld r15,48(r3)
				138	ld r16,56(r3)
				139	addi r3,r3,64
				140
				141	adde r0,r0,r10
				142
				143	adde r0,r0,r11
				144
				145	adde r0,r0,r12
				146
				147	adde r0,r0,r14
				148
				149	adde r0,r0,r15
				150	ld r6,0(r3)
				151	ld r9,8(r3)
				152
				153	adde r0,r0,r16
				154	ld r10,16(r3)
				155	ld r11,24(r3)
				156	bdnz 2b
				157
				158
				159	adde r0,r0,r6
				160	ld r12,32(r3)
				161	ld r14,40(r3)
				162
				163	adde r0,r0,r9
				164	ld r15,48(r3)
				165	ld r16,56(r3)
				166	addi r3,r3,64
				167
				168	adde r0,r0,r10
				169	adde r0,r0,r11
				170	adde r0,r0,r12
				171	adde r0,r0,r14
				172	adde r0,r0,r15
				173	adde r0,r0,r16
				174
Michael Neuling	c75df6f	2012-06-25 13:33:10 +0000	[diff] [blame]	175	ld r14,STK_REG(R14)(r1)
				176	ld r15,STK_REG(R15)(r1)
				177	ld r16,STK_REG(R16)(r1)
Anton Blanchard	9b83ecb	2010-08-02 20:08:34 +0000	[diff] [blame]	178	addi r1,r1,STACKFRAMESIZE
				179
				180	andi. r4,r4,63
				181
				182	.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
				183	srdi. r6,r4,3
				184	beq .Lcsum_tail_word
				185
				186	mtctr r6
				187	3:
				188	ld r6,0(r3)
				189	addi r3,r3,8
				190	adde r0,r0,r6
				191	bdnz 3b
				192
				193	andi. r4,r4,7
				194
				195	.Lcsum_tail_word: /* Up to 7 bytes to go */
				196	srdi. r6,r4,2
				197	beq .Lcsum_tail_halfword
				198
				199	lwz r6,0(r3)
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	200	addi r3,r3,4
Anton Blanchard	9b83ecb	2010-08-02 20:08:34 +0000	[diff] [blame]	201	adde r0,r0,r6
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	202	subi r4,r4,4
Anton Blanchard	9b83ecb	2010-08-02 20:08:34 +0000	[diff] [blame]	203
				204	.Lcsum_tail_halfword: /* Up to 3 bytes to go */
				205	srdi. r6,r4,1
				206	beq .Lcsum_tail_byte
				207
				208	lhz r6,0(r3)
				209	addi r3,r3,2
				210	adde r0,r0,r6
				211	subi r4,r4,2
				212
				213	.Lcsum_tail_byte: /* Up to 1 byte to go */
				214	andi. r6,r4,1
				215	beq .Lcsum_finish
				216
				217	lbz r6,0(r3)
				218	sldi r9,r6,8 /* Pad the byte out to 16 bits */
				219	adde r0,r0,r9
				220
				221	.Lcsum_finish:
				222	addze r0,r0 /* add in final carry */
				223	rldicl r4,r0,32,0 /* fold two 32 bit halves together */
				224	add r3,r4,r0
				225	srdi r3,r3,32
				226	blr
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	227
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	228
				229	.macro source
				230	100:
				231	.section __ex_table,"a"
				232	.align 3
				233	.llong 100b,.Lsrc_error
				234	.previous
				235	.endm
				236
				237	.macro dest
				238	200:
				239	.section __ex_table,"a"
				240	.align 3
				241	.llong 200b,.Ldest_error
				242	.previous
				243	.endm
				244
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	245	/*
				246	* Computes the checksum of a memory block at src, length len,
				247	* and adds in "sum" (32-bit), while copying the block to dst.
				248	* If an access exception occurs on src or dst, it stores -EFAULT
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	249	* to src_err or dst_err respectively. The caller must take any action
				250	* required in this case (zeroing memory, recalculating partial checksum etc).
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	251	*
				252	* csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
				253	*/
				254	_GLOBAL(csum_partial_copy_generic)
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	255	addic r0,r6,0 /* clear carry */
				256
				257	srdi. r6,r5,3 /* less than 8 bytes? */
				258	beq .Lcopy_tail_word
				259
				260	/*
				261	* If only halfword aligned, align to a double word. Since odd
				262	* aligned addresses should be rare and they would require more
				263	* work to calculate the correct checksum, we ignore that case
				264	* and take the potential slowdown of unaligned loads.
				265	*
				266	* If the source and destination are relatively unaligned we only
				267	* align the source. This keeps things simple.
				268	*/
				269	rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
				270	beq .Lcopy_aligned
				271
				272	li r7,4
				273	sub r6,r7,r6
				274	mtctr r6
				275
				276	1:
				277	source; lhz r6,0(r3) /* align to doubleword */
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	278	subi r5,r5,2
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	279	addi r3,r3,2
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	280	adde r0,r0,r6
				281	dest; sth r6,0(r4)
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	282	addi r4,r4,2
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	283	bdnz 1b
				284
				285	.Lcopy_aligned:
				286	/*
				287	* We unroll the loop such that each iteration is 64 bytes with an
				288	* entry and exit limb of 64 bytes, meaning a minimum size of
				289	* 128 bytes.
				290	*/
				291	srdi. r6,r5,7
				292	beq .Lcopy_tail_doublewords /* len < 128 */
				293
				294	srdi r6,r5,6
				295	subi r6,r6,1
				296	mtctr r6
				297
				298	stdu r1,-STACKFRAMESIZE(r1)
Michael Neuling	c75df6f	2012-06-25 13:33:10 +0000	[diff] [blame]	299	std r14,STK_REG(R14)(r1)
				300	std r15,STK_REG(R15)(r1)
				301	std r16,STK_REG(R16)(r1)
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	302
				303	source; ld r6,0(r3)
				304	source; ld r9,8(r3)
				305
				306	source; ld r10,16(r3)
				307	source; ld r11,24(r3)
				308
				309	/*
				310	* On POWER6 and POWER7 back to back addes take 2 cycles because of
				311	* the XER dependency. This means the fastest this loop can go is
				312	* 16 cycles per iteration. The scheduling of the loop below has
				313	* been shown to hit this on both POWER6 and POWER7.
				314	*/
				315	.align 5
				316	2:
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	317	adde r0,r0,r6
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	318	source; ld r12,32(r3)
				319	source; ld r14,40(r3)
				320
				321	adde r0,r0,r9
				322	source; ld r15,48(r3)
				323	source; ld r16,56(r3)
				324	addi r3,r3,64
				325
				326	adde r0,r0,r10
				327	dest; std r6,0(r4)
				328	dest; std r9,8(r4)
				329
				330	adde r0,r0,r11
				331	dest; std r10,16(r4)
				332	dest; std r11,24(r4)
				333
				334	adde r0,r0,r12
				335	dest; std r12,32(r4)
				336	dest; std r14,40(r4)
				337
				338	adde r0,r0,r14
				339	dest; std r15,48(r4)
				340	dest; std r16,56(r4)
				341	addi r4,r4,64
				342
				343	adde r0,r0,r15
				344	source; ld r6,0(r3)
				345	source; ld r9,8(r3)
				346
				347	adde r0,r0,r16
				348	source; ld r10,16(r3)
				349	source; ld r11,24(r3)
				350	bdnz 2b
				351
				352
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	353	adde r0,r0,r6
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	354	source; ld r12,32(r3)
				355	source; ld r14,40(r3)
				356
				357	adde r0,r0,r9
				358	source; ld r15,48(r3)
				359	source; ld r16,56(r3)
				360	addi r3,r3,64
				361
				362	adde r0,r0,r10
				363	dest; std r6,0(r4)
				364	dest; std r9,8(r4)
				365
				366	adde r0,r0,r11
				367	dest; std r10,16(r4)
				368	dest; std r11,24(r4)
				369
				370	adde r0,r0,r12
				371	dest; std r12,32(r4)
				372	dest; std r14,40(r4)
				373
				374	adde r0,r0,r14
				375	dest; std r15,48(r4)
				376	dest; std r16,56(r4)
				377	addi r4,r4,64
				378
				379	adde r0,r0,r15
				380	adde r0,r0,r16
				381
Michael Neuling	c75df6f	2012-06-25 13:33:10 +0000	[diff] [blame]	382	ld r14,STK_REG(R14)(r1)
				383	ld r15,STK_REG(R15)(r1)
				384	ld r16,STK_REG(R16)(r1)
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	385	addi r1,r1,STACKFRAMESIZE
				386
				387	andi. r5,r5,63
				388
				389	.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
				390	srdi. r6,r5,3
				391	beq .Lcopy_tail_word
				392
				393	mtctr r6
				394	3:
				395	source; ld r6,0(r3)
				396	addi r3,r3,8
				397	adde r0,r0,r6
				398	dest; std r6,0(r4)
				399	addi r4,r4,8
				400	bdnz 3b
				401
				402	andi. r5,r5,7
				403
				404	.Lcopy_tail_word: /* Up to 7 bytes to go */
				405	srdi. r6,r5,2
				406	beq .Lcopy_tail_halfword
				407
				408	source; lwz r6,0(r3)
				409	addi r3,r3,4
				410	adde r0,r0,r6
				411	dest; stw r6,0(r4)
				412	addi r4,r4,4
				413	subi r5,r5,4
				414
				415	.Lcopy_tail_halfword: /* Up to 3 bytes to go */
				416	srdi. r6,r5,1
				417	beq .Lcopy_tail_byte
				418
				419	source; lhz r6,0(r3)
				420	addi r3,r3,2
				421	adde r0,r0,r6
				422	dest; sth r6,0(r4)
				423	addi r4,r4,2
				424	subi r5,r5,2
				425
				426	.Lcopy_tail_byte: /* Up to 1 byte to go */
				427	andi. r6,r5,1
				428	beq .Lcopy_finish
				429
				430	source; lbz r6,0(r3)
				431	sldi r9,r6,8 /* Pad the byte out to 16 bits */
				432	adde r0,r0,r9
				433	dest; stb r6,0(r4)
				434
				435	.Lcopy_finish:
				436	addze r0,r0 /* add in final carry */
				437	rldicl r4,r0,32,0 /* fold two 32 bit halves together */
				438	add r3,r4,r0
				439	srdi r3,r3,32
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	440	blr
				441
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	442	.Lsrc_error:
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	443	cmpdi 0,r7,0
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	444	beqlr
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	445	li r6,-EFAULT
				446	stw r6,0(r7)
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	447	blr
				448
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	449	.Ldest_error:
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	450	cmpdi 0,r8,0
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	451	beqlr
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	452	li r6,-EFAULT
				453	stw r6,0(r8)
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	454	blr