Blame - arch/powerpc/lib/checksum_64.S - kernel/msm-4.9

blob: 18245af38aea7f747bd034dd5892c300ce73111c [file] [log] [blame]

Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	1	/*
				2	* This file contains assembly-language implementations
				3	* of IP-style 1's complement checksum routines.
				4	*
				5	* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
				6	*
				7	* This program is free software; you can redistribute it and/or
				8	* modify it under the terms of the GNU General Public License
				9	* as published by the Free Software Foundation; either version
				10	* 2 of the License, or (at your option) any later version.
				11	*
				12	* Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
				13	*/
				14
				15	#include <linux/sys.h>
				16	#include <asm/processor.h>
				17	#include <asm/errno.h>
				18	#include <asm/ppc_asm.h>
				19
				20	/*
				21	* ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
				22	* len is in words and is always >= 5.
				23	*
				24	* In practice len == 5, but this is not guaranteed. So this code does not
				25	* attempt to use doubleword instructions.
				26	*/
				27	_GLOBAL(ip_fast_csum)
				28	lwz r0,0(r3)
				29	lwzu r5,4(r3)
				30	addic. r4,r4,-2
				31	addc r0,r0,r5
				32	mtctr r4
				33	blelr-
				34	1: lwzu r4,4(r3)
				35	adde r0,r0,r4
				36	bdnz 1b
				37	addze r0,r0 /* add in final carry */
				38	rldicl r4,r0,32,0 /* fold two 32-bit halves together */
				39	add r0,r0,r4
				40	srdi r0,r0,32
				41	rlwinm r3,r0,16,0,31 /* fold two halves together */
				42	add r3,r0,r3
				43	not r3,r3
				44	srwi r3,r3,16
				45	blr
				46
				47	/*
				48	* Compute checksum of TCP or UDP pseudo-header:
				49	* csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
				50	* No real gain trying to do this specially for 64 bit, but
				51	* the 32 bit addition may spill into the upper bits of
				52	* the doubleword so we still must fold it down from 64.
				53	*/
				54	_GLOBAL(csum_tcpudp_magic)
				55	rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
				56	addc r0,r3,r4 /* add 4 32-bit words together */
				57	adde r0,r0,r5
				58	adde r0,r0,r7
				59	rldicl r4,r0,32,0 /* fold 64 bit value */
				60	add r0,r4,r0
				61	srdi r0,r0,32
				62	rlwinm r3,r0,16,0,31 /* fold two halves together */
				63	add r3,r0,r3
				64	not r3,r3
				65	srwi r3,r3,16
				66	blr
				67
Anton Blanchard	9b83ecb	2010-08-02 20:08:34 +0000	[diff] [blame]	68	#define STACKFRAMESIZE 256
				69	#define STK_REG(i) (112 + ((i)-14)*8)
				70
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	71	/*
				72	* Computes the checksum of a memory block at buff, length len,
				73	* and adds in "sum" (32-bit).
				74	*
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	75	* csum_partial(r3=buff, r4=len, r5=sum)
				76	*/
				77	_GLOBAL(csum_partial)
Anton Blanchard	9b83ecb	2010-08-02 20:08:34 +0000	[diff] [blame]	78	addic r0,r5,0 /* clear carry */
				79
				80	srdi. r6,r4,3 /* less than 8 bytes? */
				81	beq .Lcsum_tail_word
				82
				83	/*
				84	* If only halfword aligned, align to a double word. Since odd
				85	* aligned addresses should be rare and they would require more
				86	* work to calculate the correct checksum, we ignore that case
				87	* and take the potential slowdown of unaligned loads.
				88	*/
				89	rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
				90	beq .Lcsum_aligned
				91
				92	li r7,4
				93	sub r6,r7,r6
				94	mtctr r6
				95
				96	1:
				97	lhz r6,0(r3) /* align to doubleword */
				98	subi r4,r4,2
				99	addi r3,r3,2
				100	adde r0,r0,r6
				101	bdnz 1b
				102
				103	.Lcsum_aligned:
				104	/*
				105	* We unroll the loop such that each iteration is 64 bytes with an
				106	* entry and exit limb of 64 bytes, meaning a minimum size of
				107	* 128 bytes.
				108	*/
				109	srdi. r6,r4,7
				110	beq .Lcsum_tail_doublewords /* len < 128 */
				111
				112	srdi r6,r4,6
				113	subi r6,r6,1
				114	mtctr r6
				115
				116	stdu r1,-STACKFRAMESIZE(r1)
				117	std r14,STK_REG(r14)(r1)
				118	std r15,STK_REG(r15)(r1)
				119	std r16,STK_REG(r16)(r1)
				120
				121	ld r6,0(r3)
				122	ld r9,8(r3)
				123
				124	ld r10,16(r3)
				125	ld r11,24(r3)
				126
				127	/*
				128	* On POWER6 and POWER7 back to back addes take 2 cycles because of
				129	* the XER dependency. This means the fastest this loop can go is
				130	* 16 cycles per iteration. The scheduling of the loop below has
				131	* been shown to hit this on both POWER6 and POWER7.
				132	*/
				133	.align 5
				134	2:
				135	adde r0,r0,r6
				136	ld r12,32(r3)
				137	ld r14,40(r3)
				138
				139	adde r0,r0,r9
				140	ld r15,48(r3)
				141	ld r16,56(r3)
				142	addi r3,r3,64
				143
				144	adde r0,r0,r10
				145
				146	adde r0,r0,r11
				147
				148	adde r0,r0,r12
				149
				150	adde r0,r0,r14
				151
				152	adde r0,r0,r15
				153	ld r6,0(r3)
				154	ld r9,8(r3)
				155
				156	adde r0,r0,r16
				157	ld r10,16(r3)
				158	ld r11,24(r3)
				159	bdnz 2b
				160
				161
				162	adde r0,r0,r6
				163	ld r12,32(r3)
				164	ld r14,40(r3)
				165
				166	adde r0,r0,r9
				167	ld r15,48(r3)
				168	ld r16,56(r3)
				169	addi r3,r3,64
				170
				171	adde r0,r0,r10
				172	adde r0,r0,r11
				173	adde r0,r0,r12
				174	adde r0,r0,r14
				175	adde r0,r0,r15
				176	adde r0,r0,r16
				177
				178	ld r14,STK_REG(r14)(r1)
				179	ld r15,STK_REG(r15)(r1)
				180	ld r16,STK_REG(r16)(r1)
				181	addi r1,r1,STACKFRAMESIZE
				182
				183	andi. r4,r4,63
				184
				185	.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
				186	srdi. r6,r4,3
				187	beq .Lcsum_tail_word
				188
				189	mtctr r6
				190	3:
				191	ld r6,0(r3)
				192	addi r3,r3,8
				193	adde r0,r0,r6
				194	bdnz 3b
				195
				196	andi. r4,r4,7
				197
				198	.Lcsum_tail_word: /* Up to 7 bytes to go */
				199	srdi. r6,r4,2
				200	beq .Lcsum_tail_halfword
				201
				202	lwz r6,0(r3)
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	203	addi r3,r3,4
Anton Blanchard	9b83ecb	2010-08-02 20:08:34 +0000	[diff] [blame]	204	adde r0,r0,r6
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	205	subi r4,r4,4
Anton Blanchard	9b83ecb	2010-08-02 20:08:34 +0000	[diff] [blame]	206
				207	.Lcsum_tail_halfword: /* Up to 3 bytes to go */
				208	srdi. r6,r4,1
				209	beq .Lcsum_tail_byte
				210
				211	lhz r6,0(r3)
				212	addi r3,r3,2
				213	adde r0,r0,r6
				214	subi r4,r4,2
				215
				216	.Lcsum_tail_byte: /* Up to 1 byte to go */
				217	andi. r6,r4,1
				218	beq .Lcsum_finish
				219
				220	lbz r6,0(r3)
				221	sldi r9,r6,8 /* Pad the byte out to 16 bits */
				222	adde r0,r0,r9
				223
				224	.Lcsum_finish:
				225	addze r0,r0 /* add in final carry */
				226	rldicl r4,r0,32,0 /* fold two 32 bit halves together */
				227	add r3,r4,r0
				228	srdi r3,r3,32
				229	blr
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	230
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	231
				232	.macro source
				233	100:
				234	.section __ex_table,"a"
				235	.align 3
				236	.llong 100b,.Lsrc_error
				237	.previous
				238	.endm
				239
				240	.macro dest
				241	200:
				242	.section __ex_table,"a"
				243	.align 3
				244	.llong 200b,.Ldest_error
				245	.previous
				246	.endm
				247
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	248	/*
				249	* Computes the checksum of a memory block at src, length len,
				250	* and adds in "sum" (32-bit), while copying the block to dst.
				251	* If an access exception occurs on src or dst, it stores -EFAULT
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	252	* to src_err or dst_err respectively. The caller must take any action
				253	* required in this case (zeroing memory, recalculating partial checksum etc).
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	254	*
				255	* csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
				256	*/
				257	_GLOBAL(csum_partial_copy_generic)
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	258	addic r0,r6,0 /* clear carry */
				259
				260	srdi. r6,r5,3 /* less than 8 bytes? */
				261	beq .Lcopy_tail_word
				262
				263	/*
				264	* If only halfword aligned, align to a double word. Since odd
				265	* aligned addresses should be rare and they would require more
				266	* work to calculate the correct checksum, we ignore that case
				267	* and take the potential slowdown of unaligned loads.
				268	*
				269	* If the source and destination are relatively unaligned we only
				270	* align the source. This keeps things simple.
				271	*/
				272	rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
				273	beq .Lcopy_aligned
				274
				275	li r7,4
				276	sub r6,r7,r6
				277	mtctr r6
				278
				279	1:
				280	source; lhz r6,0(r3) /* align to doubleword */
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	281	subi r5,r5,2
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	282	addi r3,r3,2
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	283	adde r0,r0,r6
				284	dest; sth r6,0(r4)
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	285	addi r4,r4,2
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	286	bdnz 1b
				287
				288	.Lcopy_aligned:
				289	/*
				290	* We unroll the loop such that each iteration is 64 bytes with an
				291	* entry and exit limb of 64 bytes, meaning a minimum size of
				292	* 128 bytes.
				293	*/
				294	srdi. r6,r5,7
				295	beq .Lcopy_tail_doublewords /* len < 128 */
				296
				297	srdi r6,r5,6
				298	subi r6,r6,1
				299	mtctr r6
				300
				301	stdu r1,-STACKFRAMESIZE(r1)
				302	std r14,STK_REG(r14)(r1)
				303	std r15,STK_REG(r15)(r1)
				304	std r16,STK_REG(r16)(r1)
				305
				306	source; ld r6,0(r3)
				307	source; ld r9,8(r3)
				308
				309	source; ld r10,16(r3)
				310	source; ld r11,24(r3)
				311
				312	/*
				313	* On POWER6 and POWER7 back to back addes take 2 cycles because of
				314	* the XER dependency. This means the fastest this loop can go is
				315	* 16 cycles per iteration. The scheduling of the loop below has
				316	* been shown to hit this on both POWER6 and POWER7.
				317	*/
				318	.align 5
				319	2:
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	320	adde r0,r0,r6
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	321	source; ld r12,32(r3)
				322	source; ld r14,40(r3)
				323
				324	adde r0,r0,r9
				325	source; ld r15,48(r3)
				326	source; ld r16,56(r3)
				327	addi r3,r3,64
				328
				329	adde r0,r0,r10
				330	dest; std r6,0(r4)
				331	dest; std r9,8(r4)
				332
				333	adde r0,r0,r11
				334	dest; std r10,16(r4)
				335	dest; std r11,24(r4)
				336
				337	adde r0,r0,r12
				338	dest; std r12,32(r4)
				339	dest; std r14,40(r4)
				340
				341	adde r0,r0,r14
				342	dest; std r15,48(r4)
				343	dest; std r16,56(r4)
				344	addi r4,r4,64
				345
				346	adde r0,r0,r15
				347	source; ld r6,0(r3)
				348	source; ld r9,8(r3)
				349
				350	adde r0,r0,r16
				351	source; ld r10,16(r3)
				352	source; ld r11,24(r3)
				353	bdnz 2b
				354
				355
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	356	adde r0,r0,r6
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	357	source; ld r12,32(r3)
				358	source; ld r14,40(r3)
				359
				360	adde r0,r0,r9
				361	source; ld r15,48(r3)
				362	source; ld r16,56(r3)
				363	addi r3,r3,64
				364
				365	adde r0,r0,r10
				366	dest; std r6,0(r4)
				367	dest; std r9,8(r4)
				368
				369	adde r0,r0,r11
				370	dest; std r10,16(r4)
				371	dest; std r11,24(r4)
				372
				373	adde r0,r0,r12
				374	dest; std r12,32(r4)
				375	dest; std r14,40(r4)
				376
				377	adde r0,r0,r14
				378	dest; std r15,48(r4)
				379	dest; std r16,56(r4)
				380	addi r4,r4,64
				381
				382	adde r0,r0,r15
				383	adde r0,r0,r16
				384
				385	ld r14,STK_REG(r14)(r1)
				386	ld r15,STK_REG(r15)(r1)
				387	ld r16,STK_REG(r16)(r1)
				388	addi r1,r1,STACKFRAMESIZE
				389
				390	andi. r5,r5,63
				391
				392	.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
				393	srdi. r6,r5,3
				394	beq .Lcopy_tail_word
				395
				396	mtctr r6
				397	3:
				398	source; ld r6,0(r3)
				399	addi r3,r3,8
				400	adde r0,r0,r6
				401	dest; std r6,0(r4)
				402	addi r4,r4,8
				403	bdnz 3b
				404
				405	andi. r5,r5,7
				406
				407	.Lcopy_tail_word: /* Up to 7 bytes to go */
				408	srdi. r6,r5,2
				409	beq .Lcopy_tail_halfword
				410
				411	source; lwz r6,0(r3)
				412	addi r3,r3,4
				413	adde r0,r0,r6
				414	dest; stw r6,0(r4)
				415	addi r4,r4,4
				416	subi r5,r5,4
				417
				418	.Lcopy_tail_halfword: /* Up to 3 bytes to go */
				419	srdi. r6,r5,1
				420	beq .Lcopy_tail_byte
				421
				422	source; lhz r6,0(r3)
				423	addi r3,r3,2
				424	adde r0,r0,r6
				425	dest; sth r6,0(r4)
				426	addi r4,r4,2
				427	subi r5,r5,2
				428
				429	.Lcopy_tail_byte: /* Up to 1 byte to go */
				430	andi. r6,r5,1
				431	beq .Lcopy_finish
				432
				433	source; lbz r6,0(r3)
				434	sldi r9,r6,8 /* Pad the byte out to 16 bits */
				435	adde r0,r0,r9
				436	dest; stb r6,0(r4)
				437
				438	.Lcopy_finish:
				439	addze r0,r0 /* add in final carry */
				440	rldicl r4,r0,32,0 /* fold two 32 bit halves together */
				441	add r3,r4,r0
				442	srdi r3,r3,32
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	443	blr
				444
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	445	.Lsrc_error:
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	446	cmpdi 0,r7,0
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	447	beqlr
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	448	li r6,-EFAULT
				449	stw r6,0(r7)
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	450	blr
				451
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	452	.Ldest_error:
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	453	cmpdi 0,r8,0
Anton Blanchard	fdd374b	2010-08-02 20:09:52 +0000	[diff] [blame]	454	beqlr
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	455	li r6,-EFAULT
				456	stw r6,0(r8)
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	457	blr