blob: 347237253d1e27f086da948a5ebbbf8be7e93fa1 [file] [log] [blame]
Paul Mackerras14cf11a2005-09-26 16:04:21 +10001/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
Christophe Leroy7aef41362015-09-22 16:34:27 +020017#include <asm/cache.h>
Paul Mackerras14cf11a2005-09-26 16:04:21 +100018#include <asm/errno.h>
19#include <asm/ppc_asm.h>
20
21 .text
22
23/*
Paul Mackerras14cf11a2005-09-26 16:04:21 +100024 * computes the checksum of a memory block at buff, length len,
25 * and adds in "sum" (32-bit)
26 *
27 * csum_partial(buff, len, sum)
28 */
29_GLOBAL(csum_partial)
30 addic r0,r5,0
31 subi r3,r3,4
32 srwi. r6,r4,2
33 beq 3f /* if we're doing < 4 bytes */
34 andi. r5,r3,2 /* Align buffer to longword boundary */
35 beq+ 1f
36 lhz r5,4(r3) /* do 2 bytes to get aligned */
37 addi r3,r3,2
38 subi r4,r4,2
39 addc r0,r0,r5
40 srwi. r6,r4,2 /* # words to do */
41 beq 3f
421: mtctr r6
432: lwzu r5,4(r3) /* the bdnz has zero overhead, so it should */
44 adde r0,r0,r5 /* be unnecessary to unroll this loop */
45 bdnz 2b
46 andi. r4,r4,3
473: cmpwi 0,r4,2
48 blt+ 4f
49 lhz r5,4(r3)
50 addi r3,r3,2
51 subi r4,r4,2
52 adde r0,r0,r5
534: cmpwi 0,r4,1
54 bne+ 5f
55 lbz r5,4(r3)
56 slwi r5,r5,8 /* Upper byte of word */
57 adde r0,r0,r5
585: addze r3,r0 /* add in final carry */
59 blr
60
61/*
62 * Computes the checksum of a memory block at src, length len,
63 * and adds in "sum" (32-bit), while copying the block to dst.
64 * If an access exception occurs on src or dst, it stores -EFAULT
65 * to *src_err or *dst_err respectively, and (for an error on
66 * src) zeroes the rest of dst.
67 *
68 * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err)
69 */
Christophe Leroy7aef41362015-09-22 16:34:27 +020070#define CSUM_COPY_16_BYTES_WITHEX(n) \
718 ## n ## 0: \
72 lwz r7,4(r4); \
738 ## n ## 1: \
74 lwz r8,8(r4); \
758 ## n ## 2: \
76 lwz r9,12(r4); \
778 ## n ## 3: \
78 lwzu r10,16(r4); \
798 ## n ## 4: \
80 stw r7,4(r6); \
81 adde r12,r12,r7; \
828 ## n ## 5: \
83 stw r8,8(r6); \
84 adde r12,r12,r8; \
858 ## n ## 6: \
86 stw r9,12(r6); \
87 adde r12,r12,r9; \
888 ## n ## 7: \
89 stwu r10,16(r6); \
90 adde r12,r12,r10
91
92#define CSUM_COPY_16_BYTES_EXCODE(n) \
93.section __ex_table,"a"; \
94 .align 2; \
95 .long 8 ## n ## 0b,src_error; \
96 .long 8 ## n ## 1b,src_error; \
97 .long 8 ## n ## 2b,src_error; \
98 .long 8 ## n ## 3b,src_error; \
99 .long 8 ## n ## 4b,dst_error; \
100 .long 8 ## n ## 5b,dst_error; \
101 .long 8 ## n ## 6b,dst_error; \
102 .long 8 ## n ## 7b,dst_error; \
103 .text
104
105 .text
106 .stabs "arch/powerpc/lib/",N_SO,0,0,0f
107 .stabs "checksum_32.S",N_SO,0,0,0f
1080:
109
110CACHELINE_BYTES = L1_CACHE_BYTES
111LG_CACHELINE_BYTES = L1_CACHE_SHIFT
112CACHELINE_MASK = (L1_CACHE_BYTES-1)
113
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000114_GLOBAL(csum_partial_copy_generic)
Christophe Leroy7aef41362015-09-22 16:34:27 +0200115 stwu r1,-16(r1)
116 stw r7,12(r1)
117 stw r8,8(r1)
118
119 andi. r0,r4,1 /* is destination address even ? */
120 cmplwi cr7,r0,0
121 addic r12,r6,0
122 addi r6,r4,-4
123 neg r0,r4
124 addi r4,r3,-4
125 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
126 beq 58f
127
128 cmplw 0,r5,r0 /* is this more than total to do? */
129 blt 63f /* if not much to do */
130 andi. r8,r0,3 /* get it word-aligned first */
131 mtctr r8
132 beq+ 61f
133 li r3,0
13470: lbz r9,4(r4) /* do some bytes */
135 addi r4,r4,1
136 slwi r3,r3,8
137 rlwimi r3,r9,0,24,31
13871: stb r9,4(r6)
139 addi r6,r6,1
140 bdnz 70b
141 adde r12,r12,r3
14261: subf r5,r0,r5
143 srwi. r0,r0,2
144 mtctr r0
145 beq 58f
14672: lwzu r9,4(r4) /* do some words */
147 adde r12,r12,r9
14873: stwu r9,4(r6)
149 bdnz 72b
150
15158: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
152 clrlwi r5,r5,32-LG_CACHELINE_BYTES
153 li r11,4
154 beq 63f
155
156 /* Here we decide how far ahead to prefetch the source */
157 li r3,4
158 cmpwi r0,1
159 li r7,0
160 ble 114f
161 li r7,1
162#if MAX_COPY_PREFETCH > 1
163 /* Heuristically, for large transfers we prefetch
164 MAX_COPY_PREFETCH cachelines ahead. For small transfers
165 we prefetch 1 cacheline ahead. */
166 cmpwi r0,MAX_COPY_PREFETCH
167 ble 112f
168 li r7,MAX_COPY_PREFETCH
169112: mtctr r7
170111: dcbt r3,r4
171 addi r3,r3,CACHELINE_BYTES
172 bdnz 111b
173#else
174 dcbt r3,r4
175 addi r3,r3,CACHELINE_BYTES
176#endif /* MAX_COPY_PREFETCH > 1 */
177
178114: subf r8,r7,r0
179 mr r0,r7
180 mtctr r8
181
18253: dcbt r3,r4
18354: dcbz r11,r6
184/* the main body of the cacheline loop */
185 CSUM_COPY_16_BYTES_WITHEX(0)
186#if L1_CACHE_BYTES >= 32
187 CSUM_COPY_16_BYTES_WITHEX(1)
188#if L1_CACHE_BYTES >= 64
189 CSUM_COPY_16_BYTES_WITHEX(2)
190 CSUM_COPY_16_BYTES_WITHEX(3)
191#if L1_CACHE_BYTES >= 128
192 CSUM_COPY_16_BYTES_WITHEX(4)
193 CSUM_COPY_16_BYTES_WITHEX(5)
194 CSUM_COPY_16_BYTES_WITHEX(6)
195 CSUM_COPY_16_BYTES_WITHEX(7)
196#endif
197#endif
198#endif
199 bdnz 53b
200 cmpwi r0,0
201 li r3,4
202 li r7,0
203 bne 114b
204
20563: srwi. r0,r5,2
206 mtctr r0
207 beq 64f
20830: lwzu r0,4(r4)
209 adde r12,r12,r0
21031: stwu r0,4(r6)
211 bdnz 30b
212
21364: andi. r0,r5,2
214 beq+ 65f
21540: lhz r0,4(r4)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000216 addi r4,r4,2
Christophe Leroy7aef41362015-09-22 16:34:27 +020021741: sth r0,4(r6)
218 adde r12,r12,r0
219 addi r6,r6,2
22065: andi. r0,r5,1
221 beq+ 66f
22250: lbz r0,4(r4)
22351: stb r0,4(r6)
224 slwi r0,r0,8
225 adde r12,r12,r0
22666: addze r3,r12
227 addi r1,r1,16
228 beqlr+ cr7
229 rlwinm r3,r3,8,0,31 /* swap bytes for odd destination */
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000230 blr
231
Christophe Leroy7aef41362015-09-22 16:34:27 +0200232/* read fault */
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000233src_error:
Christophe Leroy7aef41362015-09-22 16:34:27 +0200234 lwz r7,12(r1)
235 addi r1,r1,16
236 cmpwi cr0,r7,0
237 beqlr
238 li r0,-EFAULT
239 stw r0,0(r7)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000240 blr
Christophe Leroy7aef41362015-09-22 16:34:27 +0200241/* write fault */
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000242dst_error:
Christophe Leroy7aef41362015-09-22 16:34:27 +0200243 lwz r8,8(r1)
244 addi r1,r1,16
245 cmpwi cr0,r8,0
246 beqlr
247 li r0,-EFAULT
248 stw r0,0(r8)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000249 blr
250
Christophe Leroy7aef41362015-09-22 16:34:27 +0200251 .section __ex_table,"a"
252 .align 2
253 .long 70b,src_error
254 .long 71b,dst_error
255 .long 72b,src_error
256 .long 73b,dst_error
257 .long 54b,dst_error
258 .text
259
260/*
261 * this stuff handles faults in the cacheline loop and branches to either
262 * src_error (if in read part) or dst_error (if in write part)
263 */
264 CSUM_COPY_16_BYTES_EXCODE(0)
265#if L1_CACHE_BYTES >= 32
266 CSUM_COPY_16_BYTES_EXCODE(1)
267#if L1_CACHE_BYTES >= 64
268 CSUM_COPY_16_BYTES_EXCODE(2)
269 CSUM_COPY_16_BYTES_EXCODE(3)
270#if L1_CACHE_BYTES >= 128
271 CSUM_COPY_16_BYTES_EXCODE(4)
272 CSUM_COPY_16_BYTES_EXCODE(5)
273 CSUM_COPY_16_BYTES_EXCODE(6)
274 CSUM_COPY_16_BYTES_EXCODE(7)
275#endif
276#endif
277#endif
278
279 .section __ex_table,"a"
280 .align 2
281 .long 30b,src_error
282 .long 31b,dst_error
283 .long 40b,src_error
284 .long 41b,dst_error
285 .long 50b,src_error
286 .long 51b,dst_error