blob: 167f72555d604dc6c24194e3bb9c1e0b3d021bc4 [file] [log] [blame]
Paul Mackerras14cf11a2005-09-26 16:04:21 +10001/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
19
20/*
21 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
22 * len is in words and is always >= 5.
23 *
24 * In practice len == 5, but this is not guaranteed. So this code does not
25 * attempt to use doubleword instructions.
26 */
27_GLOBAL(ip_fast_csum)
28 lwz r0,0(r3)
29 lwzu r5,4(r3)
30 addic. r4,r4,-2
31 addc r0,r0,r5
32 mtctr r4
33 blelr-
341: lwzu r4,4(r3)
35 adde r0,r0,r4
36 bdnz 1b
37 addze r0,r0 /* add in final carry */
38 rldicl r4,r0,32,0 /* fold two 32-bit halves together */
39 add r0,r0,r4
40 srdi r0,r0,32
41 rlwinm r3,r0,16,0,31 /* fold two halves together */
42 add r3,r0,r3
43 not r3,r3
44 srwi r3,r3,16
45 blr
46
47/*
48 * Compute checksum of TCP or UDP pseudo-header:
49 * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
50 * No real gain trying to do this specially for 64 bit, but
51 * the 32 bit addition may spill into the upper bits of
52 * the doubleword so we still must fold it down from 64.
53 */
54_GLOBAL(csum_tcpudp_magic)
55 rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
56 addc r0,r3,r4 /* add 4 32-bit words together */
57 adde r0,r0,r5
58 adde r0,r0,r7
59 rldicl r4,r0,32,0 /* fold 64 bit value */
60 add r0,r4,r0
61 srdi r0,r0,32
62 rlwinm r3,r0,16,0,31 /* fold two halves together */
63 add r3,r0,r3
64 not r3,r3
65 srwi r3,r3,16
66 blr
67
68/*
69 * Computes the checksum of a memory block at buff, length len,
70 * and adds in "sum" (32-bit).
71 *
Paul Mackerras14cf11a2005-09-26 16:04:21 +100072 * csum_partial(r3=buff, r4=len, r5=sum)
73 */
74_GLOBAL(csum_partial)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000075 addic r0,r5,0 /* clear carry */
76
77 srdi. r6,r4,3 /* less than 8 bytes? */
78 beq .Lcsum_tail_word
79
80 /*
81 * If only halfword aligned, align to a double word. Since odd
82 * aligned addresses should be rare and they would require more
83 * work to calculate the correct checksum, we ignore that case
84 * and take the potential slowdown of unaligned loads.
85 */
86 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
87 beq .Lcsum_aligned
88
89 li r7,4
90 sub r6,r7,r6
91 mtctr r6
92
931:
94 lhz r6,0(r3) /* align to doubleword */
95 subi r4,r4,2
96 addi r3,r3,2
97 adde r0,r0,r6
98 bdnz 1b
99
100.Lcsum_aligned:
101 /*
102 * We unroll the loop such that each iteration is 64 bytes with an
103 * entry and exit limb of 64 bytes, meaning a minimum size of
104 * 128 bytes.
105 */
106 srdi. r6,r4,7
107 beq .Lcsum_tail_doublewords /* len < 128 */
108
109 srdi r6,r4,6
110 subi r6,r6,1
111 mtctr r6
112
113 stdu r1,-STACKFRAMESIZE(r1)
Michael Neulingc75df6f2012-06-25 13:33:10 +0000114 std r14,STK_REG(R14)(r1)
115 std r15,STK_REG(R15)(r1)
116 std r16,STK_REG(R16)(r1)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000117
118 ld r6,0(r3)
119 ld r9,8(r3)
120
121 ld r10,16(r3)
122 ld r11,24(r3)
123
124 /*
125 * On POWER6 and POWER7 back to back addes take 2 cycles because of
126 * the XER dependency. This means the fastest this loop can go is
127 * 16 cycles per iteration. The scheduling of the loop below has
128 * been shown to hit this on both POWER6 and POWER7.
129 */
130 .align 5
1312:
132 adde r0,r0,r6
133 ld r12,32(r3)
134 ld r14,40(r3)
135
136 adde r0,r0,r9
137 ld r15,48(r3)
138 ld r16,56(r3)
139 addi r3,r3,64
140
141 adde r0,r0,r10
142
143 adde r0,r0,r11
144
145 adde r0,r0,r12
146
147 adde r0,r0,r14
148
149 adde r0,r0,r15
150 ld r6,0(r3)
151 ld r9,8(r3)
152
153 adde r0,r0,r16
154 ld r10,16(r3)
155 ld r11,24(r3)
156 bdnz 2b
157
158
159 adde r0,r0,r6
160 ld r12,32(r3)
161 ld r14,40(r3)
162
163 adde r0,r0,r9
164 ld r15,48(r3)
165 ld r16,56(r3)
166 addi r3,r3,64
167
168 adde r0,r0,r10
169 adde r0,r0,r11
170 adde r0,r0,r12
171 adde r0,r0,r14
172 adde r0,r0,r15
173 adde r0,r0,r16
174
Michael Neulingc75df6f2012-06-25 13:33:10 +0000175 ld r14,STK_REG(R14)(r1)
176 ld r15,STK_REG(R15)(r1)
177 ld r16,STK_REG(R16)(r1)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000178 addi r1,r1,STACKFRAMESIZE
179
180 andi. r4,r4,63
181
182.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
183 srdi. r6,r4,3
184 beq .Lcsum_tail_word
185
186 mtctr r6
1873:
188 ld r6,0(r3)
189 addi r3,r3,8
190 adde r0,r0,r6
191 bdnz 3b
192
193 andi. r4,r4,7
194
195.Lcsum_tail_word: /* Up to 7 bytes to go */
196 srdi. r6,r4,2
197 beq .Lcsum_tail_halfword
198
199 lwz r6,0(r3)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000200 addi r3,r3,4
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000201 adde r0,r0,r6
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000202 subi r4,r4,4
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000203
204.Lcsum_tail_halfword: /* Up to 3 bytes to go */
205 srdi. r6,r4,1
206 beq .Lcsum_tail_byte
207
208 lhz r6,0(r3)
209 addi r3,r3,2
210 adde r0,r0,r6
211 subi r4,r4,2
212
213.Lcsum_tail_byte: /* Up to 1 byte to go */
214 andi. r6,r4,1
215 beq .Lcsum_finish
216
217 lbz r6,0(r3)
218 sldi r9,r6,8 /* Pad the byte out to 16 bits */
219 adde r0,r0,r9
220
221.Lcsum_finish:
222 addze r0,r0 /* add in final carry */
223 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
224 add r3,r4,r0
225 srdi r3,r3,32
226 blr
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000227
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000228
229 .macro source
230100:
231 .section __ex_table,"a"
232 .align 3
233 .llong 100b,.Lsrc_error
234 .previous
235 .endm
236
237 .macro dest
238200:
239 .section __ex_table,"a"
240 .align 3
241 .llong 200b,.Ldest_error
242 .previous
243 .endm
244
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000245/*
246 * Computes the checksum of a memory block at src, length len,
247 * and adds in "sum" (32-bit), while copying the block to dst.
248 * If an access exception occurs on src or dst, it stores -EFAULT
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000249 * to *src_err or *dst_err respectively. The caller must take any action
250 * required in this case (zeroing memory, recalculating partial checksum etc).
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000251 *
252 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
253 */
254_GLOBAL(csum_partial_copy_generic)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000255 addic r0,r6,0 /* clear carry */
256
257 srdi. r6,r5,3 /* less than 8 bytes? */
258 beq .Lcopy_tail_word
259
260 /*
261 * If only halfword aligned, align to a double word. Since odd
262 * aligned addresses should be rare and they would require more
263 * work to calculate the correct checksum, we ignore that case
264 * and take the potential slowdown of unaligned loads.
265 *
266 * If the source and destination are relatively unaligned we only
267 * align the source. This keeps things simple.
268 */
269 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
270 beq .Lcopy_aligned
271
272 li r7,4
273 sub r6,r7,r6
274 mtctr r6
275
2761:
277source; lhz r6,0(r3) /* align to doubleword */
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000278 subi r5,r5,2
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000279 addi r3,r3,2
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000280 adde r0,r0,r6
281dest; sth r6,0(r4)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000282 addi r4,r4,2
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000283 bdnz 1b
284
285.Lcopy_aligned:
286 /*
287 * We unroll the loop such that each iteration is 64 bytes with an
288 * entry and exit limb of 64 bytes, meaning a minimum size of
289 * 128 bytes.
290 */
291 srdi. r6,r5,7
292 beq .Lcopy_tail_doublewords /* len < 128 */
293
294 srdi r6,r5,6
295 subi r6,r6,1
296 mtctr r6
297
298 stdu r1,-STACKFRAMESIZE(r1)
Michael Neulingc75df6f2012-06-25 13:33:10 +0000299 std r14,STK_REG(R14)(r1)
300 std r15,STK_REG(R15)(r1)
301 std r16,STK_REG(R16)(r1)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000302
303source; ld r6,0(r3)
304source; ld r9,8(r3)
305
306source; ld r10,16(r3)
307source; ld r11,24(r3)
308
309 /*
310 * On POWER6 and POWER7 back to back addes take 2 cycles because of
311 * the XER dependency. This means the fastest this loop can go is
312 * 16 cycles per iteration. The scheduling of the loop below has
313 * been shown to hit this on both POWER6 and POWER7.
314 */
315 .align 5
3162:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000317 adde r0,r0,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000318source; ld r12,32(r3)
319source; ld r14,40(r3)
320
321 adde r0,r0,r9
322source; ld r15,48(r3)
323source; ld r16,56(r3)
324 addi r3,r3,64
325
326 adde r0,r0,r10
327dest; std r6,0(r4)
328dest; std r9,8(r4)
329
330 adde r0,r0,r11
331dest; std r10,16(r4)
332dest; std r11,24(r4)
333
334 adde r0,r0,r12
335dest; std r12,32(r4)
336dest; std r14,40(r4)
337
338 adde r0,r0,r14
339dest; std r15,48(r4)
340dest; std r16,56(r4)
341 addi r4,r4,64
342
343 adde r0,r0,r15
344source; ld r6,0(r3)
345source; ld r9,8(r3)
346
347 adde r0,r0,r16
348source; ld r10,16(r3)
349source; ld r11,24(r3)
350 bdnz 2b
351
352
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000353 adde r0,r0,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000354source; ld r12,32(r3)
355source; ld r14,40(r3)
356
357 adde r0,r0,r9
358source; ld r15,48(r3)
359source; ld r16,56(r3)
360 addi r3,r3,64
361
362 adde r0,r0,r10
363dest; std r6,0(r4)
364dest; std r9,8(r4)
365
366 adde r0,r0,r11
367dest; std r10,16(r4)
368dest; std r11,24(r4)
369
370 adde r0,r0,r12
371dest; std r12,32(r4)
372dest; std r14,40(r4)
373
374 adde r0,r0,r14
375dest; std r15,48(r4)
376dest; std r16,56(r4)
377 addi r4,r4,64
378
379 adde r0,r0,r15
380 adde r0,r0,r16
381
Michael Neulingc75df6f2012-06-25 13:33:10 +0000382 ld r14,STK_REG(R14)(r1)
383 ld r15,STK_REG(R15)(r1)
384 ld r16,STK_REG(R16)(r1)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000385 addi r1,r1,STACKFRAMESIZE
386
387 andi. r5,r5,63
388
389.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
390 srdi. r6,r5,3
391 beq .Lcopy_tail_word
392
393 mtctr r6
3943:
395source; ld r6,0(r3)
396 addi r3,r3,8
397 adde r0,r0,r6
398dest; std r6,0(r4)
399 addi r4,r4,8
400 bdnz 3b
401
402 andi. r5,r5,7
403
404.Lcopy_tail_word: /* Up to 7 bytes to go */
405 srdi. r6,r5,2
406 beq .Lcopy_tail_halfword
407
408source; lwz r6,0(r3)
409 addi r3,r3,4
410 adde r0,r0,r6
411dest; stw r6,0(r4)
412 addi r4,r4,4
413 subi r5,r5,4
414
415.Lcopy_tail_halfword: /* Up to 3 bytes to go */
416 srdi. r6,r5,1
417 beq .Lcopy_tail_byte
418
419source; lhz r6,0(r3)
420 addi r3,r3,2
421 adde r0,r0,r6
422dest; sth r6,0(r4)
423 addi r4,r4,2
424 subi r5,r5,2
425
426.Lcopy_tail_byte: /* Up to 1 byte to go */
427 andi. r6,r5,1
428 beq .Lcopy_finish
429
430source; lbz r6,0(r3)
431 sldi r9,r6,8 /* Pad the byte out to 16 bits */
432 adde r0,r0,r9
433dest; stb r6,0(r4)
434
435.Lcopy_finish:
436 addze r0,r0 /* add in final carry */
437 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
438 add r3,r4,r0
439 srdi r3,r3,32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000440 blr
441
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000442.Lsrc_error:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000443 cmpdi 0,r7,0
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000444 beqlr
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000445 li r6,-EFAULT
446 stw r6,0(r7)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000447 blr
448
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000449.Ldest_error:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000450 cmpdi 0,r8,0
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000451 beqlr
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000452 li r6,-EFAULT
453 stw r6,0(r8)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000454 blr