blob: 57a0720650576a9f72e98cece18a71e25045ec3a [file] [log] [blame]
Paul Mackerras14cf11a2005-09-26 16:04:21 +10001/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
19
20/*
21 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
22 * len is in words and is always >= 5.
23 *
24 * In practice len == 5, but this is not guaranteed. So this code does not
25 * attempt to use doubleword instructions.
26 */
27_GLOBAL(ip_fast_csum)
28 lwz r0,0(r3)
29 lwzu r5,4(r3)
30 addic. r4,r4,-2
31 addc r0,r0,r5
32 mtctr r4
33 blelr-
341: lwzu r4,4(r3)
35 adde r0,r0,r4
36 bdnz 1b
37 addze r0,r0 /* add in final carry */
38 rldicl r4,r0,32,0 /* fold two 32-bit halves together */
39 add r0,r0,r4
40 srdi r0,r0,32
41 rlwinm r3,r0,16,0,31 /* fold two halves together */
42 add r3,r0,r3
43 not r3,r3
44 srwi r3,r3,16
45 blr
46
47/*
48 * Compute checksum of TCP or UDP pseudo-header:
49 * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
50 * No real gain trying to do this specially for 64 bit, but
51 * the 32 bit addition may spill into the upper bits of
52 * the doubleword so we still must fold it down from 64.
53 */
54_GLOBAL(csum_tcpudp_magic)
55 rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
56 addc r0,r3,r4 /* add 4 32-bit words together */
57 adde r0,r0,r5
58 adde r0,r0,r7
59 rldicl r4,r0,32,0 /* fold 64 bit value */
60 add r0,r4,r0
61 srdi r0,r0,32
62 rlwinm r3,r0,16,0,31 /* fold two halves together */
63 add r3,r0,r3
64 not r3,r3
65 srwi r3,r3,16
66 blr
67
68/*
69 * Computes the checksum of a memory block at buff, length len,
70 * and adds in "sum" (32-bit).
71 *
Paul Mackerras14cf11a2005-09-26 16:04:21 +100072 * csum_partial(r3=buff, r4=len, r5=sum)
73 */
74_GLOBAL(csum_partial)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000075 addic r0,r5,0 /* clear carry */
76
77 srdi. r6,r4,3 /* less than 8 bytes? */
78 beq .Lcsum_tail_word
79
80 /*
81 * If only halfword aligned, align to a double word. Since odd
82 * aligned addresses should be rare and they would require more
83 * work to calculate the correct checksum, we ignore that case
84 * and take the potential slowdown of unaligned loads.
85 */
86 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
87 beq .Lcsum_aligned
88
89 li r7,4
90 sub r6,r7,r6
91 mtctr r6
92
931:
94 lhz r6,0(r3) /* align to doubleword */
95 subi r4,r4,2
96 addi r3,r3,2
97 adde r0,r0,r6
98 bdnz 1b
99
100.Lcsum_aligned:
101 /*
102 * We unroll the loop such that each iteration is 64 bytes with an
103 * entry and exit limb of 64 bytes, meaning a minimum size of
104 * 128 bytes.
105 */
106 srdi. r6,r4,7
107 beq .Lcsum_tail_doublewords /* len < 128 */
108
109 srdi r6,r4,6
110 subi r6,r6,1
111 mtctr r6
112
113 stdu r1,-STACKFRAMESIZE(r1)
Michael Neulingc75df6f2012-06-25 13:33:10 +0000114 std r14,STK_REG(R14)(r1)
115 std r15,STK_REG(R15)(r1)
116 std r16,STK_REG(R16)(r1)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000117
118 ld r6,0(r3)
119 ld r9,8(r3)
120
121 ld r10,16(r3)
122 ld r11,24(r3)
123
124 /*
125 * On POWER6 and POWER7 back to back addes take 2 cycles because of
126 * the XER dependency. This means the fastest this loop can go is
127 * 16 cycles per iteration. The scheduling of the loop below has
128 * been shown to hit this on both POWER6 and POWER7.
129 */
130 .align 5
1312:
132 adde r0,r0,r6
133 ld r12,32(r3)
134 ld r14,40(r3)
135
136 adde r0,r0,r9
137 ld r15,48(r3)
138 ld r16,56(r3)
139 addi r3,r3,64
140
141 adde r0,r0,r10
142
143 adde r0,r0,r11
144
145 adde r0,r0,r12
146
147 adde r0,r0,r14
148
149 adde r0,r0,r15
150 ld r6,0(r3)
151 ld r9,8(r3)
152
153 adde r0,r0,r16
154 ld r10,16(r3)
155 ld r11,24(r3)
156 bdnz 2b
157
158
159 adde r0,r0,r6
160 ld r12,32(r3)
161 ld r14,40(r3)
162
163 adde r0,r0,r9
164 ld r15,48(r3)
165 ld r16,56(r3)
166 addi r3,r3,64
167
168 adde r0,r0,r10
169 adde r0,r0,r11
170 adde r0,r0,r12
171 adde r0,r0,r14
172 adde r0,r0,r15
173 adde r0,r0,r16
174
Michael Neulingc75df6f2012-06-25 13:33:10 +0000175 ld r14,STK_REG(R14)(r1)
176 ld r15,STK_REG(R15)(r1)
177 ld r16,STK_REG(R16)(r1)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000178 addi r1,r1,STACKFRAMESIZE
179
180 andi. r4,r4,63
181
182.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
183 srdi. r6,r4,3
184 beq .Lcsum_tail_word
185
186 mtctr r6
1873:
188 ld r6,0(r3)
189 addi r3,r3,8
190 adde r0,r0,r6
191 bdnz 3b
192
193 andi. r4,r4,7
194
195.Lcsum_tail_word: /* Up to 7 bytes to go */
196 srdi. r6,r4,2
197 beq .Lcsum_tail_halfword
198
199 lwz r6,0(r3)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000200 addi r3,r3,4
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000201 adde r0,r0,r6
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000202 subi r4,r4,4
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000203
204.Lcsum_tail_halfword: /* Up to 3 bytes to go */
205 srdi. r6,r4,1
206 beq .Lcsum_tail_byte
207
208 lhz r6,0(r3)
209 addi r3,r3,2
210 adde r0,r0,r6
211 subi r4,r4,2
212
213.Lcsum_tail_byte: /* Up to 1 byte to go */
214 andi. r6,r4,1
215 beq .Lcsum_finish
216
217 lbz r6,0(r3)
218 sldi r9,r6,8 /* Pad the byte out to 16 bits */
219 adde r0,r0,r9
220
221.Lcsum_finish:
222 addze r0,r0 /* add in final carry */
223 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
224 add r3,r4,r0
225 srdi r3,r3,32
226 blr
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000227
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000228
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000229 .macro srcnr
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000230100:
231 .section __ex_table,"a"
232 .align 3
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000233 .llong 100b,.Lsrc_error_nr
234 .previous
235 .endm
236
237 .macro source
238150:
239 .section __ex_table,"a"
240 .align 3
241 .llong 150b,.Lsrc_error
242 .previous
243 .endm
244
245 .macro dstnr
246200:
247 .section __ex_table,"a"
248 .align 3
249 .llong 200b,.Ldest_error_nr
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000250 .previous
251 .endm
252
253 .macro dest
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000254250:
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000255 .section __ex_table,"a"
256 .align 3
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000257 .llong 250b,.Ldest_error
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000258 .previous
259 .endm
260
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000261/*
262 * Computes the checksum of a memory block at src, length len,
263 * and adds in "sum" (32-bit), while copying the block to dst.
264 * If an access exception occurs on src or dst, it stores -EFAULT
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000265 * to *src_err or *dst_err respectively. The caller must take any action
266 * required in this case (zeroing memory, recalculating partial checksum etc).
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000267 *
268 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
269 */
270_GLOBAL(csum_partial_copy_generic)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000271 addic r0,r6,0 /* clear carry */
272
273 srdi. r6,r5,3 /* less than 8 bytes? */
274 beq .Lcopy_tail_word
275
276 /*
277 * If only halfword aligned, align to a double word. Since odd
278 * aligned addresses should be rare and they would require more
279 * work to calculate the correct checksum, we ignore that case
280 * and take the potential slowdown of unaligned loads.
281 *
282 * If the source and destination are relatively unaligned we only
283 * align the source. This keeps things simple.
284 */
285 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
286 beq .Lcopy_aligned
287
Paul E. McKenneyd9813c32013-10-01 16:54:05 +1000288 li r9,4
289 sub r6,r9,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000290 mtctr r6
291
2921:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000293srcnr; lhz r6,0(r3) /* align to doubleword */
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000294 subi r5,r5,2
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000295 addi r3,r3,2
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000296 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000297dstnr; sth r6,0(r4)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000298 addi r4,r4,2
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000299 bdnz 1b
300
301.Lcopy_aligned:
302 /*
303 * We unroll the loop such that each iteration is 64 bytes with an
304 * entry and exit limb of 64 bytes, meaning a minimum size of
305 * 128 bytes.
306 */
307 srdi. r6,r5,7
308 beq .Lcopy_tail_doublewords /* len < 128 */
309
310 srdi r6,r5,6
311 subi r6,r6,1
312 mtctr r6
313
314 stdu r1,-STACKFRAMESIZE(r1)
Michael Neulingc75df6f2012-06-25 13:33:10 +0000315 std r14,STK_REG(R14)(r1)
316 std r15,STK_REG(R15)(r1)
317 std r16,STK_REG(R16)(r1)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000318
319source; ld r6,0(r3)
320source; ld r9,8(r3)
321
322source; ld r10,16(r3)
323source; ld r11,24(r3)
324
325 /*
326 * On POWER6 and POWER7 back to back addes take 2 cycles because of
327 * the XER dependency. This means the fastest this loop can go is
328 * 16 cycles per iteration. The scheduling of the loop below has
329 * been shown to hit this on both POWER6 and POWER7.
330 */
331 .align 5
3322:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000333 adde r0,r0,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000334source; ld r12,32(r3)
335source; ld r14,40(r3)
336
337 adde r0,r0,r9
338source; ld r15,48(r3)
339source; ld r16,56(r3)
340 addi r3,r3,64
341
342 adde r0,r0,r10
343dest; std r6,0(r4)
344dest; std r9,8(r4)
345
346 adde r0,r0,r11
347dest; std r10,16(r4)
348dest; std r11,24(r4)
349
350 adde r0,r0,r12
351dest; std r12,32(r4)
352dest; std r14,40(r4)
353
354 adde r0,r0,r14
355dest; std r15,48(r4)
356dest; std r16,56(r4)
357 addi r4,r4,64
358
359 adde r0,r0,r15
360source; ld r6,0(r3)
361source; ld r9,8(r3)
362
363 adde r0,r0,r16
364source; ld r10,16(r3)
365source; ld r11,24(r3)
366 bdnz 2b
367
368
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000369 adde r0,r0,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000370source; ld r12,32(r3)
371source; ld r14,40(r3)
372
373 adde r0,r0,r9
374source; ld r15,48(r3)
375source; ld r16,56(r3)
376 addi r3,r3,64
377
378 adde r0,r0,r10
379dest; std r6,0(r4)
380dest; std r9,8(r4)
381
382 adde r0,r0,r11
383dest; std r10,16(r4)
384dest; std r11,24(r4)
385
386 adde r0,r0,r12
387dest; std r12,32(r4)
388dest; std r14,40(r4)
389
390 adde r0,r0,r14
391dest; std r15,48(r4)
392dest; std r16,56(r4)
393 addi r4,r4,64
394
395 adde r0,r0,r15
396 adde r0,r0,r16
397
Michael Neulingc75df6f2012-06-25 13:33:10 +0000398 ld r14,STK_REG(R14)(r1)
399 ld r15,STK_REG(R15)(r1)
400 ld r16,STK_REG(R16)(r1)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000401 addi r1,r1,STACKFRAMESIZE
402
403 andi. r5,r5,63
404
405.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
406 srdi. r6,r5,3
407 beq .Lcopy_tail_word
408
409 mtctr r6
4103:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000411srcnr; ld r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000412 addi r3,r3,8
413 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000414dstnr; std r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000415 addi r4,r4,8
416 bdnz 3b
417
418 andi. r5,r5,7
419
420.Lcopy_tail_word: /* Up to 7 bytes to go */
421 srdi. r6,r5,2
422 beq .Lcopy_tail_halfword
423
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000424srcnr; lwz r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000425 addi r3,r3,4
426 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000427dstnr; stw r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000428 addi r4,r4,4
429 subi r5,r5,4
430
431.Lcopy_tail_halfword: /* Up to 3 bytes to go */
432 srdi. r6,r5,1
433 beq .Lcopy_tail_byte
434
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000435srcnr; lhz r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000436 addi r3,r3,2
437 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000438dstnr; sth r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000439 addi r4,r4,2
440 subi r5,r5,2
441
442.Lcopy_tail_byte: /* Up to 1 byte to go */
443 andi. r6,r5,1
444 beq .Lcopy_finish
445
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000446srcnr; lbz r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000447 sldi r9,r6,8 /* Pad the byte out to 16 bits */
448 adde r0,r0,r9
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000449dstnr; stb r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000450
451.Lcopy_finish:
452 addze r0,r0 /* add in final carry */
453 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
454 add r3,r4,r0
455 srdi r3,r3,32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000456 blr
457
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000458.Lsrc_error:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000459 ld r14,STK_REG(R14)(r1)
460 ld r15,STK_REG(R15)(r1)
461 ld r16,STK_REG(R16)(r1)
462 addi r1,r1,STACKFRAMESIZE
463.Lsrc_error_nr:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000464 cmpdi 0,r7,0
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000465 beqlr
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000466 li r6,-EFAULT
467 stw r6,0(r7)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000468 blr
469
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000470.Ldest_error:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000471 ld r14,STK_REG(R14)(r1)
472 ld r15,STK_REG(R15)(r1)
473 ld r16,STK_REG(R16)(r1)
474 addi r1,r1,STACKFRAMESIZE
475.Ldest_error_nr:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000476 cmpdi 0,r8,0
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000477 beqlr
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000478 li r6,-EFAULT
479 stw r6,0(r8)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000480 blr