blob: fd9176671f9fb41b1c74f866445bb2e52d29c6b9 [file] [log] [blame]
Paul Mackerras14cf11a2005-09-26 16:04:21 +10001/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
Al Viro9445aa12016-01-13 23:33:46 -050019#include <asm/export.h>
Paul Mackerras14cf11a2005-09-26 16:04:21 +100020
21/*
Paul Mackerras14cf11a2005-09-26 16:04:21 +100022 * Computes the checksum of a memory block at buff, length len,
23 * and adds in "sum" (32-bit).
24 *
Christophe Leroy7e393222016-03-07 18:44:37 +010025 * __csum_partial(r3=buff, r4=len, r5=sum)
Paul Mackerras14cf11a2005-09-26 16:04:21 +100026 */
Christophe Leroy7e393222016-03-07 18:44:37 +010027_GLOBAL(__csum_partial)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000028 addic r0,r5,0 /* clear carry */
29
30 srdi. r6,r4,3 /* less than 8 bytes? */
31 beq .Lcsum_tail_word
32
33 /*
34 * If only halfword aligned, align to a double word. Since odd
35 * aligned addresses should be rare and they would require more
36 * work to calculate the correct checksum, we ignore that case
37 * and take the potential slowdown of unaligned loads.
38 */
39 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
40 beq .Lcsum_aligned
41
42 li r7,4
43 sub r6,r7,r6
44 mtctr r6
45
461:
47 lhz r6,0(r3) /* align to doubleword */
48 subi r4,r4,2
49 addi r3,r3,2
50 adde r0,r0,r6
51 bdnz 1b
52
53.Lcsum_aligned:
54 /*
55 * We unroll the loop such that each iteration is 64 bytes with an
56 * entry and exit limb of 64 bytes, meaning a minimum size of
57 * 128 bytes.
58 */
59 srdi. r6,r4,7
60 beq .Lcsum_tail_doublewords /* len < 128 */
61
62 srdi r6,r4,6
63 subi r6,r6,1
64 mtctr r6
65
66 stdu r1,-STACKFRAMESIZE(r1)
Michael Neulingc75df6f2012-06-25 13:33:10 +000067 std r14,STK_REG(R14)(r1)
68 std r15,STK_REG(R15)(r1)
69 std r16,STK_REG(R16)(r1)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000070
71 ld r6,0(r3)
72 ld r9,8(r3)
73
74 ld r10,16(r3)
75 ld r11,24(r3)
76
77 /*
Stewart Smithec5619f2016-05-23 11:27:01 +100078 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
79 * because of the XER dependency. This means the fastest this loop can
80 * go is 16 cycles per iteration. The scheduling of the loop below has
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000081 * been shown to hit this on both POWER6 and POWER7.
82 */
83 .align 5
842:
85 adde r0,r0,r6
86 ld r12,32(r3)
87 ld r14,40(r3)
88
89 adde r0,r0,r9
90 ld r15,48(r3)
91 ld r16,56(r3)
92 addi r3,r3,64
93
94 adde r0,r0,r10
95
96 adde r0,r0,r11
97
98 adde r0,r0,r12
99
100 adde r0,r0,r14
101
102 adde r0,r0,r15
103 ld r6,0(r3)
104 ld r9,8(r3)
105
106 adde r0,r0,r16
107 ld r10,16(r3)
108 ld r11,24(r3)
109 bdnz 2b
110
111
112 adde r0,r0,r6
113 ld r12,32(r3)
114 ld r14,40(r3)
115
116 adde r0,r0,r9
117 ld r15,48(r3)
118 ld r16,56(r3)
119 addi r3,r3,64
120
121 adde r0,r0,r10
122 adde r0,r0,r11
123 adde r0,r0,r12
124 adde r0,r0,r14
125 adde r0,r0,r15
126 adde r0,r0,r16
127
Michael Neulingc75df6f2012-06-25 13:33:10 +0000128 ld r14,STK_REG(R14)(r1)
129 ld r15,STK_REG(R15)(r1)
130 ld r16,STK_REG(R16)(r1)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000131 addi r1,r1,STACKFRAMESIZE
132
133 andi. r4,r4,63
134
135.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
136 srdi. r6,r4,3
137 beq .Lcsum_tail_word
138
139 mtctr r6
1403:
141 ld r6,0(r3)
142 addi r3,r3,8
143 adde r0,r0,r6
144 bdnz 3b
145
146 andi. r4,r4,7
147
148.Lcsum_tail_word: /* Up to 7 bytes to go */
149 srdi. r6,r4,2
150 beq .Lcsum_tail_halfword
151
152 lwz r6,0(r3)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000153 addi r3,r3,4
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000154 adde r0,r0,r6
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000155 subi r4,r4,4
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000156
157.Lcsum_tail_halfword: /* Up to 3 bytes to go */
158 srdi. r6,r4,1
159 beq .Lcsum_tail_byte
160
161 lhz r6,0(r3)
162 addi r3,r3,2
163 adde r0,r0,r6
164 subi r4,r4,2
165
166.Lcsum_tail_byte: /* Up to 1 byte to go */
167 andi. r6,r4,1
168 beq .Lcsum_finish
169
170 lbz r6,0(r3)
171 sldi r9,r6,8 /* Pad the byte out to 16 bits */
172 adde r0,r0,r9
173
174.Lcsum_finish:
175 addze r0,r0 /* add in final carry */
176 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
177 add r3,r4,r0
178 srdi r3,r3,32
179 blr
Al Viro9445aa12016-01-13 23:33:46 -0500180EXPORT_SYMBOL(__csum_partial)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000181
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000182
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000183 .macro srcnr
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000184100:
185 .section __ex_table,"a"
186 .align 3
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000187 .llong 100b,.Lsrc_error_nr
188 .previous
189 .endm
190
191 .macro source
192150:
193 .section __ex_table,"a"
194 .align 3
195 .llong 150b,.Lsrc_error
196 .previous
197 .endm
198
199 .macro dstnr
200200:
201 .section __ex_table,"a"
202 .align 3
203 .llong 200b,.Ldest_error_nr
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000204 .previous
205 .endm
206
207 .macro dest
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000208250:
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000209 .section __ex_table,"a"
210 .align 3
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000211 .llong 250b,.Ldest_error
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000212 .previous
213 .endm
214
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000215/*
216 * Computes the checksum of a memory block at src, length len,
217 * and adds in "sum" (32-bit), while copying the block to dst.
218 * If an access exception occurs on src or dst, it stores -EFAULT
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000219 * to *src_err or *dst_err respectively. The caller must take any action
220 * required in this case (zeroing memory, recalculating partial checksum etc).
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000221 *
222 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
223 */
224_GLOBAL(csum_partial_copy_generic)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000225 addic r0,r6,0 /* clear carry */
226
227 srdi. r6,r5,3 /* less than 8 bytes? */
228 beq .Lcopy_tail_word
229
230 /*
231 * If only halfword aligned, align to a double word. Since odd
232 * aligned addresses should be rare and they would require more
233 * work to calculate the correct checksum, we ignore that case
234 * and take the potential slowdown of unaligned loads.
235 *
236 * If the source and destination are relatively unaligned we only
237 * align the source. This keeps things simple.
238 */
239 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
240 beq .Lcopy_aligned
241
Paul E. McKenneyd9813c32013-10-01 16:54:05 +1000242 li r9,4
243 sub r6,r9,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000244 mtctr r6
245
2461:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000247srcnr; lhz r6,0(r3) /* align to doubleword */
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000248 subi r5,r5,2
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000249 addi r3,r3,2
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000250 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000251dstnr; sth r6,0(r4)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000252 addi r4,r4,2
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000253 bdnz 1b
254
255.Lcopy_aligned:
256 /*
257 * We unroll the loop such that each iteration is 64 bytes with an
258 * entry and exit limb of 64 bytes, meaning a minimum size of
259 * 128 bytes.
260 */
261 srdi. r6,r5,7
262 beq .Lcopy_tail_doublewords /* len < 128 */
263
264 srdi r6,r5,6
265 subi r6,r6,1
266 mtctr r6
267
268 stdu r1,-STACKFRAMESIZE(r1)
Michael Neulingc75df6f2012-06-25 13:33:10 +0000269 std r14,STK_REG(R14)(r1)
270 std r15,STK_REG(R15)(r1)
271 std r16,STK_REG(R16)(r1)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000272
273source; ld r6,0(r3)
274source; ld r9,8(r3)
275
276source; ld r10,16(r3)
277source; ld r11,24(r3)
278
279 /*
Stewart Smithec5619f2016-05-23 11:27:01 +1000280 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
281 * because of the XER dependency. This means the fastest this loop can
282 * go is 16 cycles per iteration. The scheduling of the loop below has
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000283 * been shown to hit this on both POWER6 and POWER7.
284 */
285 .align 5
2862:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000287 adde r0,r0,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000288source; ld r12,32(r3)
289source; ld r14,40(r3)
290
291 adde r0,r0,r9
292source; ld r15,48(r3)
293source; ld r16,56(r3)
294 addi r3,r3,64
295
296 adde r0,r0,r10
297dest; std r6,0(r4)
298dest; std r9,8(r4)
299
300 adde r0,r0,r11
301dest; std r10,16(r4)
302dest; std r11,24(r4)
303
304 adde r0,r0,r12
305dest; std r12,32(r4)
306dest; std r14,40(r4)
307
308 adde r0,r0,r14
309dest; std r15,48(r4)
310dest; std r16,56(r4)
311 addi r4,r4,64
312
313 adde r0,r0,r15
314source; ld r6,0(r3)
315source; ld r9,8(r3)
316
317 adde r0,r0,r16
318source; ld r10,16(r3)
319source; ld r11,24(r3)
320 bdnz 2b
321
322
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000323 adde r0,r0,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000324source; ld r12,32(r3)
325source; ld r14,40(r3)
326
327 adde r0,r0,r9
328source; ld r15,48(r3)
329source; ld r16,56(r3)
330 addi r3,r3,64
331
332 adde r0,r0,r10
333dest; std r6,0(r4)
334dest; std r9,8(r4)
335
336 adde r0,r0,r11
337dest; std r10,16(r4)
338dest; std r11,24(r4)
339
340 adde r0,r0,r12
341dest; std r12,32(r4)
342dest; std r14,40(r4)
343
344 adde r0,r0,r14
345dest; std r15,48(r4)
346dest; std r16,56(r4)
347 addi r4,r4,64
348
349 adde r0,r0,r15
350 adde r0,r0,r16
351
Michael Neulingc75df6f2012-06-25 13:33:10 +0000352 ld r14,STK_REG(R14)(r1)
353 ld r15,STK_REG(R15)(r1)
354 ld r16,STK_REG(R16)(r1)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000355 addi r1,r1,STACKFRAMESIZE
356
357 andi. r5,r5,63
358
359.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
360 srdi. r6,r5,3
361 beq .Lcopy_tail_word
362
363 mtctr r6
3643:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000365srcnr; ld r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000366 addi r3,r3,8
367 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000368dstnr; std r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000369 addi r4,r4,8
370 bdnz 3b
371
372 andi. r5,r5,7
373
374.Lcopy_tail_word: /* Up to 7 bytes to go */
375 srdi. r6,r5,2
376 beq .Lcopy_tail_halfword
377
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000378srcnr; lwz r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000379 addi r3,r3,4
380 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000381dstnr; stw r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000382 addi r4,r4,4
383 subi r5,r5,4
384
385.Lcopy_tail_halfword: /* Up to 3 bytes to go */
386 srdi. r6,r5,1
387 beq .Lcopy_tail_byte
388
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000389srcnr; lhz r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000390 addi r3,r3,2
391 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000392dstnr; sth r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000393 addi r4,r4,2
394 subi r5,r5,2
395
396.Lcopy_tail_byte: /* Up to 1 byte to go */
397 andi. r6,r5,1
398 beq .Lcopy_finish
399
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000400srcnr; lbz r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000401 sldi r9,r6,8 /* Pad the byte out to 16 bits */
402 adde r0,r0,r9
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000403dstnr; stb r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000404
405.Lcopy_finish:
406 addze r0,r0 /* add in final carry */
407 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
408 add r3,r4,r0
409 srdi r3,r3,32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000410 blr
411
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000412.Lsrc_error:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000413 ld r14,STK_REG(R14)(r1)
414 ld r15,STK_REG(R15)(r1)
415 ld r16,STK_REG(R16)(r1)
416 addi r1,r1,STACKFRAMESIZE
417.Lsrc_error_nr:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000418 cmpdi 0,r7,0
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000419 beqlr
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000420 li r6,-EFAULT
421 stw r6,0(r7)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000422 blr
423
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000424.Ldest_error:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000425 ld r14,STK_REG(R14)(r1)
426 ld r15,STK_REG(R15)(r1)
427 ld r16,STK_REG(R16)(r1)
428 addi r1,r1,STACKFRAMESIZE
429.Ldest_error_nr:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000430 cmpdi 0,r8,0
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000431 beqlr
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000432 li r6,-EFAULT
433 stw r6,0(r8)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000434 blr
Al Viro9445aa12016-01-13 23:33:46 -0500435EXPORT_SYMBOL(csum_partial_copy_generic)