blob: f53f4abb9282aa62d9755399a7bd6b459f5a2ce6 [file] [log] [blame]
Paul Mackerras14cf11a2005-09-26 16:04:21 +10001/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
19
20/*
Paul Mackerras14cf11a2005-09-26 16:04:21 +100021 * Computes the checksum of a memory block at buff, length len,
22 * and adds in "sum" (32-bit).
23 *
Paul Mackerras14cf11a2005-09-26 16:04:21 +100024 * csum_partial(r3=buff, r4=len, r5=sum)
25 */
26_GLOBAL(csum_partial)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000027 addic r0,r5,0 /* clear carry */
28
29 srdi. r6,r4,3 /* less than 8 bytes? */
30 beq .Lcsum_tail_word
31
32 /*
33 * If only halfword aligned, align to a double word. Since odd
34 * aligned addresses should be rare and they would require more
35 * work to calculate the correct checksum, we ignore that case
36 * and take the potential slowdown of unaligned loads.
37 */
38 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
39 beq .Lcsum_aligned
40
41 li r7,4
42 sub r6,r7,r6
43 mtctr r6
44
451:
46 lhz r6,0(r3) /* align to doubleword */
47 subi r4,r4,2
48 addi r3,r3,2
49 adde r0,r0,r6
50 bdnz 1b
51
52.Lcsum_aligned:
53 /*
54 * We unroll the loop such that each iteration is 64 bytes with an
55 * entry and exit limb of 64 bytes, meaning a minimum size of
56 * 128 bytes.
57 */
58 srdi. r6,r4,7
59 beq .Lcsum_tail_doublewords /* len < 128 */
60
61 srdi r6,r4,6
62 subi r6,r6,1
63 mtctr r6
64
65 stdu r1,-STACKFRAMESIZE(r1)
Michael Neulingc75df6f2012-06-25 13:33:10 +000066 std r14,STK_REG(R14)(r1)
67 std r15,STK_REG(R15)(r1)
68 std r16,STK_REG(R16)(r1)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000069
70 ld r6,0(r3)
71 ld r9,8(r3)
72
73 ld r10,16(r3)
74 ld r11,24(r3)
75
76 /*
77 * On POWER6 and POWER7 back to back addes take 2 cycles because of
78 * the XER dependency. This means the fastest this loop can go is
79 * 16 cycles per iteration. The scheduling of the loop below has
80 * been shown to hit this on both POWER6 and POWER7.
81 */
82 .align 5
832:
84 adde r0,r0,r6
85 ld r12,32(r3)
86 ld r14,40(r3)
87
88 adde r0,r0,r9
89 ld r15,48(r3)
90 ld r16,56(r3)
91 addi r3,r3,64
92
93 adde r0,r0,r10
94
95 adde r0,r0,r11
96
97 adde r0,r0,r12
98
99 adde r0,r0,r14
100
101 adde r0,r0,r15
102 ld r6,0(r3)
103 ld r9,8(r3)
104
105 adde r0,r0,r16
106 ld r10,16(r3)
107 ld r11,24(r3)
108 bdnz 2b
109
110
111 adde r0,r0,r6
112 ld r12,32(r3)
113 ld r14,40(r3)
114
115 adde r0,r0,r9
116 ld r15,48(r3)
117 ld r16,56(r3)
118 addi r3,r3,64
119
120 adde r0,r0,r10
121 adde r0,r0,r11
122 adde r0,r0,r12
123 adde r0,r0,r14
124 adde r0,r0,r15
125 adde r0,r0,r16
126
Michael Neulingc75df6f2012-06-25 13:33:10 +0000127 ld r14,STK_REG(R14)(r1)
128 ld r15,STK_REG(R15)(r1)
129 ld r16,STK_REG(R16)(r1)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000130 addi r1,r1,STACKFRAMESIZE
131
132 andi. r4,r4,63
133
134.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
135 srdi. r6,r4,3
136 beq .Lcsum_tail_word
137
138 mtctr r6
1393:
140 ld r6,0(r3)
141 addi r3,r3,8
142 adde r0,r0,r6
143 bdnz 3b
144
145 andi. r4,r4,7
146
147.Lcsum_tail_word: /* Up to 7 bytes to go */
148 srdi. r6,r4,2
149 beq .Lcsum_tail_halfword
150
151 lwz r6,0(r3)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000152 addi r3,r3,4
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000153 adde r0,r0,r6
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000154 subi r4,r4,4
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000155
156.Lcsum_tail_halfword: /* Up to 3 bytes to go */
157 srdi. r6,r4,1
158 beq .Lcsum_tail_byte
159
160 lhz r6,0(r3)
161 addi r3,r3,2
162 adde r0,r0,r6
163 subi r4,r4,2
164
165.Lcsum_tail_byte: /* Up to 1 byte to go */
166 andi. r6,r4,1
167 beq .Lcsum_finish
168
169 lbz r6,0(r3)
170 sldi r9,r6,8 /* Pad the byte out to 16 bits */
171 adde r0,r0,r9
172
173.Lcsum_finish:
174 addze r0,r0 /* add in final carry */
175 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
176 add r3,r4,r0
177 srdi r3,r3,32
178 blr
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000179
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000180
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000181 .macro srcnr
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000182100:
183 .section __ex_table,"a"
184 .align 3
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000185 .llong 100b,.Lsrc_error_nr
186 .previous
187 .endm
188
189 .macro source
190150:
191 .section __ex_table,"a"
192 .align 3
193 .llong 150b,.Lsrc_error
194 .previous
195 .endm
196
197 .macro dstnr
198200:
199 .section __ex_table,"a"
200 .align 3
201 .llong 200b,.Ldest_error_nr
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000202 .previous
203 .endm
204
205 .macro dest
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000206250:
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000207 .section __ex_table,"a"
208 .align 3
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000209 .llong 250b,.Ldest_error
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000210 .previous
211 .endm
212
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000213/*
214 * Computes the checksum of a memory block at src, length len,
215 * and adds in "sum" (32-bit), while copying the block to dst.
216 * If an access exception occurs on src or dst, it stores -EFAULT
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000217 * to *src_err or *dst_err respectively. The caller must take any action
218 * required in this case (zeroing memory, recalculating partial checksum etc).
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000219 *
220 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
221 */
222_GLOBAL(csum_partial_copy_generic)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000223 addic r0,r6,0 /* clear carry */
224
225 srdi. r6,r5,3 /* less than 8 bytes? */
226 beq .Lcopy_tail_word
227
228 /*
229 * If only halfword aligned, align to a double word. Since odd
230 * aligned addresses should be rare and they would require more
231 * work to calculate the correct checksum, we ignore that case
232 * and take the potential slowdown of unaligned loads.
233 *
234 * If the source and destination are relatively unaligned we only
235 * align the source. This keeps things simple.
236 */
237 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
238 beq .Lcopy_aligned
239
Paul E. McKenneyd9813c32013-10-01 16:54:05 +1000240 li r9,4
241 sub r6,r9,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000242 mtctr r6
243
2441:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000245srcnr; lhz r6,0(r3) /* align to doubleword */
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000246 subi r5,r5,2
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000247 addi r3,r3,2
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000248 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000249dstnr; sth r6,0(r4)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000250 addi r4,r4,2
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000251 bdnz 1b
252
253.Lcopy_aligned:
254 /*
255 * We unroll the loop such that each iteration is 64 bytes with an
256 * entry and exit limb of 64 bytes, meaning a minimum size of
257 * 128 bytes.
258 */
259 srdi. r6,r5,7
260 beq .Lcopy_tail_doublewords /* len < 128 */
261
262 srdi r6,r5,6
263 subi r6,r6,1
264 mtctr r6
265
266 stdu r1,-STACKFRAMESIZE(r1)
Michael Neulingc75df6f2012-06-25 13:33:10 +0000267 std r14,STK_REG(R14)(r1)
268 std r15,STK_REG(R15)(r1)
269 std r16,STK_REG(R16)(r1)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000270
271source; ld r6,0(r3)
272source; ld r9,8(r3)
273
274source; ld r10,16(r3)
275source; ld r11,24(r3)
276
277 /*
278 * On POWER6 and POWER7 back to back addes take 2 cycles because of
279 * the XER dependency. This means the fastest this loop can go is
280 * 16 cycles per iteration. The scheduling of the loop below has
281 * been shown to hit this on both POWER6 and POWER7.
282 */
283 .align 5
2842:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000285 adde r0,r0,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000286source; ld r12,32(r3)
287source; ld r14,40(r3)
288
289 adde r0,r0,r9
290source; ld r15,48(r3)
291source; ld r16,56(r3)
292 addi r3,r3,64
293
294 adde r0,r0,r10
295dest; std r6,0(r4)
296dest; std r9,8(r4)
297
298 adde r0,r0,r11
299dest; std r10,16(r4)
300dest; std r11,24(r4)
301
302 adde r0,r0,r12
303dest; std r12,32(r4)
304dest; std r14,40(r4)
305
306 adde r0,r0,r14
307dest; std r15,48(r4)
308dest; std r16,56(r4)
309 addi r4,r4,64
310
311 adde r0,r0,r15
312source; ld r6,0(r3)
313source; ld r9,8(r3)
314
315 adde r0,r0,r16
316source; ld r10,16(r3)
317source; ld r11,24(r3)
318 bdnz 2b
319
320
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000321 adde r0,r0,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000322source; ld r12,32(r3)
323source; ld r14,40(r3)
324
325 adde r0,r0,r9
326source; ld r15,48(r3)
327source; ld r16,56(r3)
328 addi r3,r3,64
329
330 adde r0,r0,r10
331dest; std r6,0(r4)
332dest; std r9,8(r4)
333
334 adde r0,r0,r11
335dest; std r10,16(r4)
336dest; std r11,24(r4)
337
338 adde r0,r0,r12
339dest; std r12,32(r4)
340dest; std r14,40(r4)
341
342 adde r0,r0,r14
343dest; std r15,48(r4)
344dest; std r16,56(r4)
345 addi r4,r4,64
346
347 adde r0,r0,r15
348 adde r0,r0,r16
349
Michael Neulingc75df6f2012-06-25 13:33:10 +0000350 ld r14,STK_REG(R14)(r1)
351 ld r15,STK_REG(R15)(r1)
352 ld r16,STK_REG(R16)(r1)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000353 addi r1,r1,STACKFRAMESIZE
354
355 andi. r5,r5,63
356
357.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
358 srdi. r6,r5,3
359 beq .Lcopy_tail_word
360
361 mtctr r6
3623:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000363srcnr; ld r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000364 addi r3,r3,8
365 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000366dstnr; std r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000367 addi r4,r4,8
368 bdnz 3b
369
370 andi. r5,r5,7
371
372.Lcopy_tail_word: /* Up to 7 bytes to go */
373 srdi. r6,r5,2
374 beq .Lcopy_tail_halfword
375
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000376srcnr; lwz r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000377 addi r3,r3,4
378 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000379dstnr; stw r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000380 addi r4,r4,4
381 subi r5,r5,4
382
383.Lcopy_tail_halfword: /* Up to 3 bytes to go */
384 srdi. r6,r5,1
385 beq .Lcopy_tail_byte
386
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000387srcnr; lhz r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000388 addi r3,r3,2
389 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000390dstnr; sth r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000391 addi r4,r4,2
392 subi r5,r5,2
393
394.Lcopy_tail_byte: /* Up to 1 byte to go */
395 andi. r6,r5,1
396 beq .Lcopy_finish
397
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000398srcnr; lbz r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000399 sldi r9,r6,8 /* Pad the byte out to 16 bits */
400 adde r0,r0,r9
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000401dstnr; stb r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000402
403.Lcopy_finish:
404 addze r0,r0 /* add in final carry */
405 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
406 add r3,r4,r0
407 srdi r3,r3,32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000408 blr
409
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000410.Lsrc_error:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000411 ld r14,STK_REG(R14)(r1)
412 ld r15,STK_REG(R15)(r1)
413 ld r16,STK_REG(R16)(r1)
414 addi r1,r1,STACKFRAMESIZE
415.Lsrc_error_nr:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000416 cmpdi 0,r7,0
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000417 beqlr
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000418 li r6,-EFAULT
419 stw r6,0(r7)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000420 blr
421
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000422.Ldest_error:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000423 ld r14,STK_REG(R14)(r1)
424 ld r15,STK_REG(R15)(r1)
425 ld r16,STK_REG(R16)(r1)
426 addi r1,r1,STACKFRAMESIZE
427.Ldest_error_nr:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000428 cmpdi 0,r8,0
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000429 beqlr
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000430 li r6,-EFAULT
431 stw r6,0(r8)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000432 blr