Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 1 | /* |
| 2 | * This file contains assembly-language implementations |
| 3 | * of IP-style 1's complement checksum routines. |
| 4 | * |
| 5 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) |
| 6 | * |
| 7 | * This program is free software; you can redistribute it and/or |
| 8 | * modify it under the terms of the GNU General Public License |
| 9 | * as published by the Free Software Foundation; either version |
| 10 | * 2 of the License, or (at your option) any later version. |
| 11 | * |
| 12 | * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). |
| 13 | */ |
| 14 | |
| 15 | #include <linux/sys.h> |
| 16 | #include <asm/processor.h> |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame^] | 17 | #include <asm/cache.h> |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 18 | #include <asm/errno.h> |
| 19 | #include <asm/ppc_asm.h> |
| 20 | |
| 21 | .text |
| 22 | |
| 23 | /* |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 24 | * computes the checksum of a memory block at buff, length len, |
| 25 | * and adds in "sum" (32-bit) |
| 26 | * |
| 27 | * csum_partial(buff, len, sum) |
| 28 | */ |
| 29 | _GLOBAL(csum_partial) |
| 30 | addic r0,r5,0 |
| 31 | subi r3,r3,4 |
| 32 | srwi. r6,r4,2 |
| 33 | beq 3f /* if we're doing < 4 bytes */ |
| 34 | andi. r5,r3,2 /* Align buffer to longword boundary */ |
| 35 | beq+ 1f |
| 36 | lhz r5,4(r3) /* do 2 bytes to get aligned */ |
| 37 | addi r3,r3,2 |
| 38 | subi r4,r4,2 |
| 39 | addc r0,r0,r5 |
| 40 | srwi. r6,r4,2 /* # words to do */ |
| 41 | beq 3f |
| 42 | 1: mtctr r6 |
| 43 | 2: lwzu r5,4(r3) /* the bdnz has zero overhead, so it should */ |
| 44 | adde r0,r0,r5 /* be unnecessary to unroll this loop */ |
| 45 | bdnz 2b |
| 46 | andi. r4,r4,3 |
| 47 | 3: cmpwi 0,r4,2 |
| 48 | blt+ 4f |
| 49 | lhz r5,4(r3) |
| 50 | addi r3,r3,2 |
| 51 | subi r4,r4,2 |
| 52 | adde r0,r0,r5 |
| 53 | 4: cmpwi 0,r4,1 |
| 54 | bne+ 5f |
| 55 | lbz r5,4(r3) |
| 56 | slwi r5,r5,8 /* Upper byte of word */ |
| 57 | adde r0,r0,r5 |
| 58 | 5: addze r3,r0 /* add in final carry */ |
| 59 | blr |
| 60 | |
| 61 | /* |
| 62 | * Computes the checksum of a memory block at src, length len, |
| 63 | * and adds in "sum" (32-bit), while copying the block to dst. |
| 64 | * If an access exception occurs on src or dst, it stores -EFAULT |
| 65 | * to *src_err or *dst_err respectively, and (for an error on |
| 66 | * src) zeroes the rest of dst. |
| 67 | * |
| 68 | * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err) |
| 69 | */ |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame^] | 70 | #define CSUM_COPY_16_BYTES_WITHEX(n) \ |
| 71 | 8 ## n ## 0: \ |
| 72 | lwz r7,4(r4); \ |
| 73 | 8 ## n ## 1: \ |
| 74 | lwz r8,8(r4); \ |
| 75 | 8 ## n ## 2: \ |
| 76 | lwz r9,12(r4); \ |
| 77 | 8 ## n ## 3: \ |
| 78 | lwzu r10,16(r4); \ |
| 79 | 8 ## n ## 4: \ |
| 80 | stw r7,4(r6); \ |
| 81 | adde r12,r12,r7; \ |
| 82 | 8 ## n ## 5: \ |
| 83 | stw r8,8(r6); \ |
| 84 | adde r12,r12,r8; \ |
| 85 | 8 ## n ## 6: \ |
| 86 | stw r9,12(r6); \ |
| 87 | adde r12,r12,r9; \ |
| 88 | 8 ## n ## 7: \ |
| 89 | stwu r10,16(r6); \ |
| 90 | adde r12,r12,r10 |
| 91 | |
| 92 | #define CSUM_COPY_16_BYTES_EXCODE(n) \ |
| 93 | .section __ex_table,"a"; \ |
| 94 | .align 2; \ |
| 95 | .long 8 ## n ## 0b,src_error; \ |
| 96 | .long 8 ## n ## 1b,src_error; \ |
| 97 | .long 8 ## n ## 2b,src_error; \ |
| 98 | .long 8 ## n ## 3b,src_error; \ |
| 99 | .long 8 ## n ## 4b,dst_error; \ |
| 100 | .long 8 ## n ## 5b,dst_error; \ |
| 101 | .long 8 ## n ## 6b,dst_error; \ |
| 102 | .long 8 ## n ## 7b,dst_error; \ |
| 103 | .text |
| 104 | |
| 105 | .text |
| 106 | .stabs "arch/powerpc/lib/",N_SO,0,0,0f |
| 107 | .stabs "checksum_32.S",N_SO,0,0,0f |
| 108 | 0: |
| 109 | |
| 110 | CACHELINE_BYTES = L1_CACHE_BYTES |
| 111 | LG_CACHELINE_BYTES = L1_CACHE_SHIFT |
| 112 | CACHELINE_MASK = (L1_CACHE_BYTES-1) |
| 113 | |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 114 | _GLOBAL(csum_partial_copy_generic) |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame^] | 115 | stwu r1,-16(r1) |
| 116 | stw r7,12(r1) |
| 117 | stw r8,8(r1) |
| 118 | |
| 119 | andi. r0,r4,1 /* is destination address even ? */ |
| 120 | cmplwi cr7,r0,0 |
| 121 | addic r12,r6,0 |
| 122 | addi r6,r4,-4 |
| 123 | neg r0,r4 |
| 124 | addi r4,r3,-4 |
| 125 | andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ |
| 126 | beq 58f |
| 127 | |
| 128 | cmplw 0,r5,r0 /* is this more than total to do? */ |
| 129 | blt 63f /* if not much to do */ |
| 130 | andi. r8,r0,3 /* get it word-aligned first */ |
| 131 | mtctr r8 |
| 132 | beq+ 61f |
| 133 | li r3,0 |
| 134 | 70: lbz r9,4(r4) /* do some bytes */ |
| 135 | addi r4,r4,1 |
| 136 | slwi r3,r3,8 |
| 137 | rlwimi r3,r9,0,24,31 |
| 138 | 71: stb r9,4(r6) |
| 139 | addi r6,r6,1 |
| 140 | bdnz 70b |
| 141 | adde r12,r12,r3 |
| 142 | 61: subf r5,r0,r5 |
| 143 | srwi. r0,r0,2 |
| 144 | mtctr r0 |
| 145 | beq 58f |
| 146 | 72: lwzu r9,4(r4) /* do some words */ |
| 147 | adde r12,r12,r9 |
| 148 | 73: stwu r9,4(r6) |
| 149 | bdnz 72b |
| 150 | |
| 151 | 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ |
| 152 | clrlwi r5,r5,32-LG_CACHELINE_BYTES |
| 153 | li r11,4 |
| 154 | beq 63f |
| 155 | |
| 156 | /* Here we decide how far ahead to prefetch the source */ |
| 157 | li r3,4 |
| 158 | cmpwi r0,1 |
| 159 | li r7,0 |
| 160 | ble 114f |
| 161 | li r7,1 |
| 162 | #if MAX_COPY_PREFETCH > 1 |
| 163 | /* Heuristically, for large transfers we prefetch |
| 164 | MAX_COPY_PREFETCH cachelines ahead. For small transfers |
| 165 | we prefetch 1 cacheline ahead. */ |
| 166 | cmpwi r0,MAX_COPY_PREFETCH |
| 167 | ble 112f |
| 168 | li r7,MAX_COPY_PREFETCH |
| 169 | 112: mtctr r7 |
| 170 | 111: dcbt r3,r4 |
| 171 | addi r3,r3,CACHELINE_BYTES |
| 172 | bdnz 111b |
| 173 | #else |
| 174 | dcbt r3,r4 |
| 175 | addi r3,r3,CACHELINE_BYTES |
| 176 | #endif /* MAX_COPY_PREFETCH > 1 */ |
| 177 | |
| 178 | 114: subf r8,r7,r0 |
| 179 | mr r0,r7 |
| 180 | mtctr r8 |
| 181 | |
| 182 | 53: dcbt r3,r4 |
| 183 | 54: dcbz r11,r6 |
| 184 | /* the main body of the cacheline loop */ |
| 185 | CSUM_COPY_16_BYTES_WITHEX(0) |
| 186 | #if L1_CACHE_BYTES >= 32 |
| 187 | CSUM_COPY_16_BYTES_WITHEX(1) |
| 188 | #if L1_CACHE_BYTES >= 64 |
| 189 | CSUM_COPY_16_BYTES_WITHEX(2) |
| 190 | CSUM_COPY_16_BYTES_WITHEX(3) |
| 191 | #if L1_CACHE_BYTES >= 128 |
| 192 | CSUM_COPY_16_BYTES_WITHEX(4) |
| 193 | CSUM_COPY_16_BYTES_WITHEX(5) |
| 194 | CSUM_COPY_16_BYTES_WITHEX(6) |
| 195 | CSUM_COPY_16_BYTES_WITHEX(7) |
| 196 | #endif |
| 197 | #endif |
| 198 | #endif |
| 199 | bdnz 53b |
| 200 | cmpwi r0,0 |
| 201 | li r3,4 |
| 202 | li r7,0 |
| 203 | bne 114b |
| 204 | |
| 205 | 63: srwi. r0,r5,2 |
| 206 | mtctr r0 |
| 207 | beq 64f |
| 208 | 30: lwzu r0,4(r4) |
| 209 | adde r12,r12,r0 |
| 210 | 31: stwu r0,4(r6) |
| 211 | bdnz 30b |
| 212 | |
| 213 | 64: andi. r0,r5,2 |
| 214 | beq+ 65f |
| 215 | 40: lhz r0,4(r4) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 216 | addi r4,r4,2 |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame^] | 217 | 41: sth r0,4(r6) |
| 218 | adde r12,r12,r0 |
| 219 | addi r6,r6,2 |
| 220 | 65: andi. r0,r5,1 |
| 221 | beq+ 66f |
| 222 | 50: lbz r0,4(r4) |
| 223 | 51: stb r0,4(r6) |
| 224 | slwi r0,r0,8 |
| 225 | adde r12,r12,r0 |
| 226 | 66: addze r3,r12 |
| 227 | addi r1,r1,16 |
| 228 | beqlr+ cr7 |
| 229 | rlwinm r3,r3,8,0,31 /* swap bytes for odd destination */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 230 | blr |
| 231 | |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame^] | 232 | /* read fault */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 233 | src_error: |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame^] | 234 | lwz r7,12(r1) |
| 235 | addi r1,r1,16 |
| 236 | cmpwi cr0,r7,0 |
| 237 | beqlr |
| 238 | li r0,-EFAULT |
| 239 | stw r0,0(r7) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 240 | blr |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame^] | 241 | /* write fault */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 242 | dst_error: |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame^] | 243 | lwz r8,8(r1) |
| 244 | addi r1,r1,16 |
| 245 | cmpwi cr0,r8,0 |
| 246 | beqlr |
| 247 | li r0,-EFAULT |
| 248 | stw r0,0(r8) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 249 | blr |
| 250 | |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame^] | 251 | .section __ex_table,"a" |
| 252 | .align 2 |
| 253 | .long 70b,src_error |
| 254 | .long 71b,dst_error |
| 255 | .long 72b,src_error |
| 256 | .long 73b,dst_error |
| 257 | .long 54b,dst_error |
| 258 | .text |
| 259 | |
| 260 | /* |
| 261 | * this stuff handles faults in the cacheline loop and branches to either |
| 262 | * src_error (if in read part) or dst_error (if in write part) |
| 263 | */ |
| 264 | CSUM_COPY_16_BYTES_EXCODE(0) |
| 265 | #if L1_CACHE_BYTES >= 32 |
| 266 | CSUM_COPY_16_BYTES_EXCODE(1) |
| 267 | #if L1_CACHE_BYTES >= 64 |
| 268 | CSUM_COPY_16_BYTES_EXCODE(2) |
| 269 | CSUM_COPY_16_BYTES_EXCODE(3) |
| 270 | #if L1_CACHE_BYTES >= 128 |
| 271 | CSUM_COPY_16_BYTES_EXCODE(4) |
| 272 | CSUM_COPY_16_BYTES_EXCODE(5) |
| 273 | CSUM_COPY_16_BYTES_EXCODE(6) |
| 274 | CSUM_COPY_16_BYTES_EXCODE(7) |
| 275 | #endif |
| 276 | #endif |
| 277 | #endif |
| 278 | |
| 279 | .section __ex_table,"a" |
| 280 | .align 2 |
| 281 | .long 30b,src_error |
| 282 | .long 31b,dst_error |
| 283 | .long 40b,src_error |
| 284 | .long 41b,dst_error |
| 285 | .long 50b,src_error |
| 286 | .long 51b,dst_error |