Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 1 | /* |
| 2 | * This file contains assembly-language implementations |
| 3 | * of IP-style 1's complement checksum routines. |
| 4 | * |
| 5 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) |
| 6 | * |
| 7 | * This program is free software; you can redistribute it and/or |
| 8 | * modify it under the terms of the GNU General Public License |
| 9 | * as published by the Free Software Foundation; either version |
| 10 | * 2 of the License, or (at your option) any later version. |
| 11 | * |
| 12 | * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). |
| 13 | */ |
| 14 | |
| 15 | #include <linux/sys.h> |
| 16 | #include <asm/processor.h> |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 17 | #include <asm/cache.h> |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 18 | #include <asm/errno.h> |
| 19 | #include <asm/ppc_asm.h> |
| 20 | |
| 21 | .text |
| 22 | |
| 23 | /* |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 24 | * computes the checksum of a memory block at buff, length len, |
| 25 | * and adds in "sum" (32-bit) |
| 26 | * |
Christophe Leroy | 7e39322 | 2016-03-07 18:44:37 +0100 | [diff] [blame] | 27 | * __csum_partial(buff, len, sum) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 28 | */ |
Christophe Leroy | 7e39322 | 2016-03-07 18:44:37 +0100 | [diff] [blame] | 29 | _GLOBAL(__csum_partial) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 30 | subi r3,r3,4 |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 31 | srawi. r6,r4,2 /* Divide len by 4 and also clear carry */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 32 | beq 3f /* if we're doing < 4 bytes */ |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 33 | andi. r0,r3,2 /* Align buffer to longword boundary */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 34 | beq+ 1f |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 35 | lhz r0,4(r3) /* do 2 bytes to get aligned */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 36 | subi r4,r4,2 |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 37 | addi r3,r3,2 |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 38 | srwi. r6,r4,2 /* # words to do */ |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 39 | adde r5,r5,r0 |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 40 | beq 3f |
Christophe Leroy | f867d55 | 2015-09-22 16:34:32 +0200 | [diff] [blame] | 41 | 1: andi. r6,r6,3 /* Prepare to handle words 4 by 4 */ |
| 42 | beq 21f |
| 43 | mtctr r6 |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 44 | 2: lwzu r0,4(r3) |
| 45 | adde r5,r5,r0 |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 46 | bdnz 2b |
Christophe Leroy | f867d55 | 2015-09-22 16:34:32 +0200 | [diff] [blame] | 47 | 21: srwi. r6,r4,4 /* # blocks of 4 words to do */ |
| 48 | beq 3f |
| 49 | mtctr r6 |
| 50 | 22: lwz r0,4(r3) |
| 51 | lwz r6,8(r3) |
| 52 | lwz r7,12(r3) |
| 53 | lwzu r8,16(r3) |
| 54 | adde r5,r5,r0 |
| 55 | adde r5,r5,r6 |
| 56 | adde r5,r5,r7 |
| 57 | adde r5,r5,r8 |
| 58 | bdnz 22b |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 59 | 3: andi. r0,r4,2 |
| 60 | beq+ 4f |
| 61 | lhz r0,4(r3) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 62 | addi r3,r3,2 |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 63 | adde r5,r5,r0 |
| 64 | 4: andi. r0,r4,1 |
| 65 | beq+ 5f |
| 66 | lbz r0,4(r3) |
| 67 | slwi r0,r0,8 /* Upper byte of word */ |
| 68 | adde r5,r5,r0 |
| 69 | 5: addze r3,r5 /* add in final carry */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 70 | blr |
| 71 | |
| 72 | /* |
| 73 | * Computes the checksum of a memory block at src, length len, |
| 74 | * and adds in "sum" (32-bit), while copying the block to dst. |
| 75 | * If an access exception occurs on src or dst, it stores -EFAULT |
| 76 | * to *src_err or *dst_err respectively, and (for an error on |
| 77 | * src) zeroes the rest of dst. |
| 78 | * |
| 79 | * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err) |
| 80 | */ |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 81 | #define CSUM_COPY_16_BYTES_WITHEX(n) \ |
| 82 | 8 ## n ## 0: \ |
| 83 | lwz r7,4(r4); \ |
| 84 | 8 ## n ## 1: \ |
| 85 | lwz r8,8(r4); \ |
| 86 | 8 ## n ## 2: \ |
| 87 | lwz r9,12(r4); \ |
| 88 | 8 ## n ## 3: \ |
| 89 | lwzu r10,16(r4); \ |
| 90 | 8 ## n ## 4: \ |
| 91 | stw r7,4(r6); \ |
| 92 | adde r12,r12,r7; \ |
| 93 | 8 ## n ## 5: \ |
| 94 | stw r8,8(r6); \ |
| 95 | adde r12,r12,r8; \ |
| 96 | 8 ## n ## 6: \ |
| 97 | stw r9,12(r6); \ |
| 98 | adde r12,r12,r9; \ |
| 99 | 8 ## n ## 7: \ |
| 100 | stwu r10,16(r6); \ |
| 101 | adde r12,r12,r10 |
| 102 | |
| 103 | #define CSUM_COPY_16_BYTES_EXCODE(n) \ |
| 104 | .section __ex_table,"a"; \ |
| 105 | .align 2; \ |
| 106 | .long 8 ## n ## 0b,src_error; \ |
| 107 | .long 8 ## n ## 1b,src_error; \ |
| 108 | .long 8 ## n ## 2b,src_error; \ |
| 109 | .long 8 ## n ## 3b,src_error; \ |
| 110 | .long 8 ## n ## 4b,dst_error; \ |
| 111 | .long 8 ## n ## 5b,dst_error; \ |
| 112 | .long 8 ## n ## 6b,dst_error; \ |
| 113 | .long 8 ## n ## 7b,dst_error; \ |
| 114 | .text |
| 115 | |
| 116 | .text |
| 117 | .stabs "arch/powerpc/lib/",N_SO,0,0,0f |
| 118 | .stabs "checksum_32.S",N_SO,0,0,0f |
| 119 | 0: |
| 120 | |
| 121 | CACHELINE_BYTES = L1_CACHE_BYTES |
| 122 | LG_CACHELINE_BYTES = L1_CACHE_SHIFT |
| 123 | CACHELINE_MASK = (L1_CACHE_BYTES-1) |
| 124 | |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 125 | _GLOBAL(csum_partial_copy_generic) |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 126 | stwu r1,-16(r1) |
| 127 | stw r7,12(r1) |
| 128 | stw r8,8(r1) |
| 129 | |
| 130 | andi. r0,r4,1 /* is destination address even ? */ |
| 131 | cmplwi cr7,r0,0 |
| 132 | addic r12,r6,0 |
| 133 | addi r6,r4,-4 |
| 134 | neg r0,r4 |
| 135 | addi r4,r3,-4 |
| 136 | andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ |
| 137 | beq 58f |
| 138 | |
| 139 | cmplw 0,r5,r0 /* is this more than total to do? */ |
| 140 | blt 63f /* if not much to do */ |
| 141 | andi. r8,r0,3 /* get it word-aligned first */ |
| 142 | mtctr r8 |
| 143 | beq+ 61f |
| 144 | li r3,0 |
| 145 | 70: lbz r9,4(r4) /* do some bytes */ |
| 146 | addi r4,r4,1 |
| 147 | slwi r3,r3,8 |
| 148 | rlwimi r3,r9,0,24,31 |
| 149 | 71: stb r9,4(r6) |
| 150 | addi r6,r6,1 |
| 151 | bdnz 70b |
| 152 | adde r12,r12,r3 |
| 153 | 61: subf r5,r0,r5 |
| 154 | srwi. r0,r0,2 |
| 155 | mtctr r0 |
| 156 | beq 58f |
| 157 | 72: lwzu r9,4(r4) /* do some words */ |
| 158 | adde r12,r12,r9 |
| 159 | 73: stwu r9,4(r6) |
| 160 | bdnz 72b |
| 161 | |
| 162 | 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ |
| 163 | clrlwi r5,r5,32-LG_CACHELINE_BYTES |
| 164 | li r11,4 |
| 165 | beq 63f |
| 166 | |
| 167 | /* Here we decide how far ahead to prefetch the source */ |
| 168 | li r3,4 |
| 169 | cmpwi r0,1 |
| 170 | li r7,0 |
| 171 | ble 114f |
| 172 | li r7,1 |
| 173 | #if MAX_COPY_PREFETCH > 1 |
| 174 | /* Heuristically, for large transfers we prefetch |
| 175 | MAX_COPY_PREFETCH cachelines ahead. For small transfers |
| 176 | we prefetch 1 cacheline ahead. */ |
| 177 | cmpwi r0,MAX_COPY_PREFETCH |
| 178 | ble 112f |
| 179 | li r7,MAX_COPY_PREFETCH |
| 180 | 112: mtctr r7 |
| 181 | 111: dcbt r3,r4 |
| 182 | addi r3,r3,CACHELINE_BYTES |
| 183 | bdnz 111b |
| 184 | #else |
| 185 | dcbt r3,r4 |
| 186 | addi r3,r3,CACHELINE_BYTES |
| 187 | #endif /* MAX_COPY_PREFETCH > 1 */ |
| 188 | |
| 189 | 114: subf r8,r7,r0 |
| 190 | mr r0,r7 |
| 191 | mtctr r8 |
| 192 | |
| 193 | 53: dcbt r3,r4 |
| 194 | 54: dcbz r11,r6 |
| 195 | /* the main body of the cacheline loop */ |
| 196 | CSUM_COPY_16_BYTES_WITHEX(0) |
| 197 | #if L1_CACHE_BYTES >= 32 |
| 198 | CSUM_COPY_16_BYTES_WITHEX(1) |
| 199 | #if L1_CACHE_BYTES >= 64 |
| 200 | CSUM_COPY_16_BYTES_WITHEX(2) |
| 201 | CSUM_COPY_16_BYTES_WITHEX(3) |
| 202 | #if L1_CACHE_BYTES >= 128 |
| 203 | CSUM_COPY_16_BYTES_WITHEX(4) |
| 204 | CSUM_COPY_16_BYTES_WITHEX(5) |
| 205 | CSUM_COPY_16_BYTES_WITHEX(6) |
| 206 | CSUM_COPY_16_BYTES_WITHEX(7) |
| 207 | #endif |
| 208 | #endif |
| 209 | #endif |
| 210 | bdnz 53b |
| 211 | cmpwi r0,0 |
| 212 | li r3,4 |
| 213 | li r7,0 |
| 214 | bne 114b |
| 215 | |
| 216 | 63: srwi. r0,r5,2 |
| 217 | mtctr r0 |
| 218 | beq 64f |
| 219 | 30: lwzu r0,4(r4) |
| 220 | adde r12,r12,r0 |
| 221 | 31: stwu r0,4(r6) |
| 222 | bdnz 30b |
| 223 | |
| 224 | 64: andi. r0,r5,2 |
| 225 | beq+ 65f |
| 226 | 40: lhz r0,4(r4) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 227 | addi r4,r4,2 |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 228 | 41: sth r0,4(r6) |
| 229 | adde r12,r12,r0 |
| 230 | addi r6,r6,2 |
| 231 | 65: andi. r0,r5,1 |
| 232 | beq+ 66f |
| 233 | 50: lbz r0,4(r4) |
| 234 | 51: stb r0,4(r6) |
| 235 | slwi r0,r0,8 |
| 236 | adde r12,r12,r0 |
| 237 | 66: addze r3,r12 |
| 238 | addi r1,r1,16 |
| 239 | beqlr+ cr7 |
| 240 | rlwinm r3,r3,8,0,31 /* swap bytes for odd destination */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 241 | blr |
| 242 | |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 243 | /* read fault */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 244 | src_error: |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 245 | lwz r7,12(r1) |
| 246 | addi r1,r1,16 |
| 247 | cmpwi cr0,r7,0 |
| 248 | beqlr |
| 249 | li r0,-EFAULT |
| 250 | stw r0,0(r7) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 251 | blr |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 252 | /* write fault */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 253 | dst_error: |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 254 | lwz r8,8(r1) |
| 255 | addi r1,r1,16 |
| 256 | cmpwi cr0,r8,0 |
| 257 | beqlr |
| 258 | li r0,-EFAULT |
| 259 | stw r0,0(r8) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 260 | blr |
| 261 | |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 262 | .section __ex_table,"a" |
| 263 | .align 2 |
| 264 | .long 70b,src_error |
| 265 | .long 71b,dst_error |
| 266 | .long 72b,src_error |
| 267 | .long 73b,dst_error |
| 268 | .long 54b,dst_error |
| 269 | .text |
| 270 | |
| 271 | /* |
| 272 | * this stuff handles faults in the cacheline loop and branches to either |
| 273 | * src_error (if in read part) or dst_error (if in write part) |
| 274 | */ |
| 275 | CSUM_COPY_16_BYTES_EXCODE(0) |
| 276 | #if L1_CACHE_BYTES >= 32 |
| 277 | CSUM_COPY_16_BYTES_EXCODE(1) |
| 278 | #if L1_CACHE_BYTES >= 64 |
| 279 | CSUM_COPY_16_BYTES_EXCODE(2) |
| 280 | CSUM_COPY_16_BYTES_EXCODE(3) |
| 281 | #if L1_CACHE_BYTES >= 128 |
| 282 | CSUM_COPY_16_BYTES_EXCODE(4) |
| 283 | CSUM_COPY_16_BYTES_EXCODE(5) |
| 284 | CSUM_COPY_16_BYTES_EXCODE(6) |
| 285 | CSUM_COPY_16_BYTES_EXCODE(7) |
| 286 | #endif |
| 287 | #endif |
| 288 | #endif |
| 289 | |
| 290 | .section __ex_table,"a" |
| 291 | .align 2 |
| 292 | .long 30b,src_error |
| 293 | .long 31b,dst_error |
| 294 | .long 40b,src_error |
| 295 | .long 41b,dst_error |
| 296 | .long 50b,src_error |
| 297 | .long 51b,dst_error |