Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 1 | /* |
| 2 | * This file contains assembly-language implementations |
| 3 | * of IP-style 1's complement checksum routines. |
| 4 | * |
| 5 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) |
| 6 | * |
| 7 | * This program is free software; you can redistribute it and/or |
| 8 | * modify it under the terms of the GNU General Public License |
| 9 | * as published by the Free Software Foundation; either version |
| 10 | * 2 of the License, or (at your option) any later version. |
| 11 | * |
| 12 | * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). |
| 13 | */ |
| 14 | |
| 15 | #include <linux/sys.h> |
| 16 | #include <asm/processor.h> |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 17 | #include <asm/cache.h> |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 18 | #include <asm/errno.h> |
| 19 | #include <asm/ppc_asm.h> |
Al Viro | 9445aa1 | 2016-01-13 23:33:46 -0500 | [diff] [blame] | 20 | #include <asm/export.h> |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 21 | |
| 22 | .text |
| 23 | |
| 24 | /* |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 25 | * computes the checksum of a memory block at buff, length len, |
| 26 | * and adds in "sum" (32-bit) |
| 27 | * |
Christophe Leroy | 7e39322 | 2016-03-07 18:44:37 +0100 | [diff] [blame] | 28 | * __csum_partial(buff, len, sum) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 29 | */ |
Christophe Leroy | 7e39322 | 2016-03-07 18:44:37 +0100 | [diff] [blame] | 30 | _GLOBAL(__csum_partial) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 31 | subi r3,r3,4 |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 32 | srawi. r6,r4,2 /* Divide len by 4 and also clear carry */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 33 | beq 3f /* if we're doing < 4 bytes */ |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 34 | andi. r0,r3,2 /* Align buffer to longword boundary */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 35 | beq+ 1f |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 36 | lhz r0,4(r3) /* do 2 bytes to get aligned */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 37 | subi r4,r4,2 |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 38 | addi r3,r3,2 |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 39 | srwi. r6,r4,2 /* # words to do */ |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 40 | adde r5,r5,r0 |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 41 | beq 3f |
Christophe Leroy | f867d55 | 2015-09-22 16:34:32 +0200 | [diff] [blame] | 42 | 1: andi. r6,r6,3 /* Prepare to handle words 4 by 4 */ |
| 43 | beq 21f |
| 44 | mtctr r6 |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 45 | 2: lwzu r0,4(r3) |
| 46 | adde r5,r5,r0 |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 47 | bdnz 2b |
Christophe Leroy | f867d55 | 2015-09-22 16:34:32 +0200 | [diff] [blame] | 48 | 21: srwi. r6,r4,4 /* # blocks of 4 words to do */ |
| 49 | beq 3f |
| 50 | mtctr r6 |
| 51 | 22: lwz r0,4(r3) |
| 52 | lwz r6,8(r3) |
| 53 | lwz r7,12(r3) |
| 54 | lwzu r8,16(r3) |
| 55 | adde r5,r5,r0 |
| 56 | adde r5,r5,r6 |
| 57 | adde r5,r5,r7 |
| 58 | adde r5,r5,r8 |
| 59 | bdnz 22b |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 60 | 3: andi. r0,r4,2 |
| 61 | beq+ 4f |
| 62 | lhz r0,4(r3) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 63 | addi r3,r3,2 |
Christophe Leroy | 48821a3 | 2015-09-22 16:34:29 +0200 | [diff] [blame] | 64 | adde r5,r5,r0 |
| 65 | 4: andi. r0,r4,1 |
| 66 | beq+ 5f |
| 67 | lbz r0,4(r3) |
| 68 | slwi r0,r0,8 /* Upper byte of word */ |
| 69 | adde r5,r5,r0 |
| 70 | 5: addze r3,r5 /* add in final carry */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 71 | blr |
Al Viro | 9445aa1 | 2016-01-13 23:33:46 -0500 | [diff] [blame] | 72 | EXPORT_SYMBOL(__csum_partial) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 73 | |
| 74 | /* |
| 75 | * Computes the checksum of a memory block at src, length len, |
| 76 | * and adds in "sum" (32-bit), while copying the block to dst. |
| 77 | * If an access exception occurs on src or dst, it stores -EFAULT |
| 78 | * to *src_err or *dst_err respectively, and (for an error on |
| 79 | * src) zeroes the rest of dst. |
| 80 | * |
| 81 | * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err) |
| 82 | */ |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 83 | #define CSUM_COPY_16_BYTES_WITHEX(n) \ |
| 84 | 8 ## n ## 0: \ |
| 85 | lwz r7,4(r4); \ |
| 86 | 8 ## n ## 1: \ |
| 87 | lwz r8,8(r4); \ |
| 88 | 8 ## n ## 2: \ |
| 89 | lwz r9,12(r4); \ |
| 90 | 8 ## n ## 3: \ |
| 91 | lwzu r10,16(r4); \ |
| 92 | 8 ## n ## 4: \ |
| 93 | stw r7,4(r6); \ |
| 94 | adde r12,r12,r7; \ |
| 95 | 8 ## n ## 5: \ |
| 96 | stw r8,8(r6); \ |
| 97 | adde r12,r12,r8; \ |
| 98 | 8 ## n ## 6: \ |
| 99 | stw r9,12(r6); \ |
| 100 | adde r12,r12,r9; \ |
| 101 | 8 ## n ## 7: \ |
| 102 | stwu r10,16(r6); \ |
| 103 | adde r12,r12,r10 |
| 104 | |
| 105 | #define CSUM_COPY_16_BYTES_EXCODE(n) \ |
Nicholas Piggin | 24bfa6a | 2016-10-13 16:42:53 +1100 | [diff] [blame] | 106 | EX_TABLE(8 ## n ## 0b, src_error); \ |
| 107 | EX_TABLE(8 ## n ## 1b, src_error); \ |
| 108 | EX_TABLE(8 ## n ## 2b, src_error); \ |
| 109 | EX_TABLE(8 ## n ## 3b, src_error); \ |
| 110 | EX_TABLE(8 ## n ## 4b, dst_error); \ |
| 111 | EX_TABLE(8 ## n ## 5b, dst_error); \ |
| 112 | EX_TABLE(8 ## n ## 6b, dst_error); \ |
| 113 | EX_TABLE(8 ## n ## 7b, dst_error); |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 114 | |
| 115 | .text |
| 116 | .stabs "arch/powerpc/lib/",N_SO,0,0,0f |
| 117 | .stabs "checksum_32.S",N_SO,0,0,0f |
| 118 | 0: |
| 119 | |
| 120 | CACHELINE_BYTES = L1_CACHE_BYTES |
| 121 | LG_CACHELINE_BYTES = L1_CACHE_SHIFT |
| 122 | CACHELINE_MASK = (L1_CACHE_BYTES-1) |
| 123 | |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 124 | _GLOBAL(csum_partial_copy_generic) |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 125 | stwu r1,-16(r1) |
| 126 | stw r7,12(r1) |
| 127 | stw r8,8(r1) |
| 128 | |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 129 | addic r12,r6,0 |
| 130 | addi r6,r4,-4 |
| 131 | neg r0,r4 |
| 132 | addi r4,r3,-4 |
| 133 | andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ |
Christophe Leroy | 8540571 | 2016-08-26 16:45:13 +0200 | [diff] [blame] | 134 | crset 4*cr7+eq |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 135 | beq 58f |
| 136 | |
| 137 | cmplw 0,r5,r0 /* is this more than total to do? */ |
| 138 | blt 63f /* if not much to do */ |
Christophe Leroy | 8540571 | 2016-08-26 16:45:13 +0200 | [diff] [blame] | 139 | rlwinm r7,r6,3,0x8 |
| 140 | rlwnm r12,r12,r7,0,31 /* odd destination address: rotate one byte */ |
| 141 | cmplwi cr7,r7,0 /* is destination address even ? */ |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 142 | andi. r8,r0,3 /* get it word-aligned first */ |
| 143 | mtctr r8 |
| 144 | beq+ 61f |
| 145 | li r3,0 |
| 146 | 70: lbz r9,4(r4) /* do some bytes */ |
| 147 | addi r4,r4,1 |
| 148 | slwi r3,r3,8 |
| 149 | rlwimi r3,r9,0,24,31 |
| 150 | 71: stb r9,4(r6) |
| 151 | addi r6,r6,1 |
| 152 | bdnz 70b |
| 153 | adde r12,r12,r3 |
| 154 | 61: subf r5,r0,r5 |
| 155 | srwi. r0,r0,2 |
| 156 | mtctr r0 |
| 157 | beq 58f |
| 158 | 72: lwzu r9,4(r4) /* do some words */ |
| 159 | adde r12,r12,r9 |
| 160 | 73: stwu r9,4(r6) |
| 161 | bdnz 72b |
| 162 | |
| 163 | 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ |
| 164 | clrlwi r5,r5,32-LG_CACHELINE_BYTES |
| 165 | li r11,4 |
| 166 | beq 63f |
| 167 | |
| 168 | /* Here we decide how far ahead to prefetch the source */ |
| 169 | li r3,4 |
| 170 | cmpwi r0,1 |
| 171 | li r7,0 |
| 172 | ble 114f |
| 173 | li r7,1 |
| 174 | #if MAX_COPY_PREFETCH > 1 |
| 175 | /* Heuristically, for large transfers we prefetch |
| 176 | MAX_COPY_PREFETCH cachelines ahead. For small transfers |
| 177 | we prefetch 1 cacheline ahead. */ |
| 178 | cmpwi r0,MAX_COPY_PREFETCH |
| 179 | ble 112f |
| 180 | li r7,MAX_COPY_PREFETCH |
| 181 | 112: mtctr r7 |
| 182 | 111: dcbt r3,r4 |
| 183 | addi r3,r3,CACHELINE_BYTES |
| 184 | bdnz 111b |
| 185 | #else |
| 186 | dcbt r3,r4 |
| 187 | addi r3,r3,CACHELINE_BYTES |
| 188 | #endif /* MAX_COPY_PREFETCH > 1 */ |
| 189 | |
| 190 | 114: subf r8,r7,r0 |
| 191 | mr r0,r7 |
| 192 | mtctr r8 |
| 193 | |
| 194 | 53: dcbt r3,r4 |
| 195 | 54: dcbz r11,r6 |
| 196 | /* the main body of the cacheline loop */ |
| 197 | CSUM_COPY_16_BYTES_WITHEX(0) |
| 198 | #if L1_CACHE_BYTES >= 32 |
| 199 | CSUM_COPY_16_BYTES_WITHEX(1) |
| 200 | #if L1_CACHE_BYTES >= 64 |
| 201 | CSUM_COPY_16_BYTES_WITHEX(2) |
| 202 | CSUM_COPY_16_BYTES_WITHEX(3) |
| 203 | #if L1_CACHE_BYTES >= 128 |
| 204 | CSUM_COPY_16_BYTES_WITHEX(4) |
| 205 | CSUM_COPY_16_BYTES_WITHEX(5) |
| 206 | CSUM_COPY_16_BYTES_WITHEX(6) |
| 207 | CSUM_COPY_16_BYTES_WITHEX(7) |
| 208 | #endif |
| 209 | #endif |
| 210 | #endif |
| 211 | bdnz 53b |
| 212 | cmpwi r0,0 |
| 213 | li r3,4 |
| 214 | li r7,0 |
| 215 | bne 114b |
| 216 | |
| 217 | 63: srwi. r0,r5,2 |
| 218 | mtctr r0 |
| 219 | beq 64f |
| 220 | 30: lwzu r0,4(r4) |
| 221 | adde r12,r12,r0 |
| 222 | 31: stwu r0,4(r6) |
| 223 | bdnz 30b |
| 224 | |
| 225 | 64: andi. r0,r5,2 |
| 226 | beq+ 65f |
| 227 | 40: lhz r0,4(r4) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 228 | addi r4,r4,2 |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 229 | 41: sth r0,4(r6) |
| 230 | adde r12,r12,r0 |
| 231 | addi r6,r6,2 |
| 232 | 65: andi. r0,r5,1 |
| 233 | beq+ 66f |
| 234 | 50: lbz r0,4(r4) |
| 235 | 51: stb r0,4(r6) |
| 236 | slwi r0,r0,8 |
| 237 | adde r12,r12,r0 |
| 238 | 66: addze r3,r12 |
| 239 | addi r1,r1,16 |
| 240 | beqlr+ cr7 |
Christophe Leroy | 1bc8b81 | 2016-08-02 10:07:05 +0200 | [diff] [blame] | 241 | rlwinm r3,r3,8,0,31 /* odd destination address: rotate one byte */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 242 | blr |
| 243 | |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 244 | /* read fault */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 245 | src_error: |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 246 | lwz r7,12(r1) |
| 247 | addi r1,r1,16 |
| 248 | cmpwi cr0,r7,0 |
| 249 | beqlr |
| 250 | li r0,-EFAULT |
| 251 | stw r0,0(r7) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 252 | blr |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 253 | /* write fault */ |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 254 | dst_error: |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 255 | lwz r8,8(r1) |
| 256 | addi r1,r1,16 |
| 257 | cmpwi cr0,r8,0 |
| 258 | beqlr |
| 259 | li r0,-EFAULT |
| 260 | stw r0,0(r8) |
Paul Mackerras | 14cf11a | 2005-09-26 16:04:21 +1000 | [diff] [blame] | 261 | blr |
| 262 | |
Nicholas Piggin | 24bfa6a | 2016-10-13 16:42:53 +1100 | [diff] [blame] | 263 | EX_TABLE(70b, src_error); |
| 264 | EX_TABLE(71b, dst_error); |
| 265 | EX_TABLE(72b, src_error); |
| 266 | EX_TABLE(73b, dst_error); |
| 267 | EX_TABLE(54b, dst_error); |
Christophe Leroy | 7aef4136 | 2015-09-22 16:34:27 +0200 | [diff] [blame] | 268 | |
| 269 | /* |
| 270 | * this stuff handles faults in the cacheline loop and branches to either |
| 271 | * src_error (if in read part) or dst_error (if in write part) |
| 272 | */ |
| 273 | CSUM_COPY_16_BYTES_EXCODE(0) |
| 274 | #if L1_CACHE_BYTES >= 32 |
| 275 | CSUM_COPY_16_BYTES_EXCODE(1) |
| 276 | #if L1_CACHE_BYTES >= 64 |
| 277 | CSUM_COPY_16_BYTES_EXCODE(2) |
| 278 | CSUM_COPY_16_BYTES_EXCODE(3) |
| 279 | #if L1_CACHE_BYTES >= 128 |
| 280 | CSUM_COPY_16_BYTES_EXCODE(4) |
| 281 | CSUM_COPY_16_BYTES_EXCODE(5) |
| 282 | CSUM_COPY_16_BYTES_EXCODE(6) |
| 283 | CSUM_COPY_16_BYTES_EXCODE(7) |
| 284 | #endif |
| 285 | #endif |
| 286 | #endif |
| 287 | |
Nicholas Piggin | 24bfa6a | 2016-10-13 16:42:53 +1100 | [diff] [blame] | 288 | EX_TABLE(30b, src_error); |
| 289 | EX_TABLE(31b, dst_error); |
| 290 | EX_TABLE(40b, src_error); |
| 291 | EX_TABLE(41b, dst_error); |
| 292 | EX_TABLE(50b, src_error); |
| 293 | EX_TABLE(51b, dst_error); |
| 294 | |
Al Viro | 9445aa1 | 2016-01-13 23:33:46 -0500 | [diff] [blame] | 295 | EXPORT_SYMBOL(csum_partial_copy_generic) |