David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 1 | /* NGmemcpy.S: Niagara optimized memcpy. |
| 2 | * |
| 3 | * Copyright (C) 2006 David S. Miller (davem@davemloft.net) |
| 4 | */ |
| 5 | |
| 6 | #ifdef __KERNEL__ |
| 7 | #include <asm/asi.h> |
David S. Miller | 0d4bc95 | 2006-02-11 10:30:41 -0800 | [diff] [blame] | 8 | #include <asm/thread_info.h> |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 9 | #define GLOBAL_SPARE %g7 |
David S. Miller | 0d4bc95 | 2006-02-11 10:30:41 -0800 | [diff] [blame] | 10 | #define RESTORE_ASI(TMP) \ |
| 11 | ldub [%g6 + TI_CURRENT_DS], TMP; \ |
| 12 | wr TMP, 0x0, %asi; |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 13 | #else |
| 14 | #define GLOBAL_SPARE %g5 |
David S. Miller | 0d4bc95 | 2006-02-11 10:30:41 -0800 | [diff] [blame] | 15 | #define RESTORE_ASI(TMP) \ |
| 16 | wr %g0, ASI_PNF, %asi |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 17 | #endif |
| 18 | |
| 19 | #ifndef STORE_ASI |
| 20 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P |
| 21 | #endif |
| 22 | |
| 23 | #ifndef EX_LD |
| 24 | #define EX_LD(x) x |
| 25 | #endif |
| 26 | |
| 27 | #ifndef EX_ST |
| 28 | #define EX_ST(x) x |
| 29 | #endif |
| 30 | |
| 31 | #ifndef EX_RETVAL |
| 32 | #define EX_RETVAL(x) x |
| 33 | #endif |
| 34 | |
| 35 | #ifndef LOAD |
| 36 | #ifndef MEMCPY_DEBUG |
| 37 | #define LOAD(type,addr,dest) type [addr], dest |
| 38 | #else |
| 39 | #define LOAD(type,addr,dest) type##a [addr] 0x80, dest |
| 40 | #endif |
| 41 | #endif |
| 42 | |
| 43 | #ifndef LOAD_TWIN |
| 44 | #define LOAD_TWIN(addr_reg,dest0,dest1) \ |
| 45 | ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 |
| 46 | #endif |
| 47 | |
| 48 | #ifndef STORE |
| 49 | #define STORE(type,src,addr) type src, [addr] |
| 50 | #endif |
| 51 | |
| 52 | #ifndef STORE_INIT |
| 53 | #define STORE_INIT(src,addr) stxa src, [addr] %asi |
| 54 | #endif |
| 55 | |
| 56 | #ifndef FUNC_NAME |
| 57 | #define FUNC_NAME NGmemcpy |
| 58 | #endif |
| 59 | |
| 60 | #ifndef PREAMBLE |
| 61 | #define PREAMBLE |
| 62 | #endif |
| 63 | |
| 64 | #ifndef XCC |
| 65 | #define XCC xcc |
| 66 | #endif |
| 67 | |
| 68 | .register %g2,#scratch |
| 69 | .register %g3,#scratch |
| 70 | |
| 71 | .text |
| 72 | .align 64 |
| 73 | |
| 74 | .globl FUNC_NAME |
| 75 | .type FUNC_NAME,#function |
| 76 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ |
| 77 | srlx %o2, 31, %g2 |
| 78 | cmp %g2, 0 |
| 79 | tne %xcc, 5 |
| 80 | PREAMBLE |
| 81 | mov %o0, GLOBAL_SPARE |
| 82 | cmp %o2, 0 |
| 83 | be,pn %XCC, 85f |
| 84 | or %o0, %o1, %o3 |
| 85 | cmp %o2, 16 |
| 86 | blu,a,pn %XCC, 80f |
| 87 | or %o3, %o2, %o3 |
| 88 | |
| 89 | /* 2 blocks (128 bytes) is the minimum we can do the block |
| 90 | * copy with. We need to ensure that we'll iterate at least |
| 91 | * once in the block copy loop. At worst we'll need to align |
| 92 | * the destination to a 64-byte boundary which can chew up |
| 93 | * to (64 - 1) bytes from the length before we perform the |
| 94 | * block copy loop. |
| 95 | */ |
| 96 | cmp %o2, (2 * 64) |
| 97 | blu,pt %XCC, 70f |
| 98 | andcc %o3, 0x7, %g0 |
| 99 | |
| 100 | /* %o0: dst |
| 101 | * %o1: src |
| 102 | * %o2: len (known to be >= 128) |
| 103 | * |
| 104 | * The block copy loops will use %o4/%o5,%g2/%g3 as |
| 105 | * temporaries while copying the data. |
| 106 | */ |
| 107 | |
| 108 | LOAD(prefetch, %o1, #one_read) |
| 109 | wr %g0, STORE_ASI, %asi |
| 110 | |
| 111 | /* Align destination on 64-byte boundary. */ |
| 112 | andcc %o0, (64 - 1), %o4 |
| 113 | be,pt %XCC, 2f |
| 114 | sub %o4, 64, %o4 |
| 115 | sub %g0, %o4, %o4 ! bytes to align dst |
| 116 | sub %o2, %o4, %o2 |
| 117 | 1: subcc %o4, 1, %o4 |
| 118 | EX_LD(LOAD(ldub, %o1, %g1)) |
| 119 | EX_ST(STORE(stb, %g1, %o0)) |
| 120 | add %o1, 1, %o1 |
| 121 | bne,pt %XCC, 1b |
| 122 | add %o0, 1, %o0 |
| 123 | |
| 124 | /* If the source is on a 16-byte boundary we can do |
| 125 | * the direct block copy loop. If it is 8-byte aligned |
| 126 | * we can do the 16-byte loads offset by -8 bytes and the |
| 127 | * init stores offset by one register. |
| 128 | * |
| 129 | * If the source is not even 8-byte aligned, we need to do |
| 130 | * shifting and masking (basically integer faligndata). |
| 131 | * |
| 132 | * The careful bit with init stores is that if we store |
| 133 | * to any part of the cache line we have to store the whole |
| 134 | * cacheline else we can end up with corrupt L2 cache line |
| 135 | * contents. Since the loop works on 64-bytes of 64-byte |
| 136 | * aligned store data at a time, this is easy to ensure. |
| 137 | */ |
| 138 | 2: |
| 139 | andcc %o1, (16 - 1), %o4 |
| 140 | andn %o2, (64 - 1), %g1 ! block copy loop iterator |
| 141 | sub %o2, %g1, %o2 ! final sub-block copy bytes |
| 142 | be,pt %XCC, 50f |
| 143 | cmp %o4, 8 |
| 144 | be,a,pt %XCC, 10f |
| 145 | sub %o1, 0x8, %o1 |
| 146 | |
| 147 | /* Neither 8-byte nor 16-byte aligned, shift and mask. */ |
| 148 | mov %g1, %o4 |
| 149 | and %o1, 0x7, %g1 |
| 150 | sll %g1, 3, %g1 |
| 151 | mov 64, %o3 |
| 152 | andn %o1, 0x7, %o1 |
| 153 | EX_LD(LOAD(ldx, %o1, %g2)) |
| 154 | sub %o3, %g1, %o3 |
| 155 | sllx %g2, %g1, %g2 |
| 156 | |
| 157 | #define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\ |
| 158 | EX_LD(LOAD(ldx, SRC, TMP1)); \ |
| 159 | srlx TMP1, PRE_SHIFT, TMP2; \ |
| 160 | or TMP2, PRE_VAL, TMP2; \ |
| 161 | EX_ST(STORE_INIT(TMP2, DST)); \ |
| 162 | sllx TMP1, POST_SHIFT, PRE_VAL; |
| 163 | |
| 164 | 1: add %o1, 0x8, %o1 |
| 165 | SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00) |
| 166 | add %o1, 0x8, %o1 |
| 167 | SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08) |
| 168 | add %o1, 0x8, %o1 |
| 169 | SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10) |
| 170 | add %o1, 0x8, %o1 |
| 171 | SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18) |
| 172 | add %o1, 32, %o1 |
| 173 | LOAD(prefetch, %o1, #one_read) |
| 174 | sub %o1, 32 - 8, %o1 |
| 175 | SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20) |
| 176 | add %o1, 8, %o1 |
| 177 | SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28) |
| 178 | add %o1, 8, %o1 |
| 179 | SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30) |
| 180 | add %o1, 8, %o1 |
| 181 | SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38) |
| 182 | subcc %o4, 64, %o4 |
| 183 | bne,pt %XCC, 1b |
| 184 | add %o0, 64, %o0 |
| 185 | |
| 186 | #undef SWIVEL_ONE_DWORD |
| 187 | |
| 188 | srl %g1, 3, %g1 |
| 189 | ba,pt %XCC, 60f |
| 190 | add %o1, %g1, %o1 |
| 191 | |
| 192 | 10: /* Destination is 64-byte aligned, source was only 8-byte |
| 193 | * aligned but it has been subtracted by 8 and we perform |
| 194 | * one twin load ahead, then add 8 back into source when |
| 195 | * we finish the loop. |
| 196 | */ |
| 197 | EX_LD(LOAD_TWIN(%o1, %o4, %o5)) |
| 198 | 1: add %o1, 16, %o1 |
| 199 | EX_LD(LOAD_TWIN(%o1, %g2, %g3)) |
| 200 | add %o1, 16 + 32, %o1 |
| 201 | LOAD(prefetch, %o1, #one_read) |
| 202 | sub %o1, 32, %o1 |
| 203 | EX_ST(STORE_INIT(%o5, %o0 + 0x00)) ! initializes cache line |
| 204 | EX_ST(STORE_INIT(%g2, %o0 + 0x08)) |
| 205 | EX_LD(LOAD_TWIN(%o1, %o4, %o5)) |
| 206 | add %o1, 16, %o1 |
| 207 | EX_ST(STORE_INIT(%g3, %o0 + 0x10)) |
| 208 | EX_ST(STORE_INIT(%o4, %o0 + 0x18)) |
| 209 | EX_LD(LOAD_TWIN(%o1, %g2, %g3)) |
| 210 | add %o1, 16, %o1 |
| 211 | EX_ST(STORE_INIT(%o5, %o0 + 0x20)) |
| 212 | EX_ST(STORE_INIT(%g2, %o0 + 0x28)) |
| 213 | EX_LD(LOAD_TWIN(%o1, %o4, %o5)) |
| 214 | EX_ST(STORE_INIT(%g3, %o0 + 0x30)) |
| 215 | EX_ST(STORE_INIT(%o4, %o0 + 0x38)) |
| 216 | subcc %g1, 64, %g1 |
| 217 | bne,pt %XCC, 1b |
| 218 | add %o0, 64, %o0 |
| 219 | |
| 220 | ba,pt %XCC, 60f |
| 221 | add %o1, 0x8, %o1 |
| 222 | |
| 223 | 50: /* Destination is 64-byte aligned, and source is 16-byte |
| 224 | * aligned. |
| 225 | */ |
| 226 | 1: EX_LD(LOAD_TWIN(%o1, %o4, %o5)) |
| 227 | add %o1, 16, %o1 |
| 228 | EX_LD(LOAD_TWIN(%o1, %g2, %g3)) |
| 229 | add %o1, 16 + 32, %o1 |
| 230 | LOAD(prefetch, %o1, #one_read) |
| 231 | sub %o1, 32, %o1 |
| 232 | EX_ST(STORE_INIT(%o4, %o0 + 0x00)) ! initializes cache line |
| 233 | EX_ST(STORE_INIT(%o5, %o0 + 0x08)) |
| 234 | EX_LD(LOAD_TWIN(%o1, %o4, %o5)) |
| 235 | add %o1, 16, %o1 |
| 236 | EX_ST(STORE_INIT(%g2, %o0 + 0x10)) |
| 237 | EX_ST(STORE_INIT(%g3, %o0 + 0x18)) |
| 238 | EX_LD(LOAD_TWIN(%o1, %g2, %g3)) |
| 239 | add %o1, 16, %o1 |
| 240 | EX_ST(STORE_INIT(%o4, %o0 + 0x20)) |
| 241 | EX_ST(STORE_INIT(%o5, %o0 + 0x28)) |
| 242 | EX_ST(STORE_INIT(%g2, %o0 + 0x30)) |
| 243 | EX_ST(STORE_INIT(%g3, %o0 + 0x38)) |
| 244 | subcc %g1, 64, %g1 |
| 245 | bne,pt %XCC, 1b |
| 246 | add %o0, 64, %o0 |
| 247 | /* fall through */ |
| 248 | |
| 249 | 60: |
| 250 | /* %o2 contains any final bytes still needed to be copied |
| 251 | * over. If anything is left, we copy it one byte at a time. |
| 252 | */ |
David S. Miller | 0d4bc95 | 2006-02-11 10:30:41 -0800 | [diff] [blame] | 253 | RESTORE_ASI(%o3) |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 254 | brz,pt %o2, 85f |
| 255 | sub %o0, %o1, %o3 |
| 256 | ba,a,pt %XCC, 90f |
| 257 | |
| 258 | .align 64 |
| 259 | 70: /* 16 < len <= 64 */ |
| 260 | bne,pn %XCC, 75f |
| 261 | sub %o0, %o1, %o3 |
| 262 | |
| 263 | 72: |
| 264 | andn %o2, 0xf, %o4 |
| 265 | and %o2, 0xf, %o2 |
| 266 | 1: subcc %o4, 0x10, %o4 |
| 267 | EX_LD(LOAD(ldx, %o1, %o5)) |
| 268 | add %o1, 0x08, %o1 |
| 269 | EX_LD(LOAD(ldx, %o1, %g1)) |
| 270 | sub %o1, 0x08, %o1 |
| 271 | EX_ST(STORE(stx, %o5, %o1 + %o3)) |
| 272 | add %o1, 0x8, %o1 |
| 273 | EX_ST(STORE(stx, %g1, %o1 + %o3)) |
| 274 | bgu,pt %XCC, 1b |
| 275 | add %o1, 0x8, %o1 |
| 276 | 73: andcc %o2, 0x8, %g0 |
| 277 | be,pt %XCC, 1f |
| 278 | nop |
| 279 | sub %o2, 0x8, %o2 |
| 280 | EX_LD(LOAD(ldx, %o1, %o5)) |
| 281 | EX_ST(STORE(stx, %o5, %o1 + %o3)) |
| 282 | add %o1, 0x8, %o1 |
| 283 | 1: andcc %o2, 0x4, %g0 |
| 284 | be,pt %XCC, 1f |
| 285 | nop |
| 286 | sub %o2, 0x4, %o2 |
| 287 | EX_LD(LOAD(lduw, %o1, %o5)) |
| 288 | EX_ST(STORE(stw, %o5, %o1 + %o3)) |
| 289 | add %o1, 0x4, %o1 |
| 290 | 1: cmp %o2, 0 |
| 291 | be,pt %XCC, 85f |
| 292 | nop |
| 293 | ba,pt %xcc, 90f |
| 294 | nop |
| 295 | |
| 296 | 75: |
| 297 | andcc %o0, 0x7, %g1 |
| 298 | sub %g1, 0x8, %g1 |
| 299 | be,pn %icc, 2f |
| 300 | sub %g0, %g1, %g1 |
| 301 | sub %o2, %g1, %o2 |
| 302 | |
| 303 | 1: subcc %g1, 1, %g1 |
| 304 | EX_LD(LOAD(ldub, %o1, %o5)) |
| 305 | EX_ST(STORE(stb, %o5, %o1 + %o3)) |
| 306 | bgu,pt %icc, 1b |
| 307 | add %o1, 1, %o1 |
| 308 | |
| 309 | 2: add %o1, %o3, %o0 |
| 310 | andcc %o1, 0x7, %g1 |
| 311 | bne,pt %icc, 8f |
| 312 | sll %g1, 3, %g1 |
| 313 | |
| 314 | cmp %o2, 16 |
| 315 | bgeu,pt %icc, 72b |
| 316 | nop |
| 317 | ba,a,pt %xcc, 73b |
| 318 | |
| 319 | 8: mov 64, %o3 |
| 320 | andn %o1, 0x7, %o1 |
| 321 | EX_LD(LOAD(ldx, %o1, %g2)) |
| 322 | sub %o3, %g1, %o3 |
| 323 | andn %o2, 0x7, %o4 |
| 324 | sllx %g2, %g1, %g2 |
| 325 | 1: add %o1, 0x8, %o1 |
| 326 | EX_LD(LOAD(ldx, %o1, %g3)) |
| 327 | subcc %o4, 0x8, %o4 |
| 328 | srlx %g3, %o3, %o5 |
| 329 | or %o5, %g2, %o5 |
| 330 | EX_ST(STORE(stx, %o5, %o0)) |
| 331 | add %o0, 0x8, %o0 |
| 332 | bgu,pt %icc, 1b |
| 333 | sllx %g3, %g1, %g2 |
| 334 | |
| 335 | srl %g1, 3, %g1 |
| 336 | andcc %o2, 0x7, %o2 |
| 337 | be,pn %icc, 85f |
| 338 | add %o1, %g1, %o1 |
| 339 | ba,pt %xcc, 90f |
| 340 | sub %o0, %o1, %o3 |
| 341 | |
| 342 | .align 64 |
| 343 | 80: /* 0 < len <= 16 */ |
| 344 | andcc %o3, 0x3, %g0 |
| 345 | bne,pn %XCC, 90f |
| 346 | sub %o0, %o1, %o3 |
| 347 | |
| 348 | 1: |
| 349 | subcc %o2, 4, %o2 |
| 350 | EX_LD(LOAD(lduw, %o1, %g1)) |
| 351 | EX_ST(STORE(stw, %g1, %o1 + %o3)) |
| 352 | bgu,pt %XCC, 1b |
| 353 | add %o1, 4, %o1 |
| 354 | |
| 355 | 85: retl |
| 356 | mov EX_RETVAL(GLOBAL_SPARE), %o0 |
| 357 | |
| 358 | .align 32 |
| 359 | 90: |
| 360 | subcc %o2, 1, %o2 |
| 361 | EX_LD(LOAD(ldub, %o1, %g1)) |
| 362 | EX_ST(STORE(stb, %g1, %o1 + %o3)) |
| 363 | bgu,pt %XCC, 90b |
| 364 | add %o1, 1, %o1 |
| 365 | retl |
| 366 | mov EX_RETVAL(GLOBAL_SPARE), %o0 |
| 367 | |
| 368 | .size FUNC_NAME, .-FUNC_NAME |