blob: c657de59abca8c8dfd48417e12ecb49c3fd75c53 [file] [log] [blame]
Paul Mackerras14cf11a2005-09-26 16:04:21 +10001/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
Paul Mackerras14cf11a2005-09-26 16:04:21 +100011#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15
16#define COPY_16_BYTES \
17 lwz r7,4(r4); \
18 lwz r8,8(r4); \
19 lwz r9,12(r4); \
20 lwzu r10,16(r4); \
21 stw r7,4(r6); \
22 stw r8,8(r6); \
23 stw r9,12(r6); \
24 stwu r10,16(r6)
25
26#define COPY_16_BYTES_WITHEX(n) \
278 ## n ## 0: \
28 lwz r7,4(r4); \
298 ## n ## 1: \
30 lwz r8,8(r4); \
318 ## n ## 2: \
32 lwz r9,12(r4); \
338 ## n ## 3: \
34 lwzu r10,16(r4); \
358 ## n ## 4: \
36 stw r7,4(r6); \
378 ## n ## 5: \
38 stw r8,8(r6); \
398 ## n ## 6: \
40 stw r9,12(r6); \
418 ## n ## 7: \
42 stwu r10,16(r6)
43
44#define COPY_16_BYTES_EXCODE(n) \
459 ## n ## 0: \
46 addi r5,r5,-(16 * n); \
47 b 104f; \
489 ## n ## 1: \
49 addi r5,r5,-(16 * n); \
50 b 105f; \
51.section __ex_table,"a"; \
52 .align 2; \
53 .long 8 ## n ## 0b,9 ## n ## 0b; \
54 .long 8 ## n ## 1b,9 ## n ## 0b; \
55 .long 8 ## n ## 2b,9 ## n ## 0b; \
56 .long 8 ## n ## 3b,9 ## n ## 0b; \
57 .long 8 ## n ## 4b,9 ## n ## 1b; \
58 .long 8 ## n ## 5b,9 ## n ## 1b; \
59 .long 8 ## n ## 6b,9 ## n ## 1b; \
60 .long 8 ## n ## 7b,9 ## n ## 1b; \
61 .text
62
63 .text
64 .stabs "arch/powerpc/lib/",N_SO,0,0,0f
65 .stabs "copy32.S",N_SO,0,0,0f
660:
67
Stephen Rothwell7dffb722005-10-17 11:50:32 +100068CACHELINE_BYTES = L1_CACHE_BYTES
69LG_CACHELINE_BYTES = L1_CACHE_SHIFT
70CACHELINE_MASK = (L1_CACHE_BYTES-1)
Paul Mackerras14cf11a2005-09-26 16:04:21 +100071
72/*
73 * Use dcbz on the complete cache lines in the destination
74 * to set them to zero. This requires that the destination
75 * area is cacheable. -- paulus
76 */
77_GLOBAL(cacheable_memzero)
78 mr r5,r4
79 li r4,0
80 addi r6,r3,-4
81 cmplwi 0,r5,4
82 blt 7f
83 stwu r4,4(r6)
84 beqlr
85 andi. r0,r6,3
86 add r5,r0,r5
87 subf r6,r0,r6
88 clrlwi r7,r6,32-LG_CACHELINE_BYTES
89 add r8,r7,r5
90 srwi r9,r8,LG_CACHELINE_BYTES
91 addic. r9,r9,-1 /* total number of complete cachelines */
92 ble 2f
93 xori r0,r7,CACHELINE_MASK & ~3
94 srwi. r0,r0,2
95 beq 3f
96 mtctr r0
974: stwu r4,4(r6)
98 bdnz 4b
993: mtctr r9
100 li r7,4
101#if !defined(CONFIG_8xx)
10210: dcbz r7,r6
103#else
10410: stw r4, 4(r6)
105 stw r4, 8(r6)
106 stw r4, 12(r6)
107 stw r4, 16(r6)
108#if CACHE_LINE_SIZE >= 32
109 stw r4, 20(r6)
110 stw r4, 24(r6)
111 stw r4, 28(r6)
112 stw r4, 32(r6)
113#endif /* CACHE_LINE_SIZE */
114#endif
115 addi r6,r6,CACHELINE_BYTES
116 bdnz 10b
117 clrlwi r5,r8,32-LG_CACHELINE_BYTES
118 addi r5,r5,4
1192: srwi r0,r5,2
120 mtctr r0
121 bdz 6f
1221: stwu r4,4(r6)
123 bdnz 1b
1246: andi. r5,r5,3
1257: cmpwi 0,r5,0
126 beqlr
127 mtctr r5
128 addi r6,r6,3
1298: stbu r4,1(r6)
130 bdnz 8b
131 blr
132
133_GLOBAL(memset)
134 rlwimi r4,r4,8,16,23
135 rlwimi r4,r4,16,0,15
136 addi r6,r3,-4
137 cmplwi 0,r5,4
138 blt 7f
139 stwu r4,4(r6)
140 beqlr
141 andi. r0,r6,3
142 add r5,r0,r5
143 subf r6,r0,r6
144 srwi r0,r5,2
145 mtctr r0
146 bdz 6f
1471: stwu r4,4(r6)
148 bdnz 1b
1496: andi. r5,r5,3
1507: cmpwi 0,r5,0
151 beqlr
152 mtctr r5
153 addi r6,r6,3
1548: stbu r4,1(r6)
155 bdnz 8b
156 blr
157
158/*
159 * This version uses dcbz on the complete cache lines in the
160 * destination area to reduce memory traffic. This requires that
161 * the destination area is cacheable.
162 * We only use this version if the source and dest don't overlap.
163 * -- paulus.
164 */
165_GLOBAL(cacheable_memcpy)
166 add r7,r3,r5 /* test if the src & dst overlap */
167 add r8,r4,r5
168 cmplw 0,r4,r7
169 cmplw 1,r3,r8
170 crand 0,0,4 /* cr0.lt &= cr1.lt */
171 blt memcpy /* if regions overlap */
172
173 addi r4,r4,-4
174 addi r6,r3,-4
175 neg r0,r3
176 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
177 beq 58f
178
179 cmplw 0,r5,r0 /* is this more than total to do? */
180 blt 63f /* if not much to do */
181 andi. r8,r0,3 /* get it word-aligned first */
182 subf r5,r0,r5
183 mtctr r8
184 beq+ 61f
18570: lbz r9,4(r4) /* do some bytes */
186 stb r9,4(r6)
187 addi r4,r4,1
188 addi r6,r6,1
189 bdnz 70b
19061: srwi. r0,r0,2
191 mtctr r0
192 beq 58f
19372: lwzu r9,4(r4) /* do some words */
194 stwu r9,4(r6)
195 bdnz 72b
196
19758: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
198 clrlwi r5,r5,32-LG_CACHELINE_BYTES
199 li r11,4
200 mtctr r0
201 beq 63f
20253:
203#if !defined(CONFIG_8xx)
204 dcbz r11,r6
205#endif
206 COPY_16_BYTES
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000207#if L1_CACHE_BYTES >= 32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000208 COPY_16_BYTES
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000209#if L1_CACHE_BYTES >= 64
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000210 COPY_16_BYTES
211 COPY_16_BYTES
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000212#if L1_CACHE_BYTES >= 128
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000213 COPY_16_BYTES
214 COPY_16_BYTES
215 COPY_16_BYTES
216 COPY_16_BYTES
217#endif
218#endif
219#endif
220 bdnz 53b
221
22263: srwi. r0,r5,2
223 mtctr r0
224 beq 64f
22530: lwzu r0,4(r4)
226 stwu r0,4(r6)
227 bdnz 30b
228
22964: andi. r0,r5,3
230 mtctr r0
231 beq+ 65f
23240: lbz r0,4(r4)
233 stb r0,4(r6)
234 addi r4,r4,1
235 addi r6,r6,1
236 bdnz 40b
23765: blr
238
239_GLOBAL(memmove)
240 cmplw 0,r3,r4
241 bgt backwards_memcpy
242 /* fall through */
243
244_GLOBAL(memcpy)
245 srwi. r7,r5,3
246 addi r6,r3,-4
247 addi r4,r4,-4
248 beq 2f /* if less than 8 bytes to do */
249 andi. r0,r6,3 /* get dest word aligned */
250 mtctr r7
251 bne 5f
2521: lwz r7,4(r4)
253 lwzu r8,8(r4)
254 stw r7,4(r6)
255 stwu r8,8(r6)
256 bdnz 1b
257 andi. r5,r5,7
2582: cmplwi 0,r5,4
259 blt 3f
260 lwzu r0,4(r4)
261 addi r5,r5,-4
262 stwu r0,4(r6)
2633: cmpwi 0,r5,0
264 beqlr
265 mtctr r5
266 addi r4,r4,3
267 addi r6,r6,3
2684: lbzu r0,1(r4)
269 stbu r0,1(r6)
270 bdnz 4b
271 blr
2725: subfic r0,r0,4
273 mtctr r0
2746: lbz r7,4(r4)
275 addi r4,r4,1
276 stb r7,4(r6)
277 addi r6,r6,1
278 bdnz 6b
279 subf r5,r0,r5
280 rlwinm. r7,r5,32-3,3,31
281 beq 2b
282 mtctr r7
283 b 1b
284
285_GLOBAL(backwards_memcpy)
286 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
287 add r6,r3,r5
288 add r4,r4,r5
289 beq 2f
290 andi. r0,r6,3
291 mtctr r7
292 bne 5f
2931: lwz r7,-4(r4)
294 lwzu r8,-8(r4)
295 stw r7,-4(r6)
296 stwu r8,-8(r6)
297 bdnz 1b
298 andi. r5,r5,7
2992: cmplwi 0,r5,4
300 blt 3f
301 lwzu r0,-4(r4)
302 subi r5,r5,4
303 stwu r0,-4(r6)
3043: cmpwi 0,r5,0
305 beqlr
306 mtctr r5
3074: lbzu r0,-1(r4)
308 stbu r0,-1(r6)
309 bdnz 4b
310 blr
3115: mtctr r0
3126: lbzu r7,-1(r4)
313 stbu r7,-1(r6)
314 bdnz 6b
315 subf r5,r0,r5
316 rlwinm. r7,r5,32-3,3,31
317 beq 2b
318 mtctr r7
319 b 1b
320
321_GLOBAL(__copy_tofrom_user)
322 addi r4,r4,-4
323 addi r6,r3,-4
324 neg r0,r3
325 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
326 beq 58f
327
328 cmplw 0,r5,r0 /* is this more than total to do? */
329 blt 63f /* if not much to do */
330 andi. r8,r0,3 /* get it word-aligned first */
331 mtctr r8
332 beq+ 61f
33370: lbz r9,4(r4) /* do some bytes */
33471: stb r9,4(r6)
335 addi r4,r4,1
336 addi r6,r6,1
337 bdnz 70b
33861: subf r5,r0,r5
339 srwi. r0,r0,2
340 mtctr r0
341 beq 58f
34272: lwzu r9,4(r4) /* do some words */
34373: stwu r9,4(r6)
344 bdnz 72b
345
346 .section __ex_table,"a"
347 .align 2
348 .long 70b,100f
349 .long 71b,101f
350 .long 72b,102f
351 .long 73b,103f
352 .text
353
35458: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
355 clrlwi r5,r5,32-LG_CACHELINE_BYTES
356 li r11,4
357 beq 63f
358
359#ifdef CONFIG_8xx
360 /* Don't use prefetch on 8xx */
361 mtctr r0
362 li r0,0
36353: COPY_16_BYTES_WITHEX(0)
364 bdnz 53b
365
366#else /* not CONFIG_8xx */
367 /* Here we decide how far ahead to prefetch the source */
368 li r3,4
369 cmpwi r0,1
370 li r7,0
371 ble 114f
372 li r7,1
373#if MAX_COPY_PREFETCH > 1
374 /* Heuristically, for large transfers we prefetch
375 MAX_COPY_PREFETCH cachelines ahead. For small transfers
376 we prefetch 1 cacheline ahead. */
377 cmpwi r0,MAX_COPY_PREFETCH
378 ble 112f
379 li r7,MAX_COPY_PREFETCH
380112: mtctr r7
381111: dcbt r3,r4
382 addi r3,r3,CACHELINE_BYTES
383 bdnz 111b
384#else
385 dcbt r3,r4
386 addi r3,r3,CACHELINE_BYTES
387#endif /* MAX_COPY_PREFETCH > 1 */
388
389114: subf r8,r7,r0
390 mr r0,r7
391 mtctr r8
392
39353: dcbt r3,r4
39454: dcbz r11,r6
395 .section __ex_table,"a"
396 .align 2
397 .long 54b,105f
398 .text
399/* the main body of the cacheline loop */
400 COPY_16_BYTES_WITHEX(0)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000401#if L1_CACHE_BYTES >= 32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000402 COPY_16_BYTES_WITHEX(1)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000403#if L1_CACHE_BYTES >= 64
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000404 COPY_16_BYTES_WITHEX(2)
405 COPY_16_BYTES_WITHEX(3)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000406#if L1_CACHE_BYTES >= 128
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000407 COPY_16_BYTES_WITHEX(4)
408 COPY_16_BYTES_WITHEX(5)
409 COPY_16_BYTES_WITHEX(6)
410 COPY_16_BYTES_WITHEX(7)
411#endif
412#endif
413#endif
414 bdnz 53b
415 cmpwi r0,0
416 li r3,4
417 li r7,0
418 bne 114b
419#endif /* CONFIG_8xx */
420
42163: srwi. r0,r5,2
422 mtctr r0
423 beq 64f
42430: lwzu r0,4(r4)
42531: stwu r0,4(r6)
426 bdnz 30b
427
42864: andi. r0,r5,3
429 mtctr r0
430 beq+ 65f
43140: lbz r0,4(r4)
43241: stb r0,4(r6)
433 addi r4,r4,1
434 addi r6,r6,1
435 bdnz 40b
43665: li r3,0
437 blr
438
439/* read fault, initial single-byte copy */
440100: li r9,0
441 b 90f
442/* write fault, initial single-byte copy */
443101: li r9,1
44490: subf r5,r8,r5
445 li r3,0
446 b 99f
447/* read fault, initial word copy */
448102: li r9,0
449 b 91f
450/* write fault, initial word copy */
451103: li r9,1
45291: li r3,2
453 b 99f
454
455/*
456 * this stuff handles faults in the cacheline loop and branches to either
457 * 104f (if in read part) or 105f (if in write part), after updating r5
458 */
459 COPY_16_BYTES_EXCODE(0)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000460#if L1_CACHE_BYTES >= 32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000461 COPY_16_BYTES_EXCODE(1)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000462#if L1_CACHE_BYTES >= 64
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000463 COPY_16_BYTES_EXCODE(2)
464 COPY_16_BYTES_EXCODE(3)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000465#if L1_CACHE_BYTES >= 128
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000466 COPY_16_BYTES_EXCODE(4)
467 COPY_16_BYTES_EXCODE(5)
468 COPY_16_BYTES_EXCODE(6)
469 COPY_16_BYTES_EXCODE(7)
470#endif
471#endif
472#endif
473
474/* read fault in cacheline loop */
475104: li r9,0
476 b 92f
477/* fault on dcbz (effectively a write fault) */
478/* or write fault in cacheline loop */
479105: li r9,1
48092: li r3,LG_CACHELINE_BYTES
481 mfctr r8
482 add r0,r0,r8
483 b 106f
484/* read fault in final word loop */
485108: li r9,0
486 b 93f
487/* write fault in final word loop */
488109: li r9,1
48993: andi. r5,r5,3
490 li r3,2
491 b 99f
492/* read fault in final byte loop */
493110: li r9,0
494 b 94f
495/* write fault in final byte loop */
496111: li r9,1
49794: li r5,0
498 li r3,0
499/*
500 * At this stage the number of bytes not copied is
501 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
502 */
50399: mfctr r0
504106: slw r3,r0,r3
505 add. r3,r3,r5
506 beq 120f /* shouldn't happen */
507 cmpwi 0,r9,0
508 bne 120f
509/* for a read fault, first try to continue the copy one byte at a time */
510 mtctr r3
511130: lbz r0,4(r4)
512131: stb r0,4(r6)
513 addi r4,r4,1
514 addi r6,r6,1
515 bdnz 130b
516/* then clear out the destination: r3 bytes starting at 4(r6) */
517132: mfctr r3
518 srwi. r0,r3,2
519 li r9,0
520 mtctr r0
521 beq 113f
522112: stwu r9,4(r6)
523 bdnz 112b
524113: andi. r0,r3,3
525 mtctr r0
526 beq 120f
527114: stb r9,4(r6)
528 addi r6,r6,1
529 bdnz 114b
530120: blr
531
532 .section __ex_table,"a"
533 .align 2
534 .long 30b,108b
535 .long 31b,109b
536 .long 40b,110b
537 .long 41b,111b
538 .long 130b,132b
539 .long 131b,120b
540 .long 112b,120b
541 .long 114b,120b
542 .text