blob: 1d49c740407ab5762bb4d749db80bf55c99c09e8 [file] [log] [blame]
Paul Mackerras14cf11a2005-09-26 16:04:21 +10001/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
Paul Mackerras14cf11a2005-09-26 16:04:21 +100011#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15
16#define COPY_16_BYTES \
17 lwz r7,4(r4); \
18 lwz r8,8(r4); \
19 lwz r9,12(r4); \
20 lwzu r10,16(r4); \
21 stw r7,4(r6); \
22 stw r8,8(r6); \
23 stw r9,12(r6); \
24 stwu r10,16(r6)
25
26#define COPY_16_BYTES_WITHEX(n) \
278 ## n ## 0: \
28 lwz r7,4(r4); \
298 ## n ## 1: \
30 lwz r8,8(r4); \
318 ## n ## 2: \
32 lwz r9,12(r4); \
338 ## n ## 3: \
34 lwzu r10,16(r4); \
358 ## n ## 4: \
36 stw r7,4(r6); \
378 ## n ## 5: \
38 stw r8,8(r6); \
398 ## n ## 6: \
40 stw r9,12(r6); \
418 ## n ## 7: \
42 stwu r10,16(r6)
43
44#define COPY_16_BYTES_EXCODE(n) \
459 ## n ## 0: \
46 addi r5,r5,-(16 * n); \
47 b 104f; \
489 ## n ## 1: \
49 addi r5,r5,-(16 * n); \
50 b 105f; \
51.section __ex_table,"a"; \
52 .align 2; \
53 .long 8 ## n ## 0b,9 ## n ## 0b; \
54 .long 8 ## n ## 1b,9 ## n ## 0b; \
55 .long 8 ## n ## 2b,9 ## n ## 0b; \
56 .long 8 ## n ## 3b,9 ## n ## 0b; \
57 .long 8 ## n ## 4b,9 ## n ## 1b; \
58 .long 8 ## n ## 5b,9 ## n ## 1b; \
59 .long 8 ## n ## 6b,9 ## n ## 1b; \
60 .long 8 ## n ## 7b,9 ## n ## 1b; \
61 .text
62
63 .text
64 .stabs "arch/powerpc/lib/",N_SO,0,0,0f
Sean MacLennan025c0182010-09-01 07:21:21 +000065 .stabs "copy_32.S",N_SO,0,0,0f
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000660:
67
Stephen Rothwell7dffb722005-10-17 11:50:32 +100068CACHELINE_BYTES = L1_CACHE_BYTES
69LG_CACHELINE_BYTES = L1_CACHE_SHIFT
70CACHELINE_MASK = (L1_CACHE_BYTES-1)
Paul Mackerras14cf11a2005-09-26 16:04:21 +100071
LEROY Christophedf087e42015-05-19 12:07:48 +020072/*
73 * Use dcbz on the complete cache lines in the destination
74 * to set them to zero. This requires that the destination
75 * area is cacheable. -- paulus
76 */
LEROY Christophe5b2a32e2015-05-19 12:07:50 +020077_GLOBAL(memset)
LEROY Christophec152f142015-05-19 12:07:52 +020078 rlwimi r4,r4,8,16,23
79 rlwimi r4,r4,16,0,15
80
LEROY Christophedf087e42015-05-19 12:07:48 +020081 addi r6,r3,-4
82 cmplwi 0,r5,4
83 blt 7f
84 stwu r4,4(r6)
85 beqlr
86 andi. r0,r6,3
87 add r5,r0,r5
88 subf r6,r0,r6
LEROY Christophec152f142015-05-19 12:07:52 +020089 cmplwi 0,r4,0
90 bne 2f /* Use normal procedure if r4 is not zero */
91
LEROY Christophedf087e42015-05-19 12:07:48 +020092 clrlwi r7,r6,32-LG_CACHELINE_BYTES
93 add r8,r7,r5
94 srwi r9,r8,LG_CACHELINE_BYTES
95 addic. r9,r9,-1 /* total number of complete cachelines */
96 ble 2f
97 xori r0,r7,CACHELINE_MASK & ~3
98 srwi. r0,r0,2
99 beq 3f
100 mtctr r0
1014: stwu r4,4(r6)
102 bdnz 4b
1033: mtctr r9
104 li r7,4
10510: dcbz r7,r6
106 addi r6,r6,CACHELINE_BYTES
107 bdnz 10b
108 clrlwi r5,r8,32-LG_CACHELINE_BYTES
109 addi r5,r5,4
LEROY Christophedf087e42015-05-19 12:07:48 +0200110
LEROY Christophec152f142015-05-19 12:07:52 +02001112: srwi r0,r5,2
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000112 mtctr r0
113 bdz 6f
1141: stwu r4,4(r6)
115 bdnz 1b
1166: andi. r5,r5,3
1177: cmpwi 0,r5,0
118 beqlr
119 mtctr r5
120 addi r6,r6,3
1218: stbu r4,1(r6)
122 bdnz 8b
123 blr
124
LEROY Christophedf087e42015-05-19 12:07:48 +0200125/*
126 * This version uses dcbz on the complete cache lines in the
127 * destination area to reduce memory traffic. This requires that
128 * the destination area is cacheable.
129 * We only use this version if the source and dest don't overlap.
130 * -- paulus.
131 */
LEROY Christophe0b05e2d2015-05-19 12:07:55 +0200132_GLOBAL(memmove)
133 cmplw 0,r3,r4
134 bgt backwards_memcpy
135 /* fall through */
136
137_GLOBAL(memcpy)
LEROY Christophedf087e42015-05-19 12:07:48 +0200138 add r7,r3,r5 /* test if the src & dst overlap */
139 add r8,r4,r5
140 cmplw 0,r4,r7
141 cmplw 1,r3,r8
142 crand 0,0,4 /* cr0.lt &= cr1.lt */
LEROY Christophe0b05e2d2015-05-19 12:07:55 +0200143 blt generic_memcpy /* if regions overlap */
LEROY Christophedf087e42015-05-19 12:07:48 +0200144
145 addi r4,r4,-4
146 addi r6,r3,-4
147 neg r0,r3
148 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
149 beq 58f
150
151 cmplw 0,r5,r0 /* is this more than total to do? */
152 blt 63f /* if not much to do */
153 andi. r8,r0,3 /* get it word-aligned first */
154 subf r5,r0,r5
155 mtctr r8
156 beq+ 61f
15770: lbz r9,4(r4) /* do some bytes */
158 stb r9,4(r6)
159 addi r4,r4,1
160 addi r6,r6,1
161 bdnz 70b
16261: srwi. r0,r0,2
163 mtctr r0
164 beq 58f
16572: lwzu r9,4(r4) /* do some words */
166 stwu r9,4(r6)
167 bdnz 72b
168
16958: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
170 clrlwi r5,r5,32-LG_CACHELINE_BYTES
171 li r11,4
172 mtctr r0
173 beq 63f
17453:
175 dcbz r11,r6
176 COPY_16_BYTES
177#if L1_CACHE_BYTES >= 32
178 COPY_16_BYTES
179#if L1_CACHE_BYTES >= 64
180 COPY_16_BYTES
181 COPY_16_BYTES
182#if L1_CACHE_BYTES >= 128
183 COPY_16_BYTES
184 COPY_16_BYTES
185 COPY_16_BYTES
186 COPY_16_BYTES
187#endif
188#endif
189#endif
190 bdnz 53b
191
19263: srwi. r0,r5,2
193 mtctr r0
194 beq 64f
19530: lwzu r0,4(r4)
196 stwu r0,4(r6)
197 bdnz 30b
198
19964: andi. r0,r5,3
200 mtctr r0
201 beq+ 65f
20240: lbz r0,4(r4)
203 stb r0,4(r6)
204 addi r4,r4,1
205 addi r6,r6,1
206 bdnz 40b
20765: blr
208
LEROY Christophe0b05e2d2015-05-19 12:07:55 +0200209_GLOBAL(generic_memcpy)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000210 srwi. r7,r5,3
211 addi r6,r3,-4
212 addi r4,r4,-4
213 beq 2f /* if less than 8 bytes to do */
214 andi. r0,r6,3 /* get dest word aligned */
215 mtctr r7
216 bne 5f
2171: lwz r7,4(r4)
218 lwzu r8,8(r4)
219 stw r7,4(r6)
220 stwu r8,8(r6)
221 bdnz 1b
222 andi. r5,r5,7
2232: cmplwi 0,r5,4
224 blt 3f
225 lwzu r0,4(r4)
226 addi r5,r5,-4
227 stwu r0,4(r6)
2283: cmpwi 0,r5,0
229 beqlr
230 mtctr r5
231 addi r4,r4,3
232 addi r6,r6,3
2334: lbzu r0,1(r4)
234 stbu r0,1(r6)
235 bdnz 4b
236 blr
2375: subfic r0,r0,4
238 mtctr r0
2396: lbz r7,4(r4)
240 addi r4,r4,1
241 stb r7,4(r6)
242 addi r6,r6,1
243 bdnz 6b
244 subf r5,r0,r5
245 rlwinm. r7,r5,32-3,3,31
246 beq 2b
247 mtctr r7
248 b 1b
249
250_GLOBAL(backwards_memcpy)
251 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
252 add r6,r3,r5
253 add r4,r4,r5
254 beq 2f
255 andi. r0,r6,3
256 mtctr r7
257 bne 5f
2581: lwz r7,-4(r4)
259 lwzu r8,-8(r4)
260 stw r7,-4(r6)
261 stwu r8,-8(r6)
262 bdnz 1b
263 andi. r5,r5,7
2642: cmplwi 0,r5,4
265 blt 3f
266 lwzu r0,-4(r4)
267 subi r5,r5,4
268 stwu r0,-4(r6)
2693: cmpwi 0,r5,0
270 beqlr
271 mtctr r5
2724: lbzu r0,-1(r4)
273 stbu r0,-1(r6)
274 bdnz 4b
275 blr
2765: mtctr r0
2776: lbzu r7,-1(r4)
278 stbu r7,-1(r6)
279 bdnz 6b
280 subf r5,r0,r5
281 rlwinm. r7,r5,32-3,3,31
282 beq 2b
283 mtctr r7
284 b 1b
285
286_GLOBAL(__copy_tofrom_user)
287 addi r4,r4,-4
288 addi r6,r3,-4
289 neg r0,r3
290 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
291 beq 58f
292
293 cmplw 0,r5,r0 /* is this more than total to do? */
294 blt 63f /* if not much to do */
295 andi. r8,r0,3 /* get it word-aligned first */
296 mtctr r8
297 beq+ 61f
29870: lbz r9,4(r4) /* do some bytes */
29971: stb r9,4(r6)
300 addi r4,r4,1
301 addi r6,r6,1
302 bdnz 70b
30361: subf r5,r0,r5
304 srwi. r0,r0,2
305 mtctr r0
306 beq 58f
30772: lwzu r9,4(r4) /* do some words */
30873: stwu r9,4(r6)
309 bdnz 72b
310
311 .section __ex_table,"a"
312 .align 2
313 .long 70b,100f
314 .long 71b,101f
315 .long 72b,102f
316 .long 73b,103f
317 .text
318
31958: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
320 clrlwi r5,r5,32-LG_CACHELINE_BYTES
321 li r11,4
322 beq 63f
323
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000324 /* Here we decide how far ahead to prefetch the source */
325 li r3,4
326 cmpwi r0,1
327 li r7,0
328 ble 114f
329 li r7,1
330#if MAX_COPY_PREFETCH > 1
331 /* Heuristically, for large transfers we prefetch
332 MAX_COPY_PREFETCH cachelines ahead. For small transfers
333 we prefetch 1 cacheline ahead. */
334 cmpwi r0,MAX_COPY_PREFETCH
335 ble 112f
336 li r7,MAX_COPY_PREFETCH
337112: mtctr r7
338111: dcbt r3,r4
339 addi r3,r3,CACHELINE_BYTES
340 bdnz 111b
341#else
342 dcbt r3,r4
343 addi r3,r3,CACHELINE_BYTES
344#endif /* MAX_COPY_PREFETCH > 1 */
345
346114: subf r8,r7,r0
347 mr r0,r7
348 mtctr r8
349
35053: dcbt r3,r4
35154: dcbz r11,r6
352 .section __ex_table,"a"
353 .align 2
354 .long 54b,105f
355 .text
356/* the main body of the cacheline loop */
357 COPY_16_BYTES_WITHEX(0)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000358#if L1_CACHE_BYTES >= 32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000359 COPY_16_BYTES_WITHEX(1)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000360#if L1_CACHE_BYTES >= 64
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000361 COPY_16_BYTES_WITHEX(2)
362 COPY_16_BYTES_WITHEX(3)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000363#if L1_CACHE_BYTES >= 128
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000364 COPY_16_BYTES_WITHEX(4)
365 COPY_16_BYTES_WITHEX(5)
366 COPY_16_BYTES_WITHEX(6)
367 COPY_16_BYTES_WITHEX(7)
368#endif
369#endif
370#endif
371 bdnz 53b
372 cmpwi r0,0
373 li r3,4
374 li r7,0
375 bne 114b
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000376
37763: srwi. r0,r5,2
378 mtctr r0
379 beq 64f
38030: lwzu r0,4(r4)
38131: stwu r0,4(r6)
382 bdnz 30b
383
38464: andi. r0,r5,3
385 mtctr r0
386 beq+ 65f
38740: lbz r0,4(r4)
38841: stb r0,4(r6)
389 addi r4,r4,1
390 addi r6,r6,1
391 bdnz 40b
39265: li r3,0
393 blr
394
395/* read fault, initial single-byte copy */
396100: li r9,0
397 b 90f
398/* write fault, initial single-byte copy */
399101: li r9,1
40090: subf r5,r8,r5
401 li r3,0
402 b 99f
403/* read fault, initial word copy */
404102: li r9,0
405 b 91f
406/* write fault, initial word copy */
407103: li r9,1
40891: li r3,2
409 b 99f
410
411/*
412 * this stuff handles faults in the cacheline loop and branches to either
413 * 104f (if in read part) or 105f (if in write part), after updating r5
414 */
415 COPY_16_BYTES_EXCODE(0)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000416#if L1_CACHE_BYTES >= 32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000417 COPY_16_BYTES_EXCODE(1)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000418#if L1_CACHE_BYTES >= 64
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000419 COPY_16_BYTES_EXCODE(2)
420 COPY_16_BYTES_EXCODE(3)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000421#if L1_CACHE_BYTES >= 128
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000422 COPY_16_BYTES_EXCODE(4)
423 COPY_16_BYTES_EXCODE(5)
424 COPY_16_BYTES_EXCODE(6)
425 COPY_16_BYTES_EXCODE(7)
426#endif
427#endif
428#endif
429
430/* read fault in cacheline loop */
431104: li r9,0
432 b 92f
433/* fault on dcbz (effectively a write fault) */
434/* or write fault in cacheline loop */
435105: li r9,1
43692: li r3,LG_CACHELINE_BYTES
437 mfctr r8
438 add r0,r0,r8
439 b 106f
440/* read fault in final word loop */
441108: li r9,0
442 b 93f
443/* write fault in final word loop */
444109: li r9,1
44593: andi. r5,r5,3
446 li r3,2
447 b 99f
448/* read fault in final byte loop */
449110: li r9,0
450 b 94f
451/* write fault in final byte loop */
452111: li r9,1
45394: li r5,0
454 li r3,0
455/*
456 * At this stage the number of bytes not copied is
457 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
458 */
45999: mfctr r0
460106: slw r3,r0,r3
461 add. r3,r3,r5
462 beq 120f /* shouldn't happen */
463 cmpwi 0,r9,0
464 bne 120f
465/* for a read fault, first try to continue the copy one byte at a time */
466 mtctr r3
467130: lbz r0,4(r4)
468131: stb r0,4(r6)
469 addi r4,r4,1
470 addi r6,r6,1
471 bdnz 130b
472/* then clear out the destination: r3 bytes starting at 4(r6) */
473132: mfctr r3
474 srwi. r0,r3,2
475 li r9,0
476 mtctr r0
477 beq 113f
478112: stwu r9,4(r6)
479 bdnz 112b
480113: andi. r0,r3,3
481 mtctr r0
482 beq 120f
483114: stb r9,4(r6)
484 addi r6,r6,1
485 bdnz 114b
486120: blr
487
488 .section __ex_table,"a"
489 .align 2
490 .long 30b,108b
491 .long 31b,109b
492 .long 40b,110b
493 .long 41b,111b
494 .long 130b,132b
495 .long 131b,120b
496 .long 112b,120b
497 .long 114b,120b
498 .text