blob: c44df2dbedd52f5a2726443afe5a9296326b3778 [file] [log] [blame]
Paul Mackerras14cf11a2005-09-26 16:04:21 +10001/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
Paul Mackerras14cf11a2005-09-26 16:04:21 +100011#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15
16#define COPY_16_BYTES \
17 lwz r7,4(r4); \
18 lwz r8,8(r4); \
19 lwz r9,12(r4); \
20 lwzu r10,16(r4); \
21 stw r7,4(r6); \
22 stw r8,8(r6); \
23 stw r9,12(r6); \
24 stwu r10,16(r6)
25
26#define COPY_16_BYTES_WITHEX(n) \
278 ## n ## 0: \
28 lwz r7,4(r4); \
298 ## n ## 1: \
30 lwz r8,8(r4); \
318 ## n ## 2: \
32 lwz r9,12(r4); \
338 ## n ## 3: \
34 lwzu r10,16(r4); \
358 ## n ## 4: \
36 stw r7,4(r6); \
378 ## n ## 5: \
38 stw r8,8(r6); \
398 ## n ## 6: \
40 stw r9,12(r6); \
418 ## n ## 7: \
42 stwu r10,16(r6)
43
44#define COPY_16_BYTES_EXCODE(n) \
459 ## n ## 0: \
46 addi r5,r5,-(16 * n); \
47 b 104f; \
489 ## n ## 1: \
49 addi r5,r5,-(16 * n); \
50 b 105f; \
51.section __ex_table,"a"; \
52 .align 2; \
53 .long 8 ## n ## 0b,9 ## n ## 0b; \
54 .long 8 ## n ## 1b,9 ## n ## 0b; \
55 .long 8 ## n ## 2b,9 ## n ## 0b; \
56 .long 8 ## n ## 3b,9 ## n ## 0b; \
57 .long 8 ## n ## 4b,9 ## n ## 1b; \
58 .long 8 ## n ## 5b,9 ## n ## 1b; \
59 .long 8 ## n ## 6b,9 ## n ## 1b; \
60 .long 8 ## n ## 7b,9 ## n ## 1b; \
61 .text
62
63 .text
64 .stabs "arch/powerpc/lib/",N_SO,0,0,0f
Sean MacLennan025c0182010-09-01 07:21:21 +000065 .stabs "copy_32.S",N_SO,0,0,0f
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000660:
67
Stephen Rothwell7dffb722005-10-17 11:50:32 +100068CACHELINE_BYTES = L1_CACHE_BYTES
69LG_CACHELINE_BYTES = L1_CACHE_SHIFT
70CACHELINE_MASK = (L1_CACHE_BYTES-1)
Paul Mackerras14cf11a2005-09-26 16:04:21 +100071
LEROY Christophedf087e42015-05-19 12:07:48 +020072/*
73 * Use dcbz on the complete cache lines in the destination
74 * to set them to zero. This requires that the destination
75 * area is cacheable. -- paulus
LEROY Christophe400c47d2015-09-16 12:04:53 +020076 *
77 * During early init, cache might not be active yet, so dcbz cannot be used.
78 * We therefore skip the optimised bloc that uses dcbz. This jump is
79 * replaced by a nop once cache is active. This is done in machine_init()
LEROY Christophedf087e42015-05-19 12:07:48 +020080 */
LEROY Christophe5b2a32e2015-05-19 12:07:50 +020081_GLOBAL(memset)
LEROY Christophec152f142015-05-19 12:07:52 +020082 rlwimi r4,r4,8,16,23
83 rlwimi r4,r4,16,0,15
84
LEROY Christophedf087e42015-05-19 12:07:48 +020085 addi r6,r3,-4
86 cmplwi 0,r5,4
87 blt 7f
88 stwu r4,4(r6)
89 beqlr
90 andi. r0,r6,3
91 add r5,r0,r5
92 subf r6,r0,r6
LEROY Christophec152f142015-05-19 12:07:52 +020093 cmplwi 0,r4,0
94 bne 2f /* Use normal procedure if r4 is not zero */
LEROY Christophe400c47d2015-09-16 12:04:53 +020095_GLOBAL(memset_nocache_branch)
96 b 2f /* Skip optimised bloc until cache is enabled */
LEROY Christophec152f142015-05-19 12:07:52 +020097
LEROY Christophedf087e42015-05-19 12:07:48 +020098 clrlwi r7,r6,32-LG_CACHELINE_BYTES
99 add r8,r7,r5
100 srwi r9,r8,LG_CACHELINE_BYTES
101 addic. r9,r9,-1 /* total number of complete cachelines */
102 ble 2f
103 xori r0,r7,CACHELINE_MASK & ~3
104 srwi. r0,r0,2
105 beq 3f
106 mtctr r0
1074: stwu r4,4(r6)
108 bdnz 4b
1093: mtctr r9
110 li r7,4
11110: dcbz r7,r6
112 addi r6,r6,CACHELINE_BYTES
113 bdnz 10b
114 clrlwi r5,r8,32-LG_CACHELINE_BYTES
115 addi r5,r5,4
LEROY Christophedf087e42015-05-19 12:07:48 +0200116
LEROY Christophec152f142015-05-19 12:07:52 +02001172: srwi r0,r5,2
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000118 mtctr r0
119 bdz 6f
1201: stwu r4,4(r6)
121 bdnz 1b
1226: andi. r5,r5,3
1237: cmpwi 0,r5,0
124 beqlr
125 mtctr r5
126 addi r6,r6,3
1278: stbu r4,1(r6)
128 bdnz 8b
129 blr
130
LEROY Christophedf087e42015-05-19 12:07:48 +0200131/*
132 * This version uses dcbz on the complete cache lines in the
133 * destination area to reduce memory traffic. This requires that
134 * the destination area is cacheable.
135 * We only use this version if the source and dest don't overlap.
136 * -- paulus.
LEROY Christophe1cd03892015-09-16 12:04:51 +0200137 *
138 * During early init, cache might not be active yet, so dcbz cannot be used.
139 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
140 * replaced by a nop once cache is active. This is done in machine_init()
LEROY Christophedf087e42015-05-19 12:07:48 +0200141 */
LEROY Christophe0b05e2d2015-05-19 12:07:55 +0200142_GLOBAL(memmove)
143 cmplw 0,r3,r4
144 bgt backwards_memcpy
145 /* fall through */
146
147_GLOBAL(memcpy)
LEROY Christophe1cd03892015-09-16 12:04:51 +0200148 b generic_memcpy
LEROY Christophedf087e42015-05-19 12:07:48 +0200149 add r7,r3,r5 /* test if the src & dst overlap */
150 add r8,r4,r5
151 cmplw 0,r4,r7
152 cmplw 1,r3,r8
153 crand 0,0,4 /* cr0.lt &= cr1.lt */
LEROY Christophe0b05e2d2015-05-19 12:07:55 +0200154 blt generic_memcpy /* if regions overlap */
LEROY Christophedf087e42015-05-19 12:07:48 +0200155
156 addi r4,r4,-4
157 addi r6,r3,-4
158 neg r0,r3
159 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
160 beq 58f
161
162 cmplw 0,r5,r0 /* is this more than total to do? */
163 blt 63f /* if not much to do */
164 andi. r8,r0,3 /* get it word-aligned first */
165 subf r5,r0,r5
166 mtctr r8
167 beq+ 61f
16870: lbz r9,4(r4) /* do some bytes */
LEROY Christophedf087e42015-05-19 12:07:48 +0200169 addi r4,r4,1
170 addi r6,r6,1
LEROY Christophe295ffb42015-05-19 12:07:57 +0200171 stb r9,3(r6)
LEROY Christophedf087e42015-05-19 12:07:48 +0200172 bdnz 70b
17361: srwi. r0,r0,2
174 mtctr r0
175 beq 58f
17672: lwzu r9,4(r4) /* do some words */
177 stwu r9,4(r6)
178 bdnz 72b
179
18058: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
181 clrlwi r5,r5,32-LG_CACHELINE_BYTES
182 li r11,4
183 mtctr r0
184 beq 63f
18553:
186 dcbz r11,r6
187 COPY_16_BYTES
188#if L1_CACHE_BYTES >= 32
189 COPY_16_BYTES
190#if L1_CACHE_BYTES >= 64
191 COPY_16_BYTES
192 COPY_16_BYTES
193#if L1_CACHE_BYTES >= 128
194 COPY_16_BYTES
195 COPY_16_BYTES
196 COPY_16_BYTES
197 COPY_16_BYTES
198#endif
199#endif
200#endif
201 bdnz 53b
202
20363: srwi. r0,r5,2
204 mtctr r0
205 beq 64f
20630: lwzu r0,4(r4)
207 stwu r0,4(r6)
208 bdnz 30b
209
21064: andi. r0,r5,3
211 mtctr r0
212 beq+ 65f
LEROY Christophe295ffb42015-05-19 12:07:57 +0200213 addi r4,r4,3
214 addi r6,r6,3
21540: lbzu r0,1(r4)
216 stbu r0,1(r6)
LEROY Christophedf087e42015-05-19 12:07:48 +0200217 bdnz 40b
21865: blr
219
LEROY Christophe0b05e2d2015-05-19 12:07:55 +0200220_GLOBAL(generic_memcpy)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000221 srwi. r7,r5,3
222 addi r6,r3,-4
223 addi r4,r4,-4
224 beq 2f /* if less than 8 bytes to do */
225 andi. r0,r6,3 /* get dest word aligned */
226 mtctr r7
227 bne 5f
2281: lwz r7,4(r4)
229 lwzu r8,8(r4)
230 stw r7,4(r6)
231 stwu r8,8(r6)
232 bdnz 1b
233 andi. r5,r5,7
2342: cmplwi 0,r5,4
235 blt 3f
236 lwzu r0,4(r4)
237 addi r5,r5,-4
238 stwu r0,4(r6)
2393: cmpwi 0,r5,0
240 beqlr
241 mtctr r5
242 addi r4,r4,3
243 addi r6,r6,3
2444: lbzu r0,1(r4)
245 stbu r0,1(r6)
246 bdnz 4b
247 blr
2485: subfic r0,r0,4
249 mtctr r0
2506: lbz r7,4(r4)
251 addi r4,r4,1
252 stb r7,4(r6)
253 addi r6,r6,1
254 bdnz 6b
255 subf r5,r0,r5
256 rlwinm. r7,r5,32-3,3,31
257 beq 2b
258 mtctr r7
259 b 1b
260
261_GLOBAL(backwards_memcpy)
262 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
263 add r6,r3,r5
264 add r4,r4,r5
265 beq 2f
266 andi. r0,r6,3
267 mtctr r7
268 bne 5f
2691: lwz r7,-4(r4)
270 lwzu r8,-8(r4)
271 stw r7,-4(r6)
272 stwu r8,-8(r6)
273 bdnz 1b
274 andi. r5,r5,7
2752: cmplwi 0,r5,4
276 blt 3f
277 lwzu r0,-4(r4)
278 subi r5,r5,4
279 stwu r0,-4(r6)
2803: cmpwi 0,r5,0
281 beqlr
282 mtctr r5
2834: lbzu r0,-1(r4)
284 stbu r0,-1(r6)
285 bdnz 4b
286 blr
2875: mtctr r0
2886: lbzu r7,-1(r4)
289 stbu r7,-1(r6)
290 bdnz 6b
291 subf r5,r0,r5
292 rlwinm. r7,r5,32-3,3,31
293 beq 2b
294 mtctr r7
295 b 1b
296
297_GLOBAL(__copy_tofrom_user)
298 addi r4,r4,-4
299 addi r6,r3,-4
300 neg r0,r3
301 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
302 beq 58f
303
304 cmplw 0,r5,r0 /* is this more than total to do? */
305 blt 63f /* if not much to do */
306 andi. r8,r0,3 /* get it word-aligned first */
307 mtctr r8
308 beq+ 61f
30970: lbz r9,4(r4) /* do some bytes */
31071: stb r9,4(r6)
311 addi r4,r4,1
312 addi r6,r6,1
313 bdnz 70b
31461: subf r5,r0,r5
315 srwi. r0,r0,2
316 mtctr r0
317 beq 58f
31872: lwzu r9,4(r4) /* do some words */
31973: stwu r9,4(r6)
320 bdnz 72b
321
322 .section __ex_table,"a"
323 .align 2
324 .long 70b,100f
325 .long 71b,101f
326 .long 72b,102f
327 .long 73b,103f
328 .text
329
33058: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
331 clrlwi r5,r5,32-LG_CACHELINE_BYTES
332 li r11,4
333 beq 63f
334
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000335 /* Here we decide how far ahead to prefetch the source */
336 li r3,4
337 cmpwi r0,1
338 li r7,0
339 ble 114f
340 li r7,1
341#if MAX_COPY_PREFETCH > 1
342 /* Heuristically, for large transfers we prefetch
343 MAX_COPY_PREFETCH cachelines ahead. For small transfers
344 we prefetch 1 cacheline ahead. */
345 cmpwi r0,MAX_COPY_PREFETCH
346 ble 112f
347 li r7,MAX_COPY_PREFETCH
348112: mtctr r7
349111: dcbt r3,r4
350 addi r3,r3,CACHELINE_BYTES
351 bdnz 111b
352#else
353 dcbt r3,r4
354 addi r3,r3,CACHELINE_BYTES
355#endif /* MAX_COPY_PREFETCH > 1 */
356
357114: subf r8,r7,r0
358 mr r0,r7
359 mtctr r8
360
36153: dcbt r3,r4
36254: dcbz r11,r6
363 .section __ex_table,"a"
364 .align 2
365 .long 54b,105f
366 .text
367/* the main body of the cacheline loop */
368 COPY_16_BYTES_WITHEX(0)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000369#if L1_CACHE_BYTES >= 32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000370 COPY_16_BYTES_WITHEX(1)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000371#if L1_CACHE_BYTES >= 64
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000372 COPY_16_BYTES_WITHEX(2)
373 COPY_16_BYTES_WITHEX(3)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000374#if L1_CACHE_BYTES >= 128
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000375 COPY_16_BYTES_WITHEX(4)
376 COPY_16_BYTES_WITHEX(5)
377 COPY_16_BYTES_WITHEX(6)
378 COPY_16_BYTES_WITHEX(7)
379#endif
380#endif
381#endif
382 bdnz 53b
383 cmpwi r0,0
384 li r3,4
385 li r7,0
386 bne 114b
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000387
38863: srwi. r0,r5,2
389 mtctr r0
390 beq 64f
39130: lwzu r0,4(r4)
39231: stwu r0,4(r6)
393 bdnz 30b
394
39564: andi. r0,r5,3
396 mtctr r0
397 beq+ 65f
39840: lbz r0,4(r4)
39941: stb r0,4(r6)
400 addi r4,r4,1
401 addi r6,r6,1
402 bdnz 40b
40365: li r3,0
404 blr
405
406/* read fault, initial single-byte copy */
407100: li r9,0
408 b 90f
409/* write fault, initial single-byte copy */
410101: li r9,1
41190: subf r5,r8,r5
412 li r3,0
413 b 99f
414/* read fault, initial word copy */
415102: li r9,0
416 b 91f
417/* write fault, initial word copy */
418103: li r9,1
41991: li r3,2
420 b 99f
421
422/*
423 * this stuff handles faults in the cacheline loop and branches to either
424 * 104f (if in read part) or 105f (if in write part), after updating r5
425 */
426 COPY_16_BYTES_EXCODE(0)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000427#if L1_CACHE_BYTES >= 32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000428 COPY_16_BYTES_EXCODE(1)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000429#if L1_CACHE_BYTES >= 64
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000430 COPY_16_BYTES_EXCODE(2)
431 COPY_16_BYTES_EXCODE(3)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000432#if L1_CACHE_BYTES >= 128
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000433 COPY_16_BYTES_EXCODE(4)
434 COPY_16_BYTES_EXCODE(5)
435 COPY_16_BYTES_EXCODE(6)
436 COPY_16_BYTES_EXCODE(7)
437#endif
438#endif
439#endif
440
441/* read fault in cacheline loop */
442104: li r9,0
443 b 92f
444/* fault on dcbz (effectively a write fault) */
445/* or write fault in cacheline loop */
446105: li r9,1
44792: li r3,LG_CACHELINE_BYTES
448 mfctr r8
449 add r0,r0,r8
450 b 106f
451/* read fault in final word loop */
452108: li r9,0
453 b 93f
454/* write fault in final word loop */
455109: li r9,1
45693: andi. r5,r5,3
457 li r3,2
458 b 99f
459/* read fault in final byte loop */
460110: li r9,0
461 b 94f
462/* write fault in final byte loop */
463111: li r9,1
46494: li r5,0
465 li r3,0
466/*
467 * At this stage the number of bytes not copied is
468 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
469 */
47099: mfctr r0
471106: slw r3,r0,r3
472 add. r3,r3,r5
473 beq 120f /* shouldn't happen */
474 cmpwi 0,r9,0
475 bne 120f
476/* for a read fault, first try to continue the copy one byte at a time */
477 mtctr r3
478130: lbz r0,4(r4)
479131: stb r0,4(r6)
480 addi r4,r4,1
481 addi r6,r6,1
482 bdnz 130b
483/* then clear out the destination: r3 bytes starting at 4(r6) */
484132: mfctr r3
485 srwi. r0,r3,2
486 li r9,0
487 mtctr r0
488 beq 113f
489112: stwu r9,4(r6)
490 bdnz 112b
491113: andi. r0,r3,3
492 mtctr r0
493 beq 120f
494114: stb r9,4(r6)
495 addi r6,r6,1
496 bdnz 114b
497120: blr
498
499 .section __ex_table,"a"
500 .align 2
501 .long 30b,108b
502 .long 31b,109b
503 .long 40b,110b
504 .long 41b,111b
505 .long 130b,132b
506 .long 131b,120b
507 .long 112b,120b
508 .long 114b,120b
509 .text