blob: 40cce33b08d6a226f64ab94788725647c15df54d [file] [log] [blame]
Paul Mackerras14cf11a2005-09-26 16:04:21 +10001/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
Paul Mackerras14cf11a2005-09-26 16:04:21 +100011#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
Al Viro9445aa12016-01-13 23:33:46 -050015#include <asm/export.h>
Paul Mackerras14cf11a2005-09-26 16:04:21 +100016
17#define COPY_16_BYTES \
18 lwz r7,4(r4); \
19 lwz r8,8(r4); \
20 lwz r9,12(r4); \
21 lwzu r10,16(r4); \
22 stw r7,4(r6); \
23 stw r8,8(r6); \
24 stw r9,12(r6); \
25 stwu r10,16(r6)
26
27#define COPY_16_BYTES_WITHEX(n) \
288 ## n ## 0: \
29 lwz r7,4(r4); \
308 ## n ## 1: \
31 lwz r8,8(r4); \
328 ## n ## 2: \
33 lwz r9,12(r4); \
348 ## n ## 3: \
35 lwzu r10,16(r4); \
368 ## n ## 4: \
37 stw r7,4(r6); \
388 ## n ## 5: \
39 stw r8,8(r6); \
408 ## n ## 6: \
41 stw r9,12(r6); \
428 ## n ## 7: \
43 stwu r10,16(r6)
44
45#define COPY_16_BYTES_EXCODE(n) \
469 ## n ## 0: \
47 addi r5,r5,-(16 * n); \
48 b 104f; \
499 ## n ## 1: \
50 addi r5,r5,-(16 * n); \
51 b 105f; \
52.section __ex_table,"a"; \
53 .align 2; \
54 .long 8 ## n ## 0b,9 ## n ## 0b; \
55 .long 8 ## n ## 1b,9 ## n ## 0b; \
56 .long 8 ## n ## 2b,9 ## n ## 0b; \
57 .long 8 ## n ## 3b,9 ## n ## 0b; \
58 .long 8 ## n ## 4b,9 ## n ## 1b; \
59 .long 8 ## n ## 5b,9 ## n ## 1b; \
60 .long 8 ## n ## 6b,9 ## n ## 1b; \
61 .long 8 ## n ## 7b,9 ## n ## 1b; \
62 .text
63
64 .text
65 .stabs "arch/powerpc/lib/",N_SO,0,0,0f
Sean MacLennan025c0182010-09-01 07:21:21 +000066 .stabs "copy_32.S",N_SO,0,0,0f
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000670:
68
Stephen Rothwell7dffb722005-10-17 11:50:32 +100069CACHELINE_BYTES = L1_CACHE_BYTES
70LG_CACHELINE_BYTES = L1_CACHE_SHIFT
71CACHELINE_MASK = (L1_CACHE_BYTES-1)
Paul Mackerras14cf11a2005-09-26 16:04:21 +100072
LEROY Christophedf087e42015-05-19 12:07:48 +020073/*
74 * Use dcbz on the complete cache lines in the destination
75 * to set them to zero. This requires that the destination
76 * area is cacheable. -- paulus
LEROY Christophe400c47d2015-09-16 12:04:53 +020077 *
78 * During early init, cache might not be active yet, so dcbz cannot be used.
79 * We therefore skip the optimised bloc that uses dcbz. This jump is
80 * replaced by a nop once cache is active. This is done in machine_init()
LEROY Christophedf087e42015-05-19 12:07:48 +020081 */
LEROY Christophe5b2a32e2015-05-19 12:07:50 +020082_GLOBAL(memset)
LEROY Christophec152f142015-05-19 12:07:52 +020083 rlwimi r4,r4,8,16,23
84 rlwimi r4,r4,16,0,15
85
LEROY Christophedf087e42015-05-19 12:07:48 +020086 addi r6,r3,-4
87 cmplwi 0,r5,4
88 blt 7f
89 stwu r4,4(r6)
90 beqlr
91 andi. r0,r6,3
92 add r5,r0,r5
93 subf r6,r0,r6
LEROY Christophec152f142015-05-19 12:07:52 +020094 cmplwi 0,r4,0
95 bne 2f /* Use normal procedure if r4 is not zero */
Al Viro9445aa12016-01-13 23:33:46 -050096EXPORT_SYMBOL(memset)
LEROY Christophe400c47d2015-09-16 12:04:53 +020097_GLOBAL(memset_nocache_branch)
98 b 2f /* Skip optimised bloc until cache is enabled */
LEROY Christophec152f142015-05-19 12:07:52 +020099
LEROY Christophedf087e42015-05-19 12:07:48 +0200100 clrlwi r7,r6,32-LG_CACHELINE_BYTES
101 add r8,r7,r5
102 srwi r9,r8,LG_CACHELINE_BYTES
103 addic. r9,r9,-1 /* total number of complete cachelines */
104 ble 2f
105 xori r0,r7,CACHELINE_MASK & ~3
106 srwi. r0,r0,2
107 beq 3f
108 mtctr r0
1094: stwu r4,4(r6)
110 bdnz 4b
1113: mtctr r9
112 li r7,4
11310: dcbz r7,r6
114 addi r6,r6,CACHELINE_BYTES
115 bdnz 10b
116 clrlwi r5,r8,32-LG_CACHELINE_BYTES
117 addi r5,r5,4
LEROY Christophedf087e42015-05-19 12:07:48 +0200118
LEROY Christophec152f142015-05-19 12:07:52 +02001192: srwi r0,r5,2
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000120 mtctr r0
121 bdz 6f
1221: stwu r4,4(r6)
123 bdnz 1b
1246: andi. r5,r5,3
1257: cmpwi 0,r5,0
126 beqlr
127 mtctr r5
128 addi r6,r6,3
1298: stbu r4,1(r6)
130 bdnz 8b
131 blr
132
LEROY Christophedf087e42015-05-19 12:07:48 +0200133/*
134 * This version uses dcbz on the complete cache lines in the
135 * destination area to reduce memory traffic. This requires that
136 * the destination area is cacheable.
137 * We only use this version if the source and dest don't overlap.
138 * -- paulus.
LEROY Christophe1cd03892015-09-16 12:04:51 +0200139 *
140 * During early init, cache might not be active yet, so dcbz cannot be used.
141 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
142 * replaced by a nop once cache is active. This is done in machine_init()
LEROY Christophedf087e42015-05-19 12:07:48 +0200143 */
LEROY Christophe0b05e2d2015-05-19 12:07:55 +0200144_GLOBAL(memmove)
145 cmplw 0,r3,r4
146 bgt backwards_memcpy
147 /* fall through */
148
149_GLOBAL(memcpy)
LEROY Christophe1cd03892015-09-16 12:04:51 +0200150 b generic_memcpy
LEROY Christophedf087e42015-05-19 12:07:48 +0200151 add r7,r3,r5 /* test if the src & dst overlap */
152 add r8,r4,r5
153 cmplw 0,r4,r7
154 cmplw 1,r3,r8
155 crand 0,0,4 /* cr0.lt &= cr1.lt */
LEROY Christophe0b05e2d2015-05-19 12:07:55 +0200156 blt generic_memcpy /* if regions overlap */
LEROY Christophedf087e42015-05-19 12:07:48 +0200157
158 addi r4,r4,-4
159 addi r6,r3,-4
160 neg r0,r3
161 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
162 beq 58f
163
164 cmplw 0,r5,r0 /* is this more than total to do? */
165 blt 63f /* if not much to do */
166 andi. r8,r0,3 /* get it word-aligned first */
167 subf r5,r0,r5
168 mtctr r8
169 beq+ 61f
17070: lbz r9,4(r4) /* do some bytes */
LEROY Christophedf087e42015-05-19 12:07:48 +0200171 addi r4,r4,1
172 addi r6,r6,1
LEROY Christophe295ffb42015-05-19 12:07:57 +0200173 stb r9,3(r6)
LEROY Christophedf087e42015-05-19 12:07:48 +0200174 bdnz 70b
17561: srwi. r0,r0,2
176 mtctr r0
177 beq 58f
17872: lwzu r9,4(r4) /* do some words */
179 stwu r9,4(r6)
180 bdnz 72b
181
18258: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
183 clrlwi r5,r5,32-LG_CACHELINE_BYTES
184 li r11,4
185 mtctr r0
186 beq 63f
18753:
188 dcbz r11,r6
189 COPY_16_BYTES
190#if L1_CACHE_BYTES >= 32
191 COPY_16_BYTES
192#if L1_CACHE_BYTES >= 64
193 COPY_16_BYTES
194 COPY_16_BYTES
195#if L1_CACHE_BYTES >= 128
196 COPY_16_BYTES
197 COPY_16_BYTES
198 COPY_16_BYTES
199 COPY_16_BYTES
200#endif
201#endif
202#endif
203 bdnz 53b
204
20563: srwi. r0,r5,2
206 mtctr r0
207 beq 64f
20830: lwzu r0,4(r4)
209 stwu r0,4(r6)
210 bdnz 30b
211
21264: andi. r0,r5,3
213 mtctr r0
214 beq+ 65f
LEROY Christophe295ffb42015-05-19 12:07:57 +0200215 addi r4,r4,3
216 addi r6,r6,3
21740: lbzu r0,1(r4)
218 stbu r0,1(r6)
LEROY Christophedf087e42015-05-19 12:07:48 +0200219 bdnz 40b
22065: blr
Al Viro9445aa12016-01-13 23:33:46 -0500221EXPORT_SYMBOL(memcpy)
222EXPORT_SYMBOL(memmove)
LEROY Christophedf087e42015-05-19 12:07:48 +0200223
Michael Ellermanb4c6afd2016-03-16 21:36:06 +1100224generic_memcpy:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000225 srwi. r7,r5,3
226 addi r6,r3,-4
227 addi r4,r4,-4
228 beq 2f /* if less than 8 bytes to do */
229 andi. r0,r6,3 /* get dest word aligned */
230 mtctr r7
231 bne 5f
2321: lwz r7,4(r4)
233 lwzu r8,8(r4)
234 stw r7,4(r6)
235 stwu r8,8(r6)
236 bdnz 1b
237 andi. r5,r5,7
2382: cmplwi 0,r5,4
239 blt 3f
240 lwzu r0,4(r4)
241 addi r5,r5,-4
242 stwu r0,4(r6)
2433: cmpwi 0,r5,0
244 beqlr
245 mtctr r5
246 addi r4,r4,3
247 addi r6,r6,3
2484: lbzu r0,1(r4)
249 stbu r0,1(r6)
250 bdnz 4b
251 blr
2525: subfic r0,r0,4
253 mtctr r0
2546: lbz r7,4(r4)
255 addi r4,r4,1
256 stb r7,4(r6)
257 addi r6,r6,1
258 bdnz 6b
259 subf r5,r0,r5
260 rlwinm. r7,r5,32-3,3,31
261 beq 2b
262 mtctr r7
263 b 1b
264
265_GLOBAL(backwards_memcpy)
266 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
267 add r6,r3,r5
268 add r4,r4,r5
269 beq 2f
270 andi. r0,r6,3
271 mtctr r7
272 bne 5f
2731: lwz r7,-4(r4)
274 lwzu r8,-8(r4)
275 stw r7,-4(r6)
276 stwu r8,-8(r6)
277 bdnz 1b
278 andi. r5,r5,7
2792: cmplwi 0,r5,4
280 blt 3f
281 lwzu r0,-4(r4)
282 subi r5,r5,4
283 stwu r0,-4(r6)
2843: cmpwi 0,r5,0
285 beqlr
286 mtctr r5
2874: lbzu r0,-1(r4)
288 stbu r0,-1(r6)
289 bdnz 4b
290 blr
2915: mtctr r0
2926: lbzu r7,-1(r4)
293 stbu r7,-1(r6)
294 bdnz 6b
295 subf r5,r0,r5
296 rlwinm. r7,r5,32-3,3,31
297 beq 2b
298 mtctr r7
299 b 1b
300
301_GLOBAL(__copy_tofrom_user)
302 addi r4,r4,-4
303 addi r6,r3,-4
304 neg r0,r3
305 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
306 beq 58f
307
308 cmplw 0,r5,r0 /* is this more than total to do? */
309 blt 63f /* if not much to do */
310 andi. r8,r0,3 /* get it word-aligned first */
311 mtctr r8
312 beq+ 61f
31370: lbz r9,4(r4) /* do some bytes */
31471: stb r9,4(r6)
315 addi r4,r4,1
316 addi r6,r6,1
317 bdnz 70b
31861: subf r5,r0,r5
319 srwi. r0,r0,2
320 mtctr r0
321 beq 58f
32272: lwzu r9,4(r4) /* do some words */
32373: stwu r9,4(r6)
324 bdnz 72b
325
326 .section __ex_table,"a"
327 .align 2
328 .long 70b,100f
329 .long 71b,101f
330 .long 72b,102f
331 .long 73b,103f
332 .text
333
33458: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
335 clrlwi r5,r5,32-LG_CACHELINE_BYTES
336 li r11,4
337 beq 63f
338
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000339 /* Here we decide how far ahead to prefetch the source */
340 li r3,4
341 cmpwi r0,1
342 li r7,0
343 ble 114f
344 li r7,1
345#if MAX_COPY_PREFETCH > 1
346 /* Heuristically, for large transfers we prefetch
347 MAX_COPY_PREFETCH cachelines ahead. For small transfers
348 we prefetch 1 cacheline ahead. */
349 cmpwi r0,MAX_COPY_PREFETCH
350 ble 112f
351 li r7,MAX_COPY_PREFETCH
352112: mtctr r7
353111: dcbt r3,r4
354 addi r3,r3,CACHELINE_BYTES
355 bdnz 111b
356#else
357 dcbt r3,r4
358 addi r3,r3,CACHELINE_BYTES
359#endif /* MAX_COPY_PREFETCH > 1 */
360
361114: subf r8,r7,r0
362 mr r0,r7
363 mtctr r8
364
36553: dcbt r3,r4
36654: dcbz r11,r6
367 .section __ex_table,"a"
368 .align 2
369 .long 54b,105f
370 .text
371/* the main body of the cacheline loop */
372 COPY_16_BYTES_WITHEX(0)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000373#if L1_CACHE_BYTES >= 32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000374 COPY_16_BYTES_WITHEX(1)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000375#if L1_CACHE_BYTES >= 64
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000376 COPY_16_BYTES_WITHEX(2)
377 COPY_16_BYTES_WITHEX(3)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000378#if L1_CACHE_BYTES >= 128
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000379 COPY_16_BYTES_WITHEX(4)
380 COPY_16_BYTES_WITHEX(5)
381 COPY_16_BYTES_WITHEX(6)
382 COPY_16_BYTES_WITHEX(7)
383#endif
384#endif
385#endif
386 bdnz 53b
387 cmpwi r0,0
388 li r3,4
389 li r7,0
390 bne 114b
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000391
39263: srwi. r0,r5,2
393 mtctr r0
394 beq 64f
39530: lwzu r0,4(r4)
39631: stwu r0,4(r6)
397 bdnz 30b
398
39964: andi. r0,r5,3
400 mtctr r0
401 beq+ 65f
40240: lbz r0,4(r4)
40341: stb r0,4(r6)
404 addi r4,r4,1
405 addi r6,r6,1
406 bdnz 40b
40765: li r3,0
408 blr
409
410/* read fault, initial single-byte copy */
411100: li r9,0
412 b 90f
413/* write fault, initial single-byte copy */
414101: li r9,1
41590: subf r5,r8,r5
416 li r3,0
417 b 99f
418/* read fault, initial word copy */
419102: li r9,0
420 b 91f
421/* write fault, initial word copy */
422103: li r9,1
42391: li r3,2
424 b 99f
425
426/*
427 * this stuff handles faults in the cacheline loop and branches to either
428 * 104f (if in read part) or 105f (if in write part), after updating r5
429 */
430 COPY_16_BYTES_EXCODE(0)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000431#if L1_CACHE_BYTES >= 32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000432 COPY_16_BYTES_EXCODE(1)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000433#if L1_CACHE_BYTES >= 64
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000434 COPY_16_BYTES_EXCODE(2)
435 COPY_16_BYTES_EXCODE(3)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000436#if L1_CACHE_BYTES >= 128
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000437 COPY_16_BYTES_EXCODE(4)
438 COPY_16_BYTES_EXCODE(5)
439 COPY_16_BYTES_EXCODE(6)
440 COPY_16_BYTES_EXCODE(7)
441#endif
442#endif
443#endif
444
445/* read fault in cacheline loop */
446104: li r9,0
447 b 92f
448/* fault on dcbz (effectively a write fault) */
449/* or write fault in cacheline loop */
450105: li r9,1
45192: li r3,LG_CACHELINE_BYTES
452 mfctr r8
453 add r0,r0,r8
454 b 106f
455/* read fault in final word loop */
456108: li r9,0
457 b 93f
458/* write fault in final word loop */
459109: li r9,1
46093: andi. r5,r5,3
461 li r3,2
462 b 99f
463/* read fault in final byte loop */
464110: li r9,0
465 b 94f
466/* write fault in final byte loop */
467111: li r9,1
46894: li r5,0
469 li r3,0
470/*
471 * At this stage the number of bytes not copied is
472 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
473 */
47499: mfctr r0
475106: slw r3,r0,r3
476 add. r3,r3,r5
477 beq 120f /* shouldn't happen */
478 cmpwi 0,r9,0
479 bne 120f
480/* for a read fault, first try to continue the copy one byte at a time */
481 mtctr r3
482130: lbz r0,4(r4)
483131: stb r0,4(r6)
484 addi r4,r4,1
485 addi r6,r6,1
486 bdnz 130b
487/* then clear out the destination: r3 bytes starting at 4(r6) */
488132: mfctr r3
489 srwi. r0,r3,2
490 li r9,0
491 mtctr r0
492 beq 113f
493112: stwu r9,4(r6)
494 bdnz 112b
495113: andi. r0,r3,3
496 mtctr r0
497 beq 120f
498114: stb r9,4(r6)
499 addi r6,r6,1
500 bdnz 114b
501120: blr
502
503 .section __ex_table,"a"
504 .align 2
505 .long 30b,108b
506 .long 31b,109b
507 .long 40b,110b
508 .long 41b,111b
509 .long 130b,132b
510 .long 131b,120b
511 .long 112b,120b
512 .long 114b,120b
513 .text
Al Viro9445aa12016-01-13 23:33:46 -0500514EXPORT_SYMBOL(__copy_tofrom_user)