blob: 6813f80d1eec38e64cc02acf7f1ed7ad57b40afd [file] [log] [blame]
Paul Mackerras14cf11a2005-09-26 16:04:21 +10001/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
Paul Mackerras14cf11a2005-09-26 16:04:21 +100011#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15
16#define COPY_16_BYTES \
17 lwz r7,4(r4); \
18 lwz r8,8(r4); \
19 lwz r9,12(r4); \
20 lwzu r10,16(r4); \
21 stw r7,4(r6); \
22 stw r8,8(r6); \
23 stw r9,12(r6); \
24 stwu r10,16(r6)
25
26#define COPY_16_BYTES_WITHEX(n) \
278 ## n ## 0: \
28 lwz r7,4(r4); \
298 ## n ## 1: \
30 lwz r8,8(r4); \
318 ## n ## 2: \
32 lwz r9,12(r4); \
338 ## n ## 3: \
34 lwzu r10,16(r4); \
358 ## n ## 4: \
36 stw r7,4(r6); \
378 ## n ## 5: \
38 stw r8,8(r6); \
398 ## n ## 6: \
40 stw r9,12(r6); \
418 ## n ## 7: \
42 stwu r10,16(r6)
43
44#define COPY_16_BYTES_EXCODE(n) \
459 ## n ## 0: \
46 addi r5,r5,-(16 * n); \
47 b 104f; \
489 ## n ## 1: \
49 addi r5,r5,-(16 * n); \
50 b 105f; \
51.section __ex_table,"a"; \
52 .align 2; \
53 .long 8 ## n ## 0b,9 ## n ## 0b; \
54 .long 8 ## n ## 1b,9 ## n ## 0b; \
55 .long 8 ## n ## 2b,9 ## n ## 0b; \
56 .long 8 ## n ## 3b,9 ## n ## 0b; \
57 .long 8 ## n ## 4b,9 ## n ## 1b; \
58 .long 8 ## n ## 5b,9 ## n ## 1b; \
59 .long 8 ## n ## 6b,9 ## n ## 1b; \
60 .long 8 ## n ## 7b,9 ## n ## 1b; \
61 .text
62
63 .text
64 .stabs "arch/powerpc/lib/",N_SO,0,0,0f
Sean MacLennan025c0182010-09-01 07:21:21 +000065 .stabs "copy_32.S",N_SO,0,0,0f
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000660:
67
Stephen Rothwell7dffb722005-10-17 11:50:32 +100068CACHELINE_BYTES = L1_CACHE_BYTES
69LG_CACHELINE_BYTES = L1_CACHE_SHIFT
70CACHELINE_MASK = (L1_CACHE_BYTES-1)
Paul Mackerras14cf11a2005-09-26 16:04:21 +100071
Paul Mackerras14cf11a2005-09-26 16:04:21 +100072_GLOBAL(memset)
73 rlwimi r4,r4,8,16,23
74 rlwimi r4,r4,16,0,15
75 addi r6,r3,-4
76 cmplwi 0,r5,4
77 blt 7f
78 stwu r4,4(r6)
79 beqlr
80 andi. r0,r6,3
81 add r5,r0,r5
82 subf r6,r0,r6
83 srwi r0,r5,2
84 mtctr r0
85 bdz 6f
861: stwu r4,4(r6)
87 bdnz 1b
886: andi. r5,r5,3
897: cmpwi 0,r5,0
90 beqlr
91 mtctr r5
92 addi r6,r6,3
938: stbu r4,1(r6)
94 bdnz 8b
95 blr
96
Paul Mackerras14cf11a2005-09-26 16:04:21 +100097_GLOBAL(memmove)
98 cmplw 0,r3,r4
99 bgt backwards_memcpy
100 /* fall through */
101
102_GLOBAL(memcpy)
103 srwi. r7,r5,3
104 addi r6,r3,-4
105 addi r4,r4,-4
106 beq 2f /* if less than 8 bytes to do */
107 andi. r0,r6,3 /* get dest word aligned */
108 mtctr r7
109 bne 5f
1101: lwz r7,4(r4)
111 lwzu r8,8(r4)
112 stw r7,4(r6)
113 stwu r8,8(r6)
114 bdnz 1b
115 andi. r5,r5,7
1162: cmplwi 0,r5,4
117 blt 3f
118 lwzu r0,4(r4)
119 addi r5,r5,-4
120 stwu r0,4(r6)
1213: cmpwi 0,r5,0
122 beqlr
123 mtctr r5
124 addi r4,r4,3
125 addi r6,r6,3
1264: lbzu r0,1(r4)
127 stbu r0,1(r6)
128 bdnz 4b
129 blr
1305: subfic r0,r0,4
131 mtctr r0
1326: lbz r7,4(r4)
133 addi r4,r4,1
134 stb r7,4(r6)
135 addi r6,r6,1
136 bdnz 6b
137 subf r5,r0,r5
138 rlwinm. r7,r5,32-3,3,31
139 beq 2b
140 mtctr r7
141 b 1b
142
143_GLOBAL(backwards_memcpy)
144 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
145 add r6,r3,r5
146 add r4,r4,r5
147 beq 2f
148 andi. r0,r6,3
149 mtctr r7
150 bne 5f
1511: lwz r7,-4(r4)
152 lwzu r8,-8(r4)
153 stw r7,-4(r6)
154 stwu r8,-8(r6)
155 bdnz 1b
156 andi. r5,r5,7
1572: cmplwi 0,r5,4
158 blt 3f
159 lwzu r0,-4(r4)
160 subi r5,r5,4
161 stwu r0,-4(r6)
1623: cmpwi 0,r5,0
163 beqlr
164 mtctr r5
1654: lbzu r0,-1(r4)
166 stbu r0,-1(r6)
167 bdnz 4b
168 blr
1695: mtctr r0
1706: lbzu r7,-1(r4)
171 stbu r7,-1(r6)
172 bdnz 6b
173 subf r5,r0,r5
174 rlwinm. r7,r5,32-3,3,31
175 beq 2b
176 mtctr r7
177 b 1b
178
179_GLOBAL(__copy_tofrom_user)
180 addi r4,r4,-4
181 addi r6,r3,-4
182 neg r0,r3
183 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
184 beq 58f
185
186 cmplw 0,r5,r0 /* is this more than total to do? */
187 blt 63f /* if not much to do */
188 andi. r8,r0,3 /* get it word-aligned first */
189 mtctr r8
190 beq+ 61f
19170: lbz r9,4(r4) /* do some bytes */
19271: stb r9,4(r6)
193 addi r4,r4,1
194 addi r6,r6,1
195 bdnz 70b
19661: subf r5,r0,r5
197 srwi. r0,r0,2
198 mtctr r0
199 beq 58f
20072: lwzu r9,4(r4) /* do some words */
20173: stwu r9,4(r6)
202 bdnz 72b
203
204 .section __ex_table,"a"
205 .align 2
206 .long 70b,100f
207 .long 71b,101f
208 .long 72b,102f
209 .long 73b,103f
210 .text
211
21258: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
213 clrlwi r5,r5,32-LG_CACHELINE_BYTES
214 li r11,4
215 beq 63f
216
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000217 /* Here we decide how far ahead to prefetch the source */
218 li r3,4
219 cmpwi r0,1
220 li r7,0
221 ble 114f
222 li r7,1
223#if MAX_COPY_PREFETCH > 1
224 /* Heuristically, for large transfers we prefetch
225 MAX_COPY_PREFETCH cachelines ahead. For small transfers
226 we prefetch 1 cacheline ahead. */
227 cmpwi r0,MAX_COPY_PREFETCH
228 ble 112f
229 li r7,MAX_COPY_PREFETCH
230112: mtctr r7
231111: dcbt r3,r4
232 addi r3,r3,CACHELINE_BYTES
233 bdnz 111b
234#else
235 dcbt r3,r4
236 addi r3,r3,CACHELINE_BYTES
237#endif /* MAX_COPY_PREFETCH > 1 */
238
239114: subf r8,r7,r0
240 mr r0,r7
241 mtctr r8
242
24353: dcbt r3,r4
24454: dcbz r11,r6
245 .section __ex_table,"a"
246 .align 2
247 .long 54b,105f
248 .text
249/* the main body of the cacheline loop */
250 COPY_16_BYTES_WITHEX(0)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000251#if L1_CACHE_BYTES >= 32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000252 COPY_16_BYTES_WITHEX(1)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000253#if L1_CACHE_BYTES >= 64
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000254 COPY_16_BYTES_WITHEX(2)
255 COPY_16_BYTES_WITHEX(3)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000256#if L1_CACHE_BYTES >= 128
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000257 COPY_16_BYTES_WITHEX(4)
258 COPY_16_BYTES_WITHEX(5)
259 COPY_16_BYTES_WITHEX(6)
260 COPY_16_BYTES_WITHEX(7)
261#endif
262#endif
263#endif
264 bdnz 53b
265 cmpwi r0,0
266 li r3,4
267 li r7,0
268 bne 114b
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000269
27063: srwi. r0,r5,2
271 mtctr r0
272 beq 64f
27330: lwzu r0,4(r4)
27431: stwu r0,4(r6)
275 bdnz 30b
276
27764: andi. r0,r5,3
278 mtctr r0
279 beq+ 65f
28040: lbz r0,4(r4)
28141: stb r0,4(r6)
282 addi r4,r4,1
283 addi r6,r6,1
284 bdnz 40b
28565: li r3,0
286 blr
287
288/* read fault, initial single-byte copy */
289100: li r9,0
290 b 90f
291/* write fault, initial single-byte copy */
292101: li r9,1
29390: subf r5,r8,r5
294 li r3,0
295 b 99f
296/* read fault, initial word copy */
297102: li r9,0
298 b 91f
299/* write fault, initial word copy */
300103: li r9,1
30191: li r3,2
302 b 99f
303
304/*
305 * this stuff handles faults in the cacheline loop and branches to either
306 * 104f (if in read part) or 105f (if in write part), after updating r5
307 */
308 COPY_16_BYTES_EXCODE(0)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000309#if L1_CACHE_BYTES >= 32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000310 COPY_16_BYTES_EXCODE(1)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000311#if L1_CACHE_BYTES >= 64
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000312 COPY_16_BYTES_EXCODE(2)
313 COPY_16_BYTES_EXCODE(3)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000314#if L1_CACHE_BYTES >= 128
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000315 COPY_16_BYTES_EXCODE(4)
316 COPY_16_BYTES_EXCODE(5)
317 COPY_16_BYTES_EXCODE(6)
318 COPY_16_BYTES_EXCODE(7)
319#endif
320#endif
321#endif
322
323/* read fault in cacheline loop */
324104: li r9,0
325 b 92f
326/* fault on dcbz (effectively a write fault) */
327/* or write fault in cacheline loop */
328105: li r9,1
32992: li r3,LG_CACHELINE_BYTES
330 mfctr r8
331 add r0,r0,r8
332 b 106f
333/* read fault in final word loop */
334108: li r9,0
335 b 93f
336/* write fault in final word loop */
337109: li r9,1
33893: andi. r5,r5,3
339 li r3,2
340 b 99f
341/* read fault in final byte loop */
342110: li r9,0
343 b 94f
344/* write fault in final byte loop */
345111: li r9,1
34694: li r5,0
347 li r3,0
348/*
349 * At this stage the number of bytes not copied is
350 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
351 */
35299: mfctr r0
353106: slw r3,r0,r3
354 add. r3,r3,r5
355 beq 120f /* shouldn't happen */
356 cmpwi 0,r9,0
357 bne 120f
358/* for a read fault, first try to continue the copy one byte at a time */
359 mtctr r3
360130: lbz r0,4(r4)
361131: stb r0,4(r6)
362 addi r4,r4,1
363 addi r6,r6,1
364 bdnz 130b
365/* then clear out the destination: r3 bytes starting at 4(r6) */
366132: mfctr r3
367 srwi. r0,r3,2
368 li r9,0
369 mtctr r0
370 beq 113f
371112: stwu r9,4(r6)
372 bdnz 112b
373113: andi. r0,r3,3
374 mtctr r0
375 beq 120f
376114: stb r9,4(r6)
377 addi r6,r6,1
378 bdnz 114b
379120: blr
380
381 .section __ex_table,"a"
382 .align 2
383 .long 30b,108b
384 .long 31b,109b
385 .long 40b,110b
386 .long 41b,111b
387 .long 130b,132b
388 .long 131b,120b
389 .long 112b,120b
390 .long 114b,120b
391 .text