blob: 5575e41f9d605b0687d681934f14b763a9218061 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Optimized memory copy routines.
3 *
4 * Copyright (C) 2004 Randolph Chung <tausq@debian.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2, or (at your option)
9 * any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * Portions derived from the GNU C Library
21 * Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
22 *
23 * Several strategies are tried to try to get the best performance for various
24 * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
25 * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
26 * general registers. Unaligned copies are handled either by aligning the
27 * destination and then using shift-and-write method, or in a few cases by
28 * falling back to a byte-at-a-time copy.
29 *
30 * I chose to implement this in C because it is easier to maintain and debug,
31 * and in my experiments it appears that the C code generated by gcc (3.3/3.4
32 * at the time of writing) is fairly optimal. Unfortunately some of the
33 * semantics of the copy routine (exception handling) is difficult to express
34 * in C, so we have to play some tricks to get it to work.
35 *
36 * All the loads and stores are done via explicit asm() code in order to use
37 * the right space registers.
38 *
39 * Testing with various alignments and buffer sizes shows that this code is
40 * often >10x faster than a simple byte-at-a-time copy, even for strangely
41 * aligned operands. It is interesting to note that the glibc version
42 * of memcpy (written in C) is actually quite fast already. This routine is
43 * able to beat it by 30-40% for aligned copies because of the loop unrolling,
44 * but in some cases the glibc version is still slightly faster. This lends
45 * more credibility that gcc can generate very good code as long as we are
46 * careful.
47 *
48 * TODO:
49 * - cache prefetching needs more experimentation to get optimal settings
50 * - try not to use the post-increment address modifiers; they create additional
51 * interlocks
52 * - replace byte-copy loops with stybs sequences
53 */
54
55#ifdef __KERNEL__
Linus Torvalds1da177e2005-04-16 15:20:36 -070056#include <linux/module.h>
57#include <linux/compiler.h>
58#include <asm/uaccess.h>
59#define s_space "%%sr1"
60#define d_space "%%sr2"
61#else
62#include "memcpy.h"
63#define s_space "%%sr0"
64#define d_space "%%sr0"
65#define pa_memcpy new2_copy
66#endif
67
68DECLARE_PER_CPU(struct exception_data, exception_data);
69
70#define preserve_branch(label) do { \
71 volatile int dummy; \
72 /* The following branch is never taken, it's just here to */ \
73 /* prevent gcc from optimizing away our exception code. */ \
74 if (unlikely(dummy != dummy)) \
75 goto label; \
76} while (0)
77
78#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
79#define get_kernel_space() (0)
80
81#define MERGE(w0, sh_1, w1, sh_2) ({ \
82 unsigned int _r; \
83 asm volatile ( \
84 "mtsar %3\n" \
85 "shrpw %1, %2, %%sar, %0\n" \
86 : "=r"(_r) \
87 : "r"(w0), "r"(w1), "r"(sh_2) \
88 ); \
89 _r; \
90})
91#define THRESHOLD 16
92
93#ifdef DEBUG_MEMCPY
94#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
95#else
96#define DPRINTF(fmt, args...)
97#endif
98
99#ifndef __LP64__
100#define EXC_WORD ".word"
101#else
102#define EXC_WORD ".dword"
103#endif
104
105#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
106 __asm__ __volatile__ ( \
107 "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n" \
108 "\t.section __ex_table,\"aw\"\n" \
109 "\t" EXC_WORD "\t1b\n" \
110 "\t" EXC_WORD "\t" #_e "\n" \
111 "\t.previous\n" \
112 : _tt(_t), "+r"(_a) \
113 : \
114 : "r8")
115
116#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
117 __asm__ __volatile__ ( \
118 "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n" \
119 "\t.section __ex_table,\"aw\"\n" \
120 "\t" EXC_WORD "\t1b\n" \
121 "\t" EXC_WORD "\t" #_e "\n" \
122 "\t.previous\n" \
123 : "+r"(_a) \
124 : _tt(_t) \
125 : "r8")
126
127#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
128#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
129#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
130#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
131#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
132#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
133
134#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \
135 __asm__ __volatile__ ( \
136 "1:\t" #_insn " " #_o "(" _s ",%1), %0\n" \
137 "\t.section __ex_table,\"aw\"\n" \
138 "\t" EXC_WORD "\t1b\n" \
139 "\t" EXC_WORD "\t" #_e "\n" \
140 "\t.previous\n" \
141 : _tt(_t) \
142 : "r"(_a) \
143 : "r8")
144
145#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \
146 __asm__ __volatile__ ( \
147 "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n" \
148 "\t.section __ex_table,\"aw\"\n" \
149 "\t" EXC_WORD "\t1b\n" \
150 "\t" EXC_WORD "\t" #_e "\n" \
151 "\t.previous\n" \
152 : \
153 : _tt(_t), "r"(_a) \
154 : "r8")
155
156#define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
157#define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e)
158
159#ifdef CONFIG_PREFETCH
160extern inline void prefetch_src(const void *addr)
161{
162 __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
163}
164
165extern inline void prefetch_dst(const void *addr)
166{
167 __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
168}
169#else
170#define prefetch_src(addr)
171#define prefetch_dst(addr)
172#endif
173
174/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
175 * per loop. This code is derived from glibc.
176 */
177static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len)
178{
179 /* gcc complains that a2 and a3 may be uninitialized, but actually
180 * they cannot be. Initialize a2/a3 to shut gcc up.
181 */
182 register unsigned int a0, a1, a2 = 0, a3 = 0;
183 int sh_1, sh_2;
184 struct exception_data *d;
185
186 /* prefetch_src((const void *)src); */
187
188 /* Calculate how to shift a word read at the memory operation
189 aligned srcp to make it aligned for copy. */
190 sh_1 = 8 * (src % sizeof(unsigned int));
191 sh_2 = 8 * sizeof(unsigned int) - sh_1;
192
193 /* Make src aligned by rounding it down. */
194 src &= -sizeof(unsigned int);
195
196 switch (len % 4)
197 {
198 case 2:
199 /* a1 = ((unsigned int *) src)[0];
200 a2 = ((unsigned int *) src)[1]; */
201 ldw(s_space, 0, src, a1, cda_ldw_exc);
202 ldw(s_space, 4, src, a2, cda_ldw_exc);
203 src -= 1 * sizeof(unsigned int);
204 dst -= 3 * sizeof(unsigned int);
205 len += 2;
206 goto do1;
207 case 3:
208 /* a0 = ((unsigned int *) src)[0];
209 a1 = ((unsigned int *) src)[1]; */
210 ldw(s_space, 0, src, a0, cda_ldw_exc);
211 ldw(s_space, 4, src, a1, cda_ldw_exc);
212 src -= 0 * sizeof(unsigned int);
213 dst -= 2 * sizeof(unsigned int);
214 len += 1;
215 goto do2;
216 case 0:
217 if (len == 0)
218 return 0;
219 /* a3 = ((unsigned int *) src)[0];
220 a0 = ((unsigned int *) src)[1]; */
221 ldw(s_space, 0, src, a3, cda_ldw_exc);
222 ldw(s_space, 4, src, a0, cda_ldw_exc);
223 src -=-1 * sizeof(unsigned int);
224 dst -= 1 * sizeof(unsigned int);
225 len += 0;
226 goto do3;
227 case 1:
228 /* a2 = ((unsigned int *) src)[0];
229 a3 = ((unsigned int *) src)[1]; */
230 ldw(s_space, 0, src, a2, cda_ldw_exc);
231 ldw(s_space, 4, src, a3, cda_ldw_exc);
232 src -=-2 * sizeof(unsigned int);
233 dst -= 0 * sizeof(unsigned int);
234 len -= 1;
235 if (len == 0)
236 goto do0;
237 goto do4; /* No-op. */
238 }
239
240 do
241 {
242 /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
243do4:
244 /* a0 = ((unsigned int *) src)[0]; */
245 ldw(s_space, 0, src, a0, cda_ldw_exc);
246 /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
247 stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
248do3:
249 /* a1 = ((unsigned int *) src)[1]; */
250 ldw(s_space, 4, src, a1, cda_ldw_exc);
251 /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
252 stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
253do2:
254 /* a2 = ((unsigned int *) src)[2]; */
255 ldw(s_space, 8, src, a2, cda_ldw_exc);
256 /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
257 stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
258do1:
259 /* a3 = ((unsigned int *) src)[3]; */
260 ldw(s_space, 12, src, a3, cda_ldw_exc);
261 /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
262 stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
263
264 src += 4 * sizeof(unsigned int);
265 dst += 4 * sizeof(unsigned int);
266 len -= 4;
267 }
268 while (len != 0);
269
270do0:
271 /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
272 stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
273
274 preserve_branch(handle_load_error);
275 preserve_branch(handle_store_error);
276
277 return 0;
278
279handle_load_error:
280 __asm__ __volatile__ ("cda_ldw_exc:\n");
281 d = &__get_cpu_var(exception_data);
282 DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
283 o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
284 return o_len * 4 - d->fault_addr + o_src;
285
286handle_store_error:
287 __asm__ __volatile__ ("cda_stw_exc:\n");
288 d = &__get_cpu_var(exception_data);
289 DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
290 o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
291 return o_len * 4 - d->fault_addr + o_dst;
292}
293
294
295/* Returns 0 for success, otherwise, returns number of bytes not transferred. */
296unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
297{
298 register unsigned long src, dst, t1, t2, t3;
299 register unsigned char *pcs, *pcd;
300 register unsigned int *pws, *pwd;
301 register double *pds, *pdd;
302 unsigned long ret = 0;
303 unsigned long o_dst, o_src, o_len;
304 struct exception_data *d;
305
306 src = (unsigned long)srcp;
307 dst = (unsigned long)dstp;
308 pcs = (unsigned char *)srcp;
309 pcd = (unsigned char *)dstp;
310
311 o_dst = dst; o_src = src; o_len = len;
312
313 /* prefetch_src((const void *)srcp); */
314
315 if (len < THRESHOLD)
316 goto byte_copy;
317
318 /* Check alignment */
319 t1 = (src ^ dst);
320 if (unlikely(t1 & (sizeof(double)-1)))
321 goto unaligned_copy;
322
323 /* src and dst have same alignment. */
324
325 /* Copy bytes till we are double-aligned. */
326 t2 = src & (sizeof(double) - 1);
327 if (unlikely(t2 != 0)) {
328 t2 = sizeof(double) - t2;
329 while (t2 && len) {
330 /* *pcd++ = *pcs++; */
331 ldbma(s_space, pcs, t3, pmc_load_exc);
332 len--;
333 stbma(d_space, t3, pcd, pmc_store_exc);
334 t2--;
335 }
336 }
337
338 pds = (double *)pcs;
339 pdd = (double *)pcd;
340
Randolph Chungfa681a12005-10-21 22:48:34 -0400341#if 0
Linus Torvalds1da177e2005-04-16 15:20:36 -0700342 /* Copy 8 doubles at a time */
343 while (len >= 8*sizeof(double)) {
344 register double r1, r2, r3, r4, r5, r6, r7, r8;
345 /* prefetch_src((char *)pds + L1_CACHE_BYTES); */
346 flddma(s_space, pds, r1, pmc_load_exc);
347 flddma(s_space, pds, r2, pmc_load_exc);
348 flddma(s_space, pds, r3, pmc_load_exc);
349 flddma(s_space, pds, r4, pmc_load_exc);
350 fstdma(d_space, r1, pdd, pmc_store_exc);
351 fstdma(d_space, r2, pdd, pmc_store_exc);
352 fstdma(d_space, r3, pdd, pmc_store_exc);
353 fstdma(d_space, r4, pdd, pmc_store_exc);
354
355#if 0
356 if (L1_CACHE_BYTES <= 32)
357 prefetch_src((char *)pds + L1_CACHE_BYTES);
358#endif
359 flddma(s_space, pds, r5, pmc_load_exc);
360 flddma(s_space, pds, r6, pmc_load_exc);
361 flddma(s_space, pds, r7, pmc_load_exc);
362 flddma(s_space, pds, r8, pmc_load_exc);
363 fstdma(d_space, r5, pdd, pmc_store_exc);
364 fstdma(d_space, r6, pdd, pmc_store_exc);
365 fstdma(d_space, r7, pdd, pmc_store_exc);
366 fstdma(d_space, r8, pdd, pmc_store_exc);
367 len -= 8*sizeof(double);
368 }
Randolph Chungfa681a12005-10-21 22:48:34 -0400369#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370
371 pws = (unsigned int *)pds;
372 pwd = (unsigned int *)pdd;
373
374word_copy:
375 while (len >= 8*sizeof(unsigned int)) {
376 register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
377 /* prefetch_src((char *)pws + L1_CACHE_BYTES); */
378 ldwma(s_space, pws, r1, pmc_load_exc);
379 ldwma(s_space, pws, r2, pmc_load_exc);
380 ldwma(s_space, pws, r3, pmc_load_exc);
381 ldwma(s_space, pws, r4, pmc_load_exc);
382 stwma(d_space, r1, pwd, pmc_store_exc);
383 stwma(d_space, r2, pwd, pmc_store_exc);
384 stwma(d_space, r3, pwd, pmc_store_exc);
385 stwma(d_space, r4, pwd, pmc_store_exc);
386
387 ldwma(s_space, pws, r5, pmc_load_exc);
388 ldwma(s_space, pws, r6, pmc_load_exc);
389 ldwma(s_space, pws, r7, pmc_load_exc);
390 ldwma(s_space, pws, r8, pmc_load_exc);
391 stwma(d_space, r5, pwd, pmc_store_exc);
392 stwma(d_space, r6, pwd, pmc_store_exc);
393 stwma(d_space, r7, pwd, pmc_store_exc);
394 stwma(d_space, r8, pwd, pmc_store_exc);
395 len -= 8*sizeof(unsigned int);
396 }
397
398 while (len >= 4*sizeof(unsigned int)) {
399 register unsigned int r1,r2,r3,r4;
400 ldwma(s_space, pws, r1, pmc_load_exc);
401 ldwma(s_space, pws, r2, pmc_load_exc);
402 ldwma(s_space, pws, r3, pmc_load_exc);
403 ldwma(s_space, pws, r4, pmc_load_exc);
404 stwma(d_space, r1, pwd, pmc_store_exc);
405 stwma(d_space, r2, pwd, pmc_store_exc);
406 stwma(d_space, r3, pwd, pmc_store_exc);
407 stwma(d_space, r4, pwd, pmc_store_exc);
408 len -= 4*sizeof(unsigned int);
409 }
410
411 pcs = (unsigned char *)pws;
412 pcd = (unsigned char *)pwd;
413
414byte_copy:
415 while (len) {
416 /* *pcd++ = *pcs++; */
417 ldbma(s_space, pcs, t3, pmc_load_exc);
418 stbma(d_space, t3, pcd, pmc_store_exc);
419 len--;
420 }
421
422 return 0;
423
424unaligned_copy:
425 /* possibly we are aligned on a word, but not on a double... */
426 if (likely(t1 & (sizeof(unsigned int)-1)) == 0) {
427 t2 = src & (sizeof(unsigned int) - 1);
428
429 if (unlikely(t2 != 0)) {
430 t2 = sizeof(unsigned int) - t2;
431 while (t2) {
432 /* *pcd++ = *pcs++; */
433 ldbma(s_space, pcs, t3, pmc_load_exc);
434 stbma(d_space, t3, pcd, pmc_store_exc);
435 len--;
436 t2--;
437 }
438 }
439
440 pws = (unsigned int *)pcs;
441 pwd = (unsigned int *)pcd;
442 goto word_copy;
443 }
444
445 /* Align the destination. */
446 if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
447 t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
448 while (t2) {
449 /* *pcd++ = *pcs++; */
450 ldbma(s_space, pcs, t3, pmc_load_exc);
451 stbma(d_space, t3, pcd, pmc_store_exc);
452 len--;
453 t2--;
454 }
455 dst = (unsigned long)pcd;
456 src = (unsigned long)pcs;
457 }
458
459 ret = copy_dstaligned(dst, src, len / sizeof(unsigned int),
460 o_dst, o_src, o_len);
461 if (ret)
462 return ret;
463
464 pcs += (len & -sizeof(unsigned int));
465 pcd += (len & -sizeof(unsigned int));
466 len %= sizeof(unsigned int);
467
468 preserve_branch(handle_load_error);
469 preserve_branch(handle_store_error);
470
471 goto byte_copy;
472
473handle_load_error:
474 __asm__ __volatile__ ("pmc_load_exc:\n");
475 d = &__get_cpu_var(exception_data);
476 DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
477 o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
478 return o_len - d->fault_addr + o_src;
479
480handle_store_error:
481 __asm__ __volatile__ ("pmc_store_exc:\n");
482 d = &__get_cpu_var(exception_data);
483 DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
484 o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
485 return o_len - d->fault_addr + o_dst;
486}
487
488#ifdef __KERNEL__
489unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
490{
491 mtsp(get_kernel_space(), 1);
492 mtsp(get_user_space(), 2);
493 return pa_memcpy((void __force *)dst, src, len);
494}
495
496unsigned long copy_from_user(void *dst, const void __user *src, unsigned long len)
497{
498 mtsp(get_user_space(), 1);
499 mtsp(get_kernel_space(), 2);
500 return pa_memcpy(dst, (void __force *)src, len);
501}
502
503unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
504{
505 mtsp(get_user_space(), 1);
506 mtsp(get_user_space(), 2);
507 return pa_memcpy((void __force *)dst, (void __force *)src, len);
508}
509
510
511void * memcpy(void * dst,const void *src, size_t count)
512{
513 mtsp(get_kernel_space(), 1);
514 mtsp(get_kernel_space(), 2);
515 pa_memcpy(dst, src, count);
516 return dst;
517}
518
519EXPORT_SYMBOL(copy_to_user);
520EXPORT_SYMBOL(copy_from_user);
521EXPORT_SYMBOL(copy_in_user);
522EXPORT_SYMBOL(memcpy);
523#endif