| /* |
| * Optimized memory copy routines. |
| * |
| * Copyright (C) 2004 Randolph Chung <tausq@debian.org> |
| * |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License as published by |
| * the Free Software Foundation; either version 2, or (at your option) |
| * any later version. |
| * |
| * This program is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| * GNU General Public License for more details. |
| * |
| * You should have received a copy of the GNU General Public License |
| * along with this program; if not, write to the Free Software |
| * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| * |
| * Portions derived from the GNU C Library |
| * Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc. |
| * |
| * Several strategies are tried to try to get the best performance for various |
| * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using |
| * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using |
| * general registers. Unaligned copies are handled either by aligning the |
| * destination and then using shift-and-write method, or in a few cases by |
| * falling back to a byte-at-a-time copy. |
| * |
| * I chose to implement this in C because it is easier to maintain and debug, |
| * and in my experiments it appears that the C code generated by gcc (3.3/3.4 |
| * at the time of writing) is fairly optimal. Unfortunately some of the |
| * semantics of the copy routine (exception handling) is difficult to express |
| * in C, so we have to play some tricks to get it to work. |
| * |
| * All the loads and stores are done via explicit asm() code in order to use |
| * the right space registers. |
| * |
| * Testing with various alignments and buffer sizes shows that this code is |
| * often >10x faster than a simple byte-at-a-time copy, even for strangely |
| * aligned operands. It is interesting to note that the glibc version |
| * of memcpy (written in C) is actually quite fast already. This routine is |
| * able to beat it by 30-40% for aligned copies because of the loop unrolling, |
| * but in some cases the glibc version is still slightly faster. This lends |
| * more credibility that gcc can generate very good code as long as we are |
| * careful. |
| * |
| * TODO: |
| * - cache prefetching needs more experimentation to get optimal settings |
| * - try not to use the post-increment address modifiers; they create additional |
| * interlocks |
| * - replace byte-copy loops with stybs sequences |
| */ |
| |
| #ifdef __KERNEL__ |
| #include <linux/config.h> |
| #include <linux/module.h> |
| #include <linux/compiler.h> |
| #include <asm/uaccess.h> |
| #define s_space "%%sr1" |
| #define d_space "%%sr2" |
| #else |
| #include "memcpy.h" |
| #define s_space "%%sr0" |
| #define d_space "%%sr0" |
| #define pa_memcpy new2_copy |
| #endif |
| |
| DECLARE_PER_CPU(struct exception_data, exception_data); |
| |
| #define preserve_branch(label) do { \ |
| volatile int dummy; \ |
| /* The following branch is never taken, it's just here to */ \ |
| /* prevent gcc from optimizing away our exception code. */ \ |
| if (unlikely(dummy != dummy)) \ |
| goto label; \ |
| } while (0) |
| |
| #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3)) |
| #define get_kernel_space() (0) |
| |
| #define MERGE(w0, sh_1, w1, sh_2) ({ \ |
| unsigned int _r; \ |
| asm volatile ( \ |
| "mtsar %3\n" \ |
| "shrpw %1, %2, %%sar, %0\n" \ |
| : "=r"(_r) \ |
| : "r"(w0), "r"(w1), "r"(sh_2) \ |
| ); \ |
| _r; \ |
| }) |
| #define THRESHOLD 16 |
| |
| #ifdef DEBUG_MEMCPY |
| #define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0) |
| #else |
| #define DPRINTF(fmt, args...) |
| #endif |
| |
| #ifndef __LP64__ |
| #define EXC_WORD ".word" |
| #else |
| #define EXC_WORD ".dword" |
| #endif |
| |
| #define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ |
| __asm__ __volatile__ ( \ |
| "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n" \ |
| "\t.section __ex_table,\"aw\"\n" \ |
| "\t" EXC_WORD "\t1b\n" \ |
| "\t" EXC_WORD "\t" #_e "\n" \ |
| "\t.previous\n" \ |
| : _tt(_t), "+r"(_a) \ |
| : \ |
| : "r8") |
| |
| #define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ |
| __asm__ __volatile__ ( \ |
| "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n" \ |
| "\t.section __ex_table,\"aw\"\n" \ |
| "\t" EXC_WORD "\t1b\n" \ |
| "\t" EXC_WORD "\t" #_e "\n" \ |
| "\t.previous\n" \ |
| : "+r"(_a) \ |
| : _tt(_t) \ |
| : "r8") |
| |
| #define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e) |
| #define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e) |
| #define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e) |
| #define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e) |
| #define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e) |
| #define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e) |
| |
| #define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \ |
| __asm__ __volatile__ ( \ |
| "1:\t" #_insn " " #_o "(" _s ",%1), %0\n" \ |
| "\t.section __ex_table,\"aw\"\n" \ |
| "\t" EXC_WORD "\t1b\n" \ |
| "\t" EXC_WORD "\t" #_e "\n" \ |
| "\t.previous\n" \ |
| : _tt(_t) \ |
| : "r"(_a) \ |
| : "r8") |
| |
| #define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \ |
| __asm__ __volatile__ ( \ |
| "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n" \ |
| "\t.section __ex_table,\"aw\"\n" \ |
| "\t" EXC_WORD "\t1b\n" \ |
| "\t" EXC_WORD "\t" #_e "\n" \ |
| "\t.previous\n" \ |
| : \ |
| : _tt(_t), "r"(_a) \ |
| : "r8") |
| |
| #define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e) |
| #define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e) |
| |
| #ifdef CONFIG_PREFETCH |
| extern inline void prefetch_src(const void *addr) |
| { |
| __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr)); |
| } |
| |
| extern inline void prefetch_dst(const void *addr) |
| { |
| __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr)); |
| } |
| #else |
| #define prefetch_src(addr) |
| #define prefetch_dst(addr) |
| #endif |
| |
| /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words |
| * per loop. This code is derived from glibc. |
| */ |
| static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len) |
| { |
| /* gcc complains that a2 and a3 may be uninitialized, but actually |
| * they cannot be. Initialize a2/a3 to shut gcc up. |
| */ |
| register unsigned int a0, a1, a2 = 0, a3 = 0; |
| int sh_1, sh_2; |
| struct exception_data *d; |
| |
| /* prefetch_src((const void *)src); */ |
| |
| /* Calculate how to shift a word read at the memory operation |
| aligned srcp to make it aligned for copy. */ |
| sh_1 = 8 * (src % sizeof(unsigned int)); |
| sh_2 = 8 * sizeof(unsigned int) - sh_1; |
| |
| /* Make src aligned by rounding it down. */ |
| src &= -sizeof(unsigned int); |
| |
| switch (len % 4) |
| { |
| case 2: |
| /* a1 = ((unsigned int *) src)[0]; |
| a2 = ((unsigned int *) src)[1]; */ |
| ldw(s_space, 0, src, a1, cda_ldw_exc); |
| ldw(s_space, 4, src, a2, cda_ldw_exc); |
| src -= 1 * sizeof(unsigned int); |
| dst -= 3 * sizeof(unsigned int); |
| len += 2; |
| goto do1; |
| case 3: |
| /* a0 = ((unsigned int *) src)[0]; |
| a1 = ((unsigned int *) src)[1]; */ |
| ldw(s_space, 0, src, a0, cda_ldw_exc); |
| ldw(s_space, 4, src, a1, cda_ldw_exc); |
| src -= 0 * sizeof(unsigned int); |
| dst -= 2 * sizeof(unsigned int); |
| len += 1; |
| goto do2; |
| case 0: |
| if (len == 0) |
| return 0; |
| /* a3 = ((unsigned int *) src)[0]; |
| a0 = ((unsigned int *) src)[1]; */ |
| ldw(s_space, 0, src, a3, cda_ldw_exc); |
| ldw(s_space, 4, src, a0, cda_ldw_exc); |
| src -=-1 * sizeof(unsigned int); |
| dst -= 1 * sizeof(unsigned int); |
| len += 0; |
| goto do3; |
| case 1: |
| /* a2 = ((unsigned int *) src)[0]; |
| a3 = ((unsigned int *) src)[1]; */ |
| ldw(s_space, 0, src, a2, cda_ldw_exc); |
| ldw(s_space, 4, src, a3, cda_ldw_exc); |
| src -=-2 * sizeof(unsigned int); |
| dst -= 0 * sizeof(unsigned int); |
| len -= 1; |
| if (len == 0) |
| goto do0; |
| goto do4; /* No-op. */ |
| } |
| |
| do |
| { |
| /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */ |
| do4: |
| /* a0 = ((unsigned int *) src)[0]; */ |
| ldw(s_space, 0, src, a0, cda_ldw_exc); |
| /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */ |
| stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc); |
| do3: |
| /* a1 = ((unsigned int *) src)[1]; */ |
| ldw(s_space, 4, src, a1, cda_ldw_exc); |
| /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */ |
| stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc); |
| do2: |
| /* a2 = ((unsigned int *) src)[2]; */ |
| ldw(s_space, 8, src, a2, cda_ldw_exc); |
| /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */ |
| stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc); |
| do1: |
| /* a3 = ((unsigned int *) src)[3]; */ |
| ldw(s_space, 12, src, a3, cda_ldw_exc); |
| /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */ |
| stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc); |
| |
| src += 4 * sizeof(unsigned int); |
| dst += 4 * sizeof(unsigned int); |
| len -= 4; |
| } |
| while (len != 0); |
| |
| do0: |
| /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */ |
| stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc); |
| |
| preserve_branch(handle_load_error); |
| preserve_branch(handle_store_error); |
| |
| return 0; |
| |
| handle_load_error: |
| __asm__ __volatile__ ("cda_ldw_exc:\n"); |
| d = &__get_cpu_var(exception_data); |
| DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n", |
| o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src); |
| return o_len * 4 - d->fault_addr + o_src; |
| |
| handle_store_error: |
| __asm__ __volatile__ ("cda_stw_exc:\n"); |
| d = &__get_cpu_var(exception_data); |
| DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n", |
| o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst); |
| return o_len * 4 - d->fault_addr + o_dst; |
| } |
| |
| |
| /* Returns 0 for success, otherwise, returns number of bytes not transferred. */ |
| unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len) |
| { |
| register unsigned long src, dst, t1, t2, t3; |
| register unsigned char *pcs, *pcd; |
| register unsigned int *pws, *pwd; |
| register double *pds, *pdd; |
| unsigned long ret = 0; |
| unsigned long o_dst, o_src, o_len; |
| struct exception_data *d; |
| |
| src = (unsigned long)srcp; |
| dst = (unsigned long)dstp; |
| pcs = (unsigned char *)srcp; |
| pcd = (unsigned char *)dstp; |
| |
| o_dst = dst; o_src = src; o_len = len; |
| |
| /* prefetch_src((const void *)srcp); */ |
| |
| if (len < THRESHOLD) |
| goto byte_copy; |
| |
| /* Check alignment */ |
| t1 = (src ^ dst); |
| if (unlikely(t1 & (sizeof(double)-1))) |
| goto unaligned_copy; |
| |
| /* src and dst have same alignment. */ |
| |
| /* Copy bytes till we are double-aligned. */ |
| t2 = src & (sizeof(double) - 1); |
| if (unlikely(t2 != 0)) { |
| t2 = sizeof(double) - t2; |
| while (t2 && len) { |
| /* *pcd++ = *pcs++; */ |
| ldbma(s_space, pcs, t3, pmc_load_exc); |
| len--; |
| stbma(d_space, t3, pcd, pmc_store_exc); |
| t2--; |
| } |
| } |
| |
| pds = (double *)pcs; |
| pdd = (double *)pcd; |
| |
| #if 0 |
| /* Copy 8 doubles at a time */ |
| while (len >= 8*sizeof(double)) { |
| register double r1, r2, r3, r4, r5, r6, r7, r8; |
| /* prefetch_src((char *)pds + L1_CACHE_BYTES); */ |
| flddma(s_space, pds, r1, pmc_load_exc); |
| flddma(s_space, pds, r2, pmc_load_exc); |
| flddma(s_space, pds, r3, pmc_load_exc); |
| flddma(s_space, pds, r4, pmc_load_exc); |
| fstdma(d_space, r1, pdd, pmc_store_exc); |
| fstdma(d_space, r2, pdd, pmc_store_exc); |
| fstdma(d_space, r3, pdd, pmc_store_exc); |
| fstdma(d_space, r4, pdd, pmc_store_exc); |
| |
| #if 0 |
| if (L1_CACHE_BYTES <= 32) |
| prefetch_src((char *)pds + L1_CACHE_BYTES); |
| #endif |
| flddma(s_space, pds, r5, pmc_load_exc); |
| flddma(s_space, pds, r6, pmc_load_exc); |
| flddma(s_space, pds, r7, pmc_load_exc); |
| flddma(s_space, pds, r8, pmc_load_exc); |
| fstdma(d_space, r5, pdd, pmc_store_exc); |
| fstdma(d_space, r6, pdd, pmc_store_exc); |
| fstdma(d_space, r7, pdd, pmc_store_exc); |
| fstdma(d_space, r8, pdd, pmc_store_exc); |
| len -= 8*sizeof(double); |
| } |
| #endif |
| |
| pws = (unsigned int *)pds; |
| pwd = (unsigned int *)pdd; |
| |
| word_copy: |
| while (len >= 8*sizeof(unsigned int)) { |
| register unsigned int r1,r2,r3,r4,r5,r6,r7,r8; |
| /* prefetch_src((char *)pws + L1_CACHE_BYTES); */ |
| ldwma(s_space, pws, r1, pmc_load_exc); |
| ldwma(s_space, pws, r2, pmc_load_exc); |
| ldwma(s_space, pws, r3, pmc_load_exc); |
| ldwma(s_space, pws, r4, pmc_load_exc); |
| stwma(d_space, r1, pwd, pmc_store_exc); |
| stwma(d_space, r2, pwd, pmc_store_exc); |
| stwma(d_space, r3, pwd, pmc_store_exc); |
| stwma(d_space, r4, pwd, pmc_store_exc); |
| |
| ldwma(s_space, pws, r5, pmc_load_exc); |
| ldwma(s_space, pws, r6, pmc_load_exc); |
| ldwma(s_space, pws, r7, pmc_load_exc); |
| ldwma(s_space, pws, r8, pmc_load_exc); |
| stwma(d_space, r5, pwd, pmc_store_exc); |
| stwma(d_space, r6, pwd, pmc_store_exc); |
| stwma(d_space, r7, pwd, pmc_store_exc); |
| stwma(d_space, r8, pwd, pmc_store_exc); |
| len -= 8*sizeof(unsigned int); |
| } |
| |
| while (len >= 4*sizeof(unsigned int)) { |
| register unsigned int r1,r2,r3,r4; |
| ldwma(s_space, pws, r1, pmc_load_exc); |
| ldwma(s_space, pws, r2, pmc_load_exc); |
| ldwma(s_space, pws, r3, pmc_load_exc); |
| ldwma(s_space, pws, r4, pmc_load_exc); |
| stwma(d_space, r1, pwd, pmc_store_exc); |
| stwma(d_space, r2, pwd, pmc_store_exc); |
| stwma(d_space, r3, pwd, pmc_store_exc); |
| stwma(d_space, r4, pwd, pmc_store_exc); |
| len -= 4*sizeof(unsigned int); |
| } |
| |
| pcs = (unsigned char *)pws; |
| pcd = (unsigned char *)pwd; |
| |
| byte_copy: |
| while (len) { |
| /* *pcd++ = *pcs++; */ |
| ldbma(s_space, pcs, t3, pmc_load_exc); |
| stbma(d_space, t3, pcd, pmc_store_exc); |
| len--; |
| } |
| |
| return 0; |
| |
| unaligned_copy: |
| /* possibly we are aligned on a word, but not on a double... */ |
| if (likely(t1 & (sizeof(unsigned int)-1)) == 0) { |
| t2 = src & (sizeof(unsigned int) - 1); |
| |
| if (unlikely(t2 != 0)) { |
| t2 = sizeof(unsigned int) - t2; |
| while (t2) { |
| /* *pcd++ = *pcs++; */ |
| ldbma(s_space, pcs, t3, pmc_load_exc); |
| stbma(d_space, t3, pcd, pmc_store_exc); |
| len--; |
| t2--; |
| } |
| } |
| |
| pws = (unsigned int *)pcs; |
| pwd = (unsigned int *)pcd; |
| goto word_copy; |
| } |
| |
| /* Align the destination. */ |
| if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) { |
| t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1)); |
| while (t2) { |
| /* *pcd++ = *pcs++; */ |
| ldbma(s_space, pcs, t3, pmc_load_exc); |
| stbma(d_space, t3, pcd, pmc_store_exc); |
| len--; |
| t2--; |
| } |
| dst = (unsigned long)pcd; |
| src = (unsigned long)pcs; |
| } |
| |
| ret = copy_dstaligned(dst, src, len / sizeof(unsigned int), |
| o_dst, o_src, o_len); |
| if (ret) |
| return ret; |
| |
| pcs += (len & -sizeof(unsigned int)); |
| pcd += (len & -sizeof(unsigned int)); |
| len %= sizeof(unsigned int); |
| |
| preserve_branch(handle_load_error); |
| preserve_branch(handle_store_error); |
| |
| goto byte_copy; |
| |
| handle_load_error: |
| __asm__ __volatile__ ("pmc_load_exc:\n"); |
| d = &__get_cpu_var(exception_data); |
| DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n", |
| o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src); |
| return o_len - d->fault_addr + o_src; |
| |
| handle_store_error: |
| __asm__ __volatile__ ("pmc_store_exc:\n"); |
| d = &__get_cpu_var(exception_data); |
| DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n", |
| o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst); |
| return o_len - d->fault_addr + o_dst; |
| } |
| |
| #ifdef __KERNEL__ |
| unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len) |
| { |
| mtsp(get_kernel_space(), 1); |
| mtsp(get_user_space(), 2); |
| return pa_memcpy((void __force *)dst, src, len); |
| } |
| |
| unsigned long copy_from_user(void *dst, const void __user *src, unsigned long len) |
| { |
| mtsp(get_user_space(), 1); |
| mtsp(get_kernel_space(), 2); |
| return pa_memcpy(dst, (void __force *)src, len); |
| } |
| |
| unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len) |
| { |
| mtsp(get_user_space(), 1); |
| mtsp(get_user_space(), 2); |
| return pa_memcpy((void __force *)dst, (void __force *)src, len); |
| } |
| |
| |
| void * memcpy(void * dst,const void *src, size_t count) |
| { |
| mtsp(get_kernel_space(), 1); |
| mtsp(get_kernel_space(), 2); |
| pa_memcpy(dst, src, count); |
| return dst; |
| } |
| |
| EXPORT_SYMBOL(copy_to_user); |
| EXPORT_SYMBOL(copy_from_user); |
| EXPORT_SYMBOL(copy_in_user); |
| EXPORT_SYMBOL(memcpy); |
| #endif |